aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c64
-rw-r--r--fs/9p/fid.c8
-rw-r--r--fs/9p/v9fs.c59
-rw-r--r--fs/9p/v9fs_vfs.h4
-rw-r--r--fs/9p/vfs_addr.c13
-rw-r--r--fs/9p/vfs_dentry.c12
-rw-r--r--fs/9p/vfs_dir.c13
-rw-r--r--fs/9p/vfs_file.c34
-rw-r--r--fs/9p/vfs_inode.c203
-rw-r--r--fs/9p/vfs_inode_dotl.c137
-rw-r--r--fs/9p/vfs_super.c14
-rw-r--r--fs/9p/xattr.c16
-rw-r--r--fs/Kconfig10
-rw-r--r--fs/Kconfig.binfmt3
-rw-r--r--fs/Makefile3
-rw-r--r--fs/adfs/super.c4
-rw-r--r--fs/affs/affs.h6
-rw-r--r--fs/affs/amigaffs.c6
-rw-r--r--fs/affs/namei.c8
-rw-r--r--fs/affs/super.c1
-rw-r--r--fs/afs/dir.c12
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/mntpt.c4
-rw-r--r--fs/afs/rxrpc.c3
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/aio.c35
-rw-r--r--fs/attr.c4
-rw-r--r--fs/autofs4/autofs_i.h4
-rw-r--r--fs/autofs4/dev-ioctl.c11
-rw-r--r--fs/autofs4/expire.c2
-rw-r--r--fs/autofs4/inode.c11
-rw-r--r--fs/autofs4/root.c4
-rw-r--r--fs/autofs4/waitq.c62
-rw-r--r--fs/bad_inode.c6
-rw-r--r--fs/befs/linuxvfs.c1
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/bfs/inode.c1
-rw-r--r--fs/binfmt_aout.c14
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/binfmt_misc.c6
-rw-r--r--fs/bio.c10
-rw-r--r--fs/block_dev.c67
-rw-r--r--fs/btrfs/async-thread.c17
-rw-r--r--fs/btrfs/async-thread.h4
-rw-r--r--fs/btrfs/backref.c7
-rw-r--r--fs/btrfs/compression.c38
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/ctree.c384
-rw-r--r--fs/btrfs/ctree.h171
-rw-r--r--fs/btrfs/delayed-inode.c33
-rw-r--r--fs/btrfs/delayed-ref.c33
-rw-r--r--fs/btrfs/dir-item.c10
-rw-r--r--fs/btrfs/disk-io.c718
-rw-r--r--fs/btrfs/disk-io.h16
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c658
-rw-r--r--fs/btrfs/extent_io.c1035
-rw-r--r--fs/btrfs/extent_io.h62
-rw-r--r--fs/btrfs/file-item.c53
-rw-r--r--fs/btrfs/file.c57
-rw-r--r--fs/btrfs/free-space-cache.c17
-rw-r--r--fs/btrfs/inode-item.c6
-rw-r--r--fs/btrfs/inode-map.c19
-rw-r--r--fs/btrfs/inode.c462
-rw-r--r--fs/btrfs/ioctl.c130
-rw-r--r--fs/btrfs/locking.c6
-rw-r--r--fs/btrfs/locking.h4
-rw-r--r--fs/btrfs/ordered-data.c60
-rw-r--r--fs/btrfs/ordered-data.h24
-rw-r--r--fs/btrfs/orphan.c2
-rw-r--r--fs/btrfs/reada.c10
-rw-r--r--fs/btrfs/relocation.c130
-rw-r--r--fs/btrfs/root-tree.c25
-rw-r--r--fs/btrfs/scrub.c1404
-rw-r--r--fs/btrfs/struct-funcs.c53
-rw-r--r--fs/btrfs/super.c336
-rw-r--r--fs/btrfs/transaction.c210
-rw-r--r--fs/btrfs/transaction.h3
-rw-r--r--fs/btrfs/tree-log.c96
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c242
-rw-r--r--fs/btrfs/volumes.h4
-rw-r--r--fs/buffer.c50
-rw-r--r--fs/cachefiles/interface.c1
-rw-r--r--fs/ceph/caps.c8
-rw-r--r--fs/ceph/dir.c92
-rw-r--r--fs/ceph/export.c6
-rw-r--r--fs/ceph/inode.c4
-rw-r--r--fs/ceph/mds_client.c14
-rw-r--r--fs/ceph/mds_client.h7
-rw-r--r--fs/ceph/super.c37
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/ceph/xattr.c26
-rw-r--r--fs/char_dev.c6
-rw-r--r--fs/cifs/Kconfig7
-rw-r--r--fs/cifs/cifs_debug.c11
-rw-r--r--fs/cifs/cifs_fs_sb.h4
-rw-r--r--fs/cifs/cifs_spnego.c10
-rw-r--r--fs/cifs/cifs_unicode.c41
-rw-r--r--fs/cifs/cifs_unicode.h20
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsencrypt.c21
-rw-r--r--fs/cifs/cifsfs.c10
-rw-r--r--fs/cifs/cifsfs.h6
-rw-r--r--fs/cifs/cifsglob.h6
-rw-r--r--fs/cifs/cifssmb.c162
-rw-r--r--fs/cifs/connect.c320
-rw-r--r--fs/cifs/dir.c26
-rw-r--r--fs/cifs/file.c69
-rw-r--r--fs/cifs/inode.c32
-rw-r--r--fs/cifs/readdir.c9
-rw-r--r--fs/cifs/sess.c45
-rw-r--r--fs/cifs/smbencrypt.c2
-rw-r--r--fs/cifs/xattr.c6
-rw-r--r--fs/coda/cnode.c38
-rw-r--r--fs/coda/coda_fs_i.h4
-rw-r--r--fs/coda/dir.c37
-rw-r--r--fs/coda/inode.c11
-rw-r--r--fs/compat.c69
-rw-r--r--fs/compat_ioctl.c39
-rw-r--r--fs/configfs/configfs_internal.h4
-rw-r--r--fs/configfs/dir.c6
-rw-r--r--fs/configfs/inode.c6
-rw-r--r--fs/cramfs/inode.c3
-rw-r--r--fs/dcache.c184
-rw-r--r--fs/debugfs/file.c117
-rw-r--r--fs/debugfs/inode.c16
-rw-r--r--fs/devpts/inode.c8
-rw-r--r--fs/direct-io.c61
-rw-r--r--fs/dlm/config.c130
-rw-r--r--fs/dlm/config.h17
-rw-r--r--fs/dlm/debug_fs.c28
-rw-r--r--fs/dlm/dir.c1
-rw-r--r--fs/dlm/dlm_internal.h60
-rw-r--r--fs/dlm/lock.c87
-rw-r--r--fs/dlm/lockspace.c71
-rw-r--r--fs/dlm/lowcomms.c2
-rw-r--r--fs/dlm/member.c486
-rw-r--r--fs/dlm/member.h10
-rw-r--r--fs/dlm/rcom.c99
-rw-r--r--fs/dlm/rcom.h2
-rw-r--r--fs/dlm/recover.c87
-rw-r--r--fs/dlm/recoverd.c53
-rw-r--r--fs/dlm/user.c5
-rw-r--r--fs/ecryptfs/crypto.c122
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h11
-rw-r--r--fs/ecryptfs/inode.c79
-rw-r--r--fs/ecryptfs/keystore.c14
-rw-r--r--fs/ecryptfs/miscdev.c140
-rw-r--r--fs/ecryptfs/mmap.c12
-rw-r--r--fs/ecryptfs/read_write.c100
-rw-r--r--fs/ecryptfs/super.c19
-rw-r--r--fs/efs/super.c1
-rw-r--r--fs/eventpoll.c268
-rw-r--r--fs/exec.c57
-rw-r--r--fs/exofs/Kconfig11
-rw-r--r--fs/exofs/Kconfig.ore12
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/exofs/exofs.h2
-rw-r--r--fs/exofs/inode.c2
-rw-r--r--fs/exofs/namei.c6
-rw-r--r--fs/exofs/ore.c8
-rw-r--r--fs/exofs/ore_raid.c78
-rw-r--r--fs/exofs/super.c3
-rw-r--r--fs/ext2/dir.c2
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/ialloc.c9
-rw-r--r--fs/ext2/inode.c5
-rw-r--r--fs/ext2/ioctl.c34
-rw-r--r--fs/ext2/namei.c6
-rw-r--r--fs/ext2/super.c8
-rw-r--r--fs/ext2/xattr.c1
-rw-r--r--fs/ext2/xattr_security.c1
-rw-r--r--fs/ext2/xattr_trusted.c1
-rw-r--r--fs/ext2/xattr_user.c1
-rw-r--r--fs/ext3/ialloc.c10
-rw-r--r--fs/ext3/inode.c45
-rw-r--r--fs/ext3/ioctl.c26
-rw-r--r--fs/ext3/namei.c17
-rw-r--r--fs/ext3/super.c22
-rw-r--r--fs/ext3/xattr_security.c1
-rw-r--r--fs/ext3/xattr_trusted.c1
-rw-r--r--fs/ext3/xattr_user.c1
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/block_validity.c1
-rw-r--r--fs/ext4/ext4.h31
-rw-r--r--fs/ext4/extents.c11
-rw-r--r--fs/ext4/ialloc.c26
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c148
-rw-r--r--fs/ext4/ioctl.c116
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/migrate.c1
-rw-r--r--fs/ext4/namei.c8
-rw-r--r--fs/ext4/page-io.c1
-rw-r--r--fs/ext4/resize.c1175
-rw-r--r--fs/ext4/super.c34
-rw-r--r--fs/ext4/xattr_security.c6
-rw-r--r--fs/ext4/xattr_trusted.c1
-rw-r--r--fs/ext4/xattr_user.c1
-rw-r--r--fs/fat/fat.h6
-rw-r--r--fs/fat/file.c8
-rw-r--r--fs/fat/inode.c33
-rw-r--r--fs/fat/namei_msdos.c4
-rw-r--r--fs/fat/namei_vfat.c7
-rw-r--r--fs/fhandle.c8
-rw-r--r--fs/file_table.c23
-rw-r--r--fs/filesystems.c1
-rw-r--r--fs/freevxfs/vxfs_inode.c5
-rw-r--r--fs/fs-writeback.c37
-rw-r--r--fs/fuse/dev.c57
-rw-r--r--fs/fuse/dir.c70
-rw-r--r--fs/fuse/file.c58
-rw-r--r--fs/fuse/fuse_i.h12
-rw-r--r--fs/fuse/inode.c11
-rw-r--r--fs/gfs2/acl.c14
-rw-r--r--fs/gfs2/aops.c18
-rw-r--r--fs/gfs2/bmap.c26
-rw-r--r--fs/gfs2/dir.c64
-rw-r--r--fs/gfs2/dir.h2
-rw-r--r--fs/gfs2/export.c3
-rw-r--r--fs/gfs2/file.c38
-rw-r--r--fs/gfs2/glock.c16
-rw-r--r--fs/gfs2/glock.h7
-rw-r--r--fs/gfs2/incore.h80
-rw-r--r--fs/gfs2/inode.c91
-rw-r--r--fs/gfs2/lock_dlm.c993
-rw-r--r--fs/gfs2/log.c6
-rw-r--r--fs/gfs2/main.c13
-rw-r--r--fs/gfs2/meta_io.c4
-rw-r--r--fs/gfs2/ops_fstype.c38
-rw-r--r--fs/gfs2/quota.c91
-rw-r--r--fs/gfs2/recovery.c11
-rw-r--r--fs/gfs2/rgrp.c306
-rw-r--r--fs/gfs2/rgrp.h16
-rw-r--r--fs/gfs2/super.c23
-rw-r--r--fs/gfs2/sys.c33
-rw-r--r--fs/gfs2/sys.h2
-rw-r--r--fs/gfs2/trans.h6
-rw-r--r--fs/gfs2/xattr.c48
-rw-r--r--fs/hfs/dir.c4
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfs/super.c5
-rw-r--r--fs/hfsplus/dir.c6
-rw-r--r--fs/hfsplus/hfsplus_fs.h4
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/ioctl.c4
-rw-r--r--fs/hfsplus/options.c4
-rw-r--r--fs/hfsplus/super.c12
-rw-r--r--fs/hostfs/hostfs.h2
-rw-r--r--fs/hostfs/hostfs_kern.c11
-rw-r--r--fs/hpfs/namei.c6
-rw-r--r--fs/hpfs/super.c1
-rw-r--r--fs/hppfs/hppfs.c3
-rw-r--r--fs/hugetlbfs/inode.c69
-rw-r--r--fs/inode.c103
-rw-r--r--fs/internal.h30
-rw-r--r--fs/ioctl.c2
-rw-r--r--fs/ioprio.c24
-rw-r--r--fs/isofs/inode.c12
-rw-r--r--fs/isofs/isofs.h6
-rw-r--r--fs/jbd/checkpoint.c29
-rw-r--r--fs/jbd/commit.c6
-rw-r--r--fs/jbd/journal.c3
-rw-r--r--fs/jbd/recovery.c4
-rw-r--r--fs/jbd/revoke.c34
-rw-r--r--fs/jbd/transaction.c38
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jbd2/revoke.c34
-rw-r--r--fs/jbd2/transaction.c5
-rw-r--r--fs/jffs2/dir.c14
-rw-r--r--fs/jffs2/erase.c19
-rw-r--r--fs/jffs2/fs.c1
-rw-r--r--fs/jffs2/readinode.c22
-rw-r--r--fs/jffs2/scan.c12
-rw-r--r--fs/jffs2/super.c9
-rw-r--r--fs/jffs2/wbuf.c38
-rw-r--r--fs/jffs2/writev.c32
-rw-r--r--fs/jfs/ioctl.c4
-rw-r--r--fs/jfs/jfs_logmgr.c2
-rw-r--r--fs/jfs/jfs_txnmgr.c4
-rw-r--r--fs/jfs/namei.c6
-rw-r--r--fs/jfs/super.c5
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/logfs/dev_mtd.c80
-rw-r--r--fs/logfs/dir.c8
-rw-r--r--fs/logfs/file.c2
-rw-r--r--fs/logfs/gc.c2
-rw-r--r--fs/logfs/inode.c7
-rw-r--r--fs/logfs/journal.c1
-rw-r--r--fs/logfs/logfs.h7
-rw-r--r--fs/logfs/readwrite.c51
-rw-r--r--fs/logfs/segment.c51
-rw-r--r--fs/logfs/super.c3
-rw-r--r--fs/minix/bitmap.c2
-rw-r--r--fs/minix/inode.c1
-rw-r--r--fs/minix/minix.h2
-rw-r--r--fs/minix/namei.c6
-rw-r--r--fs/mount.h76
-rw-r--r--fs/mpage.c4
-rw-r--r--fs/namei.c153
-rw-r--r--fs/namespace.c831
-rw-r--r--fs/ncpfs/dir.c18
-rw-r--r--fs/ncpfs/inode.c7
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/ncpfs/ncplib_kernel.h2
-rw-r--r--fs/ncpfs/symlink.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c202
-rw-r--r--fs/nfs/blocklayout/blocklayout.h12
-rw-r--r--fs/nfs/blocklayout/extents.c176
-rw-r--r--fs/nfs/callback.h2
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/callback_xdr.c4
-rw-r--r--fs/nfs/client.c12
-rw-r--r--fs/nfs/dir.c33
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfs/idmap.c83
-rw-r--r--fs/nfs/inode.c48
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/nfs3proc.c3
-rw-r--r--fs/nfs/nfs4_fs.h3
-rw-r--r--fs/nfs/nfs4filelayout.c9
-rw-r--r--fs/nfs/nfs4filelayoutdev.c2
-rw-r--r--fs/nfs/nfs4proc.c282
-rw-r--r--fs/nfs/nfs4state.c106
-rw-r--r--fs/nfs/nfs4xdr.c140
-rw-r--r--fs/nfs/objlayout/objio_osd.c3
-rw-r--r--fs/nfs/objlayout/objlayout.c4
-rw-r--r--fs/nfs/pnfs.c42
-rw-r--r--fs/nfs/pnfs.h1
-rw-r--r--fs/nfs/proc.c3
-rw-r--r--fs/nfs/super.c92
-rw-r--r--fs/nfs/write.c31
-rw-r--r--fs/nfsd/Kconfig10
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/export.c12
-rw-r--r--fs/nfsd/fault_inject.c91
-rw-r--r--fs/nfsd/fault_inject.h28
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/nfsd/nfs4idmap.c11
-rw-r--r--fs/nfsd/nfs4proc.c11
-rw-r--r--fs/nfsd/nfs4recover.c34
-rw-r--r--fs/nfsd/nfs4state.c330
-rw-r--r--fs/nfsd/nfs4xdr.c3
-rw-r--r--fs/nfsd/nfsctl.c12
-rw-r--r--fs/nfsd/nfsd.h20
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfsfh.h2
-rw-r--r--fs/nfsd/state.h3
-rw-r--r--fs/nfsd/vfs.c55
-rw-r--r--fs/nfsd/vfs.h12
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/ioctl.c24
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/nilfs2/super.c8
-rw-r--r--fs/nilfs2/the_nilfs.c7
-rw-r--r--fs/nls/nls_base.c73
-rw-r--r--fs/notify/fanotify/fanotify_user.c6
-rw-r--r--fs/notify/fsnotify.c9
-rw-r--r--fs/notify/mark.c8
-rw-r--r--fs/notify/vfsmount_mark.c19
-rw-r--r--fs/ntfs/attrib.c6
-rw-r--r--fs/ntfs/inode.c9
-rw-r--r--fs/ntfs/inode.h2
-rw-r--r--fs/ntfs/mft.c6
-rw-r--r--fs/ntfs/super.c12
-rw-r--r--fs/ntfs/volume.h4
-rw-r--r--fs/ocfs2/cluster/netdebug.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c24
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/ioctl.c4
-rw-r--r--fs/ocfs2/move_extents.c4
-rw-r--r--fs/ocfs2/namei.c10
-rw-r--r--fs/ocfs2/stack_user.c4
-rw-r--r--fs/ocfs2/super.c10
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/ocfs2/xattr.h2
-rw-r--r--fs/omfs/dir.c6
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/omfs/omfs.h2
-rw-r--r--fs/open.c22
-rw-r--r--fs/openpromfs/inode.c1
-rw-r--r--fs/partitions/Kconfig251
-rw-r--r--fs/partitions/Makefile20
-rw-r--r--fs/partitions/acorn.c556
-rw-r--r--fs/partitions/acorn.h14
-rw-r--r--fs/partitions/amiga.c139
-rw-r--r--fs/partitions/amiga.h6
-rw-r--r--fs/partitions/atari.c149
-rw-r--r--fs/partitions/atari.h34
-rw-r--r--fs/partitions/check.c687
-rw-r--r--fs/partitions/check.h49
-rw-r--r--fs/partitions/efi.c675
-rw-r--r--fs/partitions/efi.h134
-rw-r--r--fs/partitions/ibm.c275
-rw-r--r--fs/partitions/ibm.h1
-rw-r--r--fs/partitions/karma.c57
-rw-r--r--fs/partitions/karma.h8
-rw-r--r--fs/partitions/ldm.c1570
-rw-r--r--fs/partitions/ldm.h215
-rw-r--r--fs/partitions/mac.c134
-rw-r--r--fs/partitions/mac.h44
-rw-r--r--fs/partitions/msdos.c552
-rw-r--r--fs/partitions/msdos.h8
-rw-r--r--fs/partitions/osf.c86
-rw-r--r--fs/partitions/osf.h7
-rw-r--r--fs/partitions/sgi.c82
-rw-r--r--fs/partitions/sgi.h8
-rw-r--r--fs/partitions/sun.c122
-rw-r--r--fs/partitions/sun.h8
-rw-r--r--fs/partitions/sysv68.c95
-rw-r--r--fs/partitions/sysv68.h1
-rw-r--r--fs/partitions/ultrix.c48
-rw-r--r--fs/partitions/ultrix.h5
-rw-r--r--fs/pipe.c9
-rw-r--r--fs/pnode.c120
-rw-r--r--fs/pnode.h36
-rw-r--r--fs/proc/array.c17
-rw-r--r--fs/proc/base.c779
-rw-r--r--fs/proc/generic.c8
-rw-r--r--fs/proc/inode.c19
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/namespaces.c1
-rw-r--r--fs/proc/proc_net.c2
-rw-r--r--fs/proc/root.c70
-rw-r--r--fs/proc/stat.c61
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/proc/uptime.c11
-rw-r--r--fs/proc_namespace.c333
-rw-r--r--fs/pstore/inode.c3
-rw-r--r--fs/pstore/platform.c36
-rw-r--r--fs/qnx4/inode.c62
-rw-r--r--fs/quota/dquot.c11
-rw-r--r--fs/quota/quota.c25
-rw-r--r--fs/ramfs/inode.c8
-rw-r--r--fs/reiserfs/bitmap.c94
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/ioctl.c8
-rw-r--r--fs/reiserfs/journal.c64
-rw-r--r--fs/reiserfs/namei.c8
-rw-r--r--fs/reiserfs/super.c200
-rw-r--r--fs/reiserfs/xattr.c2
-rw-r--r--fs/romfs/mmap-nommu.c28
-rw-r--r--fs/romfs/super.c1
-rw-r--r--fs/select.c2
-rw-r--r--fs/seq_file.c10
-rw-r--r--fs/signalfd.c15
-rw-r--r--fs/splice.c1
-rw-r--r--fs/squashfs/cache.c30
-rw-r--r--fs/squashfs/inode.c4
-rw-r--r--fs/squashfs/squashfs_fs_sb.h1
-rw-r--r--fs/squashfs/super.c3
-rw-r--r--fs/statfs.c21
-rw-r--r--fs/super.c94
-rw-r--r--fs/sync.c1
-rw-r--r--fs/sysfs/file.c10
-rw-r--r--fs/sysfs/group.c2
-rw-r--r--fs/sysfs/inode.c7
-rw-r--r--fs/sysfs/sysfs.h4
-rw-r--r--fs/sysv/ialloc.c2
-rw-r--r--fs/sysv/inode.c1
-rw-r--r--fs/sysv/itree.c2
-rw-r--r--fs/sysv/namei.c6
-rw-r--r--fs/sysv/sysv.h2
-rw-r--r--fs/ubifs/debug.c90
-rw-r--r--fs/ubifs/debug.h75
-rw-r--r--fs/ubifs/dir.c14
-rw-r--r--fs/ubifs/ioctl.c4
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/lpt.c6
-rw-r--r--fs/ubifs/replay.c8
-rw-r--r--fs/ubifs/super.c5
-rw-r--r--fs/ubifs/tnc.c58
-rw-r--r--fs/ubifs/tnc_misc.c10
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/udf/file.c8
-rw-r--r--fs/udf/ialloc.c2
-rw-r--r--fs/udf/inode.c63
-rw-r--r--fs/udf/namei.c6
-rw-r--r--fs/udf/super.c25
-rw-r--r--fs/udf/symlink.c14
-rw-r--r--fs/udf/udf_sb.h8
-rw-r--r--fs/udf/udfdecl.h2
-rw-r--r--fs/ufs/ialloc.c2
-rw-r--r--fs/ufs/inode.c4
-rw-r--r--fs/ufs/namei.c6
-rw-r--r--fs/ufs/super.c5
-rw-r--r--fs/ufs/ufs.h2
-rw-r--r--fs/xattr.c4
-rw-r--r--fs/xfs/kmem.h6
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_aops.c29
-rw-r--r--fs/xfs/xfs_attr.c4
-rw-r--r--fs/xfs/xfs_attr_leaf.c9
-rw-r--r--fs/xfs/xfs_bmap.c116
-rw-r--r--fs/xfs/xfs_buf.c10
-rw-r--r--fs/xfs/xfs_buf.h3
-rw-r--r--fs/xfs/xfs_dfrag.c43
-rw-r--r--fs/xfs/xfs_discard.c4
-rw-r--r--fs/xfs/xfs_dquot.c625
-rw-r--r--fs/xfs/xfs_dquot.h39
-rw-r--r--fs/xfs/xfs_dquot_item.c5
-rw-r--r--fs/xfs/xfs_file.c190
-rw-r--r--fs/xfs/xfs_fs_subr.c2
-rw-r--r--fs/xfs/xfs_ialloc.c4
-rw-r--r--fs/xfs/xfs_ialloc.h2
-rw-r--r--fs/xfs/xfs_iget.c25
-rw-r--r--fs/xfs/xfs_inode.c197
-rw-r--r--fs/xfs/xfs_inode.h116
-rw-r--r--fs/xfs/xfs_inode_item.c10
-rw-r--r--fs/xfs/xfs_ioctl.c8
-rw-r--r--fs/xfs/xfs_ioctl32.c8
-rw-r--r--fs/xfs/xfs_iomap.c46
-rw-r--r--fs/xfs/xfs_iops.c54
-rw-r--r--fs/xfs/xfs_log.c79
-rw-r--r--fs/xfs/xfs_log.h8
-rw-r--r--fs/xfs/xfs_log_cil.c98
-rw-r--r--fs/xfs/xfs_log_recover.c8
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_qm.c687
-rw-r--r--fs/xfs/xfs_qm.h20
-rw-r--r--fs/xfs/xfs_qm_stats.c4
-rw-r--r--fs/xfs/xfs_qm_syscalls.c12
-rw-r--r--fs/xfs/xfs_quota.h12
-rw-r--r--fs/xfs/xfs_super.c48
-rw-r--r--fs/xfs/xfs_sync.c15
-rw-r--r--fs/xfs/xfs_trace.h36
-rw-r--r--fs/xfs/xfs_trans.c479
-rw-r--r--fs/xfs/xfs_trans.h3
-rw-r--r--fs/xfs/xfs_trans_dquot.c10
-rw-r--r--fs/xfs/xfs_utils.c2
-rw-r--r--fs/xfs/xfs_utils.h2
-rw-r--r--fs/xfs/xfs_vnodeops.c51
-rw-r--r--fs/xfs/xfs_vnodeops.h4
543 files changed, 15373 insertions, 15927 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 945aa5f02f9b..a9ea73d6dcf3 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -62,8 +62,8 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
uint16_t klen = 0;
v9ses = (struct v9fs_session_info *)cookie_netfs_data;
- P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses,
- buffer, bufmax);
+ p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n",
+ v9ses, buffer, bufmax);
if (v9ses->cachetag)
klen = strlen(v9ses->cachetag);
@@ -72,7 +72,7 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
return 0;
memcpy(buffer, v9ses->cachetag, klen);
- P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag);
+ p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag);
return klen;
}
@@ -91,14 +91,14 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
&v9fs_cache_session_index_def,
v9ses);
- P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses,
- v9ses->fscache);
+ p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
+ v9ses, v9ses->fscache);
}
void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
{
- P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses,
- v9ses->fscache);
+ p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n",
+ v9ses, v9ses->fscache);
fscache_relinquish_cookie(v9ses->fscache, 0);
v9ses->fscache = NULL;
}
@@ -109,8 +109,8 @@ static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
{
const struct v9fs_inode *v9inode = cookie_netfs_data;
memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path));
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
- v9inode->qid.path);
+ p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n",
+ &v9inode->vfs_inode, v9inode->qid.path);
return sizeof(v9inode->qid.path);
}
@@ -120,8 +120,8 @@ static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
const struct v9fs_inode *v9inode = cookie_netfs_data;
*size = i_size_read(&v9inode->vfs_inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
- *size);
+ p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n",
+ &v9inode->vfs_inode, *size);
}
static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
@@ -129,8 +129,8 @@ static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
{
const struct v9fs_inode *v9inode = cookie_netfs_data;
memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version));
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
- v9inode->qid.version);
+ p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n",
+ &v9inode->vfs_inode, v9inode->qid.version);
return sizeof(v9inode->qid.version);
}
@@ -206,8 +206,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
&v9fs_cache_inode_index_def,
v9inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
- v9inode->fscache);
+ p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
+ inode, v9inode->fscache);
}
void v9fs_cache_inode_put_cookie(struct inode *inode)
@@ -216,8 +216,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode)
if (!v9inode->fscache)
return;
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
- v9inode->fscache);
+ p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n",
+ inode, v9inode->fscache);
fscache_relinquish_cookie(v9inode->fscache, 0);
v9inode->fscache = NULL;
@@ -229,8 +229,8 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
if (!v9inode->fscache)
return;
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
- v9inode->fscache);
+ p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n",
+ inode, v9inode->fscache);
fscache_relinquish_cookie(v9inode->fscache, 1);
v9inode->fscache = NULL;
@@ -272,8 +272,8 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
&v9fs_cache_inode_index_def,
v9inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
- inode, old, v9inode->fscache);
+ p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
+ inode, old, v9inode->fscache);
spin_unlock(&v9inode->fscache_lock);
}
@@ -323,7 +323,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
int ret;
const struct v9fs_inode *v9inode = V9FS_I(inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+ p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
if (!v9inode->fscache)
return -ENOBUFS;
@@ -335,13 +335,13 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
switch (ret) {
case -ENOBUFS:
case -ENODATA:
- P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret);
+ p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret);
return 1;
case 0:
- P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+ p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
return ret;
default:
- P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+ p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
return ret;
}
}
@@ -361,7 +361,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
int ret;
const struct v9fs_inode *v9inode = V9FS_I(inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
+ p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages);
if (!v9inode->fscache)
return -ENOBUFS;
@@ -373,15 +373,15 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
switch (ret) {
case -ENOBUFS:
case -ENODATA:
- P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret);
+ p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret);
return 1;
case 0:
BUG_ON(!list_empty(pages));
BUG_ON(*nr_pages != 0);
- P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+ p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
return ret;
default:
- P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+ p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
return ret;
}
}
@@ -396,9 +396,9 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
int ret;
const struct v9fs_inode *v9inode = V9FS_I(inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+ p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
- P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret);
+ p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret);
if (ret != 0)
v9fs_uncache_page(inode, page);
}
@@ -409,7 +409,7 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
{
const struct v9fs_inode *v9inode = V9FS_I(inode);
- P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+ p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
if (PageFsCache(page))
fscache_wait_on_page_write(v9inode->fscache, page);
}
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 85b67ffa2a43..da8eefbe830d 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -45,8 +45,8 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
{
struct v9fs_dentry *dent;
- P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n",
- fid->fid, dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "fid %d dentry %s\n",
+ fid->fid, dentry->d_name.name);
dent = dentry->d_fsdata;
if (!dent) {
@@ -79,8 +79,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
struct v9fs_dentry *dent;
struct p9_fid *fid, *ret;
- P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
- dentry->d_name.name, dentry, uid, any);
+ p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
+ dentry->d_name.name, dentry, uid, any);
dent = (struct v9fs_dentry *) dentry->d_fsdata;
ret = NULL;
if (dent) {
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2b78014a124a..1964f98e74be 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -23,6 +23,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/fs.h>
@@ -85,15 +87,15 @@ static int get_cache_mode(char *s)
if (!strcmp(s, "loose")) {
version = CACHE_LOOSE;
- P9_DPRINTK(P9_DEBUG_9P, "Cache mode: loose\n");
+ p9_debug(P9_DEBUG_9P, "Cache mode: loose\n");
} else if (!strcmp(s, "fscache")) {
version = CACHE_FSCACHE;
- P9_DPRINTK(P9_DEBUG_9P, "Cache mode: fscache\n");
+ p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
} else if (!strcmp(s, "none")) {
version = CACHE_NONE;
- P9_DPRINTK(P9_DEBUG_9P, "Cache mode: none\n");
+ p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
} else
- printk(KERN_INFO "9p: Unknown Cache mode %s.\n", s);
+ pr_info("Unknown Cache mode %s\n", s);
return version;
}
@@ -140,8 +142,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
case Opt_debug:
r = match_int(&args[0], &option);
if (r < 0) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
ret = r;
continue;
}
@@ -154,8 +156,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
case Opt_dfltuid:
r = match_int(&args[0], &option);
if (r < 0) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
ret = r;
continue;
}
@@ -164,8 +166,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
case Opt_dfltgid:
r = match_int(&args[0], &option);
if (r < 0) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
ret = r;
continue;
}
@@ -174,8 +176,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
case Opt_afid:
r = match_int(&args[0], &option);
if (r < 0) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
ret = r;
continue;
}
@@ -205,8 +207,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
s = match_strdup(&args[0]);
if (!s) {
ret = -ENOMEM;
- P9_DPRINTK(P9_DEBUG_ERROR,
- "problem allocating copy of cache arg\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "problem allocating copy of cache arg\n");
goto free_and_return;
}
ret = get_cache_mode(s);
@@ -223,8 +225,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
s = match_strdup(&args[0]);
if (!s) {
ret = -ENOMEM;
- P9_DPRINTK(P9_DEBUG_ERROR,
- "problem allocating copy of access arg\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "problem allocating copy of access arg\n");
goto free_and_return;
}
@@ -240,8 +242,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
v9ses->uid = simple_strtoul(s, &e, 10);
if (*e != '\0') {
ret = -EINVAL;
- printk(KERN_INFO "9p: Unknown access "
- "argument %s.\n", s);
+ pr_info("Unknown access argument %s\n",
+ s);
kfree(s);
goto free_and_return;
}
@@ -254,9 +256,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
#ifdef CONFIG_9P_FS_POSIX_ACL
v9ses->flags |= V9FS_POSIX_ACL;
#else
- P9_DPRINTK(P9_DEBUG_ERROR,
- "Not defined CONFIG_9P_FS_POSIX_ACL. "
- "Ignoring posixacl option\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
#endif
break;
@@ -318,7 +319,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
if (IS_ERR(v9ses->clnt)) {
retval = PTR_ERR(v9ses->clnt);
v9ses->clnt = NULL;
- P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n");
+ p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
goto error;
}
@@ -371,7 +372,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
if (IS_ERR(fid)) {
retval = PTR_ERR(fid);
fid = NULL;
- P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n");
+ p9_debug(P9_DEBUG_ERROR, "cannot attach\n");
goto error;
}
@@ -429,7 +430,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
*/
void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
- P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
+ p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
p9_client_disconnect(v9ses->clnt);
}
@@ -442,7 +443,7 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
{
- P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
+ p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
p9_client_begin_disconnect(v9ses->clnt);
}
@@ -591,23 +592,23 @@ static void v9fs_cache_unregister(void)
static int __init init_v9fs(void)
{
int err;
- printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
+ pr_info("Installing v9fs 9p2000 file system support\n");
/* TODO: Setup list of registered trasnport modules */
err = register_filesystem(&v9fs_fs_type);
if (err < 0) {
- printk(KERN_ERR "Failed to register filesystem\n");
+ pr_err("Failed to register filesystem\n");
return err;
}
err = v9fs_cache_register();
if (err < 0) {
- printk(KERN_ERR "Failed to register v9fs for caching\n");
+ pr_err("Failed to register v9fs for caching\n");
goto out_fs_unreg;
}
err = v9fs_sysfs_init();
if (err < 0) {
- printk(KERN_ERR "Failed to register with sysfs\n");
+ pr_err("Failed to register with sysfs\n");
goto out_sysfs_cleanup;
}
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 410ffd6ceb5f..dc95a252523d 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -54,9 +54,9 @@ extern struct kmem_cache *v9fs_inode_cache;
struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_destroy_inode(struct inode *inode);
-struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t);
+struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, int mode, dev_t);
+ struct inode *inode, umode_t mode, dev_t);
void v9fs_evict_inode(struct inode *inode);
ino_t v9fs_qid2ino(struct p9_qid *qid);
void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 2524e4cbb8ea..0ad61c6a65a5 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -56,7 +56,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
struct inode *inode;
inode = page->mapping->host;
- P9_DPRINTK(P9_DEBUG_VFS, "\n");
+ p9_debug(P9_DEBUG_VFS, "\n");
BUG_ON(!PageLocked(page));
@@ -116,14 +116,14 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
struct inode *inode;
inode = mapping->host;
- P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
+ p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
if (ret == 0)
return ret;
ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
- P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret);
+ p9_debug(P9_DEBUG_VFS, " = %d\n", ret);
return ret;
}
@@ -263,10 +263,9 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* Now that we do caching with cache mode enabled, We need
* to support direct IO
*/
- P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
- "off/no(%lld/%lu) EINVAL\n",
- iocb->ki_filp->f_path.dentry->d_name.name,
- (long long) pos, nr_segs);
+ p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n",
+ iocb->ki_filp->f_path.dentry->d_name.name,
+ (long long)pos, nr_segs);
return -EINVAL;
}
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index e022890c6f40..d529437ff442 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -53,8 +53,8 @@
static int v9fs_dentry_delete(const struct dentry *dentry)
{
- P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
- dentry);
+ p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+ dentry->d_name.name, dentry);
return 1;
}
@@ -66,8 +66,8 @@ static int v9fs_dentry_delete(const struct dentry *dentry)
*/
static int v9fs_cached_dentry_delete(const struct dentry *dentry)
{
- P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
- dentry->d_name.name, dentry);
+ p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+ dentry->d_name.name, dentry);
/* Don't cache negative dentries */
if (!dentry->d_inode)
@@ -86,8 +86,8 @@ static void v9fs_dentry_release(struct dentry *dentry)
struct v9fs_dentry *dent;
struct p9_fid *temp, *current_fid;
- P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
- dentry);
+ p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+ dentry->d_name.name, dentry);
dent = dentry->d_fsdata;
if (dent) {
list_for_each_entry_safe(current_fid, temp, &dent->fidlist,
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 598fff1a54e5..ff911e779651 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -140,7 +140,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
int reclen = 0;
struct p9_rdir *rdir;
- P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
fid = filp->private_data;
buflen = fid->clnt->msize - P9_IOHDRSZ;
@@ -168,7 +168,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
rdir->tail - rdir->head, &st);
if (err) {
- P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
err = -EIO;
p9stat_free(&st);
goto unlock_and_exit;
@@ -213,7 +213,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
struct p9_dirent curdirent;
u64 oldoffset = 0;
- P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
fid = filp->private_data;
buflen = fid->clnt->msize - P9_READDIRHDRSZ;
@@ -244,7 +244,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
rdir->tail - rdir->head,
&curdirent);
if (err < 0) {
- P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
err = -EIO;
goto unlock_and_exit;
}
@@ -290,9 +290,8 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
struct p9_fid *fid;
fid = filp->private_data;
- P9_DPRINTK(P9_DEBUG_VFS,
- "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
- inode, filp, fid ? fid->fid : -1);
+ p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
+ inode, filp, fid ? fid->fid : -1);
if (fid)
p9_client_clunk(fid);
return 0;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 62857a810a79..fc06fd27065e 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
struct p9_fid *fid;
int omode;
- P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
+ p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
v9inode = V9FS_I(inode);
v9ses = v9fs_inode2v9ses(inode);
if (v9fs_proto_dotl(v9ses))
@@ -135,7 +135,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
int res = 0;
struct inode *inode = filp->f_path.dentry->d_inode;
- P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
+ p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
/* No mandatory locks */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -204,7 +204,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
break;
- schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
+ if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+ break;
}
/* map 9p status to VFS status */
@@ -304,8 +305,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
struct inode *inode = filp->f_path.dentry->d_inode;
int ret = -ENOLCK;
- P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
- cmd, fl, filp->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
+ filp, cmd, fl, filp->f_path.dentry->d_name.name);
/* No mandatory locks */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -340,8 +341,8 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
struct inode *inode = filp->f_path.dentry->d_inode;
int ret = -ENOLCK;
- P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
- cmd, fl, filp->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
+ filp, cmd, fl, filp->f_path.dentry->d_name.name);
/* No mandatory locks */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -384,8 +385,8 @@ v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
{
int n, total, size;
- P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
- (long long unsigned) offset, count);
+ p9_debug(P9_DEBUG_VFS, "fid %d offset %llu count %d\n",
+ fid->fid, (long long unsigned)offset, count);
n = 0;
total = 0;
size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -443,7 +444,7 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
struct p9_fid *fid;
size_t size;
- P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
+ p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
fid = filp->private_data;
size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -470,8 +471,8 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
loff_t origin = *offset;
unsigned long pg_start, pg_end;
- P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
- (int)count, (int)*offset);
+ p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n",
+ data, (int)count, (int)*offset);
clnt = fid->clnt;
do {
@@ -552,7 +553,7 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
return retval;
mutex_lock(&inode->i_mutex);
- P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
+ p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
fid = filp->private_data;
v9fs_blank_wstat(&wstat);
@@ -575,8 +576,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
return retval;
mutex_lock(&inode->i_mutex);
- P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
- filp, datasync);
+ p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
fid = filp->private_data;
@@ -607,8 +607,8 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = filp->f_path.dentry->d_inode;
- P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
- page, (unsigned long)filp->private_data);
+ p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
+ page, (unsigned long)filp->private_data);
v9inode = V9FS_I(inode);
/* make sure the cache has finished storing the page */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 879ed8851737..014c8dd62962 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -23,6 +23,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/fs.h>
@@ -59,15 +61,13 @@ static const struct inode_operations v9fs_symlink_inode_operations;
*
*/
-static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
+static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode)
{
int res;
res = mode & 0777;
if (S_ISDIR(mode))
res |= P9_DMDIR;
if (v9fs_proto_dotu(v9ses)) {
- if (S_ISLNK(mode))
- res |= P9_DMSYMLINK;
if (v9ses->nodev == 0) {
if (S_ISSOCK(mode))
res |= P9_DMSOCKET;
@@ -85,10 +85,33 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
res |= P9_DMSETGID;
if ((mode & S_ISVTX) == S_ISVTX)
res |= P9_DMSETVTX;
- if ((mode & P9_DMLINK))
- res |= P9_DMLINK;
}
+ return res;
+}
+/**
+ * p9mode2perm- convert plan9 mode bits to unix permission bits
+ * @v9ses: v9fs session information
+ * @stat: p9_wstat from which mode need to be derived
+ *
+ */
+static int p9mode2perm(struct v9fs_session_info *v9ses,
+ struct p9_wstat *stat)
+{
+ int res;
+ int mode = stat->mode;
+
+ res = mode & S_IALLUGO;
+ if (v9fs_proto_dotu(v9ses)) {
+ if ((mode & P9_DMSETUID) == P9_DMSETUID)
+ res |= S_ISUID;
+
+ if ((mode & P9_DMSETGID) == P9_DMSETGID)
+ res |= S_ISGID;
+
+ if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
+ res |= S_ISVTX;
+ }
return res;
}
@@ -99,14 +122,14 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
* @rdev: major number, minor number in case of device files.
*
*/
-static int p9mode2unixmode(struct v9fs_session_info *v9ses,
- struct p9_wstat *stat, dev_t *rdev)
+static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
+ struct p9_wstat *stat, dev_t *rdev)
{
int res;
- int mode = stat->mode;
+ u32 mode = stat->mode;
- res = mode & S_IALLUGO;
*rdev = 0;
+ res = p9mode2perm(v9ses, stat);
if ((mode & P9_DMDIR) == P9_DMDIR)
res |= S_IFDIR;
@@ -133,24 +156,13 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses,
res |= S_IFBLK;
break;
default:
- P9_DPRINTK(P9_DEBUG_ERROR,
- "Unknown special type %c %s\n", type,
- stat->extension);
+ p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n",
+ type, stat->extension);
};
*rdev = MKDEV(major, minor);
} else
res |= S_IFREG;
- if (v9fs_proto_dotu(v9ses)) {
- if ((mode & P9_DMSETUID) == P9_DMSETUID)
- res |= S_ISUID;
-
- if ((mode & P9_DMSETGID) == P9_DMSETGID)
- res |= S_ISGID;
-
- if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
- res |= S_ISVTX;
- }
return res;
}
@@ -251,7 +263,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
static void v9fs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
}
@@ -261,7 +272,7 @@ void v9fs_destroy_inode(struct inode *inode)
}
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, int mode, dev_t rdev)
+ struct inode *inode, umode_t mode, dev_t rdev)
{
int err = 0;
@@ -281,8 +292,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
} else if (v9fs_proto_dotu(v9ses)) {
inode->i_op = &v9fs_file_inode_operations;
} else {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "special files without extended mode\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "special files without extended mode\n");
err = -EINVAL;
goto error;
}
@@ -307,8 +318,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
break;
case S_IFLNK:
if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
- P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
- "legacy protocol.\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "extended modes used with legacy protocol\n");
err = -EINVAL;
goto error;
}
@@ -335,8 +346,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
break;
default:
- P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
- mode, mode & S_IFMT);
+ p9_debug(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n",
+ mode, mode & S_IFMT);
err = -EINVAL;
goto error;
}
@@ -352,17 +363,18 @@ error:
*
*/
-struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev)
+struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
{
int err;
struct inode *inode;
struct v9fs_session_info *v9ses = sb->s_fs_info;
- P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
+ p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode);
inode = new_inode(sb);
if (!inode) {
- P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
+ pr_warn("%s (%d): Problem allocating inode\n",
+ __func__, task_pid_nr(current));
return ERR_PTR(-ENOMEM);
}
err = v9fs_init_inode(v9ses, inode, mode, rdev);
@@ -492,7 +504,8 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
int new)
{
dev_t rdev;
- int retval, umode;
+ int retval;
+ umode_t umode;
unsigned long i_ino;
struct inode *inode;
struct v9fs_session_info *v9ses = sb->s_fs_info;
@@ -578,15 +591,15 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
struct p9_fid *v9fid, *dfid;
struct v9fs_session_info *v9ses;
- P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n",
- dir, dentry, flags);
+ p9_debug(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n",
+ dir, dentry, flags);
v9ses = v9fs_inode2v9ses(dir);
inode = dentry->d_inode;
dfid = v9fs_fid_lookup(dentry->d_parent);
if (IS_ERR(dfid)) {
retval = PTR_ERR(dfid);
- P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
+ p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
return retval;
}
if (v9fs_proto_dotl(v9ses))
@@ -635,7 +648,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
struct p9_fid *dfid, *ofid, *fid;
struct inode *inode;
- P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
err = 0;
ofid = NULL;
@@ -644,7 +657,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
dfid = v9fs_fid_lookup(dentry->d_parent);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
- P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
return ERR_PTR(err);
}
@@ -652,36 +665,41 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
ofid = p9_client_walk(dfid, 0, NULL, 1);
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
return ERR_PTR(err);
}
err = p9_client_fcreate(ofid, name, perm, mode, extension);
if (err < 0) {
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
goto error;
}
- /* now walk from the parent so we can get unopened fid */
- fid = p9_client_walk(dfid, 1, &name, 1);
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
- fid = NULL;
- goto error;
- }
-
- /* instantiate inode and assign the unopened fid to the dentry */
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
- goto error;
+ if (!(perm & P9_DMLINK)) {
+ /* now walk from the parent so we can get unopened fid */
+ fid = p9_client_walk(dfid, 1, &name, 1);
+ if (IS_ERR(fid)) {
+ err = PTR_ERR(fid);
+ p9_debug(P9_DEBUG_VFS,
+ "p9_client_walk failed %d\n", err);
+ fid = NULL;
+ goto error;
+ }
+ /*
+ * instantiate inode and assign the unopened fid to the dentry
+ */
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ p9_debug(P9_DEBUG_VFS,
+ "inode creation failed %d\n", err);
+ goto error;
+ }
+ err = v9fs_fid_add(dentry, fid);
+ if (err < 0)
+ goto error;
+ d_instantiate(dentry, inode);
}
- err = v9fs_fid_add(dentry, fid);
- if (err < 0)
- goto error;
- d_instantiate(dentry, inode);
return ofid;
error:
if (ofid)
@@ -703,7 +721,7 @@ error:
*/
static int
-v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
+v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
int err;
@@ -786,14 +804,14 @@ error:
*
*/
-static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int err;
u32 perm;
struct p9_fid *fid;
struct v9fs_session_info *v9ses;
- P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
err = 0;
v9ses = v9fs_inode2v9ses(dir);
perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
@@ -831,8 +849,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
char *name;
int result = 0;
- P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
- dir, dentry->d_name.name, dentry, nameidata);
+ p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
+ dir, dentry->d_name.name, dentry, nameidata);
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
@@ -938,7 +956,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct p9_fid *newdirfid;
struct p9_wstat wstat;
- P9_DPRINTK(P9_DEBUG_VFS, "\n");
+ p9_debug(P9_DEBUG_VFS, "\n");
retval = 0;
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
@@ -974,8 +992,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* 9P .u can only handle file rename in the same directory
*/
- P9_DPRINTK(P9_DEBUG_ERROR,
- "old dir and new dir are different\n");
+ p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n");
retval = -EXDEV;
goto clunk_newdir;
}
@@ -1031,7 +1048,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct p9_fid *fid;
struct p9_wstat *st;
- P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+ p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
err = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
@@ -1068,7 +1085,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
struct p9_fid *fid;
struct p9_wstat wstat;
- P9_DPRINTK(P9_DEBUG_VFS, "\n");
+ p9_debug(P9_DEBUG_VFS, "\n");
retval = inode_change_ok(dentry->d_inode, iattr);
if (retval)
return retval;
@@ -1131,7 +1148,7 @@ void
v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
struct super_block *sb)
{
- mode_t mode;
+ umode_t mode;
char ext[32];
char tag_name[14];
unsigned int i_nlink;
@@ -1167,7 +1184,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
set_nlink(inode, i_nlink);
}
}
- mode = stat->mode & S_IALLUGO;
+ mode = p9mode2perm(v9ses, stat);
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
i_size_write(inode, stat->length);
@@ -1213,7 +1230,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
struct p9_fid *fid;
struct p9_wstat *st;
- P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
retval = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
fid = v9fs_fid_lookup(dentry);
@@ -1235,8 +1252,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
/* copy extension buffer into buffer */
strncpy(buffer, st->extension, buflen);
- P9_DPRINTK(P9_DEBUG_VFS,
- "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
+ p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n",
+ dentry->d_name.name, st->extension, buffer);
retval = strnlen(buffer, buflen);
done:
@@ -1257,7 +1274,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
int len = 0;
char *link = __getname();
- P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
if (!link)
link = ERR_PTR(-ENOMEM);
@@ -1288,8 +1305,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
{
char *s = nd_get_link(nd);
- P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name,
- IS_ERR(s) ? "<error>" : s);
+ p9_debug(P9_DEBUG_VFS, " %s %s\n",
+ dentry->d_name.name, IS_ERR(s) ? "<error>" : s);
if (!IS_ERR(s))
__putname(s);
}
@@ -1304,19 +1321,17 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
*/
static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
- int mode, const char *extension)
+ u32 perm, const char *extension)
{
- u32 perm;
struct p9_fid *fid;
struct v9fs_session_info *v9ses;
v9ses = v9fs_inode2v9ses(dir);
if (!v9fs_proto_dotu(v9ses)) {
- P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
+ p9_debug(P9_DEBUG_ERROR, "not extended\n");
return -EPERM;
}
- perm = unixmode2p9mode(v9ses, mode);
fid = v9fs_create(v9ses, dir, dentry, (char *) extension, perm,
P9_OREAD);
if (IS_ERR(fid))
@@ -1340,10 +1355,10 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
static int
v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
{
- P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino,
- dentry->d_name.name, symname);
+ p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
+ dir->i_ino, dentry->d_name.name, symname);
- return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname);
+ return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname);
}
/**
@@ -1362,9 +1377,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
char *name;
struct p9_fid *oldfid;
- P9_DPRINTK(P9_DEBUG_VFS,
- " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
- old_dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
+ dir->i_ino, dentry->d_name.name, old_dentry->d_name.name);
oldfid = v9fs_fid_clone(old_dentry);
if (IS_ERR(oldfid))
@@ -1398,14 +1412,16 @@ clunk_fid:
*/
static int
-v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
{
+ struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
int retval;
char *name;
+ u32 perm;
- P9_DPRINTK(P9_DEBUG_VFS,
- " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
- dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+ p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
+ dir->i_ino, dentry->d_name.name, mode,
+ MAJOR(rdev), MINOR(rdev));
if (!new_valid_dev(rdev))
return -EINVAL;
@@ -1427,7 +1443,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
return -EINVAL;
}
- retval = v9fs_vfs_mkspecial(dir, dentry, mode, name);
+ perm = unixmode2p9mode(v9ses, mode);
+ retval = v9fs_vfs_mkspecial(dir, dentry, perm, name);
__putname(name);
return retval;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 0b5745e21946..a1e6c990cd41 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -48,7 +48,7 @@
#include "acl.h"
static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
dev_t rdev);
/**
@@ -253,7 +253,7 @@ int v9fs_open_to_dotl_flags(int flags)
*/
static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
struct nameidata *nd)
{
int err = 0;
@@ -283,13 +283,13 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
}
name = (char *) dentry->d_name.name;
- P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
- "mode:0x%x\n", name, flags, omode);
+ p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n",
+ name, flags, omode);
dfid = v9fs_fid_lookup(dentry->d_parent);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
- P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
return err;
}
@@ -297,7 +297,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
ofid = p9_client_walk(dfid, 0, NULL, 1);
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
return err;
}
@@ -307,16 +307,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
/* Update mode based on ACL value */
err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
if (err) {
- P9_DPRINTK(P9_DEBUG_VFS,
- "Failed to get acl values in creat %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
+ err);
goto error;
}
err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
mode, gid, &qid);
if (err < 0) {
- P9_DPRINTK(P9_DEBUG_VFS,
- "p9_client_open_dotl failed in creat %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
+ err);
goto error;
}
v9fs_invalidate_inode_attr(dir);
@@ -325,14 +324,14 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
fid = p9_client_walk(dfid, 1, &name, 1);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
fid = NULL;
goto error;
}
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
goto error;
}
err = v9fs_fid_add(dentry, fid);
@@ -395,7 +394,7 @@ err_clunk_old_fid:
*/
static int v9fs_vfs_mkdir_dotl(struct inode *dir,
- struct dentry *dentry, int omode)
+ struct dentry *dentry, umode_t omode)
{
int err;
struct v9fs_session_info *v9ses;
@@ -408,7 +407,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
struct dentry *dir_dentry;
struct posix_acl *dacl = NULL, *pacl = NULL;
- P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
err = 0;
v9ses = v9fs_inode2v9ses(dir);
@@ -420,7 +419,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
dfid = v9fs_fid_lookup(dir_dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
- P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
dfid = NULL;
goto error;
}
@@ -430,8 +429,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
/* Update mode based on ACL value */
err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
if (err) {
- P9_DPRINTK(P9_DEBUG_VFS,
- "Failed to get acl values in mkdir %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n",
+ err);
goto error;
}
name = (char *) dentry->d_name.name;
@@ -444,8 +443,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
fid = p9_client_walk(dfid, 1, &name, 1);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+ err);
fid = NULL;
goto error;
}
@@ -453,8 +452,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
goto error;
}
err = v9fs_fid_add(dentry, fid);
@@ -495,7 +494,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
struct p9_fid *fid;
struct p9_stat_dotl *st;
- P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+ p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
err = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
@@ -523,6 +522,46 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
return 0;
}
+/*
+ * Attribute flags.
+ */
+#define P9_ATTR_MODE (1 << 0)
+#define P9_ATTR_UID (1 << 1)
+#define P9_ATTR_GID (1 << 2)
+#define P9_ATTR_SIZE (1 << 3)
+#define P9_ATTR_ATIME (1 << 4)
+#define P9_ATTR_MTIME (1 << 5)
+#define P9_ATTR_CTIME (1 << 6)
+#define P9_ATTR_ATIME_SET (1 << 7)
+#define P9_ATTR_MTIME_SET (1 << 8)
+
+struct dotl_iattr_map {
+ int iattr_valid;
+ int p9_iattr_valid;
+};
+
+static int v9fs_mapped_iattr_valid(int iattr_valid)
+{
+ int i;
+ int p9_iattr_valid = 0;
+ struct dotl_iattr_map dotl_iattr_map[] = {
+ { ATTR_MODE, P9_ATTR_MODE },
+ { ATTR_UID, P9_ATTR_UID },
+ { ATTR_GID, P9_ATTR_GID },
+ { ATTR_SIZE, P9_ATTR_SIZE },
+ { ATTR_ATIME, P9_ATTR_ATIME },
+ { ATTR_MTIME, P9_ATTR_MTIME },
+ { ATTR_CTIME, P9_ATTR_CTIME },
+ { ATTR_ATIME_SET, P9_ATTR_ATIME_SET },
+ { ATTR_MTIME_SET, P9_ATTR_MTIME_SET },
+ };
+ for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) {
+ if (iattr_valid & dotl_iattr_map[i].iattr_valid)
+ p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid;
+ }
+ return p9_iattr_valid;
+}
+
/**
* v9fs_vfs_setattr_dotl - set file metadata
* @dentry: file whose metadata to set
@@ -537,13 +576,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
struct p9_fid *fid;
struct p9_iattr_dotl p9attr;
- P9_DPRINTK(P9_DEBUG_VFS, "\n");
+ p9_debug(P9_DEBUG_VFS, "\n");
retval = inode_change_ok(dentry->d_inode, iattr);
if (retval)
return retval;
- p9attr.valid = iattr->ia_valid;
+ p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid);
p9attr.mode = iattr->ia_mode;
p9attr.uid = iattr->ia_uid;
p9attr.gid = iattr->ia_gid;
@@ -594,7 +633,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
void
v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
{
- mode_t mode;
+ umode_t mode;
struct v9fs_inode *v9inode = V9FS_I(inode);
if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
@@ -670,14 +709,13 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
struct v9fs_session_info *v9ses;
name = (char *) dentry->d_name.name;
- P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
- dir->i_ino, name, symname);
+ p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname);
v9ses = v9fs_inode2v9ses(dir);
dfid = v9fs_fid_lookup(dentry->d_parent);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
- P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
return err;
}
@@ -687,7 +725,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
if (err < 0) {
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
goto error;
}
@@ -697,8 +735,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
fid = p9_client_walk(dfid, 1, &name, 1);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+ err);
fid = NULL;
goto error;
}
@@ -707,8 +745,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
goto error;
}
err = v9fs_fid_add(dentry, fid);
@@ -751,9 +789,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
struct p9_fid *dfid, *oldfid;
struct v9fs_session_info *v9ses;
- P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
- dir->i_ino, old_dentry->d_name.name,
- dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+ dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
v9ses = v9fs_inode2v9ses(dir);
dir_dentry = v9fs_dentry_from_dir_inode(dir);
@@ -770,7 +807,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
if (err < 0) {
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
return err;
}
@@ -799,7 +836,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
*
*/
static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
dev_t rdev)
{
int err;
@@ -813,9 +850,9 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
struct dentry *dir_dentry;
struct posix_acl *dacl = NULL, *pacl = NULL;
- P9_DPRINTK(P9_DEBUG_VFS,
- " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
- dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+ p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
+ dir->i_ino, dentry->d_name.name, omode,
+ MAJOR(rdev), MINOR(rdev));
if (!new_valid_dev(rdev))
return -EINVAL;
@@ -825,7 +862,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
dfid = v9fs_fid_lookup(dir_dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
- P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
dfid = NULL;
goto error;
}
@@ -835,8 +872,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
/* Update mode based on ACL value */
err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
if (err) {
- P9_DPRINTK(P9_DEBUG_VFS,
- "Failed to get acl values in mknod %d\n", err);
+ p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n",
+ err);
goto error;
}
name = (char *) dentry->d_name.name;
@@ -851,8 +888,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
fid = p9_client_walk(dfid, 1, &name, 1);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
- P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+ err);
fid = NULL;
goto error;
}
@@ -860,8 +897,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
- err);
+ p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
goto error;
}
err = v9fs_fid_add(dentry, fid);
@@ -905,7 +942,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
char *link = __getname();
char *target;
- P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
if (!link) {
link = ERR_PTR(-ENOMEM);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index c70251d47ed1..7b0cd87b07c2 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -117,11 +117,11 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
struct inode *inode = NULL;
struct dentry *root = NULL;
struct v9fs_session_info *v9ses = NULL;
- int mode = S_IRWXUGO | S_ISVTX;
+ umode_t mode = S_IRWXUGO | S_ISVTX;
struct p9_fid *fid;
int retval = 0;
- P9_DPRINTK(P9_DEBUG_VFS, " \n");
+ p9_debug(P9_DEBUG_VFS, "\n");
v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
if (!v9ses)
@@ -191,7 +191,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
goto release_sb;
v9fs_fid_add(root, fid);
- P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+ p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
return dget(sb->s_root);
clunk_fid:
@@ -223,7 +223,7 @@ static void v9fs_kill_super(struct super_block *s)
{
struct v9fs_session_info *v9ses = s->s_fs_info;
- P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
+ p9_debug(P9_DEBUG_VFS, " %p\n", s);
kill_anon_super(s);
@@ -231,7 +231,7 @@ static void v9fs_kill_super(struct super_block *s)
v9fs_session_close(v9ses);
kfree(v9ses);
s->s_fs_info = NULL;
- P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
+ p9_debug(P9_DEBUG_VFS, "exiting kill_super\n");
}
static void
@@ -303,7 +303,7 @@ static int v9fs_write_inode(struct inode *inode,
* send an fsync request to server irrespective of
* wbc->sync_mode.
*/
- P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
v9inode = V9FS_I(inode);
if (!v9inode->writeback_fid)
return 0;
@@ -326,7 +326,7 @@ static int v9fs_write_inode_dotl(struct inode *inode,
* send an fsync request to server irrespective of
* wbc->sync_mode.
*/
- P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
v9inode = V9FS_I(inode);
if (!v9inode->writeback_fid)
return 0;
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index d288773871b3..29653b70a9c3 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -32,8 +32,8 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
if (IS_ERR(attr_fid)) {
retval = PTR_ERR(attr_fid);
- P9_DPRINTK(P9_DEBUG_VFS,
- "p9_client_attrwalk failed %zd\n", retval);
+ p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n",
+ retval);
attr_fid = NULL;
goto error;
}
@@ -87,8 +87,8 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
{
struct p9_fid *fid;
- P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
- __func__, name, buffer_size);
+ p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n",
+ name, buffer_size);
fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return PTR_ERR(fid);
@@ -115,8 +115,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
int retval, msize, write_count;
struct p9_fid *fid = NULL;
- P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n",
- __func__, name, value_len, flags);
+ p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
+ name, value_len, flags);
fid = v9fs_fid_clone(dentry);
if (IS_ERR(fid)) {
@@ -129,8 +129,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
*/
retval = p9_client_xattrcreate(fid, name, value_len, flags);
if (retval < 0) {
- P9_DPRINTK(P9_DEBUG_VFS,
- "p9_client_xattrcreate failed %d\n", retval);
+ p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
+ retval);
goto error;
}
msize = fid->clnt->msize;
diff --git a/fs/Kconfig b/fs/Kconfig
index 5f4c45d4aa10..d621f02a3f9e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -218,6 +218,8 @@ source "fs/exofs/Kconfig"
endif # MISC_FILESYSTEMS
+source "fs/exofs/Kconfig.ore"
+
menuconfig NETWORK_FILESYSTEMS
bool "Network File Systems"
default y
@@ -266,14 +268,6 @@ source "fs/9p/Kconfig"
endif # NETWORK_FILESYSTEMS
-if BLOCK
-menu "Partition Types"
-
-source "fs/partitions/Kconfig"
-
-endmenu
-endif
-
source "fs/nls/Kconfig"
source "fs/dlm/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 79e2ca7973b7..e95d1b64082c 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,6 +27,9 @@ config COMPAT_BINFMT_ELF
bool
depends on COMPAT && BINFMT_ELF
+config ARCH_BINFMT_ELF_RANDOMIZE_PIE
+ bool
+
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y
diff --git a/fs/Makefile b/fs/Makefile
index d2c3353d5477..93804d4d66e1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,8 @@ else
obj-y += no-block.o
endif
+obj-$(CONFIG_PROC_FS) += proc_namespace.o
+
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
obj-y += notify/
obj-$(CONFIG_EPOLL) += eventpoll.o
@@ -52,7 +54,6 @@ obj-$(CONFIG_FHANDLE) += fhandle.o
obj-y += quota/
obj-$(CONFIG_PROC_FS) += proc/
-obj-y += partitions/
obj-$(CONFIG_SYSFS) += sysfs/
obj-$(CONFIG_CONFIGFS_FS) += configfs/
obj-y += devpts/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index c8bf36a1996a..8e3b36ace305 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -126,9 +126,9 @@ static void adfs_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
}
-static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
+static int adfs_show_options(struct seq_file *seq, struct dentry *root)
{
- struct adfs_sb_info *asb = ADFS_SB(mnt->mnt_sb);
+ struct adfs_sb_info *asb = ADFS_SB(root->d_sb);
if (asb->s_uid != 0)
seq_printf(seq, ",uid=%u", asb->s_uid);
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index c2b9c79eb64e..45a0ce45d7b4 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -136,7 +136,7 @@ extern int affs_remove_header(struct dentry *dentry);
extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh);
extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
extern void secs_to_datestamp(time_t secs, struct affs_date *ds);
-extern mode_t prot_to_mode(u32 prot);
+extern umode_t prot_to_mode(u32 prot);
extern void mode_to_prot(struct inode *inode);
extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...);
extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...);
@@ -156,8 +156,8 @@ extern void affs_free_bitmap(struct super_block *sb);
extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len);
extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *);
extern int affs_unlink(struct inode *dir, struct dentry *dentry);
-extern int affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *);
-extern int affs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *);
+extern int affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
extern int affs_rmdir(struct inode *dir, struct dentry *dentry);
extern int affs_link(struct dentry *olddentry, struct inode *dir,
struct dentry *dentry);
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index de37ec842340..52a6407682e6 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -390,10 +390,10 @@ secs_to_datestamp(time_t secs, struct affs_date *ds)
ds->ticks = cpu_to_be32(secs * 50);
}
-mode_t
+umode_t
prot_to_mode(u32 prot)
{
- int mode = 0;
+ umode_t mode = 0;
if (!(prot & FIBF_NOWRITE))
mode |= S_IWUSR;
@@ -421,7 +421,7 @@ void
mode_to_prot(struct inode *inode)
{
u32 prot = AFFS_I(inode)->i_protect;
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
if (!(mode & S_IXUSR))
prot |= FIBF_NOEXECUTE;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 780a11dc6318..47806940aac0 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -255,13 +255,13 @@ affs_unlink(struct inode *dir, struct dentry *dentry)
}
int
-affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
int error;
- pr_debug("AFFS: create(%lu,\"%.*s\",0%o)\n",dir->i_ino,(int)dentry->d_name.len,
+ pr_debug("AFFS: create(%lu,\"%.*s\",0%ho)\n",dir->i_ino,(int)dentry->d_name.len,
dentry->d_name.name,mode);
inode = affs_new_inode(dir);
@@ -285,12 +285,12 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata
}
int
-affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
int error;
- pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%o)\n",dir->i_ino,
+ pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%ho)\n",dir->i_ino,
(int)dentry->d_name.len,dentry->d_name.name,mode);
inode = affs_new_inode(dir);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index b31507d0f9b9..8ba73fed7964 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -98,7 +98,6 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
static void affs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 1b0b19550015..e22dc4b4a503 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -28,9 +28,9 @@ static int afs_d_delete(const struct dentry *dentry);
static void afs_d_release(struct dentry *dentry);
static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
-static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd);
-static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
static int afs_rmdir(struct inode *dir, struct dentry *dentry);
static int afs_unlink(struct inode *dir, struct dentry *dentry);
static int afs_link(struct dentry *from, struct inode *dir,
@@ -764,7 +764,7 @@ static void afs_d_release(struct dentry *dentry)
/*
* create a directory on an AFS filesystem
*/
-static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct afs_file_status status;
struct afs_callback cb;
@@ -777,7 +777,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
dvnode = AFS_FS_I(dir);
- _enter("{%x:%u},{%s},%o",
+ _enter("{%x:%u},{%s},%ho",
dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
ret = -ENAMETOOLONG;
@@ -948,7 +948,7 @@ error:
/*
* create a regular file on an AFS filesystem
*/
-static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
struct afs_file_status status;
@@ -962,7 +962,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
dvnode = AFS_FS_I(dir);
- _enter("{%x:%u},{%s},%o,",
+ _enter("{%x:%u},{%s},%ho,",
dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
ret = -ENAMETOOLONG;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index d2b0888126d4..a306bb6d88d9 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -109,7 +109,7 @@ struct afs_call {
unsigned reply_size; /* current size of reply */
unsigned first_offset; /* offset into mapping[first] */
unsigned last_to; /* amount of mapping[last] */
- unsigned short offset; /* offset into received data store */
+ unsigned offset; /* offset into received data store */
unsigned char unmarshall; /* unmarshalling phase */
bool incoming; /* T if incoming call */
bool send_pages; /* T if data from mapping should be sent */
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index aa59184151d0..8f4ce2658b7d 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -242,7 +242,7 @@ struct vfsmount *afs_d_automount(struct path *path)
{
struct vfsmount *newmnt;
- _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
+ _enter("{%s}", path->dentry->d_name.name);
newmnt = afs_mntpt_do_automount(path->dentry);
if (IS_ERR(newmnt))
@@ -252,7 +252,7 @@ struct vfsmount *afs_d_automount(struct path *path)
mnt_set_expiry(newmnt, &afs_vfsmounts);
queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
afs_mntpt_expiry_timeout * HZ);
- _leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
+ _leave(" = %p", newmnt);
return newmnt;
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index e45a323aebb4..8ad8c2a0703a 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -314,6 +314,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
struct msghdr msg;
struct kvec iov[1];
int ret;
+ struct sk_buff *skb;
_enter("%x,{%d},", addr->s_addr, ntohs(call->port));
@@ -380,6 +381,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
error_do_abort:
rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
+ while ((skb = skb_dequeue(&call->rx_queue)))
+ afs_free_skb(skb);
rxrpc_kernel_end_call(rxcall);
call->rxcall = NULL;
error_kill_call:
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 356dcf0929e8..983ec59fc80d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -495,7 +495,6 @@ static void afs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
struct afs_vnode *vnode = AFS_FS_I(inode);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(afs_inode_cachep, vnode);
}
diff --git a/fs/aio.c b/fs/aio.c
index 78c514cfd212..b9d64d89a043 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -228,12 +228,6 @@ static void __put_ioctx(struct kioctx *ctx)
call_rcu(&ctx->rcu_head, ctx_rcu_free);
}
-static inline void get_ioctx(struct kioctx *kioctx)
-{
- BUG_ON(atomic_read(&kioctx->users) <= 0);
- atomic_inc(&kioctx->users);
-}
-
static inline int try_get_ioctx(struct kioctx *kioctx)
{
return atomic_inc_not_zero(&kioctx->users);
@@ -273,7 +267,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
mm = ctx->mm = current->mm;
atomic_inc(&mm->mm_count);
- atomic_set(&ctx->users, 1);
+ atomic_set(&ctx->users, 2);
spin_lock_init(&ctx->ctx_lock);
spin_lock_init(&ctx->ring_info.ring_lock);
init_waitqueue_head(&ctx->wait);
@@ -476,14 +470,23 @@ static void kiocb_batch_init(struct kiocb_batch *batch, long total)
batch->count = total;
}
-static void kiocb_batch_free(struct kiocb_batch *batch)
+static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
{
struct kiocb *req, *n;
+ if (list_empty(&batch->head))
+ return;
+
+ spin_lock_irq(&ctx->ctx_lock);
list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
list_del(&req->ki_batch);
+ list_del(&req->ki_list);
kmem_cache_free(kiocb_cachep, req);
+ ctx->reqs_active--;
}
+ if (unlikely(!ctx->reqs_active && ctx->dead))
+ wake_up_all(&ctx->wait);
+ spin_unlock_irq(&ctx->ctx_lock);
}
/*
@@ -600,11 +603,16 @@ static void aio_fput_routine(struct work_struct *data)
fput(req->ki_filp);
/* Link the iocb into the context's free list */
+ rcu_read_lock();
spin_lock_irq(&ctx->ctx_lock);
really_put_req(ctx, req);
+ /*
+ * at that point ctx might've been killed, but actual
+ * freeing is RCU'd
+ */
spin_unlock_irq(&ctx->ctx_lock);
+ rcu_read_unlock();
- put_ioctx(ctx);
spin_lock_irq(&fput_lock);
}
spin_unlock_irq(&fput_lock);
@@ -635,7 +643,6 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
* this function will be executed w/out any aio kthread wakeup.
*/
if (unlikely(!fput_atomic(req->ki_filp))) {
- get_ioctx(ctx);
spin_lock(&fput_lock);
list_add(&req->ki_list, &fput_head);
spin_unlock(&fput_lock);
@@ -1329,10 +1336,10 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
ret = PTR_ERR(ioctx);
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
- if (!ret)
+ if (!ret) {
+ put_ioctx(ioctx);
return 0;
-
- get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
+ }
io_destroy(ioctx);
}
@@ -1742,7 +1749,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
}
blk_finish_plug(&plug);
- kiocb_batch_free(&batch);
+ kiocb_batch_free(ctx, &batch);
put_ioctx(ctx);
return i ? i : ret;
}
diff --git a/fs/attr.c b/fs/attr.c
index 7ee7ba488313..95053ad8abcc 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -166,7 +166,7 @@ EXPORT_SYMBOL(setattr_copy);
int notify_change(struct dentry * dentry, struct iattr * attr)
{
struct inode *inode = dentry->d_inode;
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
int error;
struct timespec now;
unsigned int ia_valid = attr->ia_valid;
@@ -177,7 +177,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
}
if ((ia_valid & ATTR_MODE)) {
- mode_t amode = attr->ia_mode;
+ umode_t amode = attr->ia_mode;
/* Flag setting protected by i_mutex */
if (is_sxid(amode))
inode->i_flags &= ~S_NOSEC;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 326dc08d3e3f..eb1cc92cd67d 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -110,12 +110,14 @@ struct autofs_sb_info {
int sub_version;
int min_proto;
int max_proto;
+ int compat_daemon;
unsigned long exp_timeout;
unsigned int type;
int reghost_enabled;
int needs_reghost;
struct super_block *sb;
struct mutex wq_mutex;
+ struct mutex pipe_mutex;
spinlock_t fs_lock;
struct autofs_wait_queue *queues; /* Wait queue pointer */
spinlock_t lookup_lock;
@@ -155,7 +157,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
return 0;
}
-struct inode *autofs4_get_inode(struct super_block *, mode_t);
+struct inode *autofs4_get_inode(struct super_block *, umode_t);
void autofs4_free_ino(struct autofs_info *);
/* Expiration */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 509fe1eb66ae..85f1fcdb30e7 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -194,7 +194,7 @@ static int find_autofs_mount(const char *pathname,
return err;
err = -ENOENT;
while (path.dentry == path.mnt->mnt_root) {
- if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) {
+ if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
if (test(&path, data)) {
path_get(&path);
if (!err) /* already found some */
@@ -212,7 +212,7 @@ static int find_autofs_mount(const char *pathname,
static int test_by_dev(struct path *path, void *p)
{
- return path->mnt->mnt_sb->s_dev == *(dev_t *)p;
+ return path->dentry->d_sb->s_dev == *(dev_t *)p;
}
static int test_by_type(struct path *path, void *p)
@@ -385,6 +385,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
sbi->pipefd = pipefd;
sbi->pipe = pipe;
sbi->catatonic = 0;
+ sbi->compat_daemon = is_compat_task();
}
out:
mutex_unlock(&sbi->wq_mutex);
@@ -538,11 +539,11 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
err = find_autofs_mount(name, &path, test_by_type, &type);
if (err)
goto out;
- devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
+ devid = new_encode_dev(path.dentry->d_sb->s_dev);
err = 0;
if (path.mnt->mnt_root == path.dentry) {
err = 1;
- magic = path.mnt->mnt_sb->s_magic;
+ magic = path.dentry->d_sb->s_magic;
}
} else {
dev_t dev = sbi->sb->s_dev;
@@ -556,7 +557,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
err = have_submounts(path.dentry);
if (follow_down_one(&path))
- magic = path.mnt->mnt_sb->s_magic;
+ magic = path.dentry->d_sb->s_magic;
}
param->ismountpoint.out.devid = devid;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 450f529a4eae..1feb68ecef95 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -124,6 +124,7 @@ start:
/* Negative dentry - try next */
if (!simple_positive(q)) {
spin_unlock(&p->d_lock);
+ lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_);
p = q;
goto again;
}
@@ -186,6 +187,7 @@ again:
/* Negative dentry - try next */
if (!simple_positive(ret)) {
spin_unlock(&p->d_lock);
+ lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_);
p = ret;
goto again;
}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 8179f1ab8175..06858d955120 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -19,6 +19,7 @@
#include <linux/parser.h>
#include <linux/bitops.h>
#include <linux/magic.h>
+#include <linux/compat.h>
#include "autofs_i.h"
#include <linux/module.h>
@@ -70,10 +71,10 @@ out_kill_sb:
kill_litter_super(sb);
}
-static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int autofs4_show_options(struct seq_file *m, struct dentry *root)
{
- struct autofs_sb_info *sbi = autofs4_sbi(mnt->mnt_sb);
- struct inode *root_inode = mnt->mnt_sb->s_root->d_inode;
+ struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+ struct inode *root_inode = root->d_sb->s_root->d_inode;
if (!sbi)
return 0;
@@ -224,7 +225,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
set_autofs_type_indirect(&sbi->type);
sbi->min_proto = 0;
sbi->max_proto = 0;
+ sbi->compat_daemon = is_compat_task();
mutex_init(&sbi->wq_mutex);
+ mutex_init(&sbi->pipe_mutex);
spin_lock_init(&sbi->fs_lock);
sbi->queues = NULL;
spin_lock_init(&sbi->lookup_lock);
@@ -326,7 +329,7 @@ fail_unlock:
return -EINVAL;
}
-struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
+struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
{
struct inode *inode = new_inode(sb);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f55ae23b137e..75e5f1c8e028 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -26,7 +26,7 @@
static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
static int autofs4_dir_unlink(struct inode *,struct dentry *);
static int autofs4_dir_rmdir(struct inode *,struct dentry *);
-static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
+static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t);
static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
#ifdef CONFIG_COMPAT
static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
@@ -699,7 +699,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
return 0;
}
-static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e1fbdeef85db..9c098db43344 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -56,26 +56,27 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
mutex_unlock(&sbi->wq_mutex);
}
-static int autofs4_write(struct file *file, const void *addr, int bytes)
+static int autofs4_write(struct autofs_sb_info *sbi,
+ struct file *file, const void *addr, int bytes)
{
unsigned long sigpipe, flags;
mm_segment_t fs;
const char *data = (const char *)addr;
ssize_t wr = 0;
- /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
-
sigpipe = sigismember(&current->pending.signal, SIGPIPE);
/* Save pointer to user space and point back to kernel space */
fs = get_fs();
set_fs(KERNEL_DS);
+ mutex_lock(&sbi->pipe_mutex);
while (bytes &&
(wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
data += wr;
bytes -= wr;
}
+ mutex_unlock(&sbi->pipe_mutex);
set_fs(fs);
@@ -90,7 +91,24 @@ static int autofs4_write(struct file *file, const void *addr, int bytes)
return (bytes > 0);
}
-
+
+/*
+ * The autofs_v5 packet was misdesigned.
+ *
+ * The packets are identical on x86-32 and x86-64, but have different
+ * alignment. Which means that 'sizeof()' will give different results.
+ * Fix it up for the case of running 32-bit user mode on a 64-bit kernel.
+ */
+static noinline size_t autofs_v5_packet_size(struct autofs_sb_info *sbi)
+{
+ size_t pktsz = sizeof(struct autofs_v5_packet);
+#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT)
+ if (sbi->compat_daemon > 0)
+ pktsz -= 4;
+#endif
+ return pktsz;
+}
+
static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
struct autofs_wait_queue *wq,
int type)
@@ -110,6 +128,13 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = type;
+ mutex_lock(&sbi->wq_mutex);
+
+ /* Check if we have become catatonic */
+ if (sbi->catatonic) {
+ mutex_unlock(&sbi->wq_mutex);
+ return;
+ }
switch (type) {
/* Kernel protocol v4 missing and expire packets */
case autofs_ptype_missing:
@@ -147,8 +172,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
{
struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
- pktsz = sizeof(*packet);
-
+ pktsz = autofs_v5_packet_size(sbi);
packet->wait_queue_token = wq->wait_queue_token;
packet->len = wq->name.len;
memcpy(packet->name, wq->name.name, wq->name.len);
@@ -163,22 +187,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
}
default:
printk("autofs4_notify_daemon: bad type %d!\n", type);
+ mutex_unlock(&sbi->wq_mutex);
return;
}
- /* Check if we have become catatonic */
- mutex_lock(&sbi->wq_mutex);
- if (!sbi->catatonic) {
- pipe = sbi->pipe;
- get_file(pipe);
- }
+ pipe = sbi->pipe;
+ get_file(pipe);
+
mutex_unlock(&sbi->wq_mutex);
- if (pipe) {
- if (autofs4_write(pipe, &pkt, pktsz))
- autofs4_catatonic_mode(sbi);
- fput(pipe);
- }
+ if (autofs4_write(sbi, pipe, &pkt, pktsz))
+ autofs4_catatonic_mode(sbi);
+ fput(pipe);
}
static int autofs4_getpath(struct autofs_sb_info *sbi,
@@ -257,6 +277,9 @@ static int validate_request(struct autofs_wait_queue **wait,
struct autofs_wait_queue *wq;
struct autofs_info *ino;
+ if (sbi->catatonic)
+ return -ENOENT;
+
/* Wait in progress, continue; */
wq = autofs4_find_wait(sbi, qstr);
if (wq) {
@@ -289,6 +312,9 @@ static int validate_request(struct autofs_wait_queue **wait,
if (mutex_lock_interruptible(&sbi->wq_mutex))
return -EINTR;
+ if (sbi->catatonic)
+ return -ENOENT;
+
wq = autofs4_find_wait(sbi, qstr);
if (wq) {
*wait = wq;
@@ -389,7 +415,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
ret = validate_request(&wq, sbi, &qstr, dentry, notify);
if (ret <= 0) {
- if (ret == 0)
+ if (ret != -EINTR)
mutex_unlock(&sbi->wq_mutex);
kfree(qstr.name);
return ret;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 9205cf25f1c6..22e9a78872ff 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -173,7 +173,7 @@ static const struct file_operations bad_file_ops =
};
static int bad_inode_create (struct inode *dir, struct dentry *dentry,
- int mode, struct nameidata *nd)
+ umode_t mode, struct nameidata *nd)
{
return -EIO;
}
@@ -202,7 +202,7 @@ static int bad_inode_symlink (struct inode *dir, struct dentry *dentry,
}
static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry,
- int mode)
+ umode_t mode)
{
return -EIO;
}
@@ -213,7 +213,7 @@ static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
}
static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
- int mode, dev_t rdev)
+ umode_t mode, dev_t rdev)
{
return -EIO;
}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 8342ca67abcd..6e6d536767fe 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -286,7 +286,6 @@ befs_alloc_inode(struct super_block *sb)
static void befs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 9cc074019479..d12c7966db27 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -84,7 +84,7 @@ const struct file_operations bfs_dir_operations = {
extern void dump_imap(const char *, struct super_block *);
-static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
int err;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 697af5bf70b3..b0391bc402b1 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -251,7 +251,6 @@ static struct inode *bfs_alloc_inode(struct super_block *sb)
static void bfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index a6395bdb26ae..1ff94054d35a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -259,6 +259,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
current->mm->free_area_cache = current->mm->mmap_base;
current->mm->cached_hole_size = 0;
+ retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
+ if (retval < 0) {
+ /* Someone check-me: is this error path enough? */
+ send_sig(SIGKILL, current, 0);
+ return retval;
+ }
+
install_exec_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
@@ -352,13 +359,6 @@ beyond_if:
return retval;
}
- retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
- if (retval < 0) {
- /* Someone check-me: is this error path enough? */
- send_sig(SIGKILL, current, 0);
- return retval;
- }
-
current->mm->start_stack =
(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
#ifdef __alpha__
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 21ac5ee4b43f..07d096c49920 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -794,7 +794,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
* default mmap base, as well as whatever program they
* might try to exec. This is because the brk will
* follow the loader, and is not movable. */
-#if defined(CONFIG_X86) || defined(CONFIG_ARM)
+#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
/* Memory randomization might have been switched off
* in runtime via sysctl.
* If that is the case, retain the original non-zero
@@ -1421,7 +1421,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
for (i = 1; i < view->n; ++i) {
const struct user_regset *regset = &view->regsets[i];
do_thread_regset_writeback(t->task, regset);
- if (regset->core_note_type &&
+ if (regset->core_note_type && regset->get &&
(!regset->active || regset->active(t->task, regset))) {
int ret;
size_t size = regset->n * regset->size;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 1e9edbdeda7e..a9198dfd5f85 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -560,7 +560,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
break;
case 2: set_bit(Enabled, &e->flags);
break;
- case 3: root = dget(file->f_path.mnt->mnt_sb->s_root);
+ case 3: root = dget(file->f_path.dentry->d_sb->s_root);
mutex_lock(&root->d_inode->i_mutex);
kill_node(e);
@@ -587,7 +587,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
Node *e;
struct inode *inode;
struct dentry *root, *dentry;
- struct super_block *sb = file->f_path.mnt->mnt_sb;
+ struct super_block *sb = file->f_path.dentry->d_sb;
int err = 0;
e = create_entry(buffer, count);
@@ -666,7 +666,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
switch (res) {
case 1: enabled = 0; break;
case 2: enabled = 1; break;
- case 3: root = dget(file->f_path.mnt->mnt_sb->s_root);
+ case 3: root = dget(file->f_path.dentry->d_sb->s_root);
mutex_lock(&root->d_inode->i_mutex);
while (!list_empty(&entries))
diff --git a/fs/bio.c b/fs/bio.c
index b1fe82cf88cf..b980ecde026a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -505,13 +505,9 @@ EXPORT_SYMBOL(bio_clone);
int bio_get_nr_vecs(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
- int nr_pages;
-
- nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (nr_pages > queue_max_segments(q))
- nr_pages = queue_max_segments(q);
-
- return nr_pages;
+ return min_t(unsigned,
+ queue_max_segments(q),
+ queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
}
EXPORT_SYMBOL(bio_get_nr_vecs);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b07f1da1de4e..5e9f198f7712 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -17,6 +17,7 @@
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/buffer_head.h>
+#include <linux/swap.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/mpage.h>
@@ -24,7 +25,7 @@
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/log2.h>
-#include <linux/kmemleak.h>
+#include <linux/cleancache.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -82,13 +83,35 @@ static sector_t max_block(struct block_device *bdev)
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
-static void kill_bdev(struct block_device *bdev)
+void kill_bdev(struct block_device *bdev)
{
- if (bdev->bd_inode->i_mapping->nrpages == 0)
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+ if (mapping->nrpages == 0)
return;
+
invalidate_bh_lrus();
- truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+ truncate_inode_pages(mapping, 0);
}
+EXPORT_SYMBOL(kill_bdev);
+
+/* Invalidate clean unused buffers and pagecache. */
+void invalidate_bdev(struct block_device *bdev)
+{
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+ if (mapping->nrpages == 0)
+ return;
+
+ invalidate_bh_lrus();
+ lru_add_drain_all(); /* make sure all lru add caches are flushed */
+ invalidate_mapping_pages(mapping, 0, -1);
+ /* 99% of the time, we don't need to flush the cleancache on the bdev.
+ * But, for the strange corners, lets be cautious
+ */
+ cleancache_flush_inode(mapping);
+}
+EXPORT_SYMBOL(invalidate_bdev);
int set_blocksize(struct block_device *bdev, int size)
{
@@ -425,7 +448,6 @@ static void bdev_i_callback(struct rcu_head *head)
struct inode *inode = container_of(head, struct inode, i_rcu);
struct bdev_inode *bdi = BDEV_I(inode);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(bdev_cachep, bdi);
}
@@ -493,12 +515,12 @@ static struct file_system_type bd_type = {
.kill_sb = kill_anon_super,
};
-struct super_block *blockdev_superblock __read_mostly;
+static struct super_block *blockdev_superblock __read_mostly;
void __init bdev_cache_init(void)
{
int err;
- struct vfsmount *bd_mnt;
+ static struct vfsmount *bd_mnt;
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -510,12 +532,7 @@ void __init bdev_cache_init(void)
bd_mnt = kern_mount(&bd_type);
if (IS_ERR(bd_mnt))
panic("Cannot create bdev pseudo-fs");
- /*
- * This vfsmount structure is only used to obtain the
- * blockdev_superblock, so tell kmemleak not to report it.
- */
- kmemleak_not_leak(bd_mnt);
- blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
+ blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
}
/*
@@ -639,6 +656,11 @@ static struct block_device *bd_acquire(struct inode *inode)
return bdev;
}
+static inline int sb_is_blkdev_sb(struct super_block *sb)
+{
+ return sb == blockdev_superblock;
+}
+
/* Call when you free inode */
void bd_forget(struct inode *inode)
@@ -1117,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
bdev->bd_disk = disk;
+ bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
if (!partno) {
struct backing_dev_info *bdi;
@@ -1137,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
bdev->bd_disk = NULL;
+ bdev->bd_queue = NULL;
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
put_disk(disk);
@@ -1159,8 +1183,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
* The latter is necessary to prevent ghost
* partitions on a removed medium.
*/
- if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
- rescan_partitions(disk, bdev);
+ if (bdev->bd_invalidated) {
+ if (!ret)
+ rescan_partitions(disk, bdev);
+ else if (ret == -ENOMEDIUM)
+ invalidate_partitions(disk, bdev);
+ }
if (ret)
goto out_clear;
} else {
@@ -1190,8 +1218,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
/* the same as first opener case, read comment there */
- if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
- rescan_partitions(bdev->bd_disk, bdev);
+ if (bdev->bd_invalidated) {
+ if (!ret)
+ rescan_partitions(bdev->bd_disk, bdev);
+ else if (ret == -ENOMEDIUM)
+ invalidate_partitions(bdev->bd_disk, bdev);
+ }
if (ret)
goto out_unlock_bdev;
}
@@ -1210,6 +1242,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
disk_put_part(bdev->bd_part);
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
+ bdev->bd_queue = NULL;
bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 0b394580d860..42704149b723 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -171,11 +171,11 @@ out:
spin_unlock_irqrestore(&workers->lock, flags);
}
-static noinline int run_ordered_completions(struct btrfs_workers *workers,
+static noinline void run_ordered_completions(struct btrfs_workers *workers,
struct btrfs_work *work)
{
if (!workers->ordered)
- return 0;
+ return;
set_bit(WORK_DONE_BIT, &work->flags);
@@ -213,7 +213,6 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
}
spin_unlock(&workers->order_lock);
- return 0;
}
static void put_worker(struct btrfs_worker_thread *worker)
@@ -334,7 +333,7 @@ again:
if (freezing(current)) {
worker->working = 0;
spin_unlock_irq(&worker->lock);
- refrigerator();
+ try_to_freeze();
} else {
spin_unlock_irq(&worker->lock);
if (!kthread_should_stop()) {
@@ -399,7 +398,7 @@ again:
/*
* this will wait for all the worker threads to shutdown
*/
-int btrfs_stop_workers(struct btrfs_workers *workers)
+void btrfs_stop_workers(struct btrfs_workers *workers)
{
struct list_head *cur;
struct btrfs_worker_thread *worker;
@@ -427,7 +426,6 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
put_worker(worker);
}
spin_unlock_irq(&workers->lock);
- return 0;
}
/*
@@ -615,14 +613,14 @@ found:
* it was taken from. It is intended for use with long running work functions
* that make some progress and want to give the cpu up for others.
*/
-int btrfs_requeue_work(struct btrfs_work *work)
+void btrfs_requeue_work(struct btrfs_work *work)
{
struct btrfs_worker_thread *worker = work->worker;
unsigned long flags;
int wake = 0;
if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
- goto out;
+ return;
spin_lock_irqsave(&worker->lock, flags);
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
@@ -649,9 +647,6 @@ int btrfs_requeue_work(struct btrfs_work *work)
if (wake)
wake_up_process(worker->task);
spin_unlock_irqrestore(&worker->lock, flags);
-out:
-
- return 0;
}
void btrfs_set_work_high_prio(struct btrfs_work *work)
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index f34cc31fa3c9..063698b90ce2 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -111,9 +111,9 @@ struct btrfs_workers {
void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
int btrfs_start_workers(struct btrfs_workers *workers);
-int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_stop_workers(struct btrfs_workers *workers);
void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
struct btrfs_workers *async_starter);
-int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_requeue_work(struct btrfs_work *work);
void btrfs_set_work_high_prio(struct btrfs_work *work);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 56136d9046f0..f4e90748940a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1359,12 +1359,6 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
inode_to_path, ipath);
}
-/*
- * allocates space to return multiple file system paths for an inode.
- * total_bytes to allocate are passed, note that space usable for actual path
- * information will be total_bytes - sizeof(struct inode_fs_paths).
- * the returned pointer must be freed with free_ipath() in the end.
- */
struct btrfs_data_container *init_data_container(u32 total_bytes)
{
struct btrfs_data_container *data;
@@ -1420,5 +1414,6 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
void free_ipath(struct inode_fs_paths *ipath)
{
+ kfree(ipath->fspath);
kfree(ipath);
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d02c27cd14c7..d11afa67c7d8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -226,8 +226,8 @@ out:
* Clear the writeback bits on all of the file
* pages for a compressed write
*/
-static noinline int end_compressed_writeback(struct inode *inode, u64 start,
- unsigned long ram_size)
+static noinline void end_compressed_writeback(struct inode *inode, u64 start,
+ unsigned long ram_size)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
@@ -253,7 +253,6 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start,
index += ret;
}
/* the inode may be gone now */
- return 0;
}
/*
@@ -392,16 +391,16 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
*/
atomic_inc(&cb->pending_bios);
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
ret = btrfs_csum_one_bio(root, inode, bio,
start, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
bio_put(bio);
@@ -421,15 +420,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_get(bio);
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
bio_put(bio);
return 0;
@@ -497,7 +496,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* sure they map to this compressed extent on disk.
*/
set_page_extent_mapped(page);
- lock_extent(tree, last_offset, end, GFP_NOFS);
+ lock_extent(tree, last_offset, end);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
PAGE_CACHE_SIZE);
@@ -507,7 +506,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
(last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_sector) {
free_extent_map(em);
- unlock_extent(tree, last_offset, end, GFP_NOFS);
+ unlock_extent(tree, last_offset, end);
unlock_page(page);
page_cache_release(page);
break;
@@ -535,7 +534,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
nr_pages++;
page_cache_release(page);
} else {
- unlock_extent(tree, last_offset, end, GFP_NOFS);
+ unlock_extent(tree, last_offset, end);
unlock_page(page);
page_cache_release(page);
break;
@@ -662,7 +661,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_get(comp_bio);
ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
/*
* inc the count before we submit the bio so
@@ -675,14 +674,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(root, inode,
comp_bio, sums);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
sums += (comp_bio->bi_size + root->sectorsize - 1) /
root->sectorsize;
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
bio_put(comp_bio);
@@ -698,15 +697,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_get(comp_bio);
ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
bio_put(comp_bio);
return 0;
@@ -734,7 +733,7 @@ struct btrfs_compress_op *btrfs_compress_op[] = {
&btrfs_lzo_compress,
};
-int __init btrfs_init_compress(void)
+void __init btrfs_init_compress(void)
{
int i;
@@ -744,7 +743,6 @@ int __init btrfs_init_compress(void)
atomic_set(&comp_alloc_workspace[i], 0);
init_waitqueue_head(&comp_workspace_wait[i]);
}
- return 0;
}
/*
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index a12059f4f0fd..9afb0a62ae82 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,7 +19,7 @@
#ifndef __BTRFS_COMPRESSION_
#define __BTRFS_COMPRESSION_
-int btrfs_init_compress(void);
+void btrfs_init_compress(void);
void btrfs_exit_compress(void);
int btrfs_compress_pages(int type, struct address_space *mapping,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0639a555e16e..e801f226d7e0 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -36,7 +36,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
-static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
struct btrfs_path *btrfs_alloc_path(void)
@@ -156,10 +156,23 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
- rcu_read_lock();
- eb = rcu_dereference(root->node);
- extent_buffer_get(eb);
- rcu_read_unlock();
+ while (1) {
+ rcu_read_lock();
+ eb = rcu_dereference(root->node);
+
+ /*
+ * RCU really hurts here, we could free up the root node because
+ * it was cow'ed but we may not get the new root node yet so do
+ * the inc_not_zero dance and if it doesn't work then
+ * synchronize_rcu and try again.
+ */
+ if (atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+ break;
+ }
+ rcu_read_unlock();
+ synchronize_rcu();
+ }
return eb;
}
@@ -331,8 +344,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (btrfs_block_can_be_shared(root, buf)) {
ret = btrfs_lookup_extent_info(trans, root, buf->start,
buf->len, &refs, &flags);
- BUG_ON(ret);
- BUG_ON(refs == 0);
+ if (ret)
+ return ret;
+ if (refs == 0) {
+ ret = -EROFS;
+ btrfs_std_error(root->fs_info, ret);
+ return ret;
+ }
} else {
refs = 1;
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
@@ -351,14 +369,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
ret = btrfs_inc_ref(trans, root, buf, 1, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_dec_ref(trans, root, buf, 0, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
ret = btrfs_inc_ref(trans, root, cow, 1, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
} else {
@@ -368,14 +386,15 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
ret = btrfs_inc_ref(trans, root, cow, 1, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
if (new_flags != 0) {
ret = btrfs_set_disk_extent_flags(trans, root,
buf->start,
buf->len,
new_flags, 0);
- BUG_ON(ret);
+ if (ret)
+ return ret;
}
} else {
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
@@ -384,9 +403,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
ret = btrfs_inc_ref(trans, root, cow, 1, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, buf, 1, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
clean_tree_block(trans, root, buf);
*last_ref = 1;
@@ -415,7 +434,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
{
struct btrfs_disk_key disk_key;
struct extent_buffer *cow;
- int level;
+ int level, ret;
int last_ref = 0;
int unlock_orig = 0;
u64 parent_start;
@@ -467,7 +486,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
(unsigned long)btrfs_header_fsid(cow),
BTRFS_FSID_SIZE);
- update_ref_for_cow(trans, root, buf, cow, &last_ref);
+ ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
if (root->ref_cows)
btrfs_reloc_cow_block(trans, root, buf, cow);
@@ -504,7 +527,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
}
if (unlock_orig)
btrfs_tree_unlock(buf);
- free_extent_buffer(buf);
+ free_extent_buffer_stale(buf);
btrfs_mark_buffer_dirty(cow);
*cow_ret = cow;
return 0;
@@ -934,7 +957,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* promote the child to a root */
child = read_node_slot(root, mid, 0);
- BUG_ON(!child);
+ if (!child) {
+ ret = -EROFS;
+ btrfs_std_error(root->fs_info, ret);
+ goto enospc;
+ }
+
btrfs_tree_lock(child);
btrfs_set_lock_blocking(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
@@ -959,7 +987,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
root_sub_used(root, mid->len);
btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
/* once for the root ptr */
- free_extent_buffer(mid);
+ free_extent_buffer_stale(mid);
return 0;
}
if (btrfs_header_nritems(mid) >
@@ -1010,13 +1038,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(right) == 0) {
clean_tree_block(trans, root, right);
btrfs_tree_unlock(right);
- wret = del_ptr(trans, root, path, level + 1, pslot +
- 1);
- if (wret)
- ret = wret;
+ del_ptr(trans, root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
btrfs_free_tree_block(trans, root, right, 0, 1, 0);
- free_extent_buffer(right);
+ free_extent_buffer_stale(right);
right = NULL;
} else {
struct btrfs_disk_key right_key;
@@ -1035,7 +1060,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
* otherwise we would have pulled some pointers from the
* right
*/
- BUG_ON(!left);
+ if (!left) {
+ ret = -EROFS;
+ btrfs_std_error(root->fs_info, ret);
+ goto enospc;
+ }
wret = balance_node_right(trans, root, mid, left);
if (wret < 0) {
ret = wret;
@@ -1051,12 +1080,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(mid) == 0) {
clean_tree_block(trans, root, mid);
btrfs_tree_unlock(mid);
- wret = del_ptr(trans, root, path, level + 1, pslot);
- if (wret)
- ret = wret;
+ del_ptr(trans, root, path, level + 1, pslot);
root_sub_used(root, mid->len);
btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
- free_extent_buffer(mid);
+ free_extent_buffer_stale(mid);
mid = NULL;
} else {
/* update the parent key to reflect our changes */
@@ -1382,7 +1409,8 @@ static noinline int reada_for_balance(struct btrfs_root *root,
* if lowest_unlock is 1, level 0 won't be unlocked
*/
static noinline void unlock_up(struct btrfs_path *path, int level,
- int lowest_unlock)
+ int lowest_unlock, int min_write_lock_level,
+ int *write_lock_level)
{
int i;
int skip_level = level;
@@ -1414,6 +1442,11 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
btrfs_tree_unlock_rw(t, path->locks[i]);
path->locks[i] = 0;
+ if (write_lock_level &&
+ i > min_write_lock_level &&
+ i <= *write_lock_level) {
+ *write_lock_level = i - 1;
+ }
}
}
}
@@ -1637,6 +1670,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
/* everything at write_lock_level or lower must be write locked */
int write_lock_level = 0;
u8 lowest_level = 0;
+ int min_write_lock_level;
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0);
@@ -1664,6 +1698,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
if (cow && (p->keep_locks || p->lowest_level))
write_lock_level = BTRFS_MAX_LEVEL;
+ min_write_lock_level = write_lock_level;
+
again:
/*
* we try very hard to do read locks on the root
@@ -1795,7 +1831,8 @@ cow_done:
goto again;
}
- unlock_up(p, level, lowest_unlock);
+ unlock_up(p, level, lowest_unlock,
+ min_write_lock_level, &write_lock_level);
if (level == lowest_level) {
if (dec)
@@ -1857,7 +1894,8 @@ cow_done:
}
}
if (!p->search_for_split)
- unlock_up(p, level, lowest_unlock);
+ unlock_up(p, level, lowest_unlock,
+ min_write_lock_level, &write_lock_level);
goto done;
}
}
@@ -1881,15 +1919,12 @@ done:
* fixing up pointers when a given leaf/node is not in slot 0 of the
* higher levels
*
- * If this fails to write a tree block, it returns -1, but continues
- * fixing up the blocks in ram so the tree is consistent.
*/
-static int fixup_low_keys(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_disk_key *key, int level)
+static void fixup_low_keys(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_disk_key *key, int level)
{
int i;
- int ret = 0;
struct extent_buffer *t;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1902,7 +1937,6 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
if (tslot != 0)
break;
}
- return ret;
}
/*
@@ -1911,9 +1945,9 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
* This function isn't completely safe. It's the caller's responsibility
* that the new key won't break the order
*/
-int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *new_key)
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *new_key)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *eb;
@@ -1923,13 +1957,11 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
slot = path->slots[0];
if (slot > 0) {
btrfs_item_key(eb, &disk_key, slot - 1);
- if (comp_keys(&disk_key, new_key) >= 0)
- return -1;
+ BUG_ON(comp_keys(&disk_key, new_key) >= 0);
}
if (slot < btrfs_header_nritems(eb) - 1) {
btrfs_item_key(eb, &disk_key, slot + 1);
- if (comp_keys(&disk_key, new_key) <= 0)
- return -1;
+ BUG_ON(comp_keys(&disk_key, new_key) <= 0);
}
btrfs_cpu_key_to_disk(&disk_key, new_key);
@@ -1937,7 +1969,6 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(eb);
if (slot == 0)
fixup_low_keys(trans, root, path, &disk_key, 1);
- return 0;
}
/*
@@ -2140,12 +2171,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
*
* slot and level indicate where you want the key to go, and
* blocknr is the block the key points to.
- *
- * returns zero on success and < 0 on any error
*/
-static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, struct btrfs_disk_key
- *key, u64 bytenr, int slot, int level)
+static void insert_ptr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_disk_key *key, u64 bytenr,
+ int slot, int level)
{
struct extent_buffer *lower;
int nritems;
@@ -2155,8 +2185,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
- if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
- BUG();
+ BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
if (slot != nritems) {
memmove_extent_buffer(lower,
btrfs_node_key_ptr_offset(slot + 1),
@@ -2169,7 +2198,6 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_set_node_ptr_generation(lower, slot, trans->transid);
btrfs_set_header_nritems(lower, nritems + 1);
btrfs_mark_buffer_dirty(lower);
- return 0;
}
/*
@@ -2190,7 +2218,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
int mid;
int ret;
- int wret;
u32 c_nritems;
c = path->nodes[level];
@@ -2247,11 +2274,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
btrfs_mark_buffer_dirty(split);
- wret = insert_ptr(trans, root, path, &disk_key, split->start,
- path->slots[level + 1] + 1,
- level + 1);
- if (wret)
- ret = wret;
+ insert_ptr(trans, root, path, &disk_key, split->start,
+ path->slots[level + 1] + 1, level + 1);
if (path->slots[level] >= mid) {
path->slots[level] -= mid;
@@ -2320,6 +2344,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
{
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *upper = path->nodes[1];
+ struct btrfs_map_token token;
struct btrfs_disk_key disk_key;
int slot;
u32 i;
@@ -2331,6 +2356,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
u32 data_end;
u32 this_item_size;
+ btrfs_init_map_token(&token);
+
if (empty)
nr = 0;
else
@@ -2408,8 +2435,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
push_space = BTRFS_LEAF_DATA_SIZE(root);
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(right, i);
- push_space -= btrfs_item_size(right, item);
- btrfs_set_item_offset(right, item, push_space);
+ push_space -= btrfs_token_item_size(right, item, &token);
+ btrfs_set_token_item_offset(right, item, push_space, &token);
}
left_nritems -= push_items;
@@ -2537,9 +2564,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
u32 old_left_nritems;
u32 nr;
int ret = 0;
- int wret;
u32 this_item_size;
u32 old_left_item_size;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
if (empty)
nr = min(right_nritems, max_slot);
@@ -2600,9 +2629,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
item = btrfs_item_nr(left, i);
- ioff = btrfs_item_offset(left, item);
- btrfs_set_item_offset(left, item,
- ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
+ ioff = btrfs_token_item_offset(left, item, &token);
+ btrfs_set_token_item_offset(left, item,
+ ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
+ &token);
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -2632,8 +2662,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(right, i);
- push_space = push_space - btrfs_item_size(right, item);
- btrfs_set_item_offset(right, item, push_space);
+ push_space = push_space - btrfs_token_item_size(right,
+ item, &token);
+ btrfs_set_token_item_offset(right, item, push_space, &token);
}
btrfs_mark_buffer_dirty(left);
@@ -2643,9 +2674,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
clean_tree_block(trans, root, right);
btrfs_item_key(right, &disk_key, 0);
- wret = fixup_low_keys(trans, root, path, &disk_key, 1);
- if (wret)
- ret = wret;
+ fixup_low_keys(trans, root, path, &disk_key, 1);
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
@@ -2716,7 +2745,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
path->nodes[1], slot - 1, &left);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
- ret = 1;
+ if (ret == -ENOSPC)
+ ret = 1;
goto out;
}
@@ -2738,22 +2768,21 @@ out:
/*
* split the path's leaf in two, making sure there is at least data_size
* available for the resulting leaf level of the path.
- *
- * returns 0 if all went well and < 0 on failure.
*/
-static noinline int copy_for_split(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *l,
- struct extent_buffer *right,
- int slot, int mid, int nritems)
+static noinline void copy_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *l,
+ struct extent_buffer *right,
+ int slot, int mid, int nritems)
{
int data_copy_size;
int rt_data_off;
int i;
- int ret = 0;
- int wret;
struct btrfs_disk_key disk_key;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
@@ -2775,17 +2804,15 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
struct btrfs_item *item = btrfs_item_nr(right, i);
u32 ioff;
- ioff = btrfs_item_offset(right, item);
- btrfs_set_item_offset(right, item, ioff + rt_data_off);
+ ioff = btrfs_token_item_offset(right, item, &token);
+ btrfs_set_token_item_offset(right, item,
+ ioff + rt_data_off, &token);
}
btrfs_set_header_nritems(l, mid);
- ret = 0;
btrfs_item_key(right, &disk_key, 0);
- wret = insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1] + 1, 1);
- if (wret)
- ret = wret;
+ insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
btrfs_mark_buffer_dirty(right);
btrfs_mark_buffer_dirty(l);
@@ -2803,8 +2830,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
}
BUG_ON(path->slots[0] < 0);
-
- return ret;
}
/*
@@ -2993,12 +3018,8 @@ again:
if (split == 0) {
if (mid <= slot) {
btrfs_set_header_nritems(right, 0);
- wret = insert_ptr(trans, root, path,
- &disk_key, right->start,
- path->slots[1] + 1, 1);
- if (wret)
- ret = wret;
-
+ insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3006,29 +3027,21 @@ again:
path->slots[1] += 1;
} else {
btrfs_set_header_nritems(right, 0);
- wret = insert_ptr(trans, root, path,
- &disk_key,
- right->start,
+ insert_ptr(trans, root, path, &disk_key, right->start,
path->slots[1], 1);
- if (wret)
- ret = wret;
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
path->slots[0] = 0;
- if (path->slots[1] == 0) {
- wret = fixup_low_keys(trans, root,
- path, &disk_key, 1);
- if (wret)
- ret = wret;
- }
+ if (path->slots[1] == 0)
+ fixup_low_keys(trans, root, path,
+ &disk_key, 1);
}
btrfs_mark_buffer_dirty(right);
return ret;
}
- ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
- BUG_ON(ret);
+ copy_for_split(trans, root, path, l, right, slot, mid, nritems);
if (split == 2) {
BUG_ON(num_doubles != 0);
@@ -3036,7 +3049,7 @@ again:
goto again;
}
- return ret;
+ return 0;
push_for_double:
push_for_double_split(trans, root, path, data_size);
@@ -3238,11 +3251,9 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
return ret;
path->slots[0]++;
- ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
- item_size, item_size +
- sizeof(struct btrfs_item), 1);
- BUG_ON(ret);
-
+ setup_items_for_insert(trans, root, path, new_key, &item_size,
+ item_size, item_size +
+ sizeof(struct btrfs_item), 1);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -3257,10 +3268,10 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
* off the end of the item or if we shift the item to chop bytes off
* the front.
*/
-int btrfs_truncate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u32 new_size, int from_end)
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
@@ -3271,13 +3282,16 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
unsigned int old_size;
unsigned int size_diff;
int i;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
leaf = path->nodes[0];
slot = path->slots[0];
old_size = btrfs_item_size_nr(leaf, slot);
if (old_size == new_size)
- return 0;
+ return;
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(root, leaf);
@@ -3297,8 +3311,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff + size_diff);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff + size_diff, &token);
}
/* shift the data */
@@ -3350,15 +3365,14 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
btrfs_print_leaf(root, leaf);
BUG();
}
- return 0;
}
/*
* make the item pointed to by the path bigger, data_size is the new size.
*/
-int btrfs_extend_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- u32 data_size)
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ u32 data_size)
{
int slot;
struct extent_buffer *leaf;
@@ -3368,6 +3382,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
unsigned int old_data;
unsigned int old_size;
int i;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
leaf = path->nodes[0];
@@ -3397,8 +3414,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff - data_size);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff - data_size, &token);
}
/* shift the data */
@@ -3416,7 +3434,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
btrfs_print_leaf(root, leaf);
BUG();
}
- return 0;
}
/*
@@ -3441,6 +3458,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
unsigned int data_end;
struct btrfs_disk_key disk_key;
struct btrfs_key found_key;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
for (i = 0; i < nr; i++) {
if (total_size + data_size[i] + sizeof(struct btrfs_item) >
@@ -3506,8 +3526,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff - total_data);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff - total_data, &token);
}
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
@@ -3534,9 +3555,10 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(leaf, slot + i);
- btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+ btrfs_set_token_item_offset(leaf, item,
+ data_end - data_size[i], &token);
data_end -= data_size[i];
- btrfs_set_item_size(leaf, item, data_size[i]);
+ btrfs_set_token_item_size(leaf, item, data_size[i], &token);
}
btrfs_set_header_nritems(leaf, nritems + nr);
btrfs_mark_buffer_dirty(leaf);
@@ -3544,7 +3566,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
ret = 0;
if (slot == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+ fixup_low_keys(trans, root, path, &disk_key, 1);
}
if (btrfs_leaf_free_space(root, leaf) < 0) {
@@ -3562,19 +3584,21 @@ out:
* to save stack depth by doing the bulk of the work in a function
* that doesn't call btrfs_search_slot
*/
-int setup_items_for_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+void setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr)
{
struct btrfs_item *item;
int i;
u32 nritems;
unsigned int data_end;
struct btrfs_disk_key disk_key;
- int ret;
struct extent_buffer *leaf;
int slot;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
leaf = path->nodes[0];
slot = path->slots[0];
@@ -3606,8 +3630,9 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff - total_data);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff - total_data, &token);
}
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
@@ -3626,17 +3651,17 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(leaf, slot + i);
- btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+ btrfs_set_token_item_offset(leaf, item,
+ data_end - data_size[i], &token);
data_end -= data_size[i];
- btrfs_set_item_size(leaf, item, data_size[i]);
+ btrfs_set_token_item_size(leaf, item, data_size[i], &token);
}
btrfs_set_header_nritems(leaf, nritems + nr);
- ret = 0;
if (slot == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+ fixup_low_keys(trans, root, path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(leaf);
@@ -3645,7 +3670,6 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
btrfs_print_leaf(root, leaf);
BUG();
}
- return ret;
}
/*
@@ -3672,16 +3696,14 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
if (ret == 0)
return -EEXIST;
if (ret < 0)
- goto out;
+ return ret;
slot = path->slots[0];
BUG_ON(slot < 0);
- ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+ setup_items_for_insert(trans, root, path, cpu_key, data_size,
total_data, total_size, nr);
-
-out:
- return ret;
+ return 0;
}
/*
@@ -3717,13 +3739,11 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
* the tree should have been previously balanced so the deletion does not
* empty a node.
*/
-static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot)
+static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int level, int slot)
{
struct extent_buffer *parent = path->nodes[level];
u32 nritems;
- int ret = 0;
- int wret;
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
@@ -3743,12 +3763,9 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_node_key(parent, &disk_key, 0);
- wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
- if (wret)
- ret = wret;
+ fixup_low_keys(trans, root, path, &disk_key, level + 1);
}
btrfs_mark_buffer_dirty(parent);
- return ret;
}
/*
@@ -3761,17 +3778,13 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* The path must have already been setup for deleting the leaf, including
* all the proper balancing. path->nodes[1] must be locked.
*/
-static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *leaf)
+static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *leaf)
{
- int ret;
-
WARN_ON(btrfs_header_generation(leaf) != trans->transid);
- ret = del_ptr(trans, root, path, 1, path->slots[1]);
- if (ret)
- return ret;
+ del_ptr(trans, root, path, 1, path->slots[1]);
/*
* btrfs_free_extent is expensive, we want to make sure we
@@ -3781,8 +3794,9 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
+ extent_buffer_get(leaf);
btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
- return 0;
+ free_extent_buffer_stale(leaf);
}
/*
* delete the item at the leaf level in path. If that empties
@@ -3799,6 +3813,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int wret;
int i;
u32 nritems;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
leaf = path->nodes[0];
last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
@@ -3820,8 +3837,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u32 ioff;
item = btrfs_item_nr(leaf, i);
- ioff = btrfs_item_offset(leaf, item);
- btrfs_set_item_offset(leaf, item, ioff + dsize);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff + dsize, &token);
}
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -3839,8 +3857,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
} else {
btrfs_set_path_blocking(path);
clean_tree_block(trans, root, leaf);
- ret = btrfs_del_leaf(trans, root, path, leaf);
- BUG_ON(ret);
+ btrfs_del_leaf(trans, root, path, leaf);
}
} else {
int used = leaf_space_used(leaf, 0, nritems);
@@ -3848,10 +3865,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_item_key(leaf, &disk_key, 0);
- wret = fixup_low_keys(trans, root, path,
- &disk_key, 1);
- if (wret)
- ret = wret;
+ fixup_low_keys(trans, root, path, &disk_key, 1);
}
/* delete the leaf if it is mostly empty */
@@ -3879,9 +3893,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (btrfs_header_nritems(leaf) == 0) {
path->slots[1] = slot;
- ret = btrfs_del_leaf(trans, root, path, leaf);
- BUG_ON(ret);
+ btrfs_del_leaf(trans, root, path, leaf);
free_extent_buffer(leaf);
+ ret = 0;
} else {
/* if we're still in the path, make sure
* we're dirty. Otherwise, one of the
@@ -4059,18 +4073,18 @@ find_next_key:
path->slots[level] = slot;
if (level == path->lowest_level) {
ret = 0;
- unlock_up(path, level, 1);
+ unlock_up(path, level, 1, 0, NULL);
goto out;
}
btrfs_set_path_blocking(path);
cur = read_node_slot(root, cur, slot);
- BUG_ON(!cur);
+ BUG_ON(!cur); /* -ENOMEM */
btrfs_tree_read_lock(cur);
path->locks[level - 1] = BTRFS_READ_LOCK;
path->nodes[level - 1] = cur;
- unlock_up(path, level, 1);
+ unlock_up(path, level, 1, 0, NULL);
btrfs_clear_path_blocking(path, NULL, 0);
}
out:
@@ -4306,7 +4320,7 @@ again:
}
ret = 0;
done:
- unlock_up(path, 0, 1);
+ unlock_up(path, 0, 1, 0, NULL);
path->leave_spinning = old_spinning;
if (!old_spinning)
btrfs_set_path_blocking(path);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8e4457e0478e..5b8ef8eb3521 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,6 +48,8 @@ struct btrfs_ordered_sum;
#define BTRFS_MAGIC "_BHRfS_M"
+#define BTRFS_MAX_MIRRORS 2
+
#define BTRFS_MAX_LEVEL 8
#define BTRFS_COMPAT_EXTENT_TREE_V0
@@ -138,6 +140,12 @@ struct btrfs_ordered_sum;
#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
/*
+ * the max metadata block size. This limit is somewhat artificial,
+ * but the memmove costs go through the roof for larger blocks.
+ */
+#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
+
+/*
* we can actually store much bigger names, but lets not confuse the rest
* of linux
*/
@@ -461,6 +469,19 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
+/*
+ * some patches floated around with a second compression method
+ * lets save that incompat here for when they do get in
+ * Note we don't actually support it, we're just reserving the
+ * number
+ */
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4)
+
+/*
+ * older kernels tried to do bigger metadata blocks, but the
+ * code was pretty buggy. Lets not let them try anymore.
+ */
+#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -468,6 +489,7 @@ struct btrfs_super_block {
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
/*
@@ -829,6 +851,21 @@ struct btrfs_csum_item {
*/
#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
+#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
+ BTRFS_AVAIL_ALLOC_BIT_SINGLE)
+
+static inline u64 chunk_to_extended(u64 flags)
+{
+ if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
+ flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ return flags;
+}
+static inline u64 extended_to_chunk(u64 flags)
+{
+ return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+}
+
struct btrfs_block_group_item {
__le64 used;
__le64 chunk_objectid;
@@ -1503,6 +1540,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
+#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1526,6 +1564,17 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
+struct btrfs_map_token {
+ struct extent_buffer *eb;
+ char *kaddr;
+ unsigned long offset;
+};
+
+static inline void btrfs_init_map_token (struct btrfs_map_token *token)
+{
+ memset(token, 0, sizeof(*token));
+}
+
/* some macros to generate set/get funcs for the struct fields. This
* assumes there is a lefoo_to_cpu for every type, so lets make a simple
* one for u8:
@@ -1549,20 +1598,22 @@ struct btrfs_ioctl_defrag_range_args {
#ifndef BTRFS_SETGET_FUNCS
#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
+u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \
+void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\
void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
#endif
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(struct extent_buffer *eb) \
{ \
- type *p = page_address(eb->first_page); \
+ type *p = page_address(eb->pages[0]); \
u##bits res = le##bits##_to_cpu(p->member); \
return res; \
} \
static inline void btrfs_set_##name(struct extent_buffer *eb, \
u##bits val) \
{ \
- type *p = page_address(eb->first_page); \
+ type *p = page_address(eb->pages[0]); \
p->member = cpu_to_le##bits(val); \
}
@@ -2364,7 +2415,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
return btrfs_item_size(eb, e) - offset;
}
-static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
}
@@ -2466,8 +2517,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- u64 search_end, struct btrfs_key *ins,
- u64 data);
+ struct btrfs_key *ins, u64 data);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref, int for_cow);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2484,8 +2534,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len);
-int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -2548,8 +2598,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
u64 num_bytes);
int btrfs_set_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
-int btrfs_set_block_group_rw(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache);
+void btrfs_set_block_group_rw(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_root *root,
@@ -2568,9 +2618,9 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
-int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *new_key);
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
@@ -2590,12 +2640,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct extent_buffer **cow_ret, u64 new_root_objectid);
int btrfs_block_can_be_shared(struct btrfs_root *root,
struct extent_buffer *buf);
-int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, u32 data_size);
-int btrfs_truncate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u32 new_size, int from_end);
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ u32 data_size);
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -2629,10 +2680,10 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
return btrfs_del_items(trans, root, path, path->slots[0], 1);
}
-int setup_items_for_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr);
+void setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -2659,9 +2710,9 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
}
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-void btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref,
- int for_reloc);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ int update_ref, int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
@@ -2687,24 +2738,6 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->super_for_commit);
kfree(fs_info);
}
-/**
- * profile_is_valid - tests whether a given profile is valid and reduced
- * @flags: profile to validate
- * @extended: if true @flags is treated as an extended profile
- */
-static inline int profile_is_valid(u64 flags, int extended)
-{
- u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
-
- flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
- if (extended)
- mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-
- if (flags & mask)
- return 0;
- /* true if zero or exactly one bit set */
- return (flags & (~flags + 1)) == flags;
-}
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2723,9 +2756,10 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_root_item
*item);
-int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_key *key, struct btrfs_root_item
- *item);
+int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key,
+ struct btrfs_root_item *item);
int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
btrfs_root_item *item, struct btrfs_key *key);
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
@@ -2909,7 +2943,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root);
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
-int btrfs_invalidate_inodes(struct btrfs_root *root);
+void btrfs_invalidate_inodes(struct btrfs_root *root);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_root *root);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -2961,13 +2995,41 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
/* super.c */
int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
+void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...);
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno);
+ unsigned int line, int errno, const char *fmt, ...);
+
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *function,
+ unsigned int line, int errno);
+
+#define btrfs_abort_transaction(trans, root, errno) \
+do { \
+ __btrfs_abort_transaction(trans, root, __func__, \
+ __LINE__, errno); \
+} while (0)
#define btrfs_std_error(fs_info, errno) \
do { \
if ((errno)) \
- __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+ __btrfs_std_error((fs_info), __func__, \
+ __LINE__, (errno), NULL); \
+} while (0)
+
+#define btrfs_error(fs_info, errno, fmt, args...) \
+do { \
+ __btrfs_std_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args); \
+} while (0)
+
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+
+#define btrfs_panic(fs_info, errno, fmt, args...) \
+do { \
+ struct btrfs_fs_info *_i = (fs_info); \
+ __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \
+ BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \
} while (0)
/* acl.c */
@@ -3003,16 +3065,17 @@ void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
-void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
/* scrub.c */
int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
struct btrfs_scrub_progress *progress, int readonly);
-int btrfs_scrub_pause(struct btrfs_root *root);
-int btrfs_scrub_pause_super(struct btrfs_root *root);
-int btrfs_scrub_continue(struct btrfs_root *root);
-int btrfs_scrub_continue_super(struct btrfs_root *root);
+void btrfs_scrub_pause(struct btrfs_root *root);
+void btrfs_scrub_pause_super(struct btrfs_root *root);
+void btrfs_scrub_continue(struct btrfs_root *root);
+void btrfs_scrub_continue_super(struct btrfs_root *root);
+int __btrfs_scrub_cancel(struct btrfs_fs_info *info);
int btrfs_scrub_cancel(struct btrfs_root *root);
int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index fe4cd0f1cef1..03e3748d84d0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -115,6 +115,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
return NULL;
}
+/* Will return either the node or PTR_ERR(-ENOMEM) */
static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
struct inode *inode)
{
@@ -836,10 +837,8 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
btrfs_clear_path_blocking(path, NULL, 0);
/* insert the keys of the items */
- ret = setup_items_for_insert(trans, root, path, keys, data_size,
- total_data_size, total_size, nitems);
- if (ret)
- goto error;
+ setup_items_for_insert(trans, root, path, keys, data_size,
+ total_data_size, total_size, nitems);
/* insert the dir index items */
slot = path->slots[0];
@@ -1108,16 +1107,25 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
return 0;
}
-/* Called when committing the transaction. */
+/*
+ * Called when committing the transaction.
+ * Returns 0 on success.
+ * Returns < 0 on error and returns with an aborted transaction with any
+ * outstanding delayed items cleaned up.
+ */
int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ struct btrfs_root *curr_root = root;
struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret = 0;
+ if (trans->aborted)
+ return -EIO;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1130,17 +1138,18 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
curr_node = btrfs_first_delayed_node(delayed_root);
while (curr_node) {
- root = curr_node->root;
- ret = btrfs_insert_delayed_items(trans, path, root,
+ curr_root = curr_node->root;
+ ret = btrfs_insert_delayed_items(trans, path, curr_root,
curr_node);
if (!ret)
- ret = btrfs_delete_delayed_items(trans, path, root,
- curr_node);
+ ret = btrfs_delete_delayed_items(trans, path,
+ curr_root, curr_node);
if (!ret)
- ret = btrfs_update_delayed_inode(trans, root, path,
- curr_node);
+ ret = btrfs_update_delayed_inode(trans, curr_root,
+ path, curr_node);
if (ret) {
btrfs_release_delayed_node(curr_node);
+ btrfs_abort_transaction(trans, root, ret);
break;
}
@@ -1151,6 +1160,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
trans->block_rsv = block_rsv;
+
return ret;
}
@@ -1371,6 +1381,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
btrfs_wq_run_delayed_node(delayed_root, root, 0);
}
+/* Will return 0 or -ENOMEM */
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *name,
int name_len, struct inode *dir,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 66e4f29505a3..69f22e3ab3bc 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -420,7 +420,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
* this does all the dirty work in terms of maintaining the correct
* overall modification count.
*/
-static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes,
@@ -487,20 +487,19 @@ static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(ref);
+ kfree(head_ref);
} else {
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
}
- return 0;
}
/*
* helper to insert a delayed tree ref into the rbtree.
*/
-static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -549,18 +548,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(ref);
+ kfree(full_ref);
} else {
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
}
- return 0;
}
/*
* helper to insert a delayed data ref into the rbtree.
*/
-static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -611,12 +609,11 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(ref);
+ kfree(full_ref);
} else {
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
}
- return 0;
}
/*
@@ -634,7 +631,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- int ret;
BUG_ON(extent_op && extent_op->is_data);
ref = kmalloc(sizeof(*ref), GFP_NOFS);
@@ -656,14 +652,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
num_bytes, action, 0);
- BUG_ON(ret);
- ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+ add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
num_bytes, parent, ref_root, level, action,
for_cow);
- BUG_ON(ret);
if (!need_ref_seq(for_cow, ref_root) &&
waitqueue_active(&delayed_refs->seq_wait))
wake_up(&delayed_refs->seq_wait);
@@ -685,7 +679,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- int ret;
BUG_ON(extent_op && !extent_op->is_data);
ref = kmalloc(sizeof(*ref), GFP_NOFS);
@@ -707,14 +700,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
num_bytes, action, 1);
- BUG_ON(ret);
- ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+ add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
action, for_cow);
- BUG_ON(ret);
if (!need_ref_seq(for_cow, ref_root) &&
waitqueue_active(&delayed_refs->seq_wait))
wake_up(&delayed_refs->seq_wait);
@@ -729,7 +720,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
{
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- int ret;
head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
if (!head_ref)
@@ -740,10 +730,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
extent_op->is_data);
- BUG_ON(ret);
if (waitqueue_active(&delayed_refs->seq_wait))
wake_up(&delayed_refs->seq_wait);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 31d84e78129b..c1a074d0696f 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -49,9 +49,8 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
di = btrfs_match_dir_item_name(root, path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
- ret = btrfs_extend_item(trans, root, path, data_size);
- }
- if (ret < 0)
+ btrfs_extend_item(trans, root, path, data_size);
+ } else if (ret < 0)
return ERR_PTR(ret);
WARN_ON(ret > 0);
leaf = path->nodes[0];
@@ -116,6 +115,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
* 'location' is the key to stuff into the directory item, 'type' is the
* type of the inode we're pointing to, and 'index' is the sequence number
* to use for the second index (if one is created).
+ * Will return 0 or -ENOMEM
*/
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
*root, const char *name, int name_len,
@@ -383,8 +383,8 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_len - (ptr + sub_item_len - start));
- ret = btrfs_truncate_item(trans, root, path,
- item_len - sub_item_len, 1);
+ btrfs_truncate_item(trans, root, path,
+ item_len - sub_item_len, 1);
}
return ret;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b801d29f3f10..7b55eee15a51 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -48,20 +48,19 @@
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
-static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only);
-static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
-static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static void btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
-static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
-static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages,
int mark);
static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
struct extent_io_tree *pinned_extents);
-static int btrfs_cleanup_transaction(struct btrfs_root *root);
/*
* end_io_wq structs are used to do processing in task context when an IO is
@@ -99,6 +98,7 @@ struct async_submit_bio {
*/
u64 bio_offset;
struct btrfs_work work;
+ int error;
};
/*
@@ -332,8 +332,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
return 0;
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- 0, &cached_state, GFP_NOFS);
- if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
+ 0, &cached_state);
+ if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
goto out;
@@ -344,7 +344,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
(unsigned long long)parent_transid,
(unsigned long long)btrfs_header_generation(eb));
ret = 1;
- clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
+ clear_extent_buffer_uptodate(eb);
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state, GFP_NOFS);
@@ -360,9 +360,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
u64 start, u64 parent_transid)
{
struct extent_io_tree *io_tree;
+ int failed = 0;
int ret;
int num_copies = 0;
int mirror_num = 0;
+ int failed_mirror = 0;
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
@@ -370,9 +372,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
ret = read_extent_buffer_pages(io_tree, eb, start,
WAIT_COMPLETE,
btree_get_extent, mirror_num);
- if (!ret &&
- !verify_parent_transid(io_tree, eb, parent_transid))
- return ret;
+ if (!ret && !verify_parent_transid(io_tree, eb, parent_transid))
+ break;
/*
* This buffer's crc is fine, but its contents are corrupted, so
@@ -380,18 +381,31 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
* any less wrong.
*/
if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
- return ret;
+ break;
+
+ if (!failed_mirror) {
+ failed = 1;
+ printk(KERN_ERR "failed mirror was %d\n", eb->failed_mirror);
+ failed_mirror = eb->failed_mirror;
+ }
num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
eb->start, eb->len);
if (num_copies == 1)
- return ret;
+ break;
mirror_num++;
+ if (mirror_num == failed_mirror)
+ mirror_num++;
+
if (mirror_num > num_copies)
- return ret;
+ break;
}
- return -EIO;
+
+ if (failed && !ret)
+ repair_eb_io_failure(root, eb, failed_mirror);
+
+ return ret;
}
/*
@@ -404,50 +418,27 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
struct extent_io_tree *tree;
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 found_start;
- unsigned long len;
struct extent_buffer *eb;
- int ret;
tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (page->private == EXTENT_PAGE_PRIVATE) {
- WARN_ON(1);
- goto out;
- }
- if (!page->private) {
- WARN_ON(1);
- goto out;
- }
- len = page->private >> 2;
- WARN_ON(len == 0);
-
- eb = alloc_extent_buffer(tree, start, len, page);
- if (eb == NULL) {
- WARN_ON(1);
- goto out;
- }
- ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
- btrfs_header_generation(eb));
- BUG_ON(ret);
- WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
-
+ eb = (struct extent_buffer *)page->private;
+ if (page != eb->pages[0])
+ return 0;
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
WARN_ON(1);
- goto err;
+ return 0;
}
- if (eb->first_page != page) {
+ if (eb->pages[0] != page) {
WARN_ON(1);
- goto err;
+ return 0;
}
if (!PageUptodate(page)) {
WARN_ON(1);
- goto err;
+ return 0;
}
csum_tree_block(root, eb, 0);
-err:
- free_extent_buffer(eb);
-out:
return 0;
}
@@ -537,34 +528,74 @@ static noinline int check_leaf(struct btrfs_root *root,
return 0;
}
+struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree,
+ struct page *page, int max_walk)
+{
+ struct extent_buffer *eb;
+ u64 start = page_offset(page);
+ u64 target = start;
+ u64 min_start;
+
+ if (start < max_walk)
+ min_start = 0;
+ else
+ min_start = start - max_walk;
+
+ while (start >= min_start) {
+ eb = find_extent_buffer(tree, start, 0);
+ if (eb) {
+ /*
+ * we found an extent buffer and it contains our page
+ * horray!
+ */
+ if (eb->start <= target &&
+ eb->start + eb->len > target)
+ return eb;
+
+ /* we found an extent buffer that wasn't for us */
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ if (start == 0)
+ break;
+ start -= PAGE_CACHE_SIZE;
+ }
+ return NULL;
+}
+
static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state)
{
struct extent_io_tree *tree;
u64 found_start;
int found_level;
- unsigned long len;
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
int ret = 0;
+ int reads_done;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (page->private == EXTENT_PAGE_PRIVATE)
- goto out;
if (!page->private)
goto out;
- len = page->private >> 2;
- WARN_ON(len == 0);
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ eb = (struct extent_buffer *)page->private;
- eb = alloc_extent_buffer(tree, start, len, page);
- if (eb == NULL) {
+ /* the pending IO might have been the only thing that kept this buffer
+ * in memory. Make sure we have a ref for all this other checks
+ */
+ extent_buffer_get(eb);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ if (!reads_done)
+ goto err;
+
+ if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
ret = -EIO;
- goto out;
+ goto err;
}
found_start = btrfs_header_bytenr(eb);
- if (found_start != start) {
+ if (found_start != eb->start) {
printk_ratelimited(KERN_INFO "btrfs bad tree block start "
"%llu %llu\n",
(unsigned long long)found_start,
@@ -572,13 +603,6 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
ret = -EIO;
goto err;
}
- if (eb->first_page != page) {
- printk(KERN_INFO "btrfs bad first page %lu %lu\n",
- eb->first_page->index, page->index);
- WARN_ON(1);
- ret = -EIO;
- goto err;
- }
if (check_tree_block_fsid(root, eb)) {
printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
(unsigned long long)eb->start);
@@ -606,48 +630,31 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
ret = -EIO;
}
- end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
- end = eb->start + end - 1;
+ if (!ret)
+ set_extent_buffer_uptodate(eb);
err:
if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
btree_readahead_hook(root, eb, eb->start, ret);
}
+ if (ret)
+ clear_extent_buffer_uptodate(eb);
free_extent_buffer(eb);
out:
return ret;
}
-static int btree_io_failed_hook(struct bio *failed_bio,
- struct page *page, u64 start, u64 end,
- int mirror_num, struct extent_state *state)
+static int btree_io_failed_hook(struct page *page, int failed_mirror)
{
- struct extent_io_tree *tree;
- unsigned long len;
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (page->private == EXTENT_PAGE_PRIVATE)
- goto out;
- if (!page->private)
- goto out;
-
- len = page->private >> 2;
- WARN_ON(len == 0);
-
- eb = alloc_extent_buffer(tree, start, len, page);
- if (eb == NULL)
- goto out;
-
- if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
- clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ eb = (struct extent_buffer *)page->private;
+ set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ eb->failed_mirror = failed_mirror;
+ if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
btree_readahead_hook(root, eb, eb->start, -EIO);
- }
- free_extent_buffer(eb);
-
-out:
return -EIO; /* we fixed nothing */
}
@@ -719,11 +726,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
+ int ret;
async = container_of(work, struct async_submit_bio, work);
- async->submit_bio_start(async->inode, async->rw, async->bio,
- async->mirror_num, async->bio_flags,
- async->bio_offset);
+ ret = async->submit_bio_start(async->inode, async->rw, async->bio,
+ async->mirror_num, async->bio_flags,
+ async->bio_offset);
+ if (ret)
+ async->error = ret;
}
static void run_one_async_done(struct btrfs_work *work)
@@ -744,6 +754,12 @@ static void run_one_async_done(struct btrfs_work *work)
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
+ /* If an error occured we just want to clean up the bio and move on */
+ if (async->error) {
+ bio_endio(async->bio, async->error);
+ return;
+ }
+
async->submit_bio_done(async->inode, async->rw, async->bio,
async->mirror_num, async->bio_flags,
async->bio_offset);
@@ -785,6 +801,8 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->bio_flags = bio_flags;
async->bio_offset = bio_offset;
+ async->error = 0;
+
atomic_inc(&fs_info->nr_async_submits);
if (rw & REQ_SYNC)
@@ -806,15 +824,18 @@ static int btree_csum_one_bio(struct bio *bio)
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
struct btrfs_root *root;
+ int ret = 0;
WARN_ON(bio->bi_vcnt <= 0);
while (bio_index < bio->bi_vcnt) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
- csum_dirty_buffer(root, bvec->bv_page);
+ ret = csum_dirty_buffer(root, bvec->bv_page);
+ if (ret)
+ break;
bio_index++;
bvec++;
}
- return 0;
+ return ret;
}
static int __btree_submit_bio_start(struct inode *inode, int rw,
@@ -826,8 +847,7 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
* when we're called for a write, we're already in the async
* submission context. Just jump into btrfs_map_bio
*/
- btree_csum_one_bio(bio);
- return 0;
+ return btree_csum_one_bio(bio);
}
static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
@@ -847,15 +867,16 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
{
int ret;
- ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
- bio, 1);
- BUG_ON(ret);
-
if (!(rw & REQ_WRITE)) {
+
/*
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
*/
+ ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+ bio, 1);
+ if (ret)
+ return ret;
return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
mirror_num, 0);
}
@@ -873,7 +894,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
#ifdef CONFIG_MIGRATION
static int btree_migratepage(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
/*
* we can't safely write a btree page from here,
@@ -888,38 +910,10 @@ static int btree_migratepage(struct address_space *mapping,
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
#endif
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct extent_io_tree *tree;
- struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- struct extent_buffer *eb;
- int was_dirty;
-
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (!(current->flags & PF_MEMALLOC)) {
- return extent_write_full_page(tree, page,
- btree_get_extent, wbc);
- }
-
- redirty_page_for_writepage(wbc, page);
- eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
- WARN_ON(!eb);
-
- was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- if (!was_dirty) {
- spin_lock(&root->fs_info->delalloc_lock);
- root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
- spin_unlock(&root->fs_info->delalloc_lock);
- }
- free_extent_buffer(eb);
-
- unlock_page(page);
- return 0;
-}
static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
@@ -939,7 +933,7 @@ static int btree_writepages(struct address_space *mapping,
if (num_dirty < thresh)
return 0;
}
- return extent_writepages(tree, mapping, btree_get_extent, wbc);
+ return btree_write_cache_pages(mapping, wbc);
}
static int btree_readpage(struct file *file, struct page *page)
@@ -951,16 +945,8 @@ static int btree_readpage(struct file *file, struct page *page)
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
{
- struct extent_io_tree *tree;
- struct extent_map_tree *map;
- int ret;
-
if (PageWriteback(page) || PageDirty(page))
return 0;
-
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- map = &BTRFS_I(page->mapping->host)->extent_tree;
-
/*
* We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
* slab allocation from alloc_extent_state down the callchain where
@@ -968,18 +954,7 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
*/
gfp_flags &= ~GFP_SLAB_BUG_MASK;
- ret = try_release_extent_state(map, tree, page, gfp_flags);
- if (!ret)
- return 0;
-
- ret = try_release_extent_buffer(tree, page);
- if (ret == 1) {
- ClearPagePrivate(page);
- set_page_private(page, 0);
- page_cache_release(page);
- }
-
- return ret;
+ return try_release_extent_buffer(page, gfp_flags);
}
static void btree_invalidatepage(struct page *page, unsigned long offset)
@@ -997,15 +972,28 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
}
}
+static int btree_set_page_dirty(struct page *page)
+{
+ struct extent_buffer *eb;
+
+ BUG_ON(!PagePrivate(page));
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+ BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(!atomic_read(&eb->refs));
+ btrfs_assert_tree_locked(eb);
+ return __set_page_dirty_nobuffers(page);
+}
+
static const struct address_space_operations btree_aops = {
.readpage = btree_readpage,
- .writepage = btree_writepage,
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
#ifdef CONFIG_MIGRATION
.migratepage = btree_migratepage,
#endif
+ .set_page_dirty = btree_set_page_dirty,
};
int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -1048,7 +1036,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
free_extent_buffer(buf);
return -EIO;
- } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+ } else if (extent_buffer_uptodate(buf)) {
*eb = buf;
} else {
free_extent_buffer(buf);
@@ -1073,20 +1061,20 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
struct extent_buffer *eb;
eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
- bytenr, blocksize, NULL);
+ bytenr, blocksize);
return eb;
}
int btrfs_write_tree_block(struct extent_buffer *buf)
{
- return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+ return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
buf->start + buf->len - 1);
}
int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
- return filemap_fdatawait_range(buf->first_page->mapping,
+ return filemap_fdatawait_range(buf->pages[0]->mapping,
buf->start, buf->start + buf->len - 1);
}
@@ -1101,17 +1089,13 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
return NULL;
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
-
- if (ret == 0)
- set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
return buf;
}
-int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf)
+void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf)
{
- struct inode *btree_inode = root->fs_info->btree_inode;
if (btrfs_header_generation(buf) ==
root->fs_info->running_transaction->transid) {
btrfs_assert_tree_locked(buf);
@@ -1120,23 +1104,27 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
spin_lock(&root->fs_info->delalloc_lock);
if (root->fs_info->dirty_metadata_bytes >= buf->len)
root->fs_info->dirty_metadata_bytes -= buf->len;
- else
- WARN_ON(1);
+ else {
+ spin_unlock(&root->fs_info->delalloc_lock);
+ btrfs_panic(root->fs_info, -EOVERFLOW,
+ "Can't clear %lu bytes from "
+ " dirty_mdatadata_bytes (%lu)",
+ buf->len,
+ root->fs_info->dirty_metadata_bytes);
+ }
spin_unlock(&root->fs_info->delalloc_lock);
}
/* ugh, clear_extent_buffer_dirty needs to lock the page */
btrfs_set_lock_blocking(buf);
- clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
- buf);
+ clear_extent_buffer_dirty(buf);
}
- return 0;
}
-static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
- u32 stripesize, struct btrfs_root *root,
- struct btrfs_fs_info *fs_info,
- u64 objectid)
+static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+ u32 stripesize, struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid)
{
root->node = NULL;
root->commit_root = NULL;
@@ -1150,7 +1138,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->orphan_item_inserted = 0;
root->orphan_cleanup_state = 0;
- root->fs_info = fs_info;
root->objectid = objectid;
root->last_trans = 0;
root->highest_objectid = 0;
@@ -1189,13 +1176,12 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->defrag_running = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
- return 0;
}
-static int find_and_setup_root(struct btrfs_root *tree_root,
- struct btrfs_fs_info *fs_info,
- u64 objectid,
- struct btrfs_root *root)
+static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid,
+ struct btrfs_root *root)
{
int ret;
u32 blocksize;
@@ -1208,7 +1194,8 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
&root->root_item, &root->root_key);
if (ret > 0)
return -ENOENT;
- BUG_ON(ret);
+ else if (ret < 0)
+ return ret;
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
@@ -1224,6 +1211,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
return 0;
}
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+ if (root)
+ root->fs_info = fs_info;
+ return root;
+}
+
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
@@ -1231,7 +1226,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = fs_info->tree_root;
struct extent_buffer *leaf;
- root = kzalloc(sizeof(*root), GFP_NOFS);
+ root = btrfs_alloc_root(fs_info);
if (!root)
return ERR_PTR(-ENOMEM);
@@ -1326,7 +1321,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
u32 blocksize;
int ret = 0;
- root = kzalloc(sizeof(*root), GFP_NOFS);
+ root = btrfs_alloc_root(fs_info);
if (!root)
return ERR_PTR(-ENOMEM);
if (location->offset == (u64)-1) {
@@ -1369,7 +1364,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
root->commit_root = btrfs_root_node(root);
- BUG_ON(!root->node);
+ BUG_ON(!root->node); /* -ENOMEM */
out:
if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
root->ref_cows = 1;
@@ -1505,41 +1500,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
return 0;
}
-static int bio_ready_for_csum(struct bio *bio)
-{
- u64 length = 0;
- u64 buf_len = 0;
- u64 start = 0;
- struct page *page;
- struct extent_io_tree *io_tree = NULL;
- struct bio_vec *bvec;
- int i;
- int ret;
-
- bio_for_each_segment(bvec, bio, i) {
- page = bvec->bv_page;
- if (page->private == EXTENT_PAGE_PRIVATE) {
- length += bvec->bv_len;
- continue;
- }
- if (!page->private) {
- length += bvec->bv_len;
- continue;
- }
- length = bvec->bv_len;
- buf_len = page->private >> 2;
- start = page_offset(page) + bvec->bv_offset;
- io_tree = &BTRFS_I(page->mapping->host)->io_tree;
- }
- /* are we fully contained in this bio? */
- if (buf_len <= length)
- return 1;
-
- ret = extent_range_uptodate(io_tree, start + length,
- start + buf_len - 1);
- return ret;
-}
-
/*
* called by the kthread helper functions to finally call the bio end_io
* functions. This is where read checksum verification actually happens
@@ -1555,17 +1515,6 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio = end_io_wq->bio;
fs_info = end_io_wq->info;
- /* metadata bio reads are special because the whole tree block must
- * be checksummed at once. This makes sure the entire block is in
- * ram and up to date before trying to verify things. For
- * blocksize <= pagesize, it is basically a noop
- */
- if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
- !bio_ready_for_csum(bio)) {
- btrfs_queue_worker(&fs_info->endio_meta_workers,
- &end_io_wq->work);
- return;
- }
error = end_io_wq->error;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
@@ -1588,9 +1537,7 @@ static int cleaner_kthread(void *arg)
btrfs_run_defrag_inodes(root->fs_info);
}
- if (freezing(current)) {
- refrigerator();
- } else {
+ if (!try_to_freeze()) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop())
schedule();
@@ -1608,9 +1555,10 @@ static int transaction_kthread(void *arg)
u64 transid;
unsigned long now;
unsigned long delay;
- int ret;
+ bool cannot_commit;
do {
+ cannot_commit = false;
delay = HZ * 30;
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
@@ -1632,11 +1580,14 @@ static int transaction_kthread(void *arg)
transid = cur->transid;
spin_unlock(&root->fs_info->trans_lock);
+ /* If the file system is aborted, this will always fail. */
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ cannot_commit = true;
+ goto sleep;
+ }
if (transid == trans->transid) {
- ret = btrfs_commit_transaction(trans, root);
- BUG_ON(ret);
+ btrfs_commit_transaction(trans, root);
} else {
btrfs_end_transaction(trans, root);
}
@@ -1644,12 +1595,11 @@ sleep:
wake_up_process(root->fs_info->cleaner_kthread);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
- if (freezing(current)) {
- refrigerator();
- } else {
+ if (!try_to_freeze()) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop() &&
- !btrfs_transaction_blocked(root->fs_info))
+ (!btrfs_transaction_blocked(root->fs_info) ||
+ cannot_commit))
schedule_timeout(delay);
__set_current_state(TASK_RUNNING);
}
@@ -1886,9 +1836,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
}
-struct btrfs_root *open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options)
+int open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options)
{
u32 sectorsize;
u32 nodesize;
@@ -1900,8 +1850,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_key location;
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
- struct btrfs_root *tree_root = btrfs_sb(sb);
- struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *tree_root;
struct btrfs_root *extent_root;
struct btrfs_root *csum_root;
struct btrfs_root *chunk_root;
@@ -1912,16 +1862,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
int num_backups_tried = 0;
int backup_index = 0;
- extent_root = fs_info->extent_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- csum_root = fs_info->csum_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- chunk_root = fs_info->chunk_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- dev_root = fs_info->dev_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
+ extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
+ csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
+ chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+ dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
- if (!extent_root || !csum_root || !chunk_root || !dev_root) {
+ if (!tree_root || !extent_root || !csum_root ||
+ !chunk_root || !dev_root) {
err = -ENOMEM;
goto fail;
}
@@ -2040,6 +1988,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
fs_info->btree_inode->i_mapping);
+ BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
@@ -2102,7 +2051,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
/* check FS state, whether FS is broken. */
fs_info->fs_state |= btrfs_super_flags(disk_super);
- btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+ ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+ if (ret) {
+ printk(KERN_ERR "btrfs: superblock contains fatal errors\n");
+ err = ret;
+ goto fail_alloc;
+ }
/*
* run through our array of backup supers and setup
@@ -2133,10 +2087,38 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_alloc;
}
+ if (btrfs_super_leafsize(disk_super) !=
+ btrfs_super_nodesize(disk_super)) {
+ printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+ "blocksizes don't match. node %d leaf %d\n",
+ btrfs_super_nodesize(disk_super),
+ btrfs_super_leafsize(disk_super));
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+ if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+ "blocksize (%d) was too large\n",
+ btrfs_super_leafsize(disk_super));
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
features = btrfs_super_incompat_flags(disk_super);
features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+
+ /*
+ * flag our filesystem as having big metadata blocks if
+ * they are bigger than the page size
+ */
+ if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+ if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+ printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
+ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+ }
+
btrfs_set_super_incompat_flags(disk_super, features);
features = btrfs_super_compat_ro_flags(disk_super) &
@@ -2264,6 +2246,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
+ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (leafsize != nodesize || sectorsize != nodesize)) {
+ printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes "
+ "are not allowed for mixed block groups on %s\n",
+ sb->s_id);
+ goto fail_sb_buffer;
+ }
+
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_sys_array(tree_root);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2283,7 +2273,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
blocksize, generation);
- BUG_ON(!chunk_root->node);
+ BUG_ON(!chunk_root->node); /* -ENOMEM */
if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
sb->s_id);
@@ -2411,7 +2401,7 @@ retry_root_backup:
btrfs_level_size(tree_root,
btrfs_super_log_root_level(disk_super));
- log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ log_tree_root = btrfs_alloc_root(fs_info);
if (!log_tree_root) {
err = -ENOMEM;
goto fail_trans_kthread;
@@ -2423,21 +2413,31 @@ retry_root_backup:
log_tree_root->node = read_tree_block(tree_root, bytenr,
blocksize,
generation + 1);
+ /* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_error(tree_root->fs_info, ret,
+ "Failed to recover log tree");
+ free_extent_buffer(log_tree_root->node);
+ kfree(log_tree_root);
+ goto fail_trans_kthread;
+ }
if (sb->s_flags & MS_RDONLY) {
- ret = btrfs_commit_super(tree_root);
- BUG_ON(ret);
+ ret = btrfs_commit_super(tree_root);
+ if (ret)
+ goto fail_trans_kthread;
}
}
ret = btrfs_find_orphan_roots(tree_root);
- BUG_ON(ret);
+ if (ret)
+ goto fail_trans_kthread;
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_cleanup_fs_roots(fs_info);
- BUG_ON(ret);
+ if (ret) {
+ }
ret = btrfs_recover_relocation(tree_root);
if (ret < 0) {
@@ -2472,11 +2472,11 @@ retry_root_backup:
if (err) {
close_ctree(tree_root);
- return ERR_PTR(err);
+ return err;
}
}
- return tree_root;
+ return 0;
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
@@ -2522,8 +2522,7 @@ fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
btrfs_close_devices(fs_info->fs_devices);
- free_fs_info(fs_info);
- return ERR_PTR(err);
+ return err;
recovery_tree_root:
if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2858,6 +2857,8 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
if (total_errors > max_errors) {
printk(KERN_ERR "btrfs: %d errors while writing supers\n",
total_errors);
+
+ /* This shouldn't happen. FUA is masked off if unsupported */
BUG();
}
@@ -2874,9 +2875,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
}
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) {
- printk(KERN_ERR "btrfs: %d errors while writing supers\n",
- total_errors);
- BUG();
+ btrfs_error(root->fs_info, -EIO,
+ "%d errors while writing supers", total_errors);
+ return -EIO;
}
return 0;
}
@@ -2890,7 +2891,20 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+/* Kill all outstanding I/O */
+void btrfs_abort_devices(struct btrfs_root *root)
+{
+ struct list_head *head;
+ struct btrfs_device *dev;
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ head = &root->fs_info->fs_devices->devices;
+ list_for_each_entry_rcu(dev, head, dev_list) {
+ blk_abort_queue(dev->bdev->bd_disk->queue);
+ }
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+}
+
+void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
{
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
@@ -2903,7 +2917,6 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
__btrfs_remove_free_space_cache(root->free_ino_pinned);
__btrfs_remove_free_space_cache(root->free_ino_ctl);
free_fs_root(root);
- return 0;
}
static void free_fs_root(struct btrfs_root *root)
@@ -2920,7 +2933,7 @@ static void free_fs_root(struct btrfs_root *root)
kfree(root);
}
-static int del_fs_roots(struct btrfs_fs_info *fs_info)
+static void del_fs_roots(struct btrfs_fs_info *fs_info)
{
int ret;
struct btrfs_root *gang[8];
@@ -2949,7 +2962,6 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
for (i = 0; i < ret; i++)
btrfs_free_fs_root(fs_info, gang[i]);
}
- return 0;
}
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2998,14 +3010,21 @@ int btrfs_commit_super(struct btrfs_root *root)
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_commit_transaction(trans, root);
- BUG_ON(ret);
+ if (ret)
+ return ret;
/* run commit again to drop the original snapshot */
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
ret = btrfs_write_and_wait_transaction(NULL, root);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_error(root->fs_info, ret,
+ "Failed to sync btree inode to disk.");
+ return ret;
+ }
ret = write_ctree_super(NULL, root, 0);
return ret;
@@ -3029,7 +3048,7 @@ int close_ctree(struct btrfs_root *root)
(atomic_read(&fs_info->defrag_running) == 0));
/* clear out the rbtree of defraggable inodes */
- btrfs_run_defrag_inodes(root->fs_info);
+ btrfs_run_defrag_inodes(fs_info);
/*
* Here come 2 situations when btrfs is broken to flip readonly:
@@ -3058,8 +3077,8 @@ int close_ctree(struct btrfs_root *root)
btrfs_put_block_group_cache(fs_info);
- kthread_stop(root->fs_info->transaction_kthread);
- kthread_stop(root->fs_info->cleaner_kthread);
+ kthread_stop(fs_info->transaction_kthread);
+ kthread_stop(fs_info->cleaner_kthread);
fs_info->closing = 2;
smp_mb();
@@ -3077,14 +3096,14 @@ int close_ctree(struct btrfs_root *root)
free_extent_buffer(fs_info->extent_root->commit_root);
free_extent_buffer(fs_info->tree_root->node);
free_extent_buffer(fs_info->tree_root->commit_root);
- free_extent_buffer(root->fs_info->chunk_root->node);
- free_extent_buffer(root->fs_info->chunk_root->commit_root);
- free_extent_buffer(root->fs_info->dev_root->node);
- free_extent_buffer(root->fs_info->dev_root->commit_root);
- free_extent_buffer(root->fs_info->csum_root->node);
- free_extent_buffer(root->fs_info->csum_root->commit_root);
+ free_extent_buffer(fs_info->chunk_root->node);
+ free_extent_buffer(fs_info->chunk_root->commit_root);
+ free_extent_buffer(fs_info->dev_root->node);
+ free_extent_buffer(fs_info->dev_root->commit_root);
+ free_extent_buffer(fs_info->csum_root->node);
+ free_extent_buffer(fs_info->csum_root->commit_root);
- btrfs_free_block_groups(root->fs_info);
+ btrfs_free_block_groups(fs_info);
del_fs_roots(fs_info);
@@ -3115,18 +3134,15 @@ int close_ctree(struct btrfs_root *root)
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
- free_fs_info(fs_info);
-
return 0;
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
{
int ret;
- struct inode *btree_inode = buf->first_page->mapping->host;
+ struct inode *btree_inode = buf->pages[0]->mapping->host;
- ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
- NULL);
+ ret = extent_buffer_uptodate(buf);
if (!ret)
return ret;
@@ -3137,16 +3153,13 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
{
- struct inode *btree_inode = buf->first_page->mapping->host;
- return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
- buf);
+ return set_extent_buffer_uptodate(buf);
}
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
- struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+ struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
u64 transid = btrfs_header_generation(buf);
- struct inode *btree_inode = root->fs_info->btree_inode;
int was_dirty;
btrfs_assert_tree_locked(buf);
@@ -3158,8 +3171,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
(unsigned long long)root->fs_info->generation);
WARN_ON(1);
}
- was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
- buf);
+ was_dirty = set_extent_buffer_dirty(buf);
if (!was_dirty) {
spin_lock(&root->fs_info->delalloc_lock);
root->fs_info->dirty_metadata_bytes += buf->len;
@@ -3213,12 +3225,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
{
- struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
- int ret;
- ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
- if (ret == 0)
- set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
- return ret;
+ struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+ return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
}
static int btree_lock_page_hook(struct page *page, void *data,
@@ -3226,17 +3234,21 @@ static int btree_lock_page_hook(struct page *page, void *data,
{
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_buffer *eb;
- unsigned long len;
- u64 bytenr = page_offset(page);
- if (page->private == EXTENT_PAGE_PRIVATE)
+ /*
+ * We culled this eb but the page is still hanging out on the mapping,
+ * carry on.
+ */
+ if (!PagePrivate(page))
goto out;
- len = page->private >> 2;
- eb = find_extent_buffer(io_tree, bytenr, len);
- if (!eb)
+ eb = (struct extent_buffer *)page->private;
+ if (!eb) {
+ WARN_ON(1);
+ goto out;
+ }
+ if (page != eb->pages[0])
goto out;
if (!btrfs_try_tree_write_lock(eb)) {
@@ -3255,7 +3267,6 @@ static int btree_lock_page_hook(struct page *page, void *data,
}
btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
out:
if (!trylock_page(page)) {
flush_fn(data);
@@ -3264,15 +3275,23 @@ out:
return 0;
}
-static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
+ if (btrfs_super_csum_type(fs_info->super_copy) >= ARRAY_SIZE(btrfs_csum_sizes)) {
+ printk(KERN_ERR "btrfs: unsupported checksum algorithm\n");
+ return -EINVAL;
+ }
+
if (read_only)
- return;
+ return 0;
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
printk(KERN_WARNING "warning: mount fs with errors, "
"running btrfsck is recommended\n");
+ }
+
+ return 0;
}
int btrfs_error_commit_super(struct btrfs_root *root)
@@ -3294,7 +3313,7 @@ int btrfs_error_commit_super(struct btrfs_root *root)
return ret;
}
-static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
@@ -3316,11 +3335,9 @@ static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
spin_unlock(&root->fs_info->ordered_extent_lock);
mutex_unlock(&root->fs_info->ordered_operations_mutex);
-
- return 0;
}
-static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct list_head splice;
struct btrfs_ordered_extent *ordered;
@@ -3352,12 +3369,10 @@ static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
}
spin_unlock(&root->fs_info->ordered_extent_lock);
-
- return 0;
}
-static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
- struct btrfs_root *root)
+int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_root *root)
{
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -3366,6 +3381,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
delayed_refs = &trans->delayed_refs;
+again:
spin_lock(&delayed_refs->lock);
if (delayed_refs->num_entries == 0) {
spin_unlock(&delayed_refs->lock);
@@ -3387,6 +3403,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_delayed_ref_head *head;
head = btrfs_delayed_node_to_head(ref);
+ spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
kfree(head->extent_op);
delayed_refs->num_heads--;
@@ -3394,8 +3411,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
delayed_refs->num_heads_ready--;
list_del_init(&head->cluster);
mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(ref);
+ goto again;
}
-
spin_unlock(&delayed_refs->lock);
btrfs_put_delayed_ref(ref);
@@ -3408,7 +3426,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
return ret;
}
-static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
{
struct btrfs_pending_snapshot *snapshot;
struct list_head splice;
@@ -3426,11 +3444,9 @@ static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
kfree(snapshot);
}
-
- return 0;
}
-static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
@@ -3450,8 +3466,6 @@ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
}
spin_unlock(&root->fs_info->delalloc_lock);
-
- return 0;
}
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3542,13 +3556,43 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
return 0;
}
-static int btrfs_cleanup_transaction(struct btrfs_root *root)
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
+ struct btrfs_root *root)
+{
+ btrfs_destroy_delayed_refs(cur_trans, root);
+ btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+ cur_trans->dirty_pages.dirty_bytes);
+
+ /* FIXME: cleanup wait for commit */
+ cur_trans->in_commit = 1;
+ cur_trans->blocked = 1;
+ if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+ wake_up(&root->fs_info->transaction_blocked_wait);
+
+ cur_trans->blocked = 0;
+ if (waitqueue_active(&root->fs_info->transaction_wait))
+ wake_up(&root->fs_info->transaction_wait);
+
+ cur_trans->commit_done = 1;
+ if (waitqueue_active(&cur_trans->commit_wait))
+ wake_up(&cur_trans->commit_wait);
+
+ btrfs_destroy_pending_snapshots(cur_trans);
+
+ btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
+ EXTENT_DIRTY);
+
+ /*
+ memset(cur_trans, 0, sizeof(*cur_trans));
+ kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ */
+}
+
+int btrfs_cleanup_transaction(struct btrfs_root *root)
{
struct btrfs_transaction *t;
LIST_HEAD(list);
- WARN_ON(1);
-
mutex_lock(&root->fs_info->transaction_kthread_mutex);
spin_lock(&root->fs_info->trans_lock);
@@ -3613,6 +3657,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
return 0;
}
+static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page,
+ u64 start, u64 end,
+ struct extent_state *state)
+{
+ struct super_block *sb = page->mapping->host->i_sb;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ btrfs_error(fs_info, -EIO,
+ "Error occured while writing out btree at %llu", start);
+ return -EIO;
+}
+
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
@@ -3620,4 +3675,5 @@ static struct extent_io_ops btree_extent_io_ops = {
.submit_bio_hook = btree_submit_bio_hook,
/* note we're sharing with inode.c for the merge bio hook */
.merge_bio_hook = btrfs_merge_bio_hook,
+ .writepage_io_failed_hook = btree_writepage_io_failed_hook,
};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c99d0a8f13fa..a7ace1a2dd12 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -44,11 +44,11 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
-int clean_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options);
+void clean_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf);
+int open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options);
int close_ctree(struct btrfs_root *root);
int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
@@ -64,7 +64,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
-int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
@@ -85,6 +85,10 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_cleanup_transaction(struct btrfs_root *root);
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
+ struct btrfs_root *root);
+void btrfs_abort_devices(struct btrfs_root *root);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1b8dc33778f9..e887ee62b6d4 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
u64 root_objectid, u32 generation,
int check_generation)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
struct inode *inode;
struct btrfs_key key;
@@ -193,7 +193,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
if (ret < 0)
goto fail;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Key with offset of -1 found */
if (path->slots[0] == 0) {
ret = -ENOENT;
goto fail;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 37e0a800d34e..8fe517bd8521 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -245,7 +245,7 @@ static int exclude_super_stripes(struct btrfs_root *root,
cache->bytes_super += stripe_len;
ret = add_excluded_extent(root, cache->key.objectid,
stripe_len);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -253,13 +253,13 @@ static int exclude_super_stripes(struct btrfs_root *root,
ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
cache->key.objectid, bytenr,
0, &logical, &nr, &stripe_len);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
while (nr--) {
cache->bytes_super += stripe_len;
ret = add_excluded_extent(root, logical[nr],
stripe_len);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
kfree(logical);
@@ -321,7 +321,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
total_added += size;
ret = btrfs_add_free_space(block_group, start,
size);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or logic error */
start = extent_end + 1;
} else {
break;
@@ -332,7 +332,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
size = end - start;
total_added += size;
ret = btrfs_add_free_space(block_group, start, size);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or logic error */
}
return total_added;
@@ -474,7 +474,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
int ret = 0;
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
- BUG_ON(!caching_ctl);
+ if (!caching_ctl)
+ return -ENOMEM;
INIT_LIST_HEAD(&caching_ctl->list);
mutex_init(&caching_ctl->mutex);
@@ -982,7 +983,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ret;
- BUG_ON(ret > 0);
+ BUG_ON(ret > 0); /* Corruption */
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1008,9 +1009,9 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
new_size + extra_size, 1);
if (ret < 0)
return ret;
- BUG_ON(ret);
+ BUG_ON(ret); /* Corruption */
- ret = btrfs_extend_item(trans, root, path, new_size);
+ btrfs_extend_item(trans, root, path, new_size);
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1478,7 +1479,11 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
err = ret;
goto out;
}
- BUG_ON(ret);
+ if (ret && !insert) {
+ err = -ENOENT;
+ goto out;
+ }
+ BUG_ON(ret); /* Corruption */
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -1592,13 +1597,13 @@ out:
* helper to add new inline back ref
*/
static noinline_for_stack
-int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_extent_inline_ref *iref,
- u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int refs_to_add,
- struct btrfs_delayed_extent_op *extent_op)
+void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int refs_to_add,
+ struct btrfs_delayed_extent_op *extent_op)
{
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -1608,7 +1613,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
u64 refs;
int size;
int type;
- int ret;
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1617,7 +1621,7 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
type = extent_ref_type(parent, owner);
size = btrfs_extent_inline_ref_size(type);
- ret = btrfs_extend_item(trans, root, path, size);
+ btrfs_extend_item(trans, root, path, size);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
@@ -1652,7 +1656,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
}
btrfs_mark_buffer_dirty(leaf);
- return 0;
}
static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1687,12 +1690,12 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
* helper to update/remove inline back ref
*/
static noinline_for_stack
-int update_inline_extent_backref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_extent_inline_ref *iref,
- int refs_to_mod,
- struct btrfs_delayed_extent_op *extent_op)
+void update_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ int refs_to_mod,
+ struct btrfs_delayed_extent_op *extent_op)
{
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -1703,7 +1706,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
u32 item_size;
int size;
int type;
- int ret;
u64 refs;
leaf = path->nodes[0];
@@ -1745,10 +1747,9 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
memmove_extent_buffer(leaf, ptr, ptr + size,
end - ptr - size);
item_size -= size;
- ret = btrfs_truncate_item(trans, root, path, item_size, 1);
+ btrfs_truncate_item(trans, root, path, item_size, 1);
}
btrfs_mark_buffer_dirty(leaf);
- return 0;
}
static noinline_for_stack
@@ -1768,13 +1769,13 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
root_objectid, owner, offset, 1);
if (ret == 0) {
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
- ret = update_inline_extent_backref(trans, root, path, iref,
- refs_to_add, extent_op);
+ update_inline_extent_backref(trans, root, path, iref,
+ refs_to_add, extent_op);
} else if (ret == -ENOENT) {
- ret = setup_inline_extent_backref(trans, root, path, iref,
- parent, root_objectid,
- owner, offset, refs_to_add,
- extent_op);
+ setup_inline_extent_backref(trans, root, path, iref, parent,
+ root_objectid, owner, offset,
+ refs_to_add, extent_op);
+ ret = 0;
}
return ret;
}
@@ -1804,12 +1805,12 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_extent_inline_ref *iref,
int refs_to_drop, int is_data)
{
- int ret;
+ int ret = 0;
BUG_ON(!is_data && refs_to_drop != 1);
if (iref) {
- ret = update_inline_extent_backref(trans, root, path, iref,
- -refs_to_drop, NULL);
+ update_inline_extent_backref(trans, root, path, iref,
+ -refs_to_drop, NULL);
} else if (is_data) {
ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
} else {
@@ -1835,6 +1836,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
bytenr, &num_bytes, &bbio, 0);
+ /* Error condition is -ENOMEM */
if (!ret) {
struct btrfs_bio_stripe *stripe = bbio->stripes;
int i;
@@ -1850,7 +1852,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
if (!ret)
discarded_bytes += stripe->length;
else if (ret != -EOPNOTSUPP)
- break;
+ break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
/*
* Just in case we get back EOPNOTSUPP for some reason,
@@ -1869,6 +1871,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
return ret;
}
+/* Can return -ENOMEM */
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -1944,7 +1947,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ret = insert_extent_backref(trans, root->fs_info->extent_root,
path, bytenr, parent, root_objectid,
owner, offset, refs_to_add);
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
out:
btrfs_free_path(path);
return err;
@@ -2031,6 +2035,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
int ret;
int err = 0;
+ if (trans->aborted)
+ return 0;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2128,7 +2135,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op,
int insert_reserved)
{
- int ret;
+ int ret = 0;
+
+ if (trans->aborted)
+ return 0;
+
if (btrfs_delayed_ref_is_head(node)) {
struct btrfs_delayed_ref_head *head;
/*
@@ -2146,11 +2157,10 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
ret = btrfs_del_csums(trans, root,
node->bytenr,
node->num_bytes);
- BUG_ON(ret);
}
}
mutex_unlock(&head->mutex);
- return 0;
+ return ret;
}
if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -2197,6 +2207,10 @@ again:
return NULL;
}
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct list_head *cluster)
@@ -2285,9 +2299,13 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ret = run_delayed_extent_op(trans, root,
ref, extent_op);
- BUG_ON(ret);
kfree(extent_op);
+ if (ret) {
+ printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
+ return ret;
+ }
+
goto next;
}
@@ -2308,11 +2326,16 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ret = run_one_delayed_ref(trans, root, ref, extent_op,
must_insert_reserved);
- BUG_ON(ret);
btrfs_put_delayed_ref(ref);
kfree(extent_op);
count++;
+
+ if (ret) {
+ printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
+ return ret;
+ }
+
next:
do_chunk_alloc(trans, root->fs_info->extent_root,
2 * 1024 * 1024,
@@ -2347,6 +2370,9 @@ static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
* 0, which means to process everything in the tree at the start
* of the run (but not newly added entries), or it can be some target
* number you'd like to process.
+ *
+ * Returns 0 on success or if called with an aborted transaction
+ * Returns <0 on error and aborts the transaction
*/
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count)
@@ -2362,6 +2388,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
unsigned long num_refs = 0;
int consider_waiting;
+ /* We'll clean this up in btrfs_cleanup_transaction */
+ if (trans->aborted)
+ return 0;
+
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
@@ -2419,7 +2449,11 @@ again:
}
ret = run_clustered_refs(trans, root, &cluster);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
count -= min_t(unsigned long, ret, count);
@@ -2584,7 +2618,7 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
ret = -ENOENT;
if (path->slots[0] == 0)
@@ -2738,7 +2772,6 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
}
return 0;
fail:
- BUG();
return ret;
}
@@ -2767,7 +2800,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
if (ret < 0)
goto fail;
- BUG_ON(ret);
+ BUG_ON(ret); /* Corruption */
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -2775,8 +2808,10 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
fail:
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
return ret;
+ }
return 0;
}
@@ -2949,7 +2984,8 @@ again:
if (last == 0) {
err = btrfs_run_delayed_refs(trans, root,
(unsigned long)-1);
- BUG_ON(err);
+ if (err) /* File system offline */
+ goto out;
}
cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -2976,7 +3012,9 @@ again:
last = cache->key.objectid + cache->key.offset;
err = write_one_cache_group(trans, root, path, cache);
- BUG_ON(err);
+ if (err) /* File system offline */
+ goto out;
+
btrfs_put_block_group(cache);
}
@@ -2989,7 +3027,8 @@ again:
if (last == 0) {
err = btrfs_run_delayed_refs(trans, root,
(unsigned long)-1);
- BUG_ON(err);
+ if (err) /* File system offline */
+ goto out;
}
cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -3014,20 +3053,21 @@ again:
continue;
}
- btrfs_write_out_cache(root, trans, cache, path);
+ err = btrfs_write_out_cache(root, trans, cache, path);
/*
* If we didn't have an error then the cache state is still
* NEED_WRITE, so we can set it to WRITTEN.
*/
- if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+ if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
cache->disk_cache_state = BTRFS_DC_WRITTEN;
last = cache->key.objectid + cache->key.offset;
btrfs_put_block_group(cache);
}
+out:
btrfs_free_path(path);
- return 0;
+ return err;
}
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -3098,11 +3138,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
- u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
-
- /* chunk -> extended profile */
- if (extra_flags == 0)
- extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
if (flags & BTRFS_BLOCK_GROUP_DATA)
fs_info->avail_data_alloc_bits |= extra_flags;
@@ -3113,6 +3150,35 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
/*
+ * returns target flags in extended format or 0 if restripe for this
+ * chunk_type is not in progress
+ */
+static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ u64 target = 0;
+
+ BUG_ON(!mutex_is_locked(&fs_info->volume_mutex) &&
+ !spin_is_locked(&fs_info->balance_lock));
+
+ if (!bctl)
+ return 0;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA &&
+ bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+ bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+ bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+ }
+
+ return target;
+}
+
+/*
* @flags: available profiles in extended format (see ctree.h)
*
* Returns reduced profile in chunk format. If profile changing is in
@@ -3128,31 +3194,19 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
*/
u64 num_devices = root->fs_info->fs_devices->rw_devices +
root->fs_info->fs_devices->missing_devices;
+ u64 target;
- /* pick restriper's target profile if it's available */
+ /*
+ * see if restripe for this chunk_type is in progress, if so
+ * try to reduce to the target profile
+ */
spin_lock(&root->fs_info->balance_lock);
- if (root->fs_info->balance_ctl) {
- struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
- u64 tgt = 0;
-
- if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
- (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (flags & bctl->data.target)) {
- tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
- } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
- (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (flags & bctl->sys.target)) {
- tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
- } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
- (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (flags & bctl->meta.target)) {
- tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
- }
-
- if (tgt) {
+ target = get_restripe_target(root->fs_info, flags);
+ if (target) {
+ /* pick target profile only if it's already available */
+ if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
spin_unlock(&root->fs_info->balance_lock);
- flags = tgt;
- goto out;
+ return extended_to_chunk(target);
}
}
spin_unlock(&root->fs_info->balance_lock);
@@ -3180,10 +3234,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
flags &= ~BTRFS_BLOCK_GROUP_RAID0;
}
-out:
- /* extended -> chunk profile */
- flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
- return flags;
+ return extended_to_chunk(flags);
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
@@ -3405,15 +3456,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
int wait_for_alloc = 0;
int ret = 0;
- BUG_ON(!profile_is_valid(flags, 0));
-
space_info = __find_space_info(extent_root->fs_info, flags);
if (!space_info) {
ret = update_space_info(extent_root->fs_info, flags,
0, 0, &space_info);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
- BUG_ON(!space_info);
+ BUG_ON(!space_info); /* Logic error */
again:
spin_lock(&space_info->lock);
@@ -3678,8 +3727,10 @@ again:
ret = wait_event_interruptible(space_info->wait,
!space_info->flush);
/* Must have been interrupted, return */
- if (ret)
+ if (ret) {
+ printk(KERN_DEBUG "btrfs: %s returning -EINTR\n", __func__);
return -EINTR;
+ }
spin_lock(&space_info->lock);
}
@@ -3836,8 +3887,9 @@ out:
return ret;
}
-static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static struct btrfs_block_rsv *get_block_rsv(
+ const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root)
{
struct btrfs_block_rsv *block_rsv = NULL;
@@ -4204,6 +4256,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
trans->bytes_reserved = 0;
}
+/* Can only return 0 or -ENOSPC */
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
{
@@ -4540,7 +4593,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
while (total) {
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache)
- return -1;
+ return -ENOENT;
if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -4643,7 +4696,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
struct btrfs_block_group_cache *cache;
cache = btrfs_lookup_block_group(root->fs_info, bytenr);
- BUG_ON(!cache);
+ BUG_ON(!cache); /* Logic error */
pin_down_extent(root, cache, bytenr, num_bytes, reserved);
@@ -4661,7 +4714,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *cache;
cache = btrfs_lookup_block_group(root->fs_info, bytenr);
- BUG_ON(!cache);
+ BUG_ON(!cache); /* Logic error */
/*
* pull in the free space cache (if any) so that our pin
@@ -4706,6 +4759,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
+
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
if (reserve != RESERVE_FREE) {
@@ -4734,7 +4788,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
return ret;
}
-int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4764,7 +4818,6 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
up_write(&fs_info->extent_commit_sem);
update_global_block_rsv(fs_info);
- return 0;
}
static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
@@ -4779,7 +4832,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
if (cache)
btrfs_put_block_group(cache);
cache = btrfs_lookup_block_group(fs_info, start);
- BUG_ON(!cache);
+ BUG_ON(!cache); /* Logic error */
}
len = cache->key.objectid + cache->key.offset - start;
@@ -4816,6 +4869,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
u64 end;
int ret;
+ if (trans->aborted)
+ return 0;
+
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
unpin = &fs_info->freed_extents[1];
else
@@ -4901,7 +4957,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = remove_extent_backref(trans, extent_root, path,
NULL, refs_to_drop,
is_data);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
btrfs_release_path(path);
path->leave_spinning = 1;
@@ -4919,10 +4976,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_print_leaf(extent_root,
path->nodes[0]);
}
- BUG_ON(ret);
+ if (ret < 0)
+ goto abort;
extent_slot = path->slots[0];
}
- } else {
+ } else if (ret == -ENOENT) {
btrfs_print_leaf(extent_root, path->nodes[0]);
WARN_ON(1);
printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
@@ -4932,6 +4990,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(unsigned long long)root_objectid,
(unsigned long long)owner_objectid,
(unsigned long long)owner_offset);
+ } else {
+ goto abort;
}
leaf = path->nodes[0];
@@ -4941,7 +5001,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(found_extent || extent_slot != path->slots[0]);
ret = convert_extent_item_v0(trans, extent_root, path,
owner_objectid, 0);
- BUG_ON(ret < 0);
+ if (ret < 0)
+ goto abort;
btrfs_release_path(path);
path->leave_spinning = 1;
@@ -4958,7 +5019,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(unsigned long long)bytenr);
btrfs_print_leaf(extent_root, path->nodes[0]);
}
- BUG_ON(ret);
+ if (ret < 0)
+ goto abort;
extent_slot = path->slots[0];
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -4995,7 +5057,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = remove_extent_backref(trans, extent_root, path,
iref, refs_to_drop,
is_data);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
}
} else {
if (found_extent) {
@@ -5012,23 +5075,27 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
btrfs_release_path(path);
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
- BUG_ON(ret);
- } else {
- invalidate_mapping_pages(info->btree_inode->i_mapping,
- bytenr >> PAGE_CACHE_SHIFT,
- (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
+ if (ret)
+ goto abort;
}
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
}
+out:
btrfs_free_path(path);
return ret;
+
+abort:
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
}
/*
@@ -5124,7 +5191,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
parent, root->root_key.objectid,
btrfs_header_level(buf),
BTRFS_DROP_DELAYED_REF, NULL, for_cow);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
if (!last_ref)
@@ -5158,6 +5225,7 @@ out:
btrfs_put_block_group(cache);
}
+/* Can return -ENOMEM */
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset, int for_cow)
@@ -5179,14 +5247,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
num_bytes,
parent, root_objectid, (int)owner,
BTRFS_DROP_DELAYED_REF, NULL, for_cow);
- BUG_ON(ret);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, owner,
offset, BTRFS_DROP_DELAYED_REF,
NULL, for_cow);
- BUG_ON(ret);
}
return ret;
}
@@ -5243,28 +5309,34 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return 0;
}
-static int get_block_group_index(struct btrfs_block_group_cache *cache)
+static int __get_block_group_index(u64 flags)
{
int index;
- if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+
+ if (flags & BTRFS_BLOCK_GROUP_RAID10)
index = 0;
- else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1)
index = 1;
- else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+ else if (flags & BTRFS_BLOCK_GROUP_DUP)
index = 2;
- else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+ else if (flags & BTRFS_BLOCK_GROUP_RAID0)
index = 3;
else
index = 4;
+
return index;
}
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+ return __get_block_group_index(cache->flags);
+}
+
enum btrfs_loop_type {
- LOOP_FIND_IDEAL = 0,
- LOOP_CACHING_NOWAIT = 1,
- LOOP_CACHING_WAIT = 2,
- LOOP_ALLOC_CHUNK = 3,
- LOOP_NO_EMPTY_SIZE = 4,
+ LOOP_CACHING_NOWAIT = 0,
+ LOOP_CACHING_WAIT = 1,
+ LOOP_ALLOC_CHUNK = 2,
+ LOOP_NO_EMPTY_SIZE = 3,
};
/*
@@ -5278,7 +5350,6 @@ enum btrfs_loop_type {
static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *orig_root,
u64 num_bytes, u64 empty_size,
- u64 search_start, u64 search_end,
u64 hint_byte, struct btrfs_key *ins,
u64 data)
{
@@ -5287,6 +5358,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
struct btrfs_block_group_cache *used_block_group;
+ u64 search_start = 0;
int empty_cluster = 2 * 1024 * 1024;
int allowed_chunk_alloc = 0;
int done_chunk_alloc = 0;
@@ -5300,8 +5372,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
bool failed_alloc = false;
bool use_cluster = true;
bool have_caching_bg = false;
- u64 ideal_cache_percent = 0;
- u64 ideal_cache_offset = 0;
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -5351,7 +5421,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
empty_cluster = 0;
if (search_start == hint_byte) {
-ideal_cache:
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
used_block_group = block_group;
@@ -5363,8 +5432,7 @@ ideal_cache:
* picked out then we don't care that the block group is cached.
*/
if (block_group && block_group_bits(block_group, data) &&
- (block_group->cached != BTRFS_CACHE_NO ||
- search_start == ideal_cache_offset)) {
+ block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
block_group->ro) {
@@ -5418,44 +5486,13 @@ search:
have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
- u64 free_percent;
-
found_uncached_bg = true;
ret = cache_block_group(block_group, trans,
- orig_root, 1);
- if (block_group->cached == BTRFS_CACHE_FINISHED)
- goto alloc;
-
- free_percent = btrfs_block_group_used(&block_group->item);
- free_percent *= 100;
- free_percent = div64_u64(free_percent,
- block_group->key.offset);
- free_percent = 100 - free_percent;
- if (free_percent > ideal_cache_percent &&
- likely(!block_group->ro)) {
- ideal_cache_offset = block_group->key.objectid;
- ideal_cache_percent = free_percent;
- }
-
- /*
- * The caching workers are limited to 2 threads, so we
- * can queue as much work as we care to.
- */
- if (loop > LOOP_FIND_IDEAL) {
- ret = cache_block_group(block_group, trans,
- orig_root, 0);
- BUG_ON(ret);
- }
-
- /*
- * If loop is set for cached only, try the next block
- * group.
- */
- if (loop == LOOP_FIND_IDEAL)
- goto loop;
+ orig_root, 0);
+ BUG_ON(ret < 0);
+ ret = 0;
}
-alloc:
if (unlikely(block_group->ro))
goto loop;
@@ -5606,11 +5643,6 @@ unclustered_alloc:
}
checks:
search_start = stripe_align(root, offset);
- /* move on to the next group */
- if (search_start + num_bytes >= search_end) {
- btrfs_add_free_space(used_block_group, offset, num_bytes);
- goto loop;
- }
/* move on to the next group */
if (search_start + num_bytes >
@@ -5661,9 +5693,7 @@ loop:
if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
goto search;
- /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
- * for them to make caching progress. Also
- * determine the best possible bg to cache
+ /*
* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
* caching kthreads as we move along
* LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
@@ -5673,50 +5703,17 @@ loop:
*/
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
index = 0;
- if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
- found_uncached_bg = false;
- loop++;
- if (!ideal_cache_percent)
- goto search;
-
- /*
- * 1 of the following 2 things have happened so far
- *
- * 1) We found an ideal block group for caching that
- * is mostly full and will cache quickly, so we might
- * as well wait for it.
- *
- * 2) We searched for cached only and we didn't find
- * anything, and we didn't start any caching kthreads
- * either, so chances are we will loop through and
- * start a couple caching kthreads, and then come back
- * around and just wait for them. This will be slower
- * because we will have 2 caching kthreads reading at
- * the same time when we could have just started one
- * and waited for it to get far enough to give us an
- * allocation, so go ahead and go to the wait caching
- * loop.
- */
- loop = LOOP_CACHING_WAIT;
- search_start = ideal_cache_offset;
- ideal_cache_percent = 0;
- goto ideal_cache;
- } else if (loop == LOOP_FIND_IDEAL) {
- /*
- * Didn't find a uncached bg, wait on anything we find
- * next.
- */
- loop = LOOP_CACHING_WAIT;
- goto search;
- }
-
loop++;
-
if (loop == LOOP_ALLOC_CHUNK) {
if (allowed_chunk_alloc) {
ret = do_chunk_alloc(trans, root, num_bytes +
2 * 1024 * 1024, data,
CHUNK_ALLOC_LIMITED);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans,
+ root, ret);
+ goto out;
+ }
allowed_chunk_alloc = 0;
if (ret == 1)
done_chunk_alloc = 1;
@@ -5745,6 +5742,7 @@ loop:
} else if (ins->objectid) {
ret = 0;
}
+out:
return ret;
}
@@ -5798,12 +5796,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- u64 search_end, struct btrfs_key *ins,
- u64 data)
+ struct btrfs_key *ins, u64 data)
{
bool final_tried = false;
int ret;
- u64 search_start = 0;
data = btrfs_get_alloc_profile(root, data);
again:
@@ -5811,23 +5807,31 @@ again:
* the only place that sets empty_size is btrfs_realloc_node, which
* is not called recursively on allocations
*/
- if (empty_size || root->ref_cows)
+ if (empty_size || root->ref_cows) {
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
num_bytes + 2 * 1024 * 1024, data,
CHUNK_ALLOC_NO_FORCE);
+ if (ret < 0 && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ }
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
- search_start, search_end, hint_byte,
- ins, data);
+ hint_byte, ins, data);
if (ret == -ENOSPC) {
if (!final_tried) {
num_bytes = num_bytes >> 1;
num_bytes = num_bytes & ~(root->sectorsize - 1);
num_bytes = max(num_bytes, min_alloc_size);
- do_chunk_alloc(trans, root->fs_info->extent_root,
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
num_bytes, data, CHUNK_ALLOC_FORCE);
+ if (ret < 0 && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
if (num_bytes == min_alloc_size)
final_tried = true;
goto again;
@@ -5838,7 +5842,8 @@ again:
printk(KERN_ERR "btrfs allocation failed flags %llu, "
"wanted %llu\n", (unsigned long long)data,
(unsigned long long)num_bytes);
- dump_space_info(sinfo, num_bytes, 1);
+ if (sinfo)
+ dump_space_info(sinfo, num_bytes, 1);
}
}
@@ -5917,7 +5922,10 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_free_path(path);
+ return ret;
+ }
leaf = path->nodes[0];
extent_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -5947,7 +5955,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
- if (ret) {
+ if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
(unsigned long long)ins->offset);
@@ -5978,7 +5986,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_free_path(path);
+ return ret;
+ }
leaf = path->nodes[0];
extent_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -6008,7 +6019,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
- if (ret) {
+ if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
(unsigned long long)ins->offset);
@@ -6056,28 +6067,28 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
if (!caching_ctl) {
BUG_ON(!block_group_cache_done(block_group));
ret = btrfs_remove_free_space(block_group, start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
} else {
mutex_lock(&caching_ctl->mutex);
if (start >= caching_ctl->progress) {
ret = add_excluded_extent(root, start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
} else if (start + num_bytes <= caching_ctl->progress) {
ret = btrfs_remove_free_space(block_group,
start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
} else {
num_bytes = caching_ctl->progress - start;
ret = btrfs_remove_free_space(block_group,
start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
start = caching_ctl->progress;
num_bytes = ins->objectid + ins->offset -
caching_ctl->progress;
ret = add_excluded_extent(root, start, num_bytes);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
mutex_unlock(&caching_ctl->mutex);
@@ -6086,7 +6097,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_update_reserved_bytes(block_group, ins->offset,
RESERVE_ALLOC_NO_ACCOUNT);
- BUG_ON(ret);
+ BUG_ON(ret); /* logic error */
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
@@ -6107,6 +6118,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
clean_tree_block(trans, root, buf);
+ clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking(buf);
btrfs_set_buffer_uptodate(buf);
@@ -6214,7 +6226,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
return ERR_CAST(block_rsv);
ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
- empty_size, hint, (u64)-1, &ins, 0);
+ empty_size, hint, &ins, 0);
if (ret) {
unuse_block_rsv(root->fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
@@ -6222,7 +6234,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
buf = btrfs_init_new_buffer(trans, root, ins.objectid,
blocksize, level);
- BUG_ON(IS_ERR(buf));
+ BUG_ON(IS_ERR(buf)); /* -ENOMEM */
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
if (parent == 0)
@@ -6234,7 +6246,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_delayed_extent_op *extent_op;
extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
+ BUG_ON(!extent_op); /* -ENOMEM */
if (key)
memcpy(&extent_op->key, key, sizeof(extent_op->key));
else
@@ -6249,7 +6261,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
ins.offset, parent, root_objectid,
level, BTRFS_ADD_DELAYED_EXTENT,
extent_op, for_cow);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
return buf;
}
@@ -6319,7 +6331,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
/* We don't lock the tree block, it's OK to be racy here */
ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
&refs, &flags);
- BUG_ON(ret);
+ /* We don't care about errors in readahead. */
+ if (ret < 0)
+ continue;
BUG_ON(refs == 0);
if (wc->stage == DROP_REFERENCE) {
@@ -6386,7 +6400,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
eb->start, eb->len,
&wc->refs[level],
&wc->flags[level]);
- BUG_ON(ret);
+ BUG_ON(ret == -ENOMEM);
+ if (ret)
+ return ret;
BUG_ON(wc->refs[level] == 0);
}
@@ -6405,12 +6421,12 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
if (!(wc->flags[level] & flag)) {
BUG_ON(!path->locks[level]);
ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
eb->len, flag, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
}
@@ -6482,7 +6498,11 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
&wc->refs[level - 1],
&wc->flags[level - 1]);
- BUG_ON(ret);
+ if (ret < 0) {
+ btrfs_tree_unlock(next);
+ return ret;
+ }
+
BUG_ON(wc->refs[level - 1] == 0);
*lookup_info = 0;
@@ -6551,7 +6571,7 @@ skip:
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
root->root_key.objectid, level - 1, 0, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
btrfs_tree_unlock(next);
free_extent_buffer(next);
@@ -6609,7 +6629,10 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
eb->start, eb->len,
&wc->refs[level],
&wc->flags[level]);
- BUG_ON(ret);
+ if (ret < 0) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ return ret;
+ }
BUG_ON(wc->refs[level] == 0);
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
@@ -6629,7 +6652,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
else
ret = btrfs_dec_ref(trans, root, eb, 0,
wc->for_reloc);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
/* make block locked assertion in clean_tree_block happy */
if (!path->locks[level] &&
@@ -6738,7 +6761,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
* also make sure backrefs for the shared block and all lower level
* blocks are properly updated.
*/
-void btrfs_drop_snapshot(struct btrfs_root *root,
+int btrfs_drop_snapshot(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, int update_ref,
int for_reloc)
{
@@ -6766,7 +6789,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
}
trans = btrfs_start_transaction(tree_root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
if (block_rsv)
trans->block_rsv = block_rsv;
@@ -6791,7 +6817,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
path->lowest_level = 0;
if (ret < 0) {
err = ret;
- goto out_free;
+ goto out_end_trans;
}
WARN_ON(ret > 0);
@@ -6811,7 +6837,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
path->nodes[level]->len,
&wc->refs[level],
&wc->flags[level]);
- BUG_ON(ret);
+ if (ret < 0) {
+ err = ret;
+ goto out_end_trans;
+ }
BUG_ON(wc->refs[level] == 0);
if (level == root_item->drop_level)
@@ -6862,26 +6891,40 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ err = ret;
+ goto out_end_trans;
+ }
btrfs_end_transaction_throttle(trans, tree_root);
trans = btrfs_start_transaction(tree_root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
if (block_rsv)
trans->block_rsv = block_rsv;
}
}
btrfs_release_path(path);
- BUG_ON(err);
+ if (err)
+ goto out_end_trans;
ret = btrfs_del_root(trans, tree_root, &root->root_key);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ goto out_end_trans;
+ }
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
NULL, NULL);
- BUG_ON(ret < 0);
- if (ret > 0) {
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ err = ret;
+ goto out_end_trans;
+ } else if (ret > 0) {
/* if we fail to delete the orphan item this time
* around, it'll get picked up the next time.
*
@@ -6899,14 +6942,15 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
free_extent_buffer(root->commit_root);
kfree(root);
}
-out_free:
+out_end_trans:
btrfs_end_transaction_throttle(trans, tree_root);
+out_free:
kfree(wc);
btrfs_free_path(path);
out:
if (err)
btrfs_std_error(root->fs_info, err);
- return;
+ return err;
}
/*
@@ -6983,31 +7027,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
{
u64 num_devices;
- u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
-
- if (root->fs_info->balance_ctl) {
- struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
- u64 tgt = 0;
-
- /* pick restriper's target profile and return */
- if (flags & BTRFS_BLOCK_GROUP_DATA &&
- bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
- tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
- } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
- bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
- tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
- } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
- bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
- tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
- }
+ u64 stripped;
- if (tgt) {
- /* extended -> chunk profile */
- tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
- return tgt;
- }
- }
+ /*
+ * if restripe for this chunk_type is on pick target profile and
+ * return, otherwise do the usual balance
+ */
+ stripped = get_restripe_target(root->fs_info, flags);
+ if (stripped)
+ return extended_to_chunk(stripped);
/*
* we add in the count of missing devices because we want
@@ -7017,6 +7045,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
num_devices = root->fs_info->fs_devices->rw_devices +
root->fs_info->fs_devices->missing_devices;
+ stripped = BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
if (num_devices == 1) {
stripped |= BTRFS_BLOCK_GROUP_DUP;
stripped = flags & ~stripped;
@@ -7029,7 +7060,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
return stripped | BTRFS_BLOCK_GROUP_DUP;
- return flags;
} else {
/* they already had raid on here, just return */
if (flags & stripped)
@@ -7042,9 +7072,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
if (flags & BTRFS_BLOCK_GROUP_DUP)
return stripped | BTRFS_BLOCK_GROUP_RAID1;
- /* turn single device chunks into raid0 */
- return stripped | BTRFS_BLOCK_GROUP_RAID0;
+ /* this is drive concat, leave it alone */
}
+
return flags;
}
@@ -7103,12 +7133,16 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
BUG_ON(cache->ro);
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
alloc_flags = update_block_group_flags(root, cache->flags);
- if (alloc_flags != cache->flags)
- do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
- CHUNK_ALLOC_FORCE);
+ if (alloc_flags != cache->flags) {
+ ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ CHUNK_ALLOC_FORCE);
+ if (ret < 0)
+ goto out;
+ }
ret = set_block_group_ro(cache, 0);
if (!ret)
@@ -7188,7 +7222,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
-int btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_set_block_group_rw(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
@@ -7204,7 +7238,6 @@ int btrfs_set_block_group_rw(struct btrfs_root *root,
cache->ro = 0;
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
- return 0;
}
/*
@@ -7222,6 +7255,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
u64 min_free;
u64 dev_min = 1;
u64 dev_nr = 0;
+ u64 target;
int index;
int full = 0;
int ret = 0;
@@ -7262,13 +7296,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
/*
* ok we don't have enough space, but maybe we have free space on our
* devices to allocate new chunks for relocation, so loop through our
- * alloc devices and guess if we have enough space. However, if we
- * were marked as full, then we know there aren't enough chunks, and we
- * can just return.
+ * alloc devices and guess if we have enough space. if this block
+ * group is going to be restriped, run checks against the target
+ * profile instead of the current one.
*/
ret = -1;
- if (full)
- goto out;
/*
* index:
@@ -7278,7 +7310,20 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* 3: raid0
* 4: single
*/
- index = get_block_group_index(block_group);
+ target = get_restripe_target(root->fs_info, block_group->flags);
+ if (target) {
+ index = __get_block_group_index(extended_to_chunk(target));
+ } else {
+ /*
+ * this is just a balance, so if we were marked as full
+ * we know there is no space for a new chunk
+ */
+ if (full)
+ goto out;
+
+ index = get_block_group_index(block_group);
+ }
+
if (index == 0) {
dev_min = 4;
/* Divide by 2 */
@@ -7572,7 +7617,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
ret = update_space_info(info, cache->flags, found_key.offset,
btrfs_block_group_used(&cache->item),
&space_info);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
cache->space_info = space_info;
spin_lock(&cache->space_info->lock);
cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7581,7 +7626,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
__link_block_group(space_info, cache);
ret = btrfs_add_block_group_cache(root->fs_info, cache);
- BUG_ON(ret);
+ BUG_ON(ret); /* Logic error */
set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid))
@@ -7663,7 +7708,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
update_global_block_rsv(root->fs_info);
spin_lock(&cache->space_info->lock);
@@ -7673,11 +7718,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
__link_block_group(cache->space_info, cache);
ret = btrfs_add_block_group_cache(root->fs_info, cache);
- BUG_ON(ret);
+ BUG_ON(ret); /* Logic error */
ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
sizeof(cache->item));
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ return ret;
+ }
set_avail_alloc_bits(extent_root->fs_info, type);
@@ -7686,11 +7734,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
- u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
-
- /* chunk -> extended profile */
- if (extra_flags == 0)
- extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
if (flags & BTRFS_BLOCK_GROUP_DATA)
fs_info->avail_data_alloc_bits &= ~extra_flags;
@@ -7758,7 +7803,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
inode = lookup_free_space_inode(tree_root, block_group, path);
if (!IS_ERR(inode)) {
ret = btrfs_orphan_add(trans, inode);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_add_delayed_iput(inode);
+ goto out;
+ }
clear_nlink(inode);
/* One for the block groups ref */
spin_lock(&block_group->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a55fbe6252de..0c3ec003f273 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -19,6 +19,7 @@
#include "btrfs_inode.h"
#include "volumes.h"
#include "check-integrity.h"
+#include "locking.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -53,6 +54,13 @@ struct extent_page_data {
unsigned int sync_io:1;
};
+static noinline void flush_write_bio(void *data);
+static inline struct btrfs_fs_info *
+tree_fs_info(struct extent_io_tree *tree)
+{
+ return btrfs_sb(tree->mapping->host->i_sb);
+}
+
int __init extent_io_init(void)
{
extent_state_cache = kmem_cache_create("extent_state",
@@ -136,6 +144,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
#endif
atomic_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
+ trace_alloc_extent_state(state, mask, _RET_IP_);
return state;
}
@@ -153,6 +162,7 @@ void free_extent_state(struct extent_state *state)
list_del(&state->leak_list);
spin_unlock_irqrestore(&leak_lock, flags);
#endif
+ trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
}
@@ -439,6 +449,13 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
return prealloc;
}
+void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+ btrfs_panic(tree_fs_info(tree), err, "Locking error: "
+ "Extent tree was modified by another "
+ "thread while locked.");
+}
+
/*
* clear some bits on a range in the tree. This may require splitting
* or inserting elements in the tree, so the gfp mask is used to
@@ -449,8 +466,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
*
* the range [start, end] is inclusive.
*
- * This takes the tree lock, and returns < 0 on error, > 0 if any of the
- * bits were already set, or zero if none of the bits were already set.
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int wake, int delete,
@@ -464,7 +480,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct rb_node *node;
u64 last_end;
int err;
- int set = 0;
int clear = 0;
if (delete)
@@ -542,12 +557,14 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) {
- set |= clear_state_bit(tree, state, &bits, wake);
+ clear_state_bit(tree, state, &bits, wake);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -564,17 +581,19 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, end + 1);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
if (wake)
wake_up(&state->wq);
- set |= clear_state_bit(tree, prealloc, &bits, wake);
+ clear_state_bit(tree, prealloc, &bits, wake);
prealloc = NULL;
goto out;
}
- set |= clear_state_bit(tree, state, &bits, wake);
+ clear_state_bit(tree, state, &bits, wake);
next:
if (last_end == (u64)-1)
goto out;
@@ -591,7 +610,7 @@ out:
if (prealloc)
free_extent_state(prealloc);
- return set;
+ return 0;
search_again:
if (start > end)
@@ -602,8 +621,8 @@ search_again:
goto again;
}
-static int wait_on_state(struct extent_io_tree *tree,
- struct extent_state *state)
+static void wait_on_state(struct extent_io_tree *tree,
+ struct extent_state *state)
__releases(tree->lock)
__acquires(tree->lock)
{
@@ -613,7 +632,6 @@ static int wait_on_state(struct extent_io_tree *tree,
schedule();
spin_lock(&tree->lock);
finish_wait(&state->wq, &wait);
- return 0;
}
/*
@@ -621,7 +639,7 @@ static int wait_on_state(struct extent_io_tree *tree,
* The range [start, end] is inclusive.
* The tree lock is taken by this function
*/
-int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
{
struct extent_state *state;
struct rb_node *node;
@@ -658,7 +676,6 @@ again:
}
out:
spin_unlock(&tree->lock);
- return 0;
}
static void set_state_bits(struct extent_io_tree *tree,
@@ -706,9 +723,10 @@ static void uncache_state(struct extent_state **cached_ptr)
* [start, end] is inclusive This takes the tree lock.
*/
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask)
+static int __must_check
+__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -742,8 +760,10 @@ again:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = insert_state(tree, prealloc, start, end, &bits);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
prealloc = NULL;
- BUG_ON(err == -EEXIST);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
@@ -809,7 +829,9 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
prealloc = NULL;
if (err)
goto out;
@@ -846,12 +868,9 @@ hit_next:
*/
err = insert_state(tree, prealloc, start, this_end,
&bits);
- BUG_ON(err == -EEXIST);
- if (err) {
- free_extent_state(prealloc);
- prealloc = NULL;
- goto out;
- }
+ if (err)
+ extent_io_tree_panic(tree, err);
+
cache_state(prealloc, cached_state);
prealloc = NULL;
start = this_end + 1;
@@ -873,7 +892,8 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, end + 1);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, &bits);
cache_state(prealloc, cached_state);
@@ -900,6 +920,15 @@ search_again:
goto again;
}
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
+ u64 *failed_start, struct extent_state **cached_state,
+ gfp_t mask)
+{
+ return __set_extent_bit(tree, start, end, bits, 0, failed_start,
+ cached_state, mask);
+}
+
+
/**
* convert_extent - convert all bits in a given range from one bit to another
* @tree: the io tree to search
@@ -946,7 +975,8 @@ again:
}
err = insert_state(tree, prealloc, start, end, &bits);
prealloc = NULL;
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
@@ -1002,7 +1032,8 @@ hit_next:
goto out;
}
err = split_state(tree, state, prealloc, start);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
prealloc = NULL;
if (err)
goto out;
@@ -1041,12 +1072,8 @@ hit_next:
*/
err = insert_state(tree, prealloc, start, this_end,
&bits);
- BUG_ON(err == -EEXIST);
- if (err) {
- free_extent_state(prealloc);
- prealloc = NULL;
- goto out;
- }
+ if (err)
+ extent_io_tree_panic(tree, err);
prealloc = NULL;
start = this_end + 1;
goto search_again;
@@ -1065,7 +1092,8 @@ hit_next:
}
err = split_state(tree, state, prealloc, end + 1);
- BUG_ON(err == -EEXIST);
+ if (err)
+ extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, &bits);
clear_state_bit(tree, prealloc, &clear_bits, 0);
@@ -1095,14 +1123,14 @@ search_again:
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
NULL, mask);
}
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask)
{
- return set_extent_bit(tree, start, end, bits, 0, NULL,
+ return set_extent_bit(tree, start, end, bits, NULL,
NULL, mask);
}
@@ -1117,7 +1145,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE,
- 0, NULL, cached_state, mask);
+ NULL, cached_state, mask);
}
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1131,7 +1159,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
NULL, mask);
}
@@ -1139,7 +1167,7 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
- NULL, cached_state, mask);
+ cached_state, mask);
}
static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -1155,42 +1183,40 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
* us if waiting is desired.
*/
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, struct extent_state **cached_state, gfp_t mask)
+ int bits, struct extent_state **cached_state)
{
int err;
u64 failed_start;
while (1) {
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
- EXTENT_LOCKED, &failed_start,
- cached_state, mask);
- if (err == -EEXIST && (mask & __GFP_WAIT)) {
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+ EXTENT_LOCKED, &failed_start,
+ cached_state, GFP_NOFS);
+ if (err == -EEXIST) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
- } else {
+ } else
break;
- }
WARN_ON(start > end);
}
return err;
}
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
- return lock_extent_bits(tree, start, end, 0, NULL, mask);
+ return lock_extent_bits(tree, start, end, 0, NULL);
}
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
int err;
u64 failed_start;
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, mask);
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+ &failed_start, NULL, GFP_NOFS);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, 1, 0, NULL, mask);
+ EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
return 0;
}
return 1;
@@ -1203,10 +1229,10 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
mask);
}
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
- mask);
+ GFP_NOFS);
}
/*
@@ -1220,7 +1246,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
while (index <= end_index) {
page = find_get_page(tree->mapping, index);
- BUG_ON(!page);
+ BUG_ON(!page); /* Pages should be in the extent_io_tree */
set_page_writeback(page);
page_cache_release(page);
index++;
@@ -1343,9 +1369,9 @@ out:
return found;
}
-static noinline int __unlock_for_delalloc(struct inode *inode,
- struct page *locked_page,
- u64 start, u64 end)
+static noinline void __unlock_for_delalloc(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end)
{
int ret;
struct page *pages[16];
@@ -1355,7 +1381,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
int i;
if (index == locked_page->index && end_index == index)
- return 0;
+ return;
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
@@ -1370,7 +1396,6 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
index += ret;
cond_resched();
}
- return 0;
}
static noinline int lock_delalloc_pages(struct inode *inode,
@@ -1500,11 +1525,10 @@ again:
goto out_failed;
}
}
- BUG_ON(ret);
+ BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
/* step three, lock the state bits for the whole range */
- lock_extent_bits(tree, delalloc_start, delalloc_end,
- 0, &cached_state, GFP_NOFS);
+ lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@ -1761,39 +1785,34 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
* helper function to set a given page up to date if all the
* extents in the tree for that page are up to date
*/
-static int check_page_uptodate(struct extent_io_tree *tree,
- struct page *page)
+static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
SetPageUptodate(page);
- return 0;
}
/*
* helper function to unlock a page if all the extents in the tree
* for that page are unlocked
*/
-static int check_page_locked(struct extent_io_tree *tree,
- struct page *page)
+static void check_page_locked(struct extent_io_tree *tree, struct page *page)
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
unlock_page(page);
- return 0;
}
/*
* helper function to end page writeback if all the extents
* in the tree for that page are done with writeback
*/
-static int check_page_writeback(struct extent_io_tree *tree,
- struct page *page)
+static void check_page_writeback(struct extent_io_tree *tree,
+ struct page *page)
{
end_page_writeback(page);
- return 0;
}
/*
@@ -1912,6 +1931,26 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
return 0;
}
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+ int mirror_num)
+{
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ u64 start = eb->start;
+ unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
+ int ret;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+ ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+ start, p, mirror_num);
+ if (ret)
+ break;
+ start += PAGE_CACHE_SIZE;
+ }
+
+ return ret;
+}
+
/*
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
@@ -2258,6 +2297,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
u64 start;
u64 end;
int whole_page;
+ int failed_mirror;
int ret;
if (err)
@@ -2304,9 +2344,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
else
clean_io_failure(start, page);
}
- if (!uptodate) {
- int failed_mirror;
+
+ if (!uptodate)
failed_mirror = (int)(unsigned long)bio->bi_bdev;
+
+ if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
+ ret = tree->ops->readpage_io_failed_hook(page, failed_mirror);
+ if (!ret && !err &&
+ test_bit(BIO_UPTODATE, &bio->bi_flags))
+ uptodate = 1;
+ } else if (!uptodate) {
/*
* The generic bio_readpage_error handles errors the
* following way: If possible, new read requests are
@@ -2320,7 +2367,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
ret = bio_readpage_error(bio, page, start, end,
failed_mirror, NULL);
if (ret == 0) {
-error_handled:
uptodate =
test_bit(BIO_UPTODATE, &bio->bi_flags);
if (err)
@@ -2328,16 +2374,9 @@ error_handled:
uncache_state(&cached);
continue;
}
- if (tree->ops && tree->ops->readpage_io_failed_hook) {
- ret = tree->ops->readpage_io_failed_hook(
- bio, page, start, end,
- failed_mirror, state);
- if (ret == 0)
- goto error_handled;
- }
}
- if (uptodate) {
+ if (uptodate && tree->track_uptodate) {
set_extent_uptodate(tree, start, end, &cached,
GFP_ATOMIC);
}
@@ -2386,8 +2425,12 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
return bio;
}
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+/*
+ * Since writes are async, they will only return -ENOMEM.
+ * Reads can return the full range of I/O error conditions.
+ */
+static int __must_check submit_one_bio(int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
int ret = 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -2413,6 +2456,19 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
return ret;
}
+static int merge_bio(struct extent_io_tree *tree, struct page *page,
+ unsigned long offset, size_t size, struct bio *bio,
+ unsigned long bio_flags)
+{
+ int ret = 0;
+ if (tree->ops && tree->ops->merge_bio_hook)
+ ret = tree->ops->merge_bio_hook(page, offset, size, bio,
+ bio_flags);
+ BUG_ON(ret < 0);
+ return ret;
+
+}
+
static int submit_extent_page(int rw, struct extent_io_tree *tree,
struct page *page, sector_t sector,
size_t size, unsigned long offset,
@@ -2441,12 +2497,12 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
sector;
if (prev_bio_flags != bio_flags || !contig ||
- (tree->ops && tree->ops->merge_bio_hook &&
- tree->ops->merge_bio_hook(page, offset, page_size, bio,
- bio_flags)) ||
+ merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
bio_add_page(bio, page, page_size, offset) < page_size) {
ret = submit_one_bio(rw, bio, mirror_num,
prev_bio_flags);
+ if (ret < 0)
+ return ret;
bio = NULL;
} else {
return 0;
@@ -2473,25 +2529,31 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
return ret;
}
-void set_page_extent_mapped(struct page *page)
+void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
{
if (!PagePrivate(page)) {
SetPagePrivate(page);
page_cache_get(page);
- set_page_private(page, EXTENT_PAGE_PRIVATE);
+ set_page_private(page, (unsigned long)eb);
+ } else {
+ WARN_ON(page->private != (unsigned long)eb);
}
}
-static void set_page_extent_head(struct page *page, unsigned long len)
+void set_page_extent_mapped(struct page *page)
{
- WARN_ON(!PagePrivate(page));
- set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, EXTENT_PAGE_PRIVATE);
+ }
}
/*
* basic readpage implementation. Locked extent state structs are inserted
* into the tree that are removed when the IO is done (by the end_io
* handlers)
+ * XXX JDM: This needs looking at to ensure proper page locking
*/
static int __extent_read_full_page(struct extent_io_tree *tree,
struct page *page,
@@ -2531,11 +2593,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
end = page_end;
while (1) {
- lock_extent(tree, start, end, GFP_NOFS);
+ lock_extent(tree, start, end);
ordered = btrfs_lookup_ordered_extent(inode, start);
if (!ordered)
break;
- unlock_extent(tree, start, end, GFP_NOFS);
+ unlock_extent(tree, start, end);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
@@ -2572,7 +2634,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
end - cur + 1, 0);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
- unlock_extent(tree, cur, end, GFP_NOFS);
+ unlock_extent(tree, cur, end);
break;
}
extent_offset = cur - em->start;
@@ -2624,7 +2686,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
if (test_range_bit(tree, cur, cur_end,
EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
- unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -2634,7 +2696,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
*/
if (block_start == EXTENT_MAP_INLINE) {
SetPageError(page);
- unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -2654,6 +2716,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
end_bio_extent_readpage, mirror_num,
*bio_flags,
this_bio_flag);
+ BUG_ON(ret == -ENOMEM);
nr++;
*bio_flags = this_bio_flag;
}
@@ -2795,7 +2858,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
delalloc_end,
&page_started,
&nr_written);
- BUG_ON(ret);
+ /* File system has been set read-only */
+ if (ret) {
+ SetPageError(page);
+ goto done;
+ }
/*
* delalloc_end is already one less than the total
* length, so we don't subtract one from
@@ -2968,6 +3035,275 @@ done_unlocked:
return 0;
}
+static int eb_wait(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
+static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static int lock_extent_buffer_for_io(struct extent_buffer *eb,
+ struct btrfs_fs_info *fs_info,
+ struct extent_page_data *epd)
+{
+ unsigned long i, num_pages;
+ int flush = 0;
+ int ret = 0;
+
+ if (!btrfs_try_tree_write_lock(eb)) {
+ flush = 1;
+ flush_write_bio(epd);
+ btrfs_tree_lock(eb);
+ }
+
+ if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+ btrfs_tree_unlock(eb);
+ if (!epd->sync_io)
+ return 0;
+ if (!flush) {
+ flush_write_bio(epd);
+ flush = 1;
+ }
+ while (1) {
+ wait_on_extent_buffer_writeback(eb);
+ btrfs_tree_lock(eb);
+ if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
+ break;
+ btrfs_tree_unlock(eb);
+ }
+ }
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+ spin_lock(&fs_info->delalloc_lock);
+ if (fs_info->dirty_metadata_bytes >= eb->len)
+ fs_info->dirty_metadata_bytes -= eb->len;
+ else
+ WARN_ON(1);
+ spin_unlock(&fs_info->delalloc_lock);
+ ret = 1;
+ }
+
+ btrfs_tree_unlock(eb);
+
+ if (!ret)
+ return ret;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+
+ if (!trylock_page(p)) {
+ if (!flush) {
+ flush_write_bio(epd);
+ flush = 1;
+ }
+ lock_page(p);
+ }
+ }
+
+ return ret;
+}
+
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
+static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+{
+ int uptodate = err == 0;
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct extent_buffer *eb;
+ int done;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ bvec--;
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+ done = atomic_dec_and_test(&eb->io_pages);
+
+ if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+ set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+
+ end_page_writeback(page);
+
+ if (!done)
+ continue;
+
+ end_extent_buffer_writeback(eb);
+ } while (bvec >= bio->bi_io_vec);
+
+ bio_put(bio);
+
+}
+
+static int write_one_eb(struct extent_buffer *eb,
+ struct btrfs_fs_info *fs_info,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+ u64 offset = eb->start;
+ unsigned long i, num_pages;
+ int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
+ int ret;
+
+ clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ num_pages = num_extent_pages(eb->start, eb->len);
+ atomic_set(&eb->io_pages, num_pages);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+
+ clear_page_dirty_for_io(p);
+ set_page_writeback(p);
+ ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
+ PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
+ -1, end_bio_extent_buffer_writepage,
+ 0, 0, 0);
+ if (ret) {
+ set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ SetPageError(p);
+ if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
+ end_extent_buffer_writeback(eb);
+ ret = -EIO;
+ break;
+ }
+ offset += PAGE_CACHE_SIZE;
+ update_nr_written(p, wbc, 1);
+ unlock_page(p);
+ }
+
+ if (unlikely(ret)) {
+ for (; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+ unlock_page(p);
+ }
+ }
+
+ return ret;
+}
+
+int btree_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
+ struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+ struct extent_buffer *eb, *prev_eb = NULL;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .extent_locked = 0,
+ .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ };
+ int ret = 0;
+ int done = 0;
+ int nr_to_write_done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ int scanned = 0;
+ int tag;
+
+ pagevec_init(&pvec, 0);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ scanned = 1;
+ }
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
+ while (!done && !nr_to_write_done && (index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ unsigned i;
+
+ scanned = 1;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (!PagePrivate(page))
+ continue;
+
+ if (!wbc->range_cyclic && page->index > end) {
+ done = 1;
+ break;
+ }
+
+ eb = (struct extent_buffer *)page->private;
+ if (!eb) {
+ WARN_ON(1);
+ continue;
+ }
+
+ if (eb == prev_eb)
+ continue;
+
+ if (!atomic_inc_not_zero(&eb->refs)) {
+ WARN_ON(1);
+ continue;
+ }
+
+ prev_eb = eb;
+ ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
+ if (!ret) {
+ free_extent_buffer(eb);
+ continue;
+ }
+
+ ret = write_one_eb(eb, fs_info, wbc, &epd);
+ if (ret) {
+ done = 1;
+ free_extent_buffer(eb);
+ break;
+ }
+ free_extent_buffer(eb);
+
+ /*
+ * the filesystem may choose to bump up nr_to_write.
+ * We have to make sure to honor the new nr_to_write
+ * at any time
+ */
+ nr_to_write_done = wbc->nr_to_write <= 0;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ if (!scanned && !done) {
+ /*
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ scanned = 1;
+ index = 0;
+ goto retry;
+ }
+ flush_write_bio(&epd);
+ return ret;
+}
+
/**
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
@@ -3099,10 +3435,14 @@ retry:
static void flush_epd_write_bio(struct extent_page_data *epd)
{
if (epd->bio) {
+ int rw = WRITE;
+ int ret;
+
if (epd->sync_io)
- submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
- else
- submit_one_bio(WRITE, epd->bio, 0, 0);
+ rw = WRITE_SYNC;
+
+ ret = submit_one_bio(rw, epd->bio, 0, 0);
+ BUG_ON(ret < 0); /* -ENOMEM */
epd->bio = NULL;
}
}
@@ -3219,7 +3559,7 @@ int extent_readpages(struct extent_io_tree *tree,
}
BUG_ON(!list_empty(pages));
if (bio)
- submit_one_bio(READ, bio, 0, bio_flags);
+ return submit_one_bio(READ, bio, 0, bio_flags);
return 0;
}
@@ -3240,7 +3580,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
+ lock_extent_bits(tree, start, end, 0, &cached_state);
wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -3454,7 +3794,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent,
get_extent);
@@ -3548,26 +3888,7 @@ out:
inline struct page *extent_buffer_page(struct extent_buffer *eb,
unsigned long i)
{
- struct page *p;
- struct address_space *mapping;
-
- if (i == 0)
- return eb->first_page;
- i += eb->start >> PAGE_CACHE_SHIFT;
- mapping = eb->first_page->mapping;
- if (!mapping)
- return NULL;
-
- /*
- * extent_buffer_page is only called after pinning the page
- * by increasing the reference count. So we know the page must
- * be in the radix tree.
- */
- rcu_read_lock();
- p = radix_tree_lookup(&mapping->page_tree, i);
- rcu_read_unlock();
-
- return p;
+ return eb->pages[i];
}
inline unsigned long num_extent_pages(u64 start, u64 len)
@@ -3576,6 +3897,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len)
(start >> PAGE_CACHE_SHIFT);
}
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+#if LEAK_DEBUG
+ unsigned long flags;
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&eb->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+ if (eb->pages && eb->pages != eb->inline_pages)
+ kfree(eb->pages);
+ kmem_cache_free(extent_buffer_cache, eb);
+}
+
static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
u64 start,
unsigned long len,
@@ -3591,6 +3925,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
return NULL;
eb->start = start;
eb->len = len;
+ eb->tree = tree;
rwlock_init(&eb->lock);
atomic_set(&eb->write_locks, 0);
atomic_set(&eb->read_locks, 0);
@@ -3607,20 +3942,32 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
list_add(&eb->leak_list, &buffers);
spin_unlock_irqrestore(&leak_lock, flags);
#endif
+ spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
+ atomic_set(&eb->io_pages, 0);
+
+ if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
+ struct page **pages;
+ int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ pages = kzalloc(num_pages, mask);
+ if (!pages) {
+ __free_extent_buffer(eb);
+ return NULL;
+ }
+ eb->pages = pages;
+ } else {
+ eb->pages = eb->inline_pages;
+ }
return eb;
}
-static void __free_extent_buffer(struct extent_buffer *eb)
+static int extent_buffer_under_io(struct extent_buffer *eb)
{
-#if LEAK_DEBUG
- unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
- list_del(&eb->leak_list);
- spin_unlock_irqrestore(&leak_lock, flags);
-#endif
- kmem_cache_free(extent_buffer_cache, eb);
+ return (atomic_read(&eb->io_pages) ||
+ test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+ test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
/*
@@ -3632,8 +3979,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
unsigned long index;
struct page *page;
- if (!eb->first_page)
- return;
+ BUG_ON(extent_buffer_under_io(eb));
index = num_extent_pages(eb->start, eb->len);
if (start_idx >= index)
@@ -3642,8 +3988,34 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
do {
index--;
page = extent_buffer_page(eb, index);
- if (page)
+ if (page) {
+ spin_lock(&page->mapping->private_lock);
+ /*
+ * We do this since we'll remove the pages after we've
+ * removed the eb from the radix tree, so we could race
+ * and have this page now attached to the new eb. So
+ * only clear page_private if it's still connected to
+ * this eb.
+ */
+ if (PagePrivate(page) &&
+ page->private == (unsigned long)eb) {
+ BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(PageDirty(page));
+ BUG_ON(PageWriteback(page));
+ /*
+ * We need to make sure we haven't be attached
+ * to a new eb.
+ */
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ /* One for the page private */
+ page_cache_release(page);
+ }
+ spin_unlock(&page->mapping->private_lock);
+
+ /* One for when we alloced the page */
page_cache_release(page);
+ }
} while (index != start_idx);
}
@@ -3656,9 +4028,50 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
__free_extent_buffer(eb);
}
+static void check_buffer_tree_ref(struct extent_buffer *eb)
+{
+ /* the ref bit is tricky. We have to make sure it is set
+ * if we have the buffer dirty. Otherwise the
+ * code to free a buffer can end up dropping a dirty
+ * page
+ *
+ * Once the ref bit is set, it won't go away while the
+ * buffer is dirty or in writeback, and it also won't
+ * go away while we have the reference count on the
+ * eb bumped.
+ *
+ * We can't just set the ref bit without bumping the
+ * ref on the eb because free_extent_buffer might
+ * see the ref bit and try to clear it. If this happens
+ * free_extent_buffer might end up dropping our original
+ * ref by mistake and freeing the page before we are able
+ * to add one more ref.
+ *
+ * So bump the ref count first, then set the bit. If someone
+ * beat us to it, drop the ref we added.
+ */
+ if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ atomic_inc(&eb->refs);
+ if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+ }
+}
+
+static void mark_extent_buffer_accessed(struct extent_buffer *eb)
+{
+ unsigned long num_pages, i;
+
+ check_buffer_tree_ref(eb);
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = extent_buffer_page(eb, i);
+ mark_page_accessed(p);
+ }
+}
+
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
- u64 start, unsigned long len,
- struct page *page0)
+ u64 start, unsigned long len)
{
unsigned long num_pages = num_extent_pages(start, len);
unsigned long i;
@@ -3674,7 +4087,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
- mark_page_accessed(eb->first_page);
+ mark_extent_buffer_accessed(eb);
return eb;
}
rcu_read_unlock();
@@ -3683,32 +4096,43 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
if (!eb)
return NULL;
- if (page0) {
- eb->first_page = page0;
- i = 1;
- index++;
- page_cache_get(page0);
- mark_page_accessed(page0);
- set_page_extent_mapped(page0);
- set_page_extent_head(page0, len);
- uptodate = PageUptodate(page0);
- } else {
- i = 0;
- }
- for (; i < num_pages; i++, index++) {
+ for (i = 0; i < num_pages; i++, index++) {
p = find_or_create_page(mapping, index, GFP_NOFS);
if (!p) {
WARN_ON(1);
goto free_eb;
}
- set_page_extent_mapped(p);
- mark_page_accessed(p);
- if (i == 0) {
- eb->first_page = p;
- set_page_extent_head(p, len);
- } else {
- set_page_private(p, EXTENT_PAGE_PRIVATE);
+
+ spin_lock(&mapping->private_lock);
+ if (PagePrivate(p)) {
+ /*
+ * We could have already allocated an eb for this page
+ * and attached one so lets see if we can get a ref on
+ * the existing eb, and if we can we know it's good and
+ * we can just return that one, else we know we can just
+ * overwrite page->private.
+ */
+ exists = (struct extent_buffer *)p->private;
+ if (atomic_inc_not_zero(&exists->refs)) {
+ spin_unlock(&mapping->private_lock);
+ unlock_page(p);
+ mark_extent_buffer_accessed(exists);
+ goto free_eb;
+ }
+
+ /*
+ * Do this so attach doesn't complain and we need to
+ * drop the ref the old guy had.
+ */
+ ClearPagePrivate(p);
+ WARN_ON(PageDirty(p));
+ page_cache_release(p);
}
+ attach_extent_buffer_page(eb, p);
+ spin_unlock(&mapping->private_lock);
+ WARN_ON(PageDirty(p));
+ mark_page_accessed(p);
+ eb->pages[i] = p;
if (!PageUptodate(p))
uptodate = 0;
@@ -3716,12 +4140,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
* see below about how we avoid a nasty race with release page
* and why we unlock later
*/
- if (i != 0)
- unlock_page(p);
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
+again:
ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (ret)
goto free_eb;
@@ -3731,14 +4153,21 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
if (ret == -EEXIST) {
exists = radix_tree_lookup(&tree->buffer,
start >> PAGE_CACHE_SHIFT);
- /* add one reference for the caller */
- atomic_inc(&exists->refs);
+ if (!atomic_inc_not_zero(&exists->refs)) {
+ spin_unlock(&tree->buffer_lock);
+ radix_tree_preload_end();
+ exists = NULL;
+ goto again;
+ }
spin_unlock(&tree->buffer_lock);
radix_tree_preload_end();
+ mark_extent_buffer_accessed(exists);
goto free_eb;
}
/* add one reference for the tree */
- atomic_inc(&eb->refs);
+ spin_lock(&eb->refs_lock);
+ check_buffer_tree_ref(eb);
+ spin_unlock(&eb->refs_lock);
spin_unlock(&tree->buffer_lock);
radix_tree_preload_end();
@@ -3751,15 +4180,20 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
* after the extent buffer is in the radix tree so
* it doesn't get lost
*/
- set_page_extent_mapped(eb->first_page);
- set_page_extent_head(eb->first_page, eb->len);
- if (!page0)
- unlock_page(eb->first_page);
+ SetPageChecked(eb->pages[0]);
+ for (i = 1; i < num_pages; i++) {
+ p = extent_buffer_page(eb, i);
+ ClearPageChecked(p);
+ unlock_page(p);
+ }
+ unlock_page(eb->pages[0]);
return eb;
free_eb:
- if (eb->first_page && !page0)
- unlock_page(eb->first_page);
+ for (i = 0; i < num_pages; i++) {
+ if (eb->pages[i])
+ unlock_page(eb->pages[i]);
+ }
if (!atomic_dec_and_test(&eb->refs))
return exists;
@@ -3776,7 +4210,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
- mark_page_accessed(eb->first_page);
+ mark_extent_buffer_accessed(eb);
return eb;
}
rcu_read_unlock();
@@ -3784,19 +4218,71 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
return NULL;
}
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+ struct extent_buffer *eb =
+ container_of(head, struct extent_buffer, rcu_head);
+
+ __free_extent_buffer(eb);
+}
+
+/* Expects to have eb->eb_lock already held */
+static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
+{
+ WARN_ON(atomic_read(&eb->refs) == 0);
+ if (atomic_dec_and_test(&eb->refs)) {
+ struct extent_io_tree *tree = eb->tree;
+
+ spin_unlock(&eb->refs_lock);
+
+ spin_lock(&tree->buffer_lock);
+ radix_tree_delete(&tree->buffer,
+ eb->start >> PAGE_CACHE_SHIFT);
+ spin_unlock(&tree->buffer_lock);
+
+ /* Should be safe to release our pages at this point */
+ btrfs_release_extent_buffer_page(eb, 0);
+
+ call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+ return;
+ }
+ spin_unlock(&eb->refs_lock);
+}
+
void free_extent_buffer(struct extent_buffer *eb)
{
if (!eb)
return;
- if (!atomic_dec_and_test(&eb->refs))
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) == 2 &&
+ test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
+ !extent_buffer_under_io(eb) &&
+ test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+
+ /*
+ * I know this is terrible, but it's temporary until we stop tracking
+ * the uptodate bits and such for the extent buffers.
+ */
+ release_extent_buffer(eb, GFP_ATOMIC);
+}
+
+void free_extent_buffer_stale(struct extent_buffer *eb)
+{
+ if (!eb)
return;
- WARN_ON(1);
+ spin_lock(&eb->refs_lock);
+ set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
+
+ if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+ test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+ release_extent_buffer(eb, GFP_NOFS);
}
-int clear_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb)
+void clear_extent_buffer_dirty(struct extent_buffer *eb)
{
unsigned long i;
unsigned long num_pages;
@@ -3812,10 +4298,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
lock_page(page);
WARN_ON(!PagePrivate(page));
- set_page_extent_mapped(page);
- if (i == 0)
- set_page_extent_head(page, eb->len);
-
clear_page_dirty_for_io(page);
spin_lock_irq(&page->mapping->tree_lock);
if (!PageDirty(page)) {
@@ -3827,24 +4309,29 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
ClearPageError(page);
unlock_page(page);
}
- return 0;
+ WARN_ON(atomic_read(&eb->refs) == 0);
}
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb)
+int set_extent_buffer_dirty(struct extent_buffer *eb)
{
unsigned long i;
unsigned long num_pages;
int was_dirty = 0;
+ check_buffer_tree_ref(eb);
+
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+
num_pages = num_extent_pages(eb->start, eb->len);
+ WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+
for (i = 0; i < num_pages; i++)
- __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+ set_page_dirty(extent_buffer_page(eb, i));
return was_dirty;
}
-static int __eb_straddles_pages(u64 start, u64 len)
+static int range_straddles_pages(u64 start, u64 len)
{
if (len < PAGE_CACHE_SIZE)
return 1;
@@ -3855,25 +4342,14 @@ static int __eb_straddles_pages(u64 start, u64 len)
return 0;
}
-static int eb_straddles_pages(struct extent_buffer *eb)
-{
- return __eb_straddles_pages(eb->start, eb->len);
-}
-
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- struct extent_state **cached_state)
+int clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
unsigned long num_pages;
- num_pages = num_extent_pages(eb->start, eb->len);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
- clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
- cached_state, GFP_NOFS);
-
+ num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
if (page)
@@ -3882,27 +4358,16 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
return 0;
}
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb)
+int set_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
unsigned long num_pages;
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
-
- if (eb_straddles_pages(eb)) {
- set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
- NULL, GFP_NOFS);
- }
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
- if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
- ((i == num_pages - 1) &&
- ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
- check_page_uptodate(tree, page);
- continue;
- }
SetPageUptodate(page);
}
return 0;
@@ -3917,7 +4382,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
int uptodate;
unsigned long index;
- if (__eb_straddles_pages(start, end - start + 1)) {
+ if (range_straddles_pages(start, end - start + 1)) {
ret = test_range_bit(tree, start, end,
EXTENT_UPTODATE, 1, NULL);
if (ret)
@@ -3939,35 +4404,9 @@ int extent_range_uptodate(struct extent_io_tree *tree,
return pg_uptodate;
}
-int extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- struct extent_state *cached_state)
+int extent_buffer_uptodate(struct extent_buffer *eb)
{
- int ret = 0;
- unsigned long num_pages;
- unsigned long i;
- struct page *page;
- int pg_uptodate = 1;
-
- if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
- return 1;
-
- if (eb_straddles_pages(eb)) {
- ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1, cached_state);
- if (ret)
- return ret;
- }
-
- num_pages = num_extent_pages(eb->start, eb->len);
- for (i = 0; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
- if (!PageUptodate(page)) {
- pg_uptodate = 0;
- break;
- }
- }
- return pg_uptodate;
+ return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
}
int read_extent_buffer_pages(struct extent_io_tree *tree,
@@ -3981,21 +4420,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
int ret = 0;
int locked_pages = 0;
int all_uptodate = 1;
- int inc_all_pages = 0;
unsigned long num_pages;
+ unsigned long num_reads = 0;
struct bio *bio = NULL;
unsigned long bio_flags = 0;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
- if (eb_straddles_pages(eb)) {
- if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1, NULL)) {
- return 0;
- }
- }
-
if (start) {
WARN_ON(start < eb->start);
start_i = (start >> PAGE_CACHE_SHIFT) -
@@ -4014,8 +4446,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
lock_page(page);
}
locked_pages++;
- if (!PageUptodate(page))
+ if (!PageUptodate(page)) {
+ num_reads++;
all_uptodate = 0;
+ }
}
if (all_uptodate) {
if (start_i == 0)
@@ -4023,20 +4457,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
goto unlock_exit;
}
+ clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ eb->failed_mirror = 0;
+ atomic_set(&eb->io_pages, num_reads);
for (i = start_i; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
-
- WARN_ON(!PagePrivate(page));
-
- set_page_extent_mapped(page);
- if (i == 0)
- set_page_extent_head(page, eb->len);
-
- if (inc_all_pages)
- page_cache_get(page);
if (!PageUptodate(page)) {
- if (start_i == 0)
- inc_all_pages = 1;
ClearPageError(page);
err = __extent_read_full_page(tree, page,
get_extent, &bio,
@@ -4048,8 +4474,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
}
}
- if (bio)
- submit_one_bio(READ, bio, mirror_num, bio_flags);
+ if (bio) {
+ err = submit_one_bio(READ, bio, mirror_num, bio_flags);
+ if (err)
+ return err;
+ }
if (ret || wait != WAIT_COMPLETE)
return ret;
@@ -4061,8 +4490,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
ret = -EIO;
}
- if (!ret)
- set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
return ret;
unlock_exit:
@@ -4304,15 +4731,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
{
char *dst_kaddr = page_address(dst_page);
char *src_kaddr;
+ int must_memmove = 0;
if (dst_page != src_page) {
src_kaddr = page_address(src_page);
} else {
src_kaddr = dst_kaddr;
- BUG_ON(areas_overlap(src_off, dst_off, len));
+ if (areas_overlap(src_off, dst_off, len))
+ must_memmove = 1;
}
- memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+ if (must_memmove)
+ memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
+ else
+ memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
}
void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
@@ -4382,7 +4814,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
"len %lu len %lu\n", dst_offset, len, dst->len);
BUG_ON(1);
}
- if (!areas_overlap(src_offset, dst_offset, len)) {
+ if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
}
@@ -4408,47 +4840,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
}
}
-static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+int try_release_extent_buffer(struct page *page, gfp_t mask)
{
- struct extent_buffer *eb =
- container_of(head, struct extent_buffer, rcu_head);
-
- btrfs_release_extent_buffer(eb);
-}
-
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
-{
- u64 start = page_offset(page);
struct extent_buffer *eb;
- int ret = 1;
- spin_lock(&tree->buffer_lock);
- eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
- if (!eb) {
- spin_unlock(&tree->buffer_lock);
- return ret;
+ /*
+ * We need to make sure noboody is attaching this page to an eb right
+ * now.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&page->mapping->private_lock);
+ return 1;
}
- if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
- ret = 0;
- goto out;
- }
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
/*
- * set @eb->refs to 0 if it is already 1, and then release the @eb.
- * Or go back.
+ * This is a little awful but should be ok, we need to make sure that
+ * the eb doesn't disappear out from under us while we're looking at
+ * this page.
*/
- if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
- ret = 0;
- goto out;
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ spin_unlock(&eb->refs_lock);
+ spin_unlock(&page->mapping->private_lock);
+ return 0;
}
+ spin_unlock(&page->mapping->private_lock);
- radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-out:
- spin_unlock(&tree->buffer_lock);
+ if ((mask & GFP_NOFS) == GFP_NOFS)
+ mask = GFP_NOFS;
- /* at this point we can safely release the extent buffer */
- if (atomic_read(&eb->refs) == 0)
- call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
- return ret;
+ /*
+ * If tree ref isn't set then we know the ref on this eb is a real ref,
+ * so just return, this page will likely be freed soon anyway.
+ */
+ if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ return 0;
+ }
+ release_extent_buffer(eb, mask);
+
+ return 1;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index cecc3518c121..faf10eb57f75 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -35,6 +35,10 @@
#define EXTENT_BUFFER_DIRTY 2
#define EXTENT_BUFFER_CORRUPT 3
#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
+#define EXTENT_BUFFER_TREE_REF 5
+#define EXTENT_BUFFER_STALE 6
+#define EXTENT_BUFFER_WRITEBACK 7
+#define EXTENT_BUFFER_IOERR 8
/* these are flags for extent_clear_unlock_delalloc */
#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -54,6 +58,7 @@
#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
struct extent_state;
+struct btrfs_root;
typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
@@ -69,9 +74,7 @@ struct extent_io_ops {
size_t size, struct bio *bio,
unsigned long bio_flags);
int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
- int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
- u64 start, u64 end, int failed_mirror,
- struct extent_state *state);
+ int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
u64 start, u64 end,
struct extent_state *state);
@@ -97,6 +100,7 @@ struct extent_io_tree {
struct radix_tree_root buffer;
struct address_space *mapping;
u64 dirty_bytes;
+ int track_uptodate;
spinlock_t lock;
spinlock_t buffer_lock;
struct extent_io_ops *ops;
@@ -119,16 +123,21 @@ struct extent_state {
struct list_head leak_list;
};
+#define INLINE_EXTENT_BUFFER_PAGES 16
+#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
struct extent_buffer {
u64 start;
unsigned long len;
unsigned long map_start;
unsigned long map_len;
- struct page *first_page;
unsigned long bflags;
+ struct extent_io_tree *tree;
+ spinlock_t refs_lock;
+ atomic_t refs;
+ atomic_t io_pages;
+ int failed_mirror;
struct list_head leak_list;
struct rcu_head rcu_head;
- atomic_t refs;
pid_t lock_owner;
/* count of read lock holders on the extent buffer */
@@ -152,6 +161,9 @@ struct extent_buffer {
* to unlock
*/
wait_queue_head_t read_lock_wq;
+ wait_queue_head_t lock_wq;
+ struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES];
+ struct page **pages;
};
static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -178,18 +190,17 @@ void extent_io_tree_init(struct extent_io_tree *tree,
int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
+int try_release_extent_buffer(struct page *page, gfp_t mask);
int try_release_extent_state(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, struct extent_state **cached, gfp_t mask);
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+ int bits, struct extent_state **cached);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached, gfp_t mask);
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
int __init extent_io_init(void);
@@ -210,7 +221,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive_bits, u64 *failed_start,
+ int bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
@@ -240,6 +251,8 @@ int extent_writepages(struct extent_io_tree *tree,
struct address_space *mapping,
get_extent_t *get_extent,
struct writeback_control *wbc);
+int btree_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc);
int extent_readpages(struct extent_io_tree *tree,
struct address_space *mapping,
struct list_head *pages, unsigned nr_pages,
@@ -251,11 +264,11 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
- u64 start, unsigned long len,
- struct page *page0);
+ u64 start, unsigned long len);
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len);
void free_extent_buffer(struct extent_buffer *eb);
+void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_NONE 0
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
@@ -287,19 +300,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len);
void memset_extent_buffer(struct extent_buffer *eb, char c,
unsigned long start, unsigned long len);
-int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
-int clear_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb);
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- struct extent_state **cached_state);
-int extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb,
- struct extent_state *cached_state);
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
+void clear_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_buffer *eb);
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long min_len, char **map,
unsigned long *map_start,
@@ -320,4 +326,6 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
u64 length, u64 logical, struct page *page,
int mirror_num);
int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+ int mirror_num);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c7fb3a4247d3..a14dbca5974e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -59,7 +59,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
sizeof(*item));
if (ret < 0)
goto out;
- BUG_ON(ret);
+ BUG_ON(ret); /* Can't happen */
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -284,6 +284,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_ordered_sum *sums;
struct btrfs_sector_sum *sector_sum;
struct btrfs_csum_item *item;
+ LIST_HEAD(tmplist);
unsigned long offset;
int ret;
size_t size;
@@ -358,7 +359,10 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
MAX_ORDERED_SUM_BYTES(root));
sums = kzalloc(btrfs_ordered_sum_size(root, size),
GFP_NOFS);
- BUG_ON(!sums);
+ if (!sums) {
+ ret = -ENOMEM;
+ goto fail;
+ }
sector_sum = sums->sums;
sums->bytenr = start;
@@ -380,12 +384,19 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
offset += csum_size;
sector_sum++;
}
- list_add_tail(&sums->list, list);
+ list_add_tail(&sums->list, &tmplist);
}
path->slots[0]++;
}
ret = 0;
fail:
+ while (ret < 0 && !list_empty(&tmplist)) {
+ sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ list_splice_tail(&tmplist, list);
+
btrfs_free_path(path);
return ret;
}
@@ -420,7 +431,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
ordered = btrfs_lookup_ordered_extent(inode, offset);
- BUG_ON(!ordered);
+ BUG_ON(!ordered); /* Logic error */
sums->bytenr = ordered->start;
while (bio_index < bio->bi_vcnt) {
@@ -439,11 +450,11 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
GFP_NOFS);
- BUG_ON(!sums);
+ BUG_ON(!sums); /* -ENOMEM */
sector_sum = sums->sums;
sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode, offset);
- BUG_ON(!ordered);
+ BUG_ON(!ordered); /* Logic error */
sums->bytenr = ordered->start;
}
@@ -483,18 +494,17 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
* This calls btrfs_truncate_item with the correct args based on the
* overlap, and fixes up the key as required.
*/
-static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *key,
- u64 bytenr, u64 len)
+static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ u64 bytenr, u64 len)
{
struct extent_buffer *leaf;
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
u64 csum_end;
u64 end_byte = bytenr + len;
u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
- int ret;
leaf = path->nodes[0];
csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
@@ -510,7 +520,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
*/
u32 new_size = (bytenr - key->offset) >> blocksize_bits;
new_size *= csum_size;
- ret = btrfs_truncate_item(trans, root, path, new_size, 1);
+ btrfs_truncate_item(trans, root, path, new_size, 1);
} else if (key->offset >= bytenr && csum_end > end_byte &&
end_byte > key->offset) {
/*
@@ -522,15 +532,13 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
u32 new_size = (csum_end - end_byte) >> blocksize_bits;
new_size *= csum_size;
- ret = btrfs_truncate_item(trans, root, path, new_size, 0);
+ btrfs_truncate_item(trans, root, path, new_size, 0);
key->offset = end_byte;
- ret = btrfs_set_item_key_safe(trans, root, path, key);
- BUG_ON(ret);
+ btrfs_set_item_key_safe(trans, root, path, key);
} else {
BUG();
}
- return 0;
}
/*
@@ -635,13 +643,14 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
* item changed size or key
*/
ret = btrfs_split_item(trans, root, path, &key, offset);
- BUG_ON(ret && ret != -EAGAIN);
+ if (ret && ret != -EAGAIN) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
key.offset = end_byte - 1;
} else {
- ret = truncate_one_csum(trans, root, path,
- &key, bytenr, len);
- BUG_ON(ret);
+ truncate_one_csum(trans, root, path, &key, bytenr, len);
if (key.offset < bytenr)
break;
}
@@ -772,7 +781,7 @@ again:
if (diff != csum_size)
goto insert;
- ret = btrfs_extend_item(trans, root, path, diff);
+ btrfs_extend_item(trans, root, path, diff);
goto csum;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3a520899b024..d83260d7498f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -452,7 +452,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split = alloc_extent_map();
if (!split2)
split2 = alloc_extent_map();
- BUG_ON(!split || !split2);
+ BUG_ON(!split || !split2); /* -ENOMEM */
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -494,7 +494,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->flags = flags;
split->compress_type = em->compress_type;
ret = add_extent_mapping(em_tree, split);
- BUG_ON(ret);
+ BUG_ON(ret); /* Logic error */
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -520,7 +520,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
}
ret = add_extent_mapping(em_tree, split);
- BUG_ON(ret);
+ BUG_ON(ret); /* Logic error */
free_extent_map(split);
split = NULL;
}
@@ -679,7 +679,7 @@ next_slot:
root->root_key.objectid,
new_key.objectid,
start - extent_offset, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
*hint_byte = disk_bytenr;
}
key.offset = start;
@@ -754,7 +754,7 @@ next_slot:
root->root_key.objectid,
key.objectid, key.offset -
extent_offset, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
extent_end - key.offset);
*hint_byte = disk_bytenr;
@@ -770,7 +770,10 @@ next_slot:
ret = btrfs_del_items(trans, root, path, del_slot,
del_nr);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
del_nr = 0;
del_slot = 0;
@@ -782,11 +785,13 @@ next_slot:
BUG_ON(1);
}
- if (del_nr > 0) {
+ if (!ret && del_nr > 0) {
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
}
+out:
btrfs_free_path(path);
return ret;
}
@@ -944,7 +949,10 @@ again:
btrfs_release_path(path);
goto again;
}
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
@@ -963,7 +971,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
ino, orig_offset, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (split == start) {
key.offset = start;
@@ -990,7 +998,7 @@ again:
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
ino, orig_offset, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
other_start = 0;
other_end = start;
@@ -1007,7 +1015,7 @@ again:
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
ino, orig_offset, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
if (del_nr == 0) {
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -1025,7 +1033,10 @@ again:
btrfs_mark_buffer_dirty(leaf);
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
- BUG_ON(ret);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
out:
btrfs_free_path(path);
@@ -1081,7 +1092,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
again:
for (i = 0; i < num_pages; i++) {
pages[i] = find_or_create_page(inode->i_mapping, index + i,
- mask);
+ mask | __GFP_WRITE);
if (!pages[i]) {
faili = i - 1;
err = -ENOMEM;
@@ -1105,8 +1116,7 @@ again:
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos - 1, 0, &cached_state,
- GFP_NOFS);
+ start_pos, last_pos - 1, 0, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode,
last_pos - 1);
if (ordered &&
@@ -1136,7 +1146,8 @@ again:
GFP_NOFS);
}
for (i = 0; i < num_pages; i++) {
- clear_page_dirty_for_io(pages[i]);
+ if (clear_page_dirty_for_io(pages[i]))
+ account_page_redirty(pages[i]);
set_page_extent_mapped(pages[i]);
WARN_ON(!PageLocked(pages[i]));
}
@@ -1637,7 +1648,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* transaction
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, 0, &cached_state, GFP_NOFS);
+ locked_end, 0, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode,
alloc_end - 1);
if (ordered &&
@@ -1666,7 +1677,13 @@ static long btrfs_fallocate(struct file *file, int mode,
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
- BUG_ON(IS_ERR_OR_NULL(em));
+ if (IS_ERR_OR_NULL(em)) {
+ if (!em)
+ ret = -ENOMEM;
+ else
+ ret = PTR_ERR(em);
+ break;
+ }
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = (last_byte + mask) & ~mask;
@@ -1736,7 +1753,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
return -ENXIO;
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
/*
* Delalloc is such a pain. If we have a hole and we have pending
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b30242f4435e..054707ed5791 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -230,11 +230,13 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
if (ret) {
trans->block_rsv = rsv;
- WARN_ON(1);
+ btrfs_abort_transaction(trans, root, ret);
return ret;
}
ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
trans->block_rsv = rsv;
return ret;
@@ -425,7 +427,7 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
}
if (index == 0)
- offset = sizeof(u32) * io_ctl->num_pages;;
+ offset = sizeof(u32) * io_ctl->num_pages;
crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
PAGE_CACHE_SIZE - offset);
@@ -869,7 +871,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_prepare_pages(&io_ctl, inode, 0);
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- 0, &cached_state, GFP_NOFS);
+ 0, &cached_state);
node = rb_first(&ctl->free_space_offset);
if (!node && cluster) {
@@ -1948,14 +1950,14 @@ again:
*/
ret = btrfs_add_free_space(block_group, old_start,
offset - old_start);
- WARN_ON(ret);
+ WARN_ON(ret); /* -ENOMEM */
goto out;
}
ret = remove_from_bitmap(ctl, info, &offset, &bytes);
if (ret == -EAGAIN)
goto again;
- BUG_ON(ret);
+ BUG_ON(ret); /* logic error */
out_lock:
spin_unlock(&ctl->tree_lock);
out:
@@ -2346,7 +2348,7 @@ again:
rb_erase(&entry->offset_index, &ctl->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -EEXIST; Logic error */
trace_btrfs_setup_cluster(block_group, cluster,
total_found * block_group->sectorsize, 1);
@@ -2439,7 +2441,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 0);
total_size += entry->bytes;
- BUG_ON(ret);
+ BUG_ON(ret); /* -EEXIST; Logic error */
} while (node && entry != last);
cluster->max_size = max_extent;
@@ -2830,6 +2832,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
int ret;
ret = search_bitmap(ctl, entry, &offset, &count);
+ /* Logic error; Should be empty if it can't find anything */
BUG_ON(ret);
ino = offset;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index baa74f3db691..a13cf1a96c73 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -19,6 +19,7 @@
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
+#include "print-tree.h"
static int find_name_in_backref(struct btrfs_path *path, const char *name,
int name_len, struct btrfs_inode_ref **ref_ret)
@@ -128,13 +129,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
- ret = btrfs_truncate_item(trans, root, path,
+ btrfs_truncate_item(trans, root, path,
item_size - sub_item_len, 1);
out:
btrfs_free_path(path);
return ret;
}
+/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
@@ -165,7 +167,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
goto out;
old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- ret = btrfs_extend_item(trans, root, path, ins_len);
+ btrfs_extend_item(trans, root, path, ins_len);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ee15d88b33d2..7ca46e6e11ae 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root)
tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
root->root_key.objectid);
- BUG_ON(IS_ERR(tsk));
+ BUG_ON(IS_ERR(tsk)); /* -ENOMEM */
}
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
@@ -271,7 +271,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
break;
info = rb_entry(n, struct btrfs_free_space, offset_index);
- BUG_ON(info->bitmap);
+ BUG_ON(info->bitmap); /* Logic error */
if (info->offset > root->cache_progress)
goto free;
@@ -443,13 +443,13 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
trans->bytes_reserved, 1);
again:
inode = lookup_free_ino_inode(root, path);
- if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+ if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
ret = PTR_ERR(inode);
goto out_release;
}
if (IS_ERR(inode)) {
- BUG_ON(retry);
+ BUG_ON(retry); /* Logic error */
retry = true;
ret = create_free_ino_inode(root, trans, path);
@@ -460,12 +460,17 @@ again:
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, root, inode);
- WARN_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_put;
+ }
if (i_size_read(inode) > 0) {
ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
goto out_put;
+ }
}
spin_lock(&root->cache_lock);
@@ -532,7 +537,7 @@ static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto error;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
l = path->nodes[0];
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cbeb2e36cebc..eb6aec7bbacb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -150,7 +150,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
inode_add_bytes(inode, size);
ret = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
- BUG_ON(ret);
if (ret) {
err = ret;
goto fail;
@@ -206,9 +205,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
* could end up racing with unlink.
*/
BTRFS_I(inode)->disk_i_size = inode->i_size;
- btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, inode);
- return 0;
+ return ret;
fail:
btrfs_free_path(path);
return err;
@@ -250,14 +249,18 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
ret = btrfs_drop_extents(trans, inode, start, aligned_end,
&hint_byte, 1);
- BUG_ON(ret);
+ if (ret)
+ return ret;
if (isize > actual_end)
inline_len = min_t(u64, isize, actual_end);
ret = insert_inline_extent(trans, root, inode, start,
inline_len, compressed_size,
compress_type, compressed_pages);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
@@ -293,7 +296,7 @@ static noinline int add_async_extent(struct async_cow *cow,
struct async_extent *async_extent;
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
- BUG_ON(!async_extent);
+ BUG_ON(!async_extent); /* -ENOMEM */
async_extent->start = start;
async_extent->ram_size = ram_size;
async_extent->compressed_size = compressed_size;
@@ -433,7 +436,11 @@ again:
cont:
if (start == 0) {
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto cleanup_and_out;
+ }
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
/* lets try to make an inline extent */
@@ -450,11 +457,11 @@ cont:
total_compressed,
compress_type, pages);
}
- if (ret == 0) {
+ if (ret <= 0) {
/*
- * inline extent creation worked, we don't need
- * to create any more async work items. Unlock
- * and free up our temp pages.
+ * inline extent creation worked or returned error,
+ * we don't need to create any more async work items.
+ * Unlock and free up our temp pages.
*/
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
@@ -547,7 +554,7 @@ cleanup_and_bail_uncompressed:
}
out:
- return 0;
+ return ret;
free_pages_out:
for (i = 0; i < nr_pages_ret; i++) {
@@ -557,6 +564,20 @@ free_pages_out:
kfree(pages);
goto out;
+
+cleanup_and_out:
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+ if (!trans || IS_ERR(trans))
+ btrfs_error(root->fs_info, ret, "Failed to join transaction");
+ else
+ btrfs_abort_transaction(trans, root, ret);
+ goto free_pages_out;
}
/*
@@ -597,7 +618,7 @@ retry:
lock_extent(io_tree, async_extent->start,
async_extent->start +
- async_extent->ram_size - 1, GFP_NOFS);
+ async_extent->ram_size - 1);
/* allocate blocks */
ret = cow_file_range(inode, async_cow->locked_page,
@@ -606,6 +627,8 @@ retry:
async_extent->ram_size - 1,
&page_started, &nr_written, 0);
+ /* JDM XXX */
+
/*
* if page_started, cow_file_range inserted an
* inline extent and took care of all the unlocking
@@ -625,18 +648,21 @@ retry:
}
lock_extent(io_tree, async_extent->start,
- async_extent->start + async_extent->ram_size - 1,
- GFP_NOFS);
+ async_extent->start + async_extent->ram_size - 1);
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
- ret = btrfs_reserve_extent(trans, root,
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ } else {
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ ret = btrfs_reserve_extent(trans, root,
async_extent->compressed_size,
async_extent->compressed_size,
- 0, alloc_hint,
- (u64)-1, &ins, 1);
- btrfs_end_transaction(trans, root);
+ 0, alloc_hint, &ins, 1);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ }
if (ret) {
int i;
@@ -649,8 +675,10 @@ retry:
async_extent->pages = NULL;
unlock_extent(io_tree, async_extent->start,
async_extent->start +
- async_extent->ram_size - 1, GFP_NOFS);
- goto retry;
+ async_extent->ram_size - 1);
+ if (ret == -ENOSPC)
+ goto retry;
+ goto out_free; /* JDM: Requeue? */
}
/*
@@ -662,7 +690,7 @@ retry:
async_extent->ram_size - 1, 0);
em = alloc_extent_map();
- BUG_ON(!em);
+ BUG_ON(!em); /* -ENOMEM */
em->start = async_extent->start;
em->len = async_extent->ram_size;
em->orig_start = em->start;
@@ -694,7 +722,7 @@ retry:
ins.offset,
BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
/*
* clear dirty, set writeback and unlock the pages.
@@ -716,13 +744,17 @@ retry:
ins.offset, async_extent->pages,
async_extent->nr_pages);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
cond_resched();
}
-
- return 0;
+ ret = 0;
+out:
+ return ret;
+out_free:
+ kfree(async_extent);
+ goto out;
}
static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -791,7 +823,18 @@ static noinline int cow_file_range(struct inode *inode,
BUG_ON(btrfs_is_free_space_inode(root, inode));
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+ return PTR_ERR(trans);
+ }
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -821,8 +864,10 @@ static noinline int cow_file_range(struct inode *inode,
*nr_written = *nr_written +
(end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
*page_started = 1;
- ret = 0;
goto out;
+ } else if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_unlock;
}
}
@@ -838,11 +883,14 @@ static noinline int cow_file_range(struct inode *inode,
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
- (u64)-1, &ins, 1);
- BUG_ON(ret);
+ &ins, 1);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_unlock;
+ }
em = alloc_extent_map();
- BUG_ON(!em);
+ BUG_ON(!em); /* -ENOMEM */
em->start = start;
em->orig_start = em->start;
ram_size = ins.offset;
@@ -868,13 +916,16 @@ static noinline int cow_file_range(struct inode *inode,
cur_alloc_size = ins.offset;
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
ram_size, cur_alloc_size, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_unlock;
+ }
}
if (disk_num_bytes < cur_alloc_size)
@@ -899,11 +950,23 @@ static noinline int cow_file_range(struct inode *inode,
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
}
-out:
ret = 0;
+out:
btrfs_end_transaction(trans, root);
return ret;
+out_unlock:
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+
+ goto out;
}
/*
@@ -969,7 +1032,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1, 0, NULL, GFP_NOFS);
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
- BUG_ON(!async_cow);
+ BUG_ON(!async_cow); /* -ENOMEM */
async_cow->inode = inode;
async_cow->root = root;
async_cow->locked_page = locked_page;
@@ -1060,7 +1123,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 disk_bytenr;
u64 num_bytes;
int extent_type;
- int ret;
+ int ret, err;
int type;
int nocow;
int check_prev = 1;
@@ -1078,7 +1141,11 @@ static noinline int run_delalloc_nocow(struct inode *inode,
else
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
cow_start = (u64)-1;
@@ -1086,7 +1153,10 @@ static noinline int run_delalloc_nocow(struct inode *inode,
while (1) {
ret = btrfs_lookup_file_extent(trans, root, path, ino,
cur_offset, 0);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error;
+ }
if (ret > 0 && path->slots[0] > 0 && check_prev) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1100,8 +1170,10 @@ next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- BUG_ON(1);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error;
+ }
if (ret > 0)
break;
leaf = path->nodes[0];
@@ -1189,7 +1261,10 @@ out_check:
ret = cow_file_range(inode, locked_page, cow_start,
found_key.offset - 1, page_started,
nr_written, 1);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error;
+ }
cow_start = (u64)-1;
}
@@ -1198,7 +1273,7 @@ out_check:
struct extent_map_tree *em_tree;
em_tree = &BTRFS_I(inode)->extent_tree;
em = alloc_extent_map();
- BUG_ON(!em);
+ BUG_ON(!em); /* -ENOMEM */
em->start = cur_offset;
em->orig_start = em->start;
em->len = num_bytes;
@@ -1224,13 +1299,16 @@ out_check:
ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
num_bytes, num_bytes, type);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, cur_offset,
num_bytes);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error;
+ }
}
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
@@ -1249,18 +1327,23 @@ out_check:
if (cow_start != (u64)-1) {
ret = cow_file_range(inode, locked_page, cow_start, end,
page_started, nr_written, 1);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error;
+ }
}
+error:
if (nolock) {
- ret = btrfs_end_transaction_nolock(trans, root);
- BUG_ON(ret);
+ err = btrfs_end_transaction_nolock(trans, root);
} else {
- ret = btrfs_end_transaction(trans, root);
- BUG_ON(ret);
+ err = btrfs_end_transaction(trans, root);
}
+ if (!ret)
+ ret = err;
+
btrfs_free_path(path);
- return 0;
+ return ret;
}
/*
@@ -1425,10 +1508,11 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
map_length = length;
ret = btrfs_map_block(map_tree, READ, logical,
&map_length, NULL, 0);
-
+ /* Will always return 0 or 1 with map_multi == NULL */
+ BUG_ON(ret < 0);
if (map_length < length + size)
return 1;
- return ret;
+ return 0;
}
/*
@@ -1448,7 +1532,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
int ret = 0;
ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
return 0;
}
@@ -1479,14 +1563,16 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
int skip_sum;
+ int metadata = 0;
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
if (btrfs_is_free_space_inode(root, inode))
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
- else
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
- BUG_ON(ret);
+ metadata = 2;
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+ if (ret)
+ return ret;
if (!(rw & REQ_WRITE)) {
if (bio_flags & EXTENT_BIO_COMPRESSED) {
@@ -1571,7 +1657,7 @@ again:
page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
/* already ordered? We're done */
if (PagePrivate2(page))
@@ -1675,13 +1761,15 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
*/
ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
&hint, 0);
- BUG_ON(ret);
+ if (ret)
+ goto out;
ins.objectid = btrfs_ino(inode);
ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
- BUG_ON(ret);
+ if (ret)
+ goto out;
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -1709,10 +1797,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_alloc_reserved_file_extent(trans, root,
root->root_key.objectid,
btrfs_ino(inode), file_pos, &ins);
- BUG_ON(ret);
+out:
btrfs_free_path(path);
- return 0;
+ return ret;
}
/*
@@ -1740,35 +1828,41 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
end - start + 1);
if (!ret)
return 0;
- BUG_ON(!ordered_extent);
+ BUG_ON(!ordered_extent); /* Logic error */
nolock = btrfs_is_free_space_inode(root, inode);
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
- BUG_ON(!list_empty(&ordered_extent->list));
+ BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (!ret) {
if (nolock)
trans = btrfs_join_transaction_nolock(root);
else
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
ret = btrfs_update_inode_fallback(trans, root, inode);
- BUG_ON(ret);
+ if (ret) /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, root, ret);
}
goto out;
}
lock_extent_bits(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
- 0, &cached_state, GFP_NOFS);
+ 0, &cached_state);
if (nolock)
trans = btrfs_join_transaction_nolock(root);
else
trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out_unlock;
+ }
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -1779,7 +1873,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->file_offset,
ordered_extent->file_offset +
ordered_extent->len);
- BUG_ON(ret);
} else {
BUG_ON(root == root->fs_info->tree_root);
ret = insert_reserved_file_extent(trans, inode,
@@ -1793,11 +1886,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset,
ordered_extent->len);
- BUG_ON(ret);
}
unlock_extent_cached(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset +
ordered_extent->len - 1, &cached_state, GFP_NOFS);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
add_pending_csums(trans, inode, ordered_extent->file_offset,
&ordered_extent->list);
@@ -1805,7 +1901,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
ret = btrfs_update_inode_fallback(trans, root, inode);
- BUG_ON(ret);
+ if (ret) { /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
ret = 0;
out:
@@ -1824,6 +1923,11 @@ out:
btrfs_put_ordered_extent(ordered_extent);
return 0;
+out_unlock:
+ unlock_extent_cached(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1, &cached_state, GFP_NOFS);
+ goto out;
}
static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
@@ -1905,6 +2009,8 @@ struct delayed_iput {
struct inode *inode;
};
+/* JDM: If this is fs-wide, why can't we add a pointer to
+ * btrfs_inode instead and avoid the allocation? */
void btrfs_add_delayed_iput(struct inode *inode)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@ -1954,7 +2060,7 @@ enum btrfs_orphan_cleanup_state {
};
/*
- * This is called in transaction commmit time. If there are no orphan
+ * This is called in transaction commit time. If there are no orphan
* files in the subvolume, it removes orphan item and frees block_rsv
* structure.
*/
@@ -2051,20 +2157,27 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
/* grab metadata reservation from transaction handle */
if (reserve) {
ret = btrfs_orphan_reserve_metadata(trans, inode);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
}
/* insert an orphan item to track this unlinked/truncated file */
if (insert >= 1) {
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
- BUG_ON(ret && ret != -EEXIST);
+ if (ret && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ ret = 0;
}
/* insert an orphan item to track subvolume contains orphan files */
if (insert >= 2) {
ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
root->root_key.objectid);
- BUG_ON(ret);
+ if (ret && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
}
return 0;
}
@@ -2094,7 +2207,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
if (trans && delete_item) {
ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
}
if (release_rsv)
@@ -2228,7 +2341,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
}
ret = btrfs_del_orphan_item(trans, root,
found_key.objectid);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
btrfs_end_transaction(trans, root);
continue;
}
@@ -2610,16 +2723,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
"inode %llu parent %llu\n", name_len, name,
(unsigned long long)ino, (unsigned long long)dir_ino);
+ btrfs_abort_transaction(trans, root, ret);
goto err;
}
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
goto err;
+ }
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
inode, dir_ino);
- BUG_ON(ret != 0 && ret != -ENOENT);
+ if (ret != 0 && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto err;
+ }
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
dir, index);
@@ -2777,7 +2896,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
err = ret;
goto out;
}
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
if (check_path_shared(root, path))
goto out;
btrfs_release_path(path);
@@ -2810,7 +2929,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
err = PTR_ERR(ref);
goto out;
}
- BUG_ON(!ref);
+ BUG_ON(!ref); /* Logic error */
if (check_path_shared(root, path))
goto out;
index = btrfs_inode_ref_index(path->nodes[0], ref);
@@ -2917,23 +3036,42 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
- BUG_ON(IS_ERR_OR_NULL(di));
+ if (IS_ERR_OR_NULL(di)) {
+ if (!di)
+ ret = -ENOENT;
+ else
+ ret = PTR_ERR(di);
+ goto out;
+ }
leaf = path->nodes[0];
btrfs_dir_item_key_to_cpu(leaf, di, &key);
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
btrfs_release_path(path);
ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
objectid, root->root_key.objectid,
dir_ino, &index, name, name_len);
if (ret < 0) {
- BUG_ON(ret != -ENOENT);
+ if (ret != -ENOENT) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
di = btrfs_search_dir_index_item(root, path, dir_ino,
name, name_len);
- BUG_ON(IS_ERR_OR_NULL(di));
+ if (IS_ERR_OR_NULL(di)) {
+ if (!di)
+ ret = -ENOENT;
+ else
+ ret = PTR_ERR(di);
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
@@ -2943,15 +3081,19 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, dir);
- BUG_ON(ret);
-
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+out:
btrfs_free_path(path);
- return 0;
+ return ret;
}
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -3161,8 +3303,8 @@ search_again:
}
size =
btrfs_file_extent_calc_inline_size(size);
- ret = btrfs_truncate_item(trans, root, path,
- size, 1);
+ btrfs_truncate_item(trans, root, path,
+ size, 1);
} else if (root->ref_cows) {
inode_sub_bytes(inode, item_end + 1 -
found_key.offset);
@@ -3210,7 +3352,11 @@ delete:
ret = btrfs_del_items(trans, root, path,
pending_del_slot,
pending_del_nr);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans,
+ root, ret);
+ goto error;
+ }
pending_del_nr = 0;
}
btrfs_release_path(path);
@@ -3223,8 +3369,10 @@ out:
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path, pending_del_slot,
pending_del_nr);
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
}
+error:
btrfs_free_path(path);
return err;
}
@@ -3282,8 +3430,7 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
- GFP_NOFS);
+ lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
set_page_extent_mapped(page);
ordered = btrfs_lookup_ordered_extent(inode, page_start);
@@ -3359,7 +3506,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
btrfs_wait_ordered_range(inode, hole_start,
block_end - hole_start);
lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, hole_start);
if (!ordered)
break;
@@ -3372,7 +3519,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
while (1) {
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
block_end - cur_offset, 0);
- BUG_ON(IS_ERR_OR_NULL(em));
+ if (IS_ERR(em)) {
+ err = PTR_ERR(em);
+ break;
+ }
last_byte = min(extent_map_end(em), block_end);
last_byte = (last_byte + mask) & ~mask;
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3389,7 +3539,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
cur_offset + hole_size,
&hint_byte, 1);
if (err) {
- btrfs_update_inode(trans, root, inode);
+ btrfs_abort_transaction(trans, root, err);
btrfs_end_transaction(trans, root);
break;
}
@@ -3399,7 +3549,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
0, hole_size, 0, hole_size,
0, 0, 0);
if (err) {
- btrfs_update_inode(trans, root, inode);
+ btrfs_abort_transaction(trans, root, err);
btrfs_end_transaction(trans, root);
break;
}
@@ -3779,7 +3929,7 @@ static void inode_tree_del(struct inode *inode)
}
}
-int btrfs_invalidate_inodes(struct btrfs_root *root)
+void btrfs_invalidate_inodes(struct btrfs_root *root)
{
struct rb_node *node;
struct rb_node *prev;
@@ -3839,7 +3989,6 @@ again:
node = rb_next(node);
}
spin_unlock(&root->inode_lock);
- return 0;
}
static int btrfs_init_locked_inode(struct inode *inode, void *p)
@@ -4424,8 +4573,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir,
const char *name, int name_len,
- u64 ref_objectid, u64 objectid, int mode,
- u64 *index)
+ u64 ref_objectid, u64 objectid,
+ umode_t mode, u64 *index)
{
struct inode *inode;
struct btrfs_inode_item *inode_item;
@@ -4581,18 +4730,26 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
parent_ino, index);
}
- if (ret == 0) {
- ret = btrfs_insert_dir_item(trans, root, name, name_len,
- parent_inode, &key,
- btrfs_inode_type(inode), index);
- if (ret)
- goto fail_dir_item;
+ /* Nothing to clean up yet */
+ if (ret)
+ return ret;
- btrfs_i_size_write(parent_inode, parent_inode->i_size +
- name_len * 2);
- parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
- ret = btrfs_update_inode(trans, root, parent_inode);
+ ret = btrfs_insert_dir_item(trans, root, name, name_len,
+ parent_inode, &key,
+ btrfs_inode_type(inode), index);
+ if (ret == -EEXIST)
+ goto fail_dir_item;
+ else if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
}
+
+ btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ name_len * 2);
+ parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, parent_inode);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
return ret;
fail_dir_item:
@@ -4626,7 +4783,7 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
}
static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
- int mode, dev_t rdev)
+ umode_t mode, dev_t rdev)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -4695,7 +4852,7 @@ out_unlock:
}
static int btrfs_create(struct inode *dir, struct dentry *dentry,
- int mode, struct nameidata *nd)
+ umode_t mode, struct nameidata *nd)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -4806,7 +4963,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
} else {
struct dentry *parent = dentry->d_parent;
err = btrfs_update_inode(trans, root, inode);
- BUG_ON(err);
+ if (err)
+ goto fail;
d_instantiate(dentry, inode);
btrfs_log_new_name(trans, inode, NULL, parent);
}
@@ -4822,7 +4980,7 @@ fail:
return err;
}
-static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
@@ -5137,7 +5295,7 @@ again:
ret = uncompress_inline(path, inode, page,
pg_offset,
extent_offset, item);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
} else {
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -5252,6 +5410,7 @@ out:
free_extent_map(em);
return ERR_PTR(err);
}
+ BUG_ON(!em); /* Error is always set */
return em;
}
@@ -5414,7 +5573,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
alloc_hint = get_extent_allocation_hint(inode, start, len);
ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
- alloc_hint, (u64)-1, &ins, 1);
+ alloc_hint, &ins, 1);
if (ret) {
em = ERR_PTR(ret);
goto out;
@@ -5602,7 +5761,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
free_extent_map(em);
/* DIO will do one hole at a time, so just unlock a sector */
unlock_extent(&BTRFS_I(inode)->io_tree, start,
- start + root->sectorsize - 1, GFP_NOFS);
+ start + root->sectorsize - 1);
return 0;
}
@@ -5743,7 +5902,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
} while (bvec <= bvec_end);
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
- dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+ dip->logical_offset + dip->bytes - 1);
bio->bi_private = dip->private;
kfree(dip->csums);
@@ -5794,7 +5953,7 @@ again:
lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
ordered->file_offset + ordered->len - 1, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
ret = btrfs_mark_extent_written(trans, inode,
@@ -5868,7 +6027,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
int ret;
struct btrfs_root *root = BTRFS_I(inode)->root;
ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
return 0;
}
@@ -6209,7 +6368,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
while (1) {
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, &cached_state, GFP_NOFS);
+ 0, &cached_state);
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure theres no ordered
@@ -6233,7 +6392,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
if (writing) {
write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- EXTENT_DELALLOC, 0, NULL, &cached_state,
+ EXTENT_DELALLOC, NULL, &cached_state,
GFP_NOFS);
if (ret) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
@@ -6363,8 +6522,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
btrfs_releasepage(page, GFP_NOFS);
return;
}
- lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
- GFP_NOFS);
+ lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
ordered = btrfs_lookup_ordered_extent(page->mapping->host,
page_offset(page));
if (ordered) {
@@ -6386,8 +6544,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
}
btrfs_put_ordered_extent(ordered);
cached_state = NULL;
- lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
- GFP_NOFS);
+ lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
}
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -6462,8 +6619,7 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
- GFP_NOFS);
+ lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
set_page_extent_mapped(page);
/*
@@ -6737,10 +6893,9 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
btrfs_i_size_write(inode, 0);
err = btrfs_update_inode(trans, new_root, inode);
- BUG_ON(err);
iput(inode);
- return 0;
+ return err;
}
struct inode *btrfs_alloc_inode(struct super_block *sb)
@@ -6783,6 +6938,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
extent_map_tree_init(&ei->extent_tree);
extent_io_tree_init(&ei->io_tree, &inode->i_data);
extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+ ei->io_tree.track_uptodate = 1;
+ ei->io_failure_tree.track_uptodate = 1;
mutex_init(&ei->log_mutex);
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -6797,7 +6954,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
static void btrfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
@@ -7073,7 +7229,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!ret)
ret = btrfs_update_inode(trans, root, old_inode);
}
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
if (new_inode) {
new_inode->i_ctime = CURRENT_TIME;
@@ -7091,11 +7250,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_dentry->d_name.name,
new_dentry->d_name.len);
}
- BUG_ON(ret);
- if (new_inode->i_nlink == 0) {
+ if (!ret && new_inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, new_dentry->d_inode);
BUG_ON(ret);
}
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
}
fixup_inode_flags(new_dir, old_inode);
@@ -7103,7 +7265,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
ret = btrfs_add_link(trans, new_dir, old_inode,
new_dentry->d_name.name,
new_dentry->d_name.len, 0, index);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
struct dentry *parent = new_dentry->d_parent;
@@ -7316,7 +7481,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
- 0, *alloc_hint, (u64)-1, &ins, 1);
+ 0, *alloc_hint, &ins, 1);
if (ret) {
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -7328,7 +7493,12 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
ins.offset, ins.offset,
ins.offset, 0, 0, 0,
BTRFS_FILE_EXTENT_PREALLOC);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ if (own_trans)
+ btrfs_end_transaction(trans, root);
+ break;
+ }
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + ins.offset -1, 0);
@@ -7350,7 +7520,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
+
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ if (own_trans)
+ btrfs_end_transaction(trans, root);
+ break;
+ }
if (own_trans)
btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 013c6371e3e8..a979ab7d3967 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -206,7 +206,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
}
}
- ret = mnt_want_write(file->f_path.mnt);
+ ret = mnt_want_write_file(file);
if (ret)
goto out_unlock;
@@ -271,7 +271,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
inode->i_flags = i_oldflags;
}
- mnt_drop_write(file->f_path.mnt);
+ mnt_drop_write_file(file);
out_unlock:
mutex_unlock(&inode->i_mutex);
return ret;
@@ -286,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
{
- struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
struct btrfs_device *device;
struct request_queue *q;
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
- u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+ u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -322,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
- ret = btrfs_trim_fs(root, &range);
+ ret = btrfs_trim_fs(fs_info->tree_root, &range);
if (ret < 0)
return ret;
@@ -426,22 +425,37 @@ static noinline int create_subvol(struct btrfs_root *root,
key.offset = (u64)-1;
new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
- BUG_ON(IS_ERR(new_root));
+ if (IS_ERR(new_root)) {
+ btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
+ ret = PTR_ERR(new_root);
+ goto fail;
+ }
btrfs_record_root_in_trans(trans, new_root);
ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
+ if (ret) {
+ /* We potentially lose an unused inode item here */
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
/*
* insert the directory item
*/
ret = btrfs_set_inode_index(dir, &index);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
ret = btrfs_insert_dir_item(trans, root,
name, namelen, dir, &key,
BTRFS_FT_DIR, index);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
goto fail;
+ }
btrfs_i_size_write(dir, dir->i_size + namelen * 2);
ret = btrfs_update_inode(trans, root, dir);
@@ -798,9 +812,9 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
if (!em) {
/* get the big lock and read metadata off disk */
- lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+ lock_extent(io_tree, start, start + len - 1);
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
- unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+ unlock_extent(io_tree, start, start + len - 1);
if (IS_ERR(em))
return 0;
@@ -888,10 +902,10 @@ again:
page_start = page_offset(page);
page_end = page_start + PAGE_CACHE_SIZE - 1;
while (1) {
- lock_extent(tree, page_start, page_end, GFP_NOFS);
+ lock_extent(tree, page_start, page_end);
ordered = btrfs_lookup_ordered_extent(inode,
page_start);
- unlock_extent(tree, page_start, page_end, GFP_NOFS);
+ unlock_extent(tree, page_start, page_end);
if (!ordered)
break;
@@ -947,8 +961,7 @@ again:
page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, 0, &cached_state,
- GFP_NOFS);
+ page_start, page_end - 1, 0, &cached_state);
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
@@ -1883,7 +1896,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out;
}
- err = mnt_want_write(file->f_path.mnt);
+ err = mnt_want_write_file(file);
if (err)
goto out;
@@ -1967,7 +1980,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dest->root_key.objectid,
dentry->d_name.name,
dentry->d_name.len);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_end_trans;
+ }
btrfs_record_root_in_trans(trans, dest);
@@ -1980,11 +1997,16 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
ret = btrfs_insert_orphan_item(trans,
root->fs_info->tree_root,
dest->root_key.objectid);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ err = ret;
+ goto out_end_trans;
+ }
}
-
+out_end_trans:
ret = btrfs_end_transaction(trans, root);
- BUG_ON(ret);
+ if (ret && !err)
+ err = ret;
inode->i_flags |= S_DEAD;
out_up_write:
up_write(&root->fs_info->subvol_sem);
@@ -1999,7 +2021,7 @@ out_dput:
dput(dentry);
out_unlock_dir:
mutex_unlock(&dir->i_mutex);
- mnt_drop_write(file->f_path.mnt);
+ mnt_drop_write_file(file);
out:
kfree(vol_args);
return err;
@@ -2015,7 +2037,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
if (btrfs_root_readonly(root))
return -EROFS;
- ret = mnt_want_write(file->f_path.mnt);
+ ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -2068,7 +2090,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EINVAL;
}
out:
- mnt_drop_write(file->f_path.mnt);
+ mnt_drop_write_file(file);
return ret;
}
@@ -2245,7 +2267,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (btrfs_root_readonly(root))
return -EROFS;
- ret = mnt_want_write(file->f_path.mnt);
+ ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -2327,13 +2349,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
another, and lock file content */
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ lock_extent(&BTRFS_I(src)->io_tree, off, off+len);
ordered = btrfs_lookup_first_ordered_extent(src, off+len);
if (!ordered &&
!test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
EXTENT_DELALLOC, 0, NULL))
break;
- unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
if (ordered)
btrfs_put_ordered_extent(ordered);
btrfs_wait_ordered_range(src, off, len);
@@ -2448,11 +2470,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
new_key.offset,
new_key.offset + datal,
&hint_byte, 1);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root,
+ ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root,
+ ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
leaf = path->nodes[0];
slot = path->slots[0];
@@ -2479,7 +2511,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_ino(inode),
new_key.offset - datao,
0);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans,
+ root,
+ ret);
+ btrfs_end_transaction(trans,
+ root);
+ goto out;
+
+ }
}
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
u64 skip = 0;
@@ -2504,11 +2544,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
new_key.offset,
new_key.offset + datal,
&hint_byte, 1);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root,
+ ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root,
+ ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
if (skip) {
u32 start =
@@ -2542,8 +2592,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_i_size_write(inode, endoff);
ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
- btrfs_end_transaction(trans, root);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ ret = btrfs_end_transaction(trans, root);
}
next:
btrfs_release_path(path);
@@ -2552,7 +2606,7 @@ next:
ret = 0;
out:
btrfs_release_path(path);
- unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
out_unlock:
mutex_unlock(&src->i_mutex);
mutex_unlock(&inode->i_mutex);
@@ -2561,7 +2615,7 @@ out_unlock:
out_fput:
fput(src_file);
out_drop_write:
- mnt_drop_write(file->f_path.mnt);
+ mnt_drop_write_file(file);
return ret;
}
@@ -2600,7 +2654,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
if (btrfs_root_readonly(root))
goto out;
- ret = mnt_want_write(file->f_path.mnt);
+ ret = mnt_want_write_file(file);
if (ret)
goto out;
@@ -2616,7 +2670,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
out_drop:
atomic_dec(&root->fs_info->open_ioctl_trans);
- mnt_drop_write(file->f_path.mnt);
+ mnt_drop_write_file(file);
out:
return ret;
}
@@ -2851,7 +2905,7 @@ long btrfs_ioctl_trans_end(struct file *file)
atomic_dec(&root->fs_info->open_ioctl_trans);
- mnt_drop_write(file->f_path.mnt);
+ mnt_drop_write_file(file);
return 0;
}
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 5e178d8f7167..272f911203ff 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -208,7 +208,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
* take a spinning write lock. This will wait for both
* blocking readers or writers
*/
-int btrfs_tree_lock(struct extent_buffer *eb)
+void btrfs_tree_lock(struct extent_buffer *eb)
{
again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
@@ -230,13 +230,12 @@ again:
atomic_inc(&eb->spinning_writers);
atomic_inc(&eb->write_locks);
eb->lock_owner = current->pid;
- return 0;
}
/*
* drop a spinning or a blocking write lock.
*/
-int btrfs_tree_unlock(struct extent_buffer *eb)
+void btrfs_tree_unlock(struct extent_buffer *eb)
{
int blockers = atomic_read(&eb->blocking_writers);
@@ -255,7 +254,6 @@ int btrfs_tree_unlock(struct extent_buffer *eb)
atomic_dec(&eb->spinning_writers);
write_unlock(&eb->lock);
}
- return 0;
}
void btrfs_assert_tree_locked(struct extent_buffer *eb)
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 17247ddb81a0..ca52681e5f40 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -24,8 +24,8 @@
#define BTRFS_WRITE_LOCK_BLOCKING 3
#define BTRFS_READ_LOCK_BLOCKING 4
-int btrfs_tree_lock(struct extent_buffer *eb);
-int btrfs_tree_unlock(struct extent_buffer *eb);
+void btrfs_tree_lock(struct extent_buffer *eb);
+void btrfs_tree_unlock(struct extent_buffer *eb);
int btrfs_try_spin_lock(struct extent_buffer *eb);
void btrfs_tree_read_lock(struct extent_buffer *eb);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a1c940425307..bbf6d0d9aebe 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -59,6 +59,14 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
return NULL;
}
+static void ordered_data_tree_panic(struct inode *inode, int errno,
+ u64 offset)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
+ "%llu\n", (unsigned long long)offset);
+}
+
/*
* look for a given offset in the tree, and if it can't be found return the
* first lesser offset
@@ -207,7 +215,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
spin_lock(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
- BUG_ON(node);
+ if (node)
+ ordered_data_tree_panic(inode, -EEXIST, file_offset);
spin_unlock(&tree->lock);
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
@@ -215,7 +224,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
&BTRFS_I(inode)->root->fs_info->ordered_extents);
spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
- BUG_ON(node);
return 0;
}
@@ -249,9 +257,9 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
* when an ordered extent is finished. If the list covers more than one
* ordered extent, it is split across multiples.
*/
-int btrfs_add_ordered_sum(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- struct btrfs_ordered_sum *sum)
+void btrfs_add_ordered_sum(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ struct btrfs_ordered_sum *sum)
{
struct btrfs_ordered_inode_tree *tree;
@@ -259,7 +267,6 @@ int btrfs_add_ordered_sum(struct inode *inode,
spin_lock(&tree->lock);
list_add_tail(&sum->list, &entry->list);
spin_unlock(&tree->lock);
- return 0;
}
/*
@@ -384,7 +391,7 @@ out:
* used to drop a reference on an ordered extent. This will free
* the extent if the last reference is dropped
*/
-int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
{
struct list_head *cur;
struct btrfs_ordered_sum *sum;
@@ -400,7 +407,6 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
}
kfree(entry);
}
- return 0;
}
/*
@@ -408,8 +414,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
* and you must wake_up entry->wait. You must hold the tree lock
* while you call this function.
*/
-static int __btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
+static void __btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -436,35 +442,30 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
list_del_init(&BTRFS_I(inode)->ordered_operations);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
-
- return 0;
}
/*
* remove an ordered extent from the tree. No references are dropped
* but any waiters are woken.
*/
-int btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
+void btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
{
struct btrfs_ordered_inode_tree *tree;
- int ret;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock(&tree->lock);
- ret = __btrfs_remove_ordered_extent(inode, entry);
+ __btrfs_remove_ordered_extent(inode, entry);
spin_unlock(&tree->lock);
wake_up(&entry->wait);
-
- return ret;
}
/*
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
-int btrfs_wait_ordered_extents(struct btrfs_root *root,
- int nocow_only, int delay_iput)
+void btrfs_wait_ordered_extents(struct btrfs_root *root,
+ int nocow_only, int delay_iput)
{
struct list_head splice;
struct list_head *cur;
@@ -512,7 +513,6 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root,
spin_lock(&root->fs_info->ordered_extent_lock);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
- return 0;
}
/*
@@ -525,7 +525,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root,
* extra check to make sure the ordered operation list really is empty
* before we return
*/
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
{
struct btrfs_inode *btrfs_inode;
struct inode *inode;
@@ -573,8 +573,6 @@ again:
spin_unlock(&root->fs_info->ordered_extent_lock);
mutex_unlock(&root->fs_info->ordered_operations_mutex);
-
- return 0;
}
/*
@@ -609,7 +607,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
/*
* Used to wait on ordered extents across a large range of bytes.
*/
-int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
{
u64 end;
u64 orig_end;
@@ -664,7 +662,6 @@ again:
schedule_timeout(1);
goto again;
}
- return 0;
}
/*
@@ -948,9 +945,8 @@ out:
* If trans is not null, we'll do a friendly check for a transaction that
* is already flushing things and force the IO down ourselves.
*/
-int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode)
+void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
{
u64 last_mod;
@@ -961,7 +957,7 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
* commit, we can safely return without doing anything
*/
if (last_mod < root->fs_info->last_trans_committed)
- return 0;
+ return;
/*
* the transaction is already committing. Just start the IO and
@@ -969,7 +965,7 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
*/
if (trans && root->fs_info->running_transaction->blocked) {
btrfs_wait_ordered_range(inode, 0, (u64)-1);
- return 0;
+ return;
}
spin_lock(&root->fs_info->ordered_extent_lock);
@@ -978,6 +974,4 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
&root->fs_info->ordered_operations);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
-
- return 0;
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ff1f69aa1883..c355ad4dc1a6 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -138,8 +138,8 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
t->last = NULL;
}
-int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
-int btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry);
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
@@ -154,14 +154,14 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int compress_type);
-int btrfs_add_ordered_sum(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- struct btrfs_ordered_sum *sum);
+void btrfs_add_ordered_sum(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
u64 file_offset);
void btrfs_start_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry, int wait);
-int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
@@ -170,10 +170,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
-int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode);
-int btrfs_wait_ordered_extents(struct btrfs_root *root,
- int nocow_only, int delay_iput);
+void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode);
+void btrfs_wait_ordered_extents(struct btrfs_root *root,
+ int nocow_only, int delay_iput);
#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index f8be250963a0..24cad1695af7 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -58,7 +58,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
- if (ret) {
+ if (ret) { /* JDM: Really? */
ret = -ENOENT;
goto out;
}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 22db04550f6a..dc5d33146fdb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -54,7 +54,6 @@
* than the 2 started one after another.
*/
-#define MAX_MIRRORS 2
#define MAX_IN_FLIGHT 6
struct reada_extctl {
@@ -71,7 +70,7 @@ struct reada_extent {
struct list_head extctl;
struct kref refcnt;
spinlock_t lock;
- struct reada_zone *zones[MAX_MIRRORS];
+ struct reada_zone *zones[BTRFS_MAX_MIRRORS];
int nzones;
struct btrfs_device *scheduled_for;
};
@@ -84,7 +83,8 @@ struct reada_zone {
spinlock_t lock;
int locked;
struct btrfs_device *device;
- struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
+ struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl
+ * self */
int ndevs;
struct kref refcnt;
};
@@ -365,9 +365,9 @@ again:
if (ret || !bbio || length < blocksize)
goto error;
- if (bbio->num_stripes > MAX_MIRRORS) {
+ if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
printk(KERN_ERR "btrfs readahead: more than %d copies not "
- "supported", MAX_MIRRORS);
+ "supported", BTRFS_MAX_MIRRORS);
goto error;
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8c1aae2c845d..017281dbb2a7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -326,6 +326,19 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
return NULL;
}
+void backref_tree_panic(struct rb_node *rb_node, int errno,
+ u64 bytenr)
+{
+
+ struct btrfs_fs_info *fs_info = NULL;
+ struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ if (bnode->root)
+ fs_info = bnode->root->fs_info;
+ btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
+ "found at offset %llu\n", (unsigned long long)bytenr);
+}
+
/*
* walk up backref nodes until reach node presents tree root
*/
@@ -452,7 +465,8 @@ static void update_backref_node(struct backref_cache *cache,
rb_erase(&node->rb_node, &cache->rb_root);
node->bytenr = bytenr;
rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, bytenr);
}
/*
@@ -999,7 +1013,8 @@ next:
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, node->bytenr,
&node->rb_node);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, node->bytenr);
list_add_tail(&node->lower, &cache->leaves);
}
@@ -1034,7 +1049,9 @@ next:
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, upper->bytenr,
&upper->rb_node);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST,
+ upper->bytenr);
}
list_add_tail(&edge->list[UPPER], &upper->lower);
@@ -1180,7 +1197,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
&new_node->rb_node);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
if (!new_node->lowest) {
list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
@@ -1203,14 +1221,15 @@ fail:
/*
* helper to add 'address of tree root -> reloc tree' mapping
*/
-static int __add_reloc_root(struct btrfs_root *root)
+static int __must_check __add_reloc_root(struct btrfs_root *root)
{
struct rb_node *rb_node;
struct mapping_node *node;
struct reloc_control *rc = root->fs_info->reloc_ctl;
node = kmalloc(sizeof(*node), GFP_NOFS);
- BUG_ON(!node);
+ if (!node)
+ return -ENOMEM;
node->bytenr = root->node->start;
node->data = root;
@@ -1219,7 +1238,12 @@ static int __add_reloc_root(struct btrfs_root *root)
rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
- BUG_ON(rb_node);
+ if (rb_node) {
+ kfree(node);
+ btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
+ "for start=%llu while inserting into relocation "
+ "tree\n");
+ }
list_add_tail(&root->root_list, &rc->reloc_roots);
return 0;
@@ -1252,7 +1276,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, node->bytenr);
} else {
list_del_init(&root->root_list);
kfree(node);
@@ -1334,6 +1359,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *reloc_root;
struct reloc_control *rc = root->fs_info->reloc_ctl;
int clear_rsv = 0;
+ int ret;
if (root->reloc_root) {
reloc_root = root->reloc_root;
@@ -1353,7 +1379,8 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
if (clear_rsv)
trans->block_rsv = NULL;
- __add_reloc_root(reloc_root);
+ ret = __add_reloc_root(reloc_root);
+ BUG_ON(ret < 0);
root->reloc_root = reloc_root;
return 0;
}
@@ -1577,15 +1604,14 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
WARN_ON(!IS_ALIGNED(end, root->sectorsize));
end--;
ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end,
- GFP_NOFS);
+ key.offset, end);
if (!ret)
continue;
btrfs_drop_extent_cache(inode, key.offset, end,
1);
unlock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end, GFP_NOFS);
+ key.offset, end);
}
}
@@ -1956,9 +1982,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
}
/* the lock_extent waits for readpage to complete */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end);
btrfs_drop_extent_cache(inode, start, end, 1);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
}
return 0;
}
@@ -2246,7 +2272,8 @@ again:
} else {
list_del_init(&reloc_root->root_list);
}
- btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
+ ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
+ BUG_ON(ret < 0);
}
if (found) {
@@ -2862,12 +2889,12 @@ int prealloc_file_extent_cluster(struct inode *inode,
else
end = cluster->end - offset;
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end);
num_bytes = end + 1 - start;
ret = btrfs_prealloc_file_range(inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
if (ret)
break;
nr++;
@@ -2899,7 +2926,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end);
while (1) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
@@ -2910,7 +2937,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
}
btrfs_drop_extent_cache(inode, start, end, 0);
}
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
return ret;
}
@@ -2990,8 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
page_start = (u64)page->index << PAGE_CACHE_SHIFT;
page_end = page_start + PAGE_CACHE_SIZE - 1;
- lock_extent(&BTRFS_I(inode)->io_tree,
- page_start, page_end, GFP_NOFS);
+ lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
set_page_extent_mapped(page);
@@ -3007,7 +3033,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
set_page_dirty(page);
unlock_extent(&BTRFS_I(inode)->io_tree,
- page_start, page_end, GFP_NOFS);
+ page_start, page_end);
unlock_page(page);
page_cache_release(page);
@@ -3154,7 +3180,8 @@ static int add_tree_block(struct reloc_control *rc,
block->key_ready = 0;
rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, block->bytenr);
return 0;
}
@@ -3426,7 +3453,9 @@ static int find_data_references(struct reloc_control *rc,
block->key_ready = 1;
rb_node = tree_insert(blocks, block->bytenr,
&block->rb_node);
- BUG_ON(rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST,
+ block->bytenr);
}
if (counted)
added = 1;
@@ -4073,10 +4102,11 @@ out:
static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
- int ret;
+ int ret, err;
trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
memset(&root->root_item.drop_progress, 0,
sizeof(root->root_item.drop_progress));
@@ -4084,11 +4114,11 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
btrfs_set_root_refs(&root->root_item, 0);
ret = btrfs_update_root(trans, root->fs_info->tree_root,
&root->root_key, &root->root_item);
- BUG_ON(ret);
- ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
- BUG_ON(ret);
- return 0;
+ err = btrfs_end_transaction(trans, root->fs_info->tree_root);
+ if (err)
+ return err;
+ return ret;
}
/*
@@ -4156,7 +4186,11 @@ int btrfs_recover_relocation(struct btrfs_root *root)
err = ret;
goto out;
}
- mark_garbage_root(reloc_root);
+ ret = mark_garbage_root(reloc_root);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
}
}
@@ -4202,13 +4236,19 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root = read_fs_root(root->fs_info,
reloc_root->root_key.offset);
- BUG_ON(IS_ERR(fs_root));
+ if (IS_ERR(fs_root)) {
+ err = PTR_ERR(fs_root);
+ goto out_free;
+ }
- __add_reloc_root(reloc_root);
+ err = __add_reloc_root(reloc_root);
+ BUG_ON(err < 0); /* -ENOMEM or logic error */
fs_root->reloc_root = reloc_root;
}
- btrfs_commit_transaction(trans, rc->extent_root);
+ err = btrfs_commit_transaction(trans, rc->extent_root);
+ if (err)
+ goto out_free;
merge_reloc_roots(rc);
@@ -4218,7 +4258,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (IS_ERR(trans))
err = PTR_ERR(trans);
else
- btrfs_commit_transaction(trans, rc->extent_root);
+ err = btrfs_commit_transaction(trans, rc->extent_root);
out_free:
kfree(rc);
out:
@@ -4267,6 +4307,8 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
disk_bytenr + len - 1, &list, 0);
+ if (ret)
+ goto out;
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
@@ -4284,6 +4326,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
btrfs_add_ordered_sum(inode, ordered, sums);
}
+out:
btrfs_put_ordered_extent(ordered);
return ret;
}
@@ -4380,7 +4423,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
* called after snapshot is created. migrate block reservation
* and create reloc root for the newly created snapshot
*/
-void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending)
{
struct btrfs_root *root = pending->root;
@@ -4390,7 +4433,7 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
int ret;
if (!root->reloc_root)
- return;
+ return 0;
rc = root->fs_info->reloc_ctl;
rc->merging_rsv_size += rc->nodes_relocated;
@@ -4399,18 +4442,21 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_block_rsv_migrate(&pending->block_rsv,
rc->block_rsv,
rc->nodes_relocated);
- BUG_ON(ret);
+ if (ret)
+ return ret;
}
new_root = pending->snap;
reloc_root = create_reloc_root(trans, root->reloc_root,
new_root->root_key.objectid);
+ if (IS_ERR(reloc_root))
+ return PTR_ERR(reloc_root);
- __add_reloc_root(reloc_root);
+ ret = __add_reloc_root(reloc_root);
+ BUG_ON(ret < 0);
new_root->reloc_root = reloc_root;
- if (rc->create_reloc_tree) {
+ if (rc->create_reloc_tree)
ret = clone_backref_node(trans, rc, root, reloc_root);
- BUG_ON(ret);
- }
+ return ret;
}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index f4099904565a..24fb8ce4e071 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -93,10 +93,14 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
unsigned long ptr;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
+
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
goto out;
+ }
if (ret != 0) {
btrfs_print_leaf(root, path->nodes[0]);
@@ -116,13 +120,10 @@ out:
return ret;
}
-int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_key *key, struct btrfs_root_item
- *item)
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_root_item *item)
{
- int ret;
- ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
- return ret;
+ return btrfs_insert_item(trans, root, key, item, sizeof(*item));
}
/*
@@ -384,6 +385,8 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
*
* For a back ref the root_id is the id of the subvol or snapshot and
* ref_id is the id of the tree referencing it.
+ *
+ * Will return 0, -ENOMEM, or anything from the CoW path
*/
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
@@ -407,7 +410,11 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name_len);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_free_path(path);
+ return ret;
+ }
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b9b84cdfc359..c9a2c1aef4bd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -36,37 +36,30 @@
* Future enhancements:
* - In case an unrepairable extent is encountered, track which files are
* affected and report them
- * - In case of a read error on files with nodatasum, map the file and read
- * the extent to trigger a writeback of the good copy
* - track and record media errors, throw out bad devices
* - add a mode to also read unallocated space
*/
-struct scrub_bio;
-struct scrub_page;
+struct scrub_block;
struct scrub_dev;
-static void scrub_bio_end_io(struct bio *bio, int err);
-static void scrub_checksum(struct btrfs_work *work);
-static int scrub_checksum_data(struct scrub_dev *sdev,
- struct scrub_page *spag, void *buffer);
-static int scrub_checksum_tree_block(struct scrub_dev *sdev,
- struct scrub_page *spag, u64 logical,
- void *buffer);
-static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
-static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
-static void scrub_fixup_end_io(struct bio *bio, int err);
-static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
- struct page *page);
-static void scrub_fixup(struct scrub_bio *sbio, int ix);
#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
+#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
struct scrub_page {
+ struct scrub_block *sblock;
+ struct page *page;
+ struct block_device *bdev;
u64 flags; /* extent flags */
u64 generation;
- int mirror_num;
- int have_csum;
+ u64 logical;
+ u64 physical;
+ struct {
+ unsigned int mirror_num:8;
+ unsigned int have_csum:1;
+ unsigned int io_error:1;
+ };
u8 csum[BTRFS_CSUM_SIZE];
};
@@ -77,12 +70,25 @@ struct scrub_bio {
int err;
u64 logical;
u64 physical;
- struct scrub_page spag[SCRUB_PAGES_PER_BIO];
- u64 count;
+ struct scrub_page *pagev[SCRUB_PAGES_PER_BIO];
+ int page_count;
int next_free;
struct btrfs_work work;
};
+struct scrub_block {
+ struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+ int page_count;
+ atomic_t outstanding_pages;
+ atomic_t ref_count; /* free mem on transition to zero */
+ struct scrub_dev *sdev;
+ struct {
+ unsigned int header_error:1;
+ unsigned int checksum_error:1;
+ unsigned int no_io_error_seen:1;
+ };
+};
+
struct scrub_dev {
struct scrub_bio *bios[SCRUB_BIOS_PER_DEV];
struct btrfs_device *dev;
@@ -96,6 +102,10 @@ struct scrub_dev {
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
+ int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
+ u32 sectorsize;
+ u32 nodesize;
+ u32 leafsize;
/*
* statistics
*/
@@ -124,6 +134,43 @@ struct scrub_warning {
int scratch_bufsize;
};
+
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
+static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+ struct btrfs_mapping_tree *map_tree,
+ u64 length, u64 logical,
+ struct scrub_block *sblock);
+static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock, int is_metadata,
+ int have_csum, u8 *csum, u64 generation,
+ u16 csum_size);
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock,
+ int is_metadata, int have_csum,
+ const u8 *csum, u64 generation,
+ u16 csum_size);
+static void scrub_complete_bio_end_io(struct bio *bio, int err);
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int force_write);
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int page_num, int force_write);
+static int scrub_checksum_data(struct scrub_block *sblock);
+static int scrub_checksum_tree_block(struct scrub_block *sblock);
+static int scrub_checksum_super(struct scrub_block *sblock);
+static void scrub_block_get(struct scrub_block *sblock);
+static void scrub_block_put(struct scrub_block *sblock);
+static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+ struct scrub_page *spage);
+static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+ u64 physical, u64 flags, u64 gen, int mirror_num,
+ u8 *csum, int force);
+static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_block_complete(struct scrub_block *sblock);
+
+
static void scrub_free_csums(struct scrub_dev *sdev)
{
while (!list_empty(&sdev->csum_list)) {
@@ -135,23 +182,6 @@ static void scrub_free_csums(struct scrub_dev *sdev)
}
}
-static void scrub_free_bio(struct bio *bio)
-{
- int i;
- struct page *last_page = NULL;
-
- if (!bio)
- return;
-
- for (i = 0; i < bio->bi_vcnt; ++i) {
- if (bio->bi_io_vec[i].bv_page == last_page)
- continue;
- last_page = bio->bi_io_vec[i].bv_page;
- __free_page(last_page);
- }
- bio_put(bio);
-}
-
static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
{
int i;
@@ -159,13 +189,23 @@ static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
if (!sdev)
return;
+ /* this can happen when scrub is cancelled */
+ if (sdev->curr != -1) {
+ struct scrub_bio *sbio = sdev->bios[sdev->curr];
+
+ for (i = 0; i < sbio->page_count; i++) {
+ BUG_ON(!sbio->pagev[i]);
+ BUG_ON(!sbio->pagev[i]->page);
+ scrub_block_put(sbio->pagev[i]->sblock);
+ }
+ bio_put(sbio->bio);
+ }
+
for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
struct scrub_bio *sbio = sdev->bios[i];
if (!sbio)
break;
-
- scrub_free_bio(sbio->bio);
kfree(sbio);
}
@@ -179,11 +219,16 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
struct scrub_dev *sdev;
int i;
struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+ int pages_per_bio;
+ pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
+ bio_get_nr_vecs(dev->bdev));
sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
if (!sdev)
goto nomem;
sdev->dev = dev;
+ sdev->pages_per_bio = pages_per_bio;
+ sdev->curr = -1;
for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
struct scrub_bio *sbio;
@@ -194,8 +239,8 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
sbio->index = i;
sbio->sdev = sdev;
- sbio->count = 0;
- sbio->work.func = scrub_checksum;
+ sbio->page_count = 0;
+ sbio->work.func = scrub_bio_end_io_worker;
if (i != SCRUB_BIOS_PER_DEV-1)
sdev->bios[i]->next_free = i + 1;
@@ -203,7 +248,9 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
sdev->bios[i]->next_free = -1;
}
sdev->first_free = 0;
- sdev->curr = -1;
+ sdev->nodesize = dev->dev_root->nodesize;
+ sdev->leafsize = dev->dev_root->leafsize;
+ sdev->sectorsize = dev->dev_root->sectorsize;
atomic_set(&sdev->in_flight, 0);
atomic_set(&sdev->fixup_cnt, 0);
atomic_set(&sdev->cancel_req, 0);
@@ -294,10 +341,9 @@ err:
return 0;
}
-static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
- int ix)
+static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
{
- struct btrfs_device *dev = sbio->sdev->dev;
+ struct btrfs_device *dev = sblock->sdev->dev;
struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
struct btrfs_path *path;
struct btrfs_key found_key;
@@ -316,8 +362,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
- swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
- swarn.logical = sbio->logical + ix * PAGE_SIZE;
+ BUG_ON(sblock->page_count < 1);
+ swarn.sector = (sblock->pagev[0].physical) >> 9;
+ swarn.logical = sblock->pagev[0].logical;
swarn.errstr = errstr;
swarn.dev = dev;
swarn.msg_bufsize = bufsize;
@@ -342,7 +389,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
do {
ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
&ref_root, &ref_level);
- printk(KERN_WARNING "%s at logical %llu on dev %s, "
+ printk(KERN_WARNING
+ "btrfs: %s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
"%llu\n", errstr, swarn.logical, dev->name,
(unsigned long long)swarn.sector,
@@ -531,9 +579,9 @@ out:
spin_lock(&sdev->stat_lock);
++sdev->stat.uncorrectable_errors;
spin_unlock(&sdev->stat_lock);
- printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
- "(nodatasum) error at logical %llu\n",
- fixup->logical);
+ printk_ratelimited(KERN_ERR
+ "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+ (unsigned long long)fixup->logical, sdev->dev->name);
}
btrfs_free_path(path);
@@ -550,91 +598,168 @@ out:
}
/*
- * scrub_recheck_error gets called when either verification of the page
- * failed or the bio failed to read, e.g. with EIO. In the latter case,
- * recheck_error gets called for every page in the bio, even though only
- * one may be bad
+ * scrub_handle_errored_block gets called when either verification of the
+ * pages failed or the bio failed to read, e.g. with EIO. In the latter
+ * case, this function handles all pages in the bio, even though only one
+ * may be bad.
+ * The goal of this function is to repair the errored block by using the
+ * contents of one of the mirrors.
*/
-static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
{
- struct scrub_dev *sdev = sbio->sdev;
- u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+ struct scrub_dev *sdev = sblock_to_check->sdev;
+ struct btrfs_fs_info *fs_info;
+ u64 length;
+ u64 logical;
+ u64 generation;
+ unsigned int failed_mirror_index;
+ unsigned int is_metadata;
+ unsigned int have_csum;
+ u8 *csum;
+ struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+ struct scrub_block *sblock_bad;
+ int ret;
+ int mirror_index;
+ int page_num;
+ int success;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
+ DEFAULT_RATELIMIT_BURST);
+
+ BUG_ON(sblock_to_check->page_count < 1);
+ fs_info = sdev->dev->dev_root->fs_info;
+ length = sblock_to_check->page_count * PAGE_SIZE;
+ logical = sblock_to_check->pagev[0].logical;
+ generation = sblock_to_check->pagev[0].generation;
+ BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
+ failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
+ is_metadata = !(sblock_to_check->pagev[0].flags &
+ BTRFS_EXTENT_FLAG_DATA);
+ have_csum = sblock_to_check->pagev[0].have_csum;
+ csum = sblock_to_check->pagev[0].csum;
- if (sbio->err) {
- if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
- sbio->bio->bi_io_vec[ix].bv_page) == 0) {
- if (scrub_fixup_check(sbio, ix) == 0)
- return 0;
- }
- if (__ratelimit(&_rs))
- scrub_print_warning("i/o error", sbio, ix);
- } else {
- if (__ratelimit(&_rs))
- scrub_print_warning("checksum error", sbio, ix);
+ /*
+ * read all mirrors one after the other. This includes to
+ * re-read the extent or metadata block that failed (that was
+ * the cause that this fixup code is called) another time,
+ * page by page this time in order to know which pages
+ * caused I/O errors and which ones are good (for all mirrors).
+ * It is the goal to handle the situation when more than one
+ * mirror contains I/O errors, but the errors do not
+ * overlap, i.e. the data can be repaired by selecting the
+ * pages from those mirrors without I/O error on the
+ * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
+ * would be that mirror #1 has an I/O error on the first page,
+ * the second page is good, and mirror #2 has an I/O error on
+ * the second page, but the first page is good.
+ * Then the first page of the first mirror can be repaired by
+ * taking the first page of the second mirror, and the
+ * second page of the second mirror can be repaired by
+ * copying the contents of the 2nd page of the 1st mirror.
+ * One more note: if the pages of one mirror contain I/O
+ * errors, the checksum cannot be verified. In order to get
+ * the best data for repairing, the first attempt is to find
+ * a mirror without I/O errors and with a validated checksum.
+ * Only if this is not possible, the pages are picked from
+ * mirrors with I/O errors without considering the checksum.
+ * If the latter is the case, at the end, the checksum of the
+ * repaired area is verified in order to correctly maintain
+ * the statistics.
+ */
+
+ sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
+ sizeof(*sblocks_for_recheck),
+ GFP_NOFS);
+ if (!sblocks_for_recheck) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.malloc_errors++;
+ sdev->stat.read_errors++;
+ sdev->stat.uncorrectable_errors++;
+ spin_unlock(&sdev->stat_lock);
+ goto out;
}
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.read_errors;
- spin_unlock(&sdev->stat_lock);
+ /* setup the context, map the logical blocks and alloc the pages */
+ ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
+ logical, sblocks_for_recheck);
+ if (ret) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.read_errors++;
+ sdev->stat.uncorrectable_errors++;
+ spin_unlock(&sdev->stat_lock);
+ goto out;
+ }
+ BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
+ sblock_bad = sblocks_for_recheck + failed_mirror_index;
- scrub_fixup(sbio, ix);
- return 1;
-}
+ /* build and submit the bios for the failed mirror, check checksums */
+ ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+ csum, generation, sdev->csum_size);
+ if (ret) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.read_errors++;
+ sdev->stat.uncorrectable_errors++;
+ spin_unlock(&sdev->stat_lock);
+ goto out;
+ }
-static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
-{
- int ret = 1;
- struct page *page;
- void *buffer;
- u64 flags = sbio->spag[ix].flags;
+ if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
+ sblock_bad->no_io_error_seen) {
+ /*
+ * the error disappeared after reading page by page, or
+ * the area was part of a huge bio and other parts of the
+ * bio caused I/O errors, or the block layer merged several
+ * read requests into one and the error is caused by a
+ * different bio (usually one of the two latter cases is
+ * the cause)
+ */
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.unverified_errors++;
+ spin_unlock(&sdev->stat_lock);
- page = sbio->bio->bi_io_vec[ix].bv_page;
- buffer = kmap_atomic(page, KM_USER0);
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
- ret = scrub_checksum_data(sbio->sdev,
- sbio->spag + ix, buffer);
- } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- ret = scrub_checksum_tree_block(sbio->sdev,
- sbio->spag + ix,
- sbio->logical + ix * PAGE_SIZE,
- buffer);
- } else {
- WARN_ON(1);
+ goto out;
}
- kunmap_atomic(buffer, KM_USER0);
- return ret;
-}
+ if (!sblock_bad->no_io_error_seen) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.read_errors++;
+ spin_unlock(&sdev->stat_lock);
+ if (__ratelimit(&_rs))
+ scrub_print_warning("i/o error", sblock_to_check);
+ } else if (sblock_bad->checksum_error) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.csum_errors++;
+ spin_unlock(&sdev->stat_lock);
+ if (__ratelimit(&_rs))
+ scrub_print_warning("checksum error", sblock_to_check);
+ } else if (sblock_bad->header_error) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.verify_errors++;
+ spin_unlock(&sdev->stat_lock);
+ if (__ratelimit(&_rs))
+ scrub_print_warning("checksum/header error",
+ sblock_to_check);
+ }
-static void scrub_fixup_end_io(struct bio *bio, int err)
-{
- complete((struct completion *)bio->bi_private);
-}
+ if (sdev->readonly)
+ goto did_not_correct_error;
-static void scrub_fixup(struct scrub_bio *sbio, int ix)
-{
- struct scrub_dev *sdev = sbio->sdev;
- struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- struct btrfs_bio *bbio = NULL;
- struct scrub_fixup_nodatasum *fixup;
- u64 logical = sbio->logical + ix * PAGE_SIZE;
- u64 length;
- int i;
- int ret;
- DECLARE_COMPLETION_ONSTACK(complete);
-
- if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
- (sbio->spag[ix].have_csum == 0)) {
- fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
- if (!fixup)
- goto uncorrectable;
- fixup->sdev = sdev;
- fixup->logical = logical;
- fixup->root = fs_info->extent_root;
- fixup->mirror_num = sbio->spag[ix].mirror_num;
+ if (!is_metadata && !have_csum) {
+ struct scrub_fixup_nodatasum *fixup_nodatasum;
+
+ /*
+ * !is_metadata and !have_csum, this means that the data
+ * might not be COW'ed, that it might be modified
+ * concurrently. The general strategy to work on the
+ * commit root does not help in the case when COW is not
+ * used.
+ */
+ fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
+ if (!fixup_nodatasum)
+ goto did_not_correct_error;
+ fixup_nodatasum->sdev = sdev;
+ fixup_nodatasum->logical = logical;
+ fixup_nodatasum->root = fs_info->extent_root;
+ fixup_nodatasum->mirror_num = failed_mirror_index + 1;
/*
* increment scrubs_running to prevent cancel requests from
* completing as long as a fixup worker is running. we must also
@@ -649,235 +774,529 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
atomic_inc(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
atomic_inc(&sdev->fixup_cnt);
- fixup->work.func = scrub_fixup_nodatasum;
- btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
- return;
+ fixup_nodatasum->work.func = scrub_fixup_nodatasum;
+ btrfs_queue_worker(&fs_info->scrub_workers,
+ &fixup_nodatasum->work);
+ goto out;
}
- length = PAGE_SIZE;
- ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
- &bbio, 0);
- if (ret || !bbio || length < PAGE_SIZE) {
- printk(KERN_ERR
- "scrub_fixup: btrfs_map_block failed us for %llu\n",
- (unsigned long long)logical);
- WARN_ON(1);
- kfree(bbio);
- return;
+ /*
+ * now build and submit the bios for the other mirrors, check
+ * checksums
+ */
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index].page_count > 0;
+ mirror_index++) {
+ if (mirror_index == failed_mirror_index)
+ continue;
+
+ /* build and submit the bios, check checksums */
+ ret = scrub_recheck_block(fs_info,
+ sblocks_for_recheck + mirror_index,
+ is_metadata, have_csum, csum,
+ generation, sdev->csum_size);
+ if (ret)
+ goto did_not_correct_error;
}
- if (bbio->num_stripes == 1)
- /* there aren't any replicas */
- goto uncorrectable;
+ /*
+ * first try to pick the mirror which is completely without I/O
+ * errors and also does not have a checksum error.
+ * If one is found, and if a checksum is present, the full block
+ * that is known to contain an error is rewritten. Afterwards
+ * the block is known to be corrected.
+ * If a mirror is found which is completely correct, and no
+ * checksum is present, only those pages are rewritten that had
+ * an I/O error in the block to be repaired, since it cannot be
+ * determined, which copy of the other pages is better (and it
+ * could happen otherwise that a correct page would be
+ * overwritten by a bad one).
+ */
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index].page_count > 0;
+ mirror_index++) {
+ struct scrub_block *sblock_other = sblocks_for_recheck +
+ mirror_index;
+
+ if (!sblock_other->header_error &&
+ !sblock_other->checksum_error &&
+ sblock_other->no_io_error_seen) {
+ int force_write = is_metadata || have_csum;
+
+ ret = scrub_repair_block_from_good_copy(sblock_bad,
+ sblock_other,
+ force_write);
+ if (0 == ret)
+ goto corrected_error;
+ }
+ }
/*
- * first find a good copy
+ * in case of I/O errors in the area that is supposed to be
+ * repaired, continue by picking good copies of those pages.
+ * Select the good pages from mirrors to rewrite bad pages from
+ * the area to fix. Afterwards verify the checksum of the block
+ * that is supposed to be repaired. This verification step is
+ * only done for the purpose of statistic counting and for the
+ * final scrub report, whether errors remain.
+ * A perfect algorithm could make use of the checksum and try
+ * all possible combinations of pages from the different mirrors
+ * until the checksum verification succeeds. For example, when
+ * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+ * of mirror #2 is readable but the final checksum test fails,
+ * then the 2nd page of mirror #3 could be tried, whether now
+ * the final checksum succeedes. But this would be a rare
+ * exception and is therefore not implemented. At least it is
+ * avoided that the good copy is overwritten.
+ * A more useful improvement would be to pick the sectors
+ * without I/O error based on sector sizes (512 bytes on legacy
+ * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+ * mirror could be repaired by taking 512 byte of a different
+ * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+ * area are unreadable.
*/
- for (i = 0; i < bbio->num_stripes; ++i) {
- if (i + 1 == sbio->spag[ix].mirror_num)
- continue;
- if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
- bbio->stripes[i].physical >> 9,
- sbio->bio->bi_io_vec[ix].bv_page)) {
- /* I/O-error, this is not a good copy */
+ /* can only fix I/O errors from here on */
+ if (sblock_bad->no_io_error_seen)
+ goto did_not_correct_error;
+
+ success = 1;
+ for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+
+ if (!page_bad->io_error)
continue;
+
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index].page_count > 0;
+ mirror_index++) {
+ struct scrub_block *sblock_other = sblocks_for_recheck +
+ mirror_index;
+ struct scrub_page *page_other = sblock_other->pagev +
+ page_num;
+
+ if (!page_other->io_error) {
+ ret = scrub_repair_page_from_good_copy(
+ sblock_bad, sblock_other, page_num, 0);
+ if (0 == ret) {
+ page_bad->io_error = 0;
+ break; /* succeeded for this page */
+ }
+ }
}
- if (scrub_fixup_check(sbio, ix) == 0)
- break;
+ if (page_bad->io_error) {
+ /* did not find a mirror to copy the page from */
+ success = 0;
+ }
}
- if (i == bbio->num_stripes)
- goto uncorrectable;
- if (!sdev->readonly) {
- /*
- * bi_io_vec[ix].bv_page now contains good data, write it back
- */
- if (scrub_fixup_io(WRITE, sdev->dev->bdev,
- (sbio->physical + ix * PAGE_SIZE) >> 9,
- sbio->bio->bi_io_vec[ix].bv_page)) {
- /* I/O-error, writeback failed, give up */
- goto uncorrectable;
+ if (success) {
+ if (is_metadata || have_csum) {
+ /*
+ * need to verify the checksum now that all
+ * sectors on disk are repaired (the write
+ * request for data to be repaired is on its way).
+ * Just be lazy and use scrub_recheck_block()
+ * which re-reads the data before the checksum
+ * is verified, but most likely the data comes out
+ * of the page cache.
+ */
+ ret = scrub_recheck_block(fs_info, sblock_bad,
+ is_metadata, have_csum, csum,
+ generation, sdev->csum_size);
+ if (!ret && !sblock_bad->header_error &&
+ !sblock_bad->checksum_error &&
+ sblock_bad->no_io_error_seen)
+ goto corrected_error;
+ else
+ goto did_not_correct_error;
+ } else {
+corrected_error:
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.corrected_errors++;
+ spin_unlock(&sdev->stat_lock);
+ printk_ratelimited(KERN_ERR
+ "btrfs: fixed up error at logical %llu on dev %s\n",
+ (unsigned long long)logical, sdev->dev->name);
}
+ } else {
+did_not_correct_error:
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.uncorrectable_errors++;
+ spin_unlock(&sdev->stat_lock);
+ printk_ratelimited(KERN_ERR
+ "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
+ (unsigned long long)logical, sdev->dev->name);
}
- kfree(bbio);
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.corrected_errors;
- spin_unlock(&sdev->stat_lock);
+out:
+ if (sblocks_for_recheck) {
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
+ mirror_index++) {
+ struct scrub_block *sblock = sblocks_for_recheck +
+ mirror_index;
+ int page_index;
+
+ for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
+ page_index++)
+ if (sblock->pagev[page_index].page)
+ __free_page(
+ sblock->pagev[page_index].page);
+ }
+ kfree(sblocks_for_recheck);
+ }
- printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
- (unsigned long long)logical);
- return;
+ return 0;
+}
-uncorrectable:
- kfree(bbio);
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.uncorrectable_errors;
- spin_unlock(&sdev->stat_lock);
+static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+ struct btrfs_mapping_tree *map_tree,
+ u64 length, u64 logical,
+ struct scrub_block *sblocks_for_recheck)
+{
+ int page_index;
+ int mirror_index;
+ int ret;
+
+ /*
+ * note: the three members sdev, ref_count and outstanding_pages
+ * are not used (and not set) in the blocks that are used for
+ * the recheck procedure
+ */
+
+ page_index = 0;
+ while (length > 0) {
+ u64 sublen = min_t(u64, length, PAGE_SIZE);
+ u64 mapped_length = sublen;
+ struct btrfs_bio *bbio = NULL;
+
+ /*
+ * with a length of PAGE_SIZE, each returned stripe
+ * represents one mirror
+ */
+ ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
+ &bbio, 0);
+ if (ret || !bbio || mapped_length < sublen) {
+ kfree(bbio);
+ return -EIO;
+ }
- printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
- "logical %llu\n", (unsigned long long)logical);
+ BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
+ for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+ mirror_index++) {
+ struct scrub_block *sblock;
+ struct scrub_page *page;
+
+ if (mirror_index >= BTRFS_MAX_MIRRORS)
+ continue;
+
+ sblock = sblocks_for_recheck + mirror_index;
+ page = sblock->pagev + page_index;
+ page->logical = logical;
+ page->physical = bbio->stripes[mirror_index].physical;
+ page->bdev = bbio->stripes[mirror_index].dev->bdev;
+ page->mirror_num = mirror_index + 1;
+ page->page = alloc_page(GFP_NOFS);
+ if (!page->page) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.malloc_errors++;
+ spin_unlock(&sdev->stat_lock);
+ return -ENOMEM;
+ }
+ sblock->page_count++;
+ }
+ kfree(bbio);
+ length -= sublen;
+ logical += sublen;
+ page_index++;
+ }
+
+ return 0;
}
-static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
- struct page *page)
+/*
+ * this function will check the on disk data for checksum errors, header
+ * errors and read I/O errors. If any I/O errors happen, the exact pages
+ * which are errored are marked as being bad. The goal is to enable scrub
+ * to take those pages that are not errored from all the mirrors so that
+ * the pages that are errored in the just handled mirror can be repaired.
+ */
+static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock, int is_metadata,
+ int have_csum, u8 *csum, u64 generation,
+ u16 csum_size)
{
- struct bio *bio = NULL;
- int ret;
- DECLARE_COMPLETION_ONSTACK(complete);
+ int page_num;
+
+ sblock->no_io_error_seen = 1;
+ sblock->header_error = 0;
+ sblock->checksum_error = 0;
+
+ for (page_num = 0; page_num < sblock->page_count; page_num++) {
+ struct bio *bio;
+ int ret;
+ struct scrub_page *page = sblock->pagev + page_num;
+ DECLARE_COMPLETION_ONSTACK(complete);
+
+ BUG_ON(!page->page);
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio->bi_bdev = page->bdev;
+ bio->bi_sector = page->physical >> 9;
+ bio->bi_end_io = scrub_complete_bio_end_io;
+ bio->bi_private = &complete;
+
+ ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
+ if (PAGE_SIZE != ret) {
+ bio_put(bio);
+ return -EIO;
+ }
+ btrfsic_submit_bio(READ, bio);
- bio = bio_alloc(GFP_NOFS, 1);
- bio->bi_bdev = bdev;
- bio->bi_sector = sector;
- bio_add_page(bio, page, PAGE_SIZE, 0);
- bio->bi_end_io = scrub_fixup_end_io;
- bio->bi_private = &complete;
- btrfsic_submit_bio(rw, bio);
+ /* this will also unplug the queue */
+ wait_for_completion(&complete);
- /* this will also unplug the queue */
- wait_for_completion(&complete);
+ page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ sblock->no_io_error_seen = 0;
+ bio_put(bio);
+ }
- ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
- bio_put(bio);
- return ret;
+ if (sblock->no_io_error_seen)
+ scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+ have_csum, csum, generation,
+ csum_size);
+
+ return 0;
}
-static void scrub_bio_end_io(struct bio *bio, int err)
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock,
+ int is_metadata, int have_csum,
+ const u8 *csum, u64 generation,
+ u16 csum_size)
{
- struct scrub_bio *sbio = bio->bi_private;
- struct scrub_dev *sdev = sbio->sdev;
- struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+ int page_num;
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ u32 crc = ~(u32)0;
+ struct btrfs_root *root = fs_info->extent_root;
+ void *mapped_buffer;
+
+ BUG_ON(!sblock->pagev[0].page);
+ if (is_metadata) {
+ struct btrfs_header *h;
+
+ mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0);
+ h = (struct btrfs_header *)mapped_buffer;
+
+ if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
+ generation != le64_to_cpu(h->generation) ||
+ memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
+ memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+ BTRFS_UUID_SIZE))
+ sblock->header_error = 1;
+ csum = h->csum;
+ } else {
+ if (!have_csum)
+ return;
- sbio->err = err;
- sbio->bio = bio;
+ mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0);
+ }
- btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+ for (page_num = 0;;) {
+ if (page_num == 0 && is_metadata)
+ crc = btrfs_csum_data(root,
+ ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
+ crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
+ else
+ crc = btrfs_csum_data(root, mapped_buffer, crc,
+ PAGE_SIZE);
+
+ kunmap_atomic(mapped_buffer, KM_USER0);
+ page_num++;
+ if (page_num >= sblock->page_count)
+ break;
+ BUG_ON(!sblock->pagev[page_num].page);
+
+ mapped_buffer = kmap_atomic(sblock->pagev[page_num].page,
+ KM_USER0);
+ }
+
+ btrfs_csum_final(crc, calculated_csum);
+ if (memcmp(calculated_csum, csum, csum_size))
+ sblock->checksum_error = 1;
}
-static void scrub_checksum(struct btrfs_work *work)
+static void scrub_complete_bio_end_io(struct bio *bio, int err)
{
- struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
- struct scrub_dev *sdev = sbio->sdev;
- struct page *page;
- void *buffer;
- int i;
- u64 flags;
- u64 logical;
- int ret;
+ complete((struct completion *)bio->bi_private);
+}
- if (sbio->err) {
- ret = 0;
- for (i = 0; i < sbio->count; ++i)
- ret |= scrub_recheck_error(sbio, i);
- if (!ret) {
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.unverified_errors;
- spin_unlock(&sdev->stat_lock);
- }
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int force_write)
+{
+ int page_num;
+ int ret = 0;
- sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
- sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
- sbio->bio->bi_phys_segments = 0;
- sbio->bio->bi_idx = 0;
+ for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ int ret_sub;
- for (i = 0; i < sbio->count; i++) {
- struct bio_vec *bi;
- bi = &sbio->bio->bi_io_vec[i];
- bi->bv_offset = 0;
- bi->bv_len = PAGE_SIZE;
- }
- goto out;
+ ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
+ sblock_good,
+ page_num,
+ force_write);
+ if (ret_sub)
+ ret = ret_sub;
}
- for (i = 0; i < sbio->count; ++i) {
- page = sbio->bio->bi_io_vec[i].bv_page;
- buffer = kmap_atomic(page, KM_USER0);
- flags = sbio->spag[i].flags;
- logical = sbio->logical + i * PAGE_SIZE;
- ret = 0;
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
- ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
- } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
- logical, buffer);
- } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
- BUG_ON(i);
- (void)scrub_checksum_super(sbio, buffer);
- } else {
- WARN_ON(1);
- }
- kunmap_atomic(buffer, KM_USER0);
- if (ret) {
- ret = scrub_recheck_error(sbio, i);
- if (!ret) {
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.unverified_errors;
- spin_unlock(&sdev->stat_lock);
- }
+
+ return ret;
+}
+
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int page_num, int force_write)
+{
+ struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+ struct scrub_page *page_good = sblock_good->pagev + page_num;
+
+ BUG_ON(sblock_bad->pagev[page_num].page == NULL);
+ BUG_ON(sblock_good->pagev[page_num].page == NULL);
+ if (force_write || sblock_bad->header_error ||
+ sblock_bad->checksum_error || page_bad->io_error) {
+ struct bio *bio;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(complete);
+
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio->bi_bdev = page_bad->bdev;
+ bio->bi_sector = page_bad->physical >> 9;
+ bio->bi_end_io = scrub_complete_bio_end_io;
+ bio->bi_private = &complete;
+
+ ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
+ if (PAGE_SIZE != ret) {
+ bio_put(bio);
+ return -EIO;
}
+ btrfsic_submit_bio(WRITE, bio);
+
+ /* this will also unplug the queue */
+ wait_for_completion(&complete);
+ bio_put(bio);
}
-out:
- scrub_free_bio(sbio->bio);
- sbio->bio = NULL;
- spin_lock(&sdev->list_lock);
- sbio->next_free = sdev->first_free;
- sdev->first_free = sbio->index;
- spin_unlock(&sdev->list_lock);
- atomic_dec(&sdev->in_flight);
- wake_up(&sdev->list_wait);
+ return 0;
+}
+
+static void scrub_checksum(struct scrub_block *sblock)
+{
+ u64 flags;
+ int ret;
+
+ BUG_ON(sblock->page_count < 1);
+ flags = sblock->pagev[0].flags;
+ ret = 0;
+ if (flags & BTRFS_EXTENT_FLAG_DATA)
+ ret = scrub_checksum_data(sblock);
+ else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ ret = scrub_checksum_tree_block(sblock);
+ else if (flags & BTRFS_EXTENT_FLAG_SUPER)
+ (void)scrub_checksum_super(sblock);
+ else
+ WARN_ON(1);
+ if (ret)
+ scrub_handle_errored_block(sblock);
}
-static int scrub_checksum_data(struct scrub_dev *sdev,
- struct scrub_page *spag, void *buffer)
+static int scrub_checksum_data(struct scrub_block *sblock)
{
+ struct scrub_dev *sdev = sblock->sdev;
u8 csum[BTRFS_CSUM_SIZE];
+ u8 *on_disk_csum;
+ struct page *page;
+ void *buffer;
u32 crc = ~(u32)0;
int fail = 0;
struct btrfs_root *root = sdev->dev->dev_root;
+ u64 len;
+ int index;
- if (!spag->have_csum)
+ BUG_ON(sblock->page_count < 1);
+ if (!sblock->pagev[0].have_csum)
return 0;
- crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
+ on_disk_csum = sblock->pagev[0].csum;
+ page = sblock->pagev[0].page;
+ buffer = kmap_atomic(page, KM_USER0);
+
+ len = sdev->sectorsize;
+ index = 0;
+ for (;;) {
+ u64 l = min_t(u64, len, PAGE_SIZE);
+
+ crc = btrfs_csum_data(root, buffer, crc, l);
+ kunmap_atomic(buffer, KM_USER0);
+ len -= l;
+ if (len == 0)
+ break;
+ index++;
+ BUG_ON(index >= sblock->page_count);
+ BUG_ON(!sblock->pagev[index].page);
+ page = sblock->pagev[index].page;
+ buffer = kmap_atomic(page, KM_USER0);
+ }
+
btrfs_csum_final(crc, csum);
- if (memcmp(csum, spag->csum, sdev->csum_size))
+ if (memcmp(csum, on_disk_csum, sdev->csum_size))
fail = 1;
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.data_extents_scrubbed;
- sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
- if (fail)
+ if (fail) {
+ spin_lock(&sdev->stat_lock);
++sdev->stat.csum_errors;
- spin_unlock(&sdev->stat_lock);
+ spin_unlock(&sdev->stat_lock);
+ }
return fail;
}
-static int scrub_checksum_tree_block(struct scrub_dev *sdev,
- struct scrub_page *spag, u64 logical,
- void *buffer)
+static int scrub_checksum_tree_block(struct scrub_block *sblock)
{
+ struct scrub_dev *sdev = sblock->sdev;
struct btrfs_header *h;
struct btrfs_root *root = sdev->dev->dev_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- u8 csum[BTRFS_CSUM_SIZE];
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ u8 on_disk_csum[BTRFS_CSUM_SIZE];
+ struct page *page;
+ void *mapped_buffer;
+ u64 mapped_size;
+ void *p;
u32 crc = ~(u32)0;
int fail = 0;
int crc_fail = 0;
+ u64 len;
+ int index;
+
+ BUG_ON(sblock->page_count < 1);
+ page = sblock->pagev[0].page;
+ mapped_buffer = kmap_atomic(page, KM_USER0);
+ h = (struct btrfs_header *)mapped_buffer;
+ memcpy(on_disk_csum, h->csum, sdev->csum_size);
/*
* we don't use the getter functions here, as we
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
- h = (struct btrfs_header *)buffer;
- if (logical != le64_to_cpu(h->bytenr))
+ if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
++fail;
- if (spag->generation != le64_to_cpu(h->generation))
+ if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
++fail;
if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -887,51 +1306,99 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev,
BTRFS_UUID_SIZE))
++fail;
- crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
- PAGE_SIZE - BTRFS_CSUM_SIZE);
- btrfs_csum_final(crc, csum);
- if (memcmp(csum, h->csum, sdev->csum_size))
+ BUG_ON(sdev->nodesize != sdev->leafsize);
+ len = sdev->nodesize - BTRFS_CSUM_SIZE;
+ mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+ p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+ index = 0;
+ for (;;) {
+ u64 l = min_t(u64, len, mapped_size);
+
+ crc = btrfs_csum_data(root, p, crc, l);
+ kunmap_atomic(mapped_buffer, KM_USER0);
+ len -= l;
+ if (len == 0)
+ break;
+ index++;
+ BUG_ON(index >= sblock->page_count);
+ BUG_ON(!sblock->pagev[index].page);
+ page = sblock->pagev[index].page;
+ mapped_buffer = kmap_atomic(page, KM_USER0);
+ mapped_size = PAGE_SIZE;
+ p = mapped_buffer;
+ }
+
+ btrfs_csum_final(crc, calculated_csum);
+ if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
++crc_fail;
- spin_lock(&sdev->stat_lock);
- ++sdev->stat.tree_extents_scrubbed;
- sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
- if (crc_fail)
- ++sdev->stat.csum_errors;
- if (fail)
- ++sdev->stat.verify_errors;
- spin_unlock(&sdev->stat_lock);
+ if (crc_fail || fail) {
+ spin_lock(&sdev->stat_lock);
+ if (crc_fail)
+ ++sdev->stat.csum_errors;
+ if (fail)
+ ++sdev->stat.verify_errors;
+ spin_unlock(&sdev->stat_lock);
+ }
return fail || crc_fail;
}
-static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
+static int scrub_checksum_super(struct scrub_block *sblock)
{
struct btrfs_super_block *s;
- u64 logical;
- struct scrub_dev *sdev = sbio->sdev;
+ struct scrub_dev *sdev = sblock->sdev;
struct btrfs_root *root = sdev->dev->dev_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- u8 csum[BTRFS_CSUM_SIZE];
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ u8 on_disk_csum[BTRFS_CSUM_SIZE];
+ struct page *page;
+ void *mapped_buffer;
+ u64 mapped_size;
+ void *p;
u32 crc = ~(u32)0;
int fail = 0;
+ u64 len;
+ int index;
- s = (struct btrfs_super_block *)buffer;
- logical = sbio->logical;
+ BUG_ON(sblock->page_count < 1);
+ page = sblock->pagev[0].page;
+ mapped_buffer = kmap_atomic(page, KM_USER0);
+ s = (struct btrfs_super_block *)mapped_buffer;
+ memcpy(on_disk_csum, s->csum, sdev->csum_size);
- if (logical != le64_to_cpu(s->bytenr))
+ if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
++fail;
- if (sbio->spag[0].generation != le64_to_cpu(s->generation))
+ if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
++fail;
if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
++fail;
- crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
- PAGE_SIZE - BTRFS_CSUM_SIZE);
- btrfs_csum_final(crc, csum);
- if (memcmp(csum, s->csum, sbio->sdev->csum_size))
+ len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
+ mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+ p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+ index = 0;
+ for (;;) {
+ u64 l = min_t(u64, len, mapped_size);
+
+ crc = btrfs_csum_data(root, p, crc, l);
+ kunmap_atomic(mapped_buffer, KM_USER0);
+ len -= l;
+ if (len == 0)
+ break;
+ index++;
+ BUG_ON(index >= sblock->page_count);
+ BUG_ON(!sblock->pagev[index].page);
+ page = sblock->pagev[index].page;
+ mapped_buffer = kmap_atomic(page, KM_USER0);
+ mapped_size = PAGE_SIZE;
+ p = mapped_buffer;
+ }
+
+ btrfs_csum_final(crc, calculated_csum);
+ if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
++fail;
if (fail) {
@@ -948,29 +1415,42 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
return fail;
}
-static int scrub_submit(struct scrub_dev *sdev)
+static void scrub_block_get(struct scrub_block *sblock)
+{
+ atomic_inc(&sblock->ref_count);
+}
+
+static void scrub_block_put(struct scrub_block *sblock)
+{
+ if (atomic_dec_and_test(&sblock->ref_count)) {
+ int i;
+
+ for (i = 0; i < sblock->page_count; i++)
+ if (sblock->pagev[i].page)
+ __free_page(sblock->pagev[i].page);
+ kfree(sblock);
+ }
+}
+
+static void scrub_submit(struct scrub_dev *sdev)
{
struct scrub_bio *sbio;
if (sdev->curr == -1)
- return 0;
+ return;
sbio = sdev->bios[sdev->curr];
- sbio->err = 0;
sdev->curr = -1;
atomic_inc(&sdev->in_flight);
btrfsic_submit_bio(READ, sbio->bio);
-
- return 0;
}
-static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
- u64 physical, u64 flags, u64 gen, int mirror_num,
- u8 *csum, int force)
+static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+ struct scrub_page *spage)
{
+ struct scrub_block *sblock = spage->sblock;
struct scrub_bio *sbio;
- struct page *page;
int ret;
again:
@@ -983,7 +1463,7 @@ again:
if (sdev->curr != -1) {
sdev->first_free = sdev->bios[sdev->curr]->next_free;
sdev->bios[sdev->curr]->next_free = -1;
- sdev->bios[sdev->curr]->count = 0;
+ sdev->bios[sdev->curr]->page_count = 0;
spin_unlock(&sdev->list_lock);
} else {
spin_unlock(&sdev->list_lock);
@@ -991,62 +1471,200 @@ again:
}
}
sbio = sdev->bios[sdev->curr];
- if (sbio->count == 0) {
+ if (sbio->page_count == 0) {
struct bio *bio;
- sbio->physical = physical;
- sbio->logical = logical;
- bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
- if (!bio)
- return -ENOMEM;
+ sbio->physical = spage->physical;
+ sbio->logical = spage->logical;
+ bio = sbio->bio;
+ if (!bio) {
+ bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
+ if (!bio)
+ return -ENOMEM;
+ sbio->bio = bio;
+ }
bio->bi_private = sbio;
bio->bi_end_io = scrub_bio_end_io;
bio->bi_bdev = sdev->dev->bdev;
- bio->bi_sector = sbio->physical >> 9;
+ bio->bi_sector = spage->physical >> 9;
sbio->err = 0;
- sbio->bio = bio;
- } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
- sbio->logical + sbio->count * PAGE_SIZE != logical) {
- ret = scrub_submit(sdev);
- if (ret)
- return ret;
+ } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+ spage->physical ||
+ sbio->logical + sbio->page_count * PAGE_SIZE !=
+ spage->logical) {
+ scrub_submit(sdev);
goto again;
}
- sbio->spag[sbio->count].flags = flags;
- sbio->spag[sbio->count].generation = gen;
- sbio->spag[sbio->count].have_csum = 0;
- sbio->spag[sbio->count].mirror_num = mirror_num;
-
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
- if (!ret) {
- __free_page(page);
- ret = scrub_submit(sdev);
- if (ret)
- return ret;
+ sbio->pagev[sbio->page_count] = spage;
+ ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+ if (ret != PAGE_SIZE) {
+ if (sbio->page_count < 1) {
+ bio_put(sbio->bio);
+ sbio->bio = NULL;
+ return -EIO;
+ }
+ scrub_submit(sdev);
goto again;
}
- if (csum) {
- sbio->spag[sbio->count].have_csum = 1;
- memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
+ scrub_block_get(sblock); /* one for the added page */
+ atomic_inc(&sblock->outstanding_pages);
+ sbio->page_count++;
+ if (sbio->page_count == sdev->pages_per_bio)
+ scrub_submit(sdev);
+
+ return 0;
+}
+
+static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+ u64 physical, u64 flags, u64 gen, int mirror_num,
+ u8 *csum, int force)
+{
+ struct scrub_block *sblock;
+ int index;
+
+ sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+ if (!sblock) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.malloc_errors++;
+ spin_unlock(&sdev->stat_lock);
+ return -ENOMEM;
}
- ++sbio->count;
- if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
+
+ /* one ref inside this function, plus one for each page later on */
+ atomic_set(&sblock->ref_count, 1);
+ sblock->sdev = sdev;
+ sblock->no_io_error_seen = 1;
+
+ for (index = 0; len > 0; index++) {
+ struct scrub_page *spage = sblock->pagev + index;
+ u64 l = min_t(u64, len, PAGE_SIZE);
+
+ BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ spage->page = alloc_page(GFP_NOFS);
+ if (!spage->page) {
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.malloc_errors++;
+ spin_unlock(&sdev->stat_lock);
+ while (index > 0) {
+ index--;
+ __free_page(sblock->pagev[index].page);
+ }
+ kfree(sblock);
+ return -ENOMEM;
+ }
+ spage->sblock = sblock;
+ spage->bdev = sdev->dev->bdev;
+ spage->flags = flags;
+ spage->generation = gen;
+ spage->logical = logical;
+ spage->physical = physical;
+ spage->mirror_num = mirror_num;
+ if (csum) {
+ spage->have_csum = 1;
+ memcpy(spage->csum, csum, sdev->csum_size);
+ } else {
+ spage->have_csum = 0;
+ }
+ sblock->page_count++;
+ len -= l;
+ logical += l;
+ physical += l;
+ }
+
+ BUG_ON(sblock->page_count == 0);
+ for (index = 0; index < sblock->page_count; index++) {
+ struct scrub_page *spage = sblock->pagev + index;
int ret;
- ret = scrub_submit(sdev);
- if (ret)
+ ret = scrub_add_page_to_bio(sdev, spage);
+ if (ret) {
+ scrub_block_put(sblock);
return ret;
+ }
}
+ if (force)
+ scrub_submit(sdev);
+
+ /* last one frees, either here or in bio completion for last page */
+ scrub_block_put(sblock);
return 0;
}
+static void scrub_bio_end_io(struct bio *bio, int err)
+{
+ struct scrub_bio *sbio = bio->bi_private;
+ struct scrub_dev *sdev = sbio->sdev;
+ struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+
+ sbio->err = err;
+ sbio->bio = bio;
+
+ btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+}
+
+static void scrub_bio_end_io_worker(struct btrfs_work *work)
+{
+ struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+ struct scrub_dev *sdev = sbio->sdev;
+ int i;
+
+ BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
+ if (sbio->err) {
+ for (i = 0; i < sbio->page_count; i++) {
+ struct scrub_page *spage = sbio->pagev[i];
+
+ spage->io_error = 1;
+ spage->sblock->no_io_error_seen = 0;
+ }
+ }
+
+ /* now complete the scrub_block items that have all pages completed */
+ for (i = 0; i < sbio->page_count; i++) {
+ struct scrub_page *spage = sbio->pagev[i];
+ struct scrub_block *sblock = spage->sblock;
+
+ if (atomic_dec_and_test(&sblock->outstanding_pages))
+ scrub_block_complete(sblock);
+ scrub_block_put(sblock);
+ }
+
+ if (sbio->err) {
+ /* what is this good for??? */
+ sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
+ sbio->bio->bi_phys_segments = 0;
+ sbio->bio->bi_idx = 0;
+
+ for (i = 0; i < sbio->page_count; i++) {
+ struct bio_vec *bi;
+ bi = &sbio->bio->bi_io_vec[i];
+ bi->bv_offset = 0;
+ bi->bv_len = PAGE_SIZE;
+ }
+ }
+
+ bio_put(sbio->bio);
+ sbio->bio = NULL;
+ spin_lock(&sdev->list_lock);
+ sbio->next_free = sdev->first_free;
+ sdev->first_free = sbio->index;
+ spin_unlock(&sdev->list_lock);
+ atomic_dec(&sdev->in_flight);
+ wake_up(&sdev->list_wait);
+}
+
+static void scrub_block_complete(struct scrub_block *sblock)
+{
+ if (!sblock->no_io_error_seen)
+ scrub_handle_errored_block(sblock);
+ else
+ scrub_checksum(sblock);
+}
+
static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
u8 *csum)
{
@@ -1054,7 +1672,6 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
int ret = 0;
unsigned long i;
unsigned long num_sectors;
- u32 sectorsize = sdev->dev->dev_root->sectorsize;
while (!list_empty(&sdev->csum_list)) {
sum = list_first_entry(&sdev->csum_list,
@@ -1072,7 +1689,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
if (!sum)
return 0;
- num_sectors = sum->len / sectorsize;
+ num_sectors = sum->len / sdev->sectorsize;
for (i = 0; i < num_sectors; ++i) {
if (sum->sums[i].bytenr == logical) {
memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
@@ -1093,9 +1710,28 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
{
int ret;
u8 csum[BTRFS_CSUM_SIZE];
+ u32 blocksize;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ blocksize = sdev->sectorsize;
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.data_extents_scrubbed++;
+ sdev->stat.data_bytes_scrubbed += len;
+ spin_unlock(&sdev->stat_lock);
+ } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ BUG_ON(sdev->nodesize != sdev->leafsize);
+ blocksize = sdev->nodesize;
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.tree_extents_scrubbed++;
+ sdev->stat.tree_bytes_scrubbed += len;
+ spin_unlock(&sdev->stat_lock);
+ } else {
+ blocksize = sdev->sectorsize;
+ BUG_ON(1);
+ }
while (len) {
- u64 l = min_t(u64, len, PAGE_SIZE);
+ u64 l = min_t(u64, len, blocksize);
int have_csum = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
@@ -1104,8 +1740,8 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
if (have_csum == 0)
++sdev->stat.no_csum;
}
- ret = scrub_page(sdev, logical, l, physical, flags, gen,
- mirror_num, have_csum ? csum : NULL, 0);
+ ret = scrub_pages(sdev, logical, l, physical, flags, gen,
+ mirror_num, have_csum ? csum : NULL, 0);
if (ret)
return ret;
len -= l;
@@ -1170,6 +1806,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
if (!path)
return -ENOMEM;
+ /*
+ * work on commit root. The related disk blocks are static as
+ * long as COW is applied. This means, it is save to rewrite
+ * them to repair disk errors without any race conditions
+ */
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -1516,15 +2157,18 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
struct btrfs_device *device = sdev->dev;
struct btrfs_root *root = device->dev_root;
+ if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ return -EIO;
+
gen = root->fs_info->last_trans_committed;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
break;
- ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
- BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+ ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+ BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
if (ret)
return ret;
}
@@ -1583,10 +2227,30 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
/*
* check some assumptions
*/
- if (root->sectorsize != PAGE_SIZE ||
- root->sectorsize != root->leafsize ||
- root->sectorsize != root->nodesize) {
- printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
+ if (root->nodesize != root->leafsize) {
+ printk(KERN_ERR
+ "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
+ root->nodesize, root->leafsize);
+ return -EINVAL;
+ }
+
+ if (root->nodesize > BTRFS_STRIPE_LEN) {
+ /*
+ * in this case scrub is unable to calculate the checksum
+ * the way scrub is implemented. Do not handle this
+ * situation at all because it won't ever happen.
+ */
+ printk(KERN_ERR
+ "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
+ root->nodesize, BTRFS_STRIPE_LEN);
+ return -EINVAL;
+ }
+
+ if (root->sectorsize != PAGE_SIZE) {
+ /* not supported for data w/o checksums */
+ printk(KERN_ERR
+ "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
+ root->sectorsize, (unsigned long long)PAGE_SIZE);
return -EINVAL;
}
@@ -1656,7 +2320,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
return ret;
}
-int btrfs_scrub_pause(struct btrfs_root *root)
+void btrfs_scrub_pause(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1671,34 +2335,28 @@ int btrfs_scrub_pause(struct btrfs_root *root)
mutex_lock(&fs_info->scrub_lock);
}
mutex_unlock(&fs_info->scrub_lock);
-
- return 0;
}
-int btrfs_scrub_continue(struct btrfs_root *root)
+void btrfs_scrub_continue(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
atomic_dec(&fs_info->scrub_pause_req);
wake_up(&fs_info->scrub_pause_wait);
- return 0;
}
-int btrfs_scrub_pause_super(struct btrfs_root *root)
+void btrfs_scrub_pause_super(struct btrfs_root *root)
{
down_write(&root->fs_info->scrub_super_lock);
- return 0;
}
-int btrfs_scrub_continue_super(struct btrfs_root *root)
+void btrfs_scrub_continue_super(struct btrfs_root *root)
{
up_write(&root->fs_info->scrub_super_lock);
- return 0;
}
-int btrfs_scrub_cancel(struct btrfs_root *root)
+int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
mutex_lock(&fs_info->scrub_lock);
if (!atomic_read(&fs_info->scrubs_running)) {
@@ -1719,6 +2377,11 @@ int btrfs_scrub_cancel(struct btrfs_root *root)
return 0;
}
+int btrfs_scrub_cancel(struct btrfs_root *root)
+{
+ return __btrfs_scrub_cancel(root->fs_info);
+}
+
int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1741,6 +2404,7 @@ int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
return 0;
}
+
int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
{
struct btrfs_fs_info *fs_info = root->fs_info;
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index bc1f6ad18442..c6ffa5812419 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -44,8 +44,9 @@
#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \
-u##bits btrfs_##name(struct extent_buffer *eb, \
- type *s) \
+void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token); \
+u##bits btrfs_token_##name(struct extent_buffer *eb, \
+ type *s, struct btrfs_map_token *token) \
{ \
unsigned long part_offset = (unsigned long)s; \
unsigned long offset = part_offset + offsetof(type, member); \
@@ -54,9 +55,18 @@ u##bits btrfs_##name(struct extent_buffer *eb, \
char *kaddr; \
unsigned long map_start; \
unsigned long map_len; \
+ unsigned long mem_len = sizeof(((type *)0)->member); \
u##bits res; \
+ if (token && token->kaddr && token->offset <= offset && \
+ token->eb == eb && \
+ (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \
+ kaddr = token->kaddr; \
+ p = (type *)(kaddr + part_offset - token->offset); \
+ res = le##bits##_to_cpu(p->member); \
+ return res; \
+ } \
err = map_private_extent_buffer(eb, offset, \
- sizeof(((type *)0)->member), \
+ mem_len, \
&kaddr, &map_start, &map_len); \
if (err) { \
__le##bits leres; \
@@ -65,10 +75,15 @@ u##bits btrfs_##name(struct extent_buffer *eb, \
} \
p = (type *)(kaddr + part_offset - map_start); \
res = le##bits##_to_cpu(p->member); \
+ if (token) { \
+ token->kaddr = kaddr; \
+ token->offset = map_start; \
+ token->eb = eb; \
+ } \
return res; \
} \
-void btrfs_set_##name(struct extent_buffer *eb, \
- type *s, u##bits val) \
+void btrfs_set_token_##name(struct extent_buffer *eb, \
+ type *s, u##bits val, struct btrfs_map_token *token) \
{ \
unsigned long part_offset = (unsigned long)s; \
unsigned long offset = part_offset + offsetof(type, member); \
@@ -77,8 +92,17 @@ void btrfs_set_##name(struct extent_buffer *eb, \
char *kaddr; \
unsigned long map_start; \
unsigned long map_len; \
+ unsigned long mem_len = sizeof(((type *)0)->member); \
+ if (token && token->kaddr && token->offset <= offset && \
+ token->eb == eb && \
+ (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \
+ kaddr = token->kaddr; \
+ p = (type *)(kaddr + part_offset - token->offset); \
+ p->member = cpu_to_le##bits(val); \
+ return; \
+ } \
err = map_private_extent_buffer(eb, offset, \
- sizeof(((type *)0)->member), \
+ mem_len, \
&kaddr, &map_start, &map_len); \
if (err) { \
__le##bits val2; \
@@ -88,7 +112,22 @@ void btrfs_set_##name(struct extent_buffer *eb, \
} \
p = (type *)(kaddr + part_offset - map_start); \
p->member = cpu_to_le##bits(val); \
-}
+ if (token) { \
+ token->kaddr = kaddr; \
+ token->offset = map_start; \
+ token->eb = eb; \
+ } \
+} \
+void btrfs_set_##name(struct extent_buffer *eb, \
+ type *s, u##bits val) \
+{ \
+ btrfs_set_token_##name(eb, s, val, NULL); \
+} \
+u##bits btrfs_##name(struct extent_buffer *eb, \
+ type *s) \
+{ \
+ return btrfs_token_##name(eb, s, NULL); \
+} \
#include "ctree.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5239003d453e..84571d7da12e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,7 +40,6 @@
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/cleancache.h>
-#include <linux/mnt_namespace.h>
#include <linux/ratelimit.h>
#include "compat.h"
#include "delayed-inode.h"
@@ -77,6 +76,9 @@ static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
case -EROFS:
errstr = "Readonly filesystem";
break;
+ case -EEXIST:
+ errstr = "Object already exists";
+ break;
default:
if (nbuf) {
if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
@@ -117,6 +119,8 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
sb->s_flags |= MS_RDONLY;
printk(KERN_INFO "btrfs is forced readonly\n");
+ __btrfs_scrub_cancel(fs_info);
+// WARN_ON(1);
}
}
@@ -125,36 +129,143 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
* invokes the approciate error response.
*/
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno)
+ unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
char nbuf[16];
const char *errstr;
+ va_list args;
+ va_start(args, fmt);
/*
* Special case: if the error is EROFS, and we're already
* under MS_RDONLY, then it is safe here.
*/
if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
- return;
+ return;
- errstr = btrfs_decode_error(fs_info, errno, nbuf);
- printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
- sb->s_id, function, line, errstr);
- save_error_info(fs_info);
+ errstr = btrfs_decode_error(fs_info, errno, nbuf);
+ if (fmt) {
+ struct va_format vaf = {
+ .fmt = fmt,
+ .va = &args,
+ };
- btrfs_handle_error(fs_info);
+ printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
+ sb->s_id, function, line, errstr, &vaf);
+ } else {
+ printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+ sb->s_id, function, line, errstr);
+ }
+
+ /* Don't go through full error handling during mount */
+ if (sb->s_flags & MS_BORN) {
+ save_error_info(fs_info);
+ btrfs_handle_error(fs_info);
+ }
+ va_end(args);
}
-static void btrfs_put_super(struct super_block *sb)
+const char *logtypes[] = {
+ "emergency",
+ "alert",
+ "critical",
+ "error",
+ "warning",
+ "notice",
+ "info",
+ "debug",
+};
+
+void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
- struct btrfs_root *root = btrfs_sb(sb);
- int ret;
+ struct super_block *sb = fs_info->sb;
+ char lvl[4];
+ struct va_format vaf;
+ va_list args;
+ const char *type = logtypes[4];
+
+ va_start(args, fmt);
+
+ if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') {
+ strncpy(lvl, fmt, 3);
+ fmt += 3;
+ type = logtypes[fmt[1] - '0'];
+ } else
+ *lvl = '\0';
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf);
+}
- ret = close_ctree(root);
- sb->s_fs_info = NULL;
+/*
+ * We only mark the transaction aborted and then set the file system read-only.
+ * This will prevent new transactions from starting or trying to join this
+ * one.
+ *
+ * This means that error recovery at the call site is limited to freeing
+ * any local memory allocations and passing the error code up without
+ * further cleanup. The transaction should complete as it normally would
+ * in the call path but will return -EIO.
+ *
+ * We'll complete the cleanup in btrfs_end_transaction and
+ * btrfs_commit_transaction.
+ */
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *function,
+ unsigned int line, int errno)
+{
+ WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted");
+ trans->aborted = errno;
+ /* Nothing used. The other threads that have joined this
+ * transaction may be able to continue. */
+ if (!trans->blocks_used) {
+ btrfs_printk(root->fs_info, "Aborting unused transaction.\n");
+ return;
+ }
+ trans->transaction->aborted = errno;
+ __btrfs_std_error(root->fs_info, function, line, errno, NULL);
+}
+/*
+ * __btrfs_panic decodes unexpected, fatal errors from the caller,
+ * issues an alert, and either panics or BUGs, depending on mount options.
+ */
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...)
+{
+ char nbuf[16];
+ char *s_id = "<unknown>";
+ const char *errstr;
+ struct va_format vaf = { .fmt = fmt };
+ va_list args;
+
+ if (fs_info)
+ s_id = fs_info->sb->s_id;
- (void)ret; /* FIXME: need to fix VFS to return error? */
+ va_start(args, fmt);
+ vaf.va = &args;
+
+ errstr = btrfs_decode_error(fs_info, errno, nbuf);
+ if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)
+ panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
+ s_id, function, line, &vaf, errstr);
+
+ printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
+ s_id, function, line, &vaf, errstr);
+ va_end(args);
+ /* Caller calls BUG() */
+}
+
+static void btrfs_put_super(struct super_block *sb)
+{
+ (void)close_ctree(btrfs_sb(sb)->tree_root);
+ /* FIXME: need to fix VFS to return error? */
+ /* AV: return it _where_? ->put_super() can be triggered by any number
+ * of async events, up to and including delivery of SIGKILL to the
+ * last process that kept it busy. Or segfault in the aforementioned
+ * process... Whom would you report that to?
+ */
}
enum {
@@ -167,7 +278,7 @@ enum {
Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
Opt_check_integrity, Opt_check_integrity_including_extent_data,
- Opt_check_integrity_print_mask,
+ Opt_check_integrity_print_mask, Opt_fatal_errors,
Opt_err,
};
@@ -207,12 +318,14 @@ static match_table_t tokens = {
{Opt_check_integrity, "check_int"},
{Opt_check_integrity_including_extent_data, "check_int_data"},
{Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
+ {Opt_fatal_errors, "fatal_errors=%s"},
{Opt_err, NULL},
};
/*
* Regular mount options parser. Everything that is needed only when
* reading in a new superblock is parsed here.
+ * XXX JDM: This needs to be cleaned up for remount.
*/
int btrfs_parse_options(struct btrfs_root *root, char *options)
{
@@ -439,6 +552,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
ret = -EINVAL;
goto out;
#endif
+ case Opt_fatal_errors:
+ if (strcmp(args[0].from, "panic") == 0)
+ btrfs_set_opt(info->mount_opt,
+ PANIC_ON_FATAL_ERROR);
+ else if (strcmp(args[0].from, "bug") == 0)
+ btrfs_clear_opt(info->mount_opt,
+ PANIC_ON_FATAL_ERROR);
+ else {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p);
@@ -542,7 +667,8 @@ out:
static struct dentry *get_default_root(struct super_block *sb,
u64 subvol_objectid)
{
- struct btrfs_root *root = sb->s_fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_root *new_root;
struct btrfs_dir_item *di;
struct btrfs_path *path;
@@ -572,7 +698,7 @@ static struct dentry *get_default_root(struct super_block *sb,
* will mount by default if we haven't been given a specific subvolume
* to mount.
*/
- dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+ dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
@@ -586,7 +712,7 @@ static struct dentry *get_default_root(struct super_block *sb,
*/
btrfs_free_path(path);
dir_id = BTRFS_FIRST_FREE_OBJECTID;
- new_root = root->fs_info->fs_root;
+ new_root = fs_info->fs_root;
goto setup_root;
}
@@ -594,7 +720,7 @@ static struct dentry *get_default_root(struct super_block *sb,
btrfs_free_path(path);
find_root:
- new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+ new_root = btrfs_read_fs_root_no_name(fs_info, &location);
if (IS_ERR(new_root))
return ERR_CAST(new_root);
@@ -630,7 +756,7 @@ static int btrfs_fill_super(struct super_block *sb,
{
struct inode *inode;
struct dentry *root_dentry;
- struct btrfs_root *tree_root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_key key;
int err;
@@ -645,18 +771,16 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_flags |= MS_POSIXACL;
#endif
- tree_root = open_ctree(sb, fs_devices, (char *)data);
-
- if (IS_ERR(tree_root)) {
+ err = open_ctree(sb, fs_devices, (char *)data);
+ if (err) {
printk("btrfs: open_ctree failed\n");
- return PTR_ERR(tree_root);
+ return err;
}
- sb->s_fs_info = tree_root;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
+ inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail_close;
@@ -673,23 +797,25 @@ static int btrfs_fill_super(struct super_block *sb,
save_mount_options(sb, data);
cleancache_init_fs(sb);
+ sb->s_flags |= MS_ACTIVE;
return 0;
fail_close:
- close_ctree(tree_root);
+ close_ctree(fs_info->tree_root);
return err;
}
int btrfs_sync_fs(struct super_block *sb, int wait)
{
struct btrfs_trans_handle *trans;
- struct btrfs_root *root = btrfs_sb(sb);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
int ret;
trace_btrfs_sync_fs(wait);
if (!wait) {
- filemap_flush(root->fs_info->btree_inode->i_mapping);
+ filemap_flush(fs_info->btree_inode->i_mapping);
return 0;
}
@@ -703,10 +829,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return ret;
}
-static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
- struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
- struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+ struct btrfs_root *root = info->tree_root;
char *compress_type;
if (btrfs_test_opt(root, DEGRADED))
@@ -766,28 +892,25 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",inode_cache");
if (btrfs_test_opt(root, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
+ if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
+ seq_puts(seq, ",fatal_errors=panic");
return 0;
}
static int btrfs_test_super(struct super_block *s, void *data)
{
- struct btrfs_root *test_root = data;
- struct btrfs_root *root = btrfs_sb(s);
+ struct btrfs_fs_info *p = data;
+ struct btrfs_fs_info *fs_info = btrfs_sb(s);
- /*
- * If this super block is going away, return false as it
- * can't match as an existing super block.
- */
- if (!atomic_read(&s->s_active))
- return 0;
- return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+ return fs_info->fs_devices == p->fs_devices;
}
static int btrfs_set_super(struct super_block *s, void *data)
{
- s->s_fs_info = data;
-
- return set_anon_super(s, data);
+ int err = set_anon_super(s, data);
+ if (!err)
+ s->s_fs_info = data;
+ return err;
}
/*
@@ -947,12 +1070,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if (!fs_info)
return ERR_PTR(-ENOMEM);
- fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- if (!fs_info->tree_root) {
- error = -ENOMEM;
- goto error_fs_info;
- }
- fs_info->tree_root->fs_info = fs_info;
fs_info->fs_devices = fs_devices;
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@ -972,43 +1089,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
}
bdev = fs_devices->latest_bdev;
- s = sget(fs_type, btrfs_test_super, btrfs_set_super,
- fs_info->tree_root);
+ s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
if (IS_ERR(s)) {
error = PTR_ERR(s);
goto error_close_devices;
}
if (s->s_root) {
- if ((flags ^ s->s_flags) & MS_RDONLY) {
- deactivate_locked_super(s);
- error = -EBUSY;
- goto error_close_devices;
- }
-
btrfs_close_devices(fs_devices);
free_fs_info(fs_info);
+ if ((flags ^ s->s_flags) & MS_RDONLY)
+ error = -EBUSY;
} else {
char b[BDEVNAME_SIZE];
s->s_flags = flags | MS_NOSEC;
strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
- btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+ btrfs_sb(s)->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
-
- s->s_flags |= MS_ACTIVE;
}
- root = get_default_root(s, subvol_objectid);
- if (IS_ERR(root)) {
+ root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
+ if (IS_ERR(root))
deactivate_locked_super(s);
- return root;
- }
return root;
@@ -1021,12 +1125,22 @@ error_fs_info:
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
- struct btrfs_root *root = btrfs_sb(sb);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
+ unsigned old_flags = sb->s_flags;
+ unsigned long old_opts = fs_info->mount_opt;
+ unsigned long old_compress_type = fs_info->compress_type;
+ u64 old_max_inline = fs_info->max_inline;
+ u64 old_alloc_start = fs_info->alloc_start;
+ int old_thread_pool_size = fs_info->thread_pool_size;
+ unsigned int old_metadata_ratio = fs_info->metadata_ratio;
int ret;
ret = btrfs_parse_options(root, data);
- if (ret)
- return -EINVAL;
+ if (ret) {
+ ret = -EINVAL;
+ goto restore;
+ }
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
return 0;
@@ -1034,26 +1148,44 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (*flags & MS_RDONLY) {
sb->s_flags |= MS_RDONLY;
- ret = btrfs_commit_super(root);
- WARN_ON(ret);
+ ret = btrfs_commit_super(root);
+ if (ret)
+ goto restore;
} else {
- if (root->fs_info->fs_devices->rw_devices == 0)
- return -EACCES;
+ if (fs_info->fs_devices->rw_devices == 0)
+ ret = -EACCES;
+ goto restore;
- if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
- return -EINVAL;
+ if (btrfs_super_log_root(fs_info->super_copy) != 0)
+ ret = -EINVAL;
+ goto restore;
- ret = btrfs_cleanup_fs_roots(root->fs_info);
- WARN_ON(ret);
+ ret = btrfs_cleanup_fs_roots(fs_info);
+ if (ret)
+ goto restore;
/* recover relocation */
ret = btrfs_recover_relocation(root);
- WARN_ON(ret);
+ if (ret)
+ goto restore;
sb->s_flags &= ~MS_RDONLY;
}
return 0;
+
+restore:
+ /* We've hit an error - don't reset MS_RDONLY */
+ if (sb->s_flags & MS_RDONLY)
+ old_flags |= MS_RDONLY;
+ sb->s_flags = old_flags;
+ fs_info->mount_opt = old_opts;
+ fs_info->compress_type = old_compress_type;
+ fs_info->max_inline = old_max_inline;
+ fs_info->alloc_start = old_alloc_start;
+ fs_info->thread_pool_size = old_thread_pool_size;
+ fs_info->metadata_ratio = old_metadata_ratio;
+ return ret;
}
/* Used to sort the devices by max_avail(descending sort) */
@@ -1212,18 +1344,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct btrfs_root *root = btrfs_sb(dentry->d_sb);
- struct btrfs_super_block *disk_super = root->fs_info->super_copy;
- struct list_head *head = &root->fs_info->space_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ struct list_head *head = &fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
int bits = dentry->d_sb->s_blocksize_bits;
- __be32 *fsid = (__be32 *)root->fs_info->fsid;
+ __be32 *fsid = (__be32 *)fs_info->fsid;
int ret;
/* holding chunk_muext to avoid allocating new chunks */
- mutex_lock(&root->fs_info->chunk_mutex);
+ mutex_lock(&fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1242,14 +1374,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bavail = total_free_data;
- ret = btrfs_calc_avail_data_space(root, &total_free_data);
+ ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
if (ret) {
- mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
buf->f_bavail += total_free_data;
buf->f_bavail = buf->f_bavail >> bits;
- mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
/* We treat it as constant endianness (it doesn't matter _which_)
because we want the fsid to come out the same whether mounted
@@ -1263,11 +1395,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+static void btrfs_kill_super(struct super_block *sb)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ kill_anon_super(sb);
+ free_fs_info(fs_info);
+}
+
static struct file_system_type btrfs_fs_type = {
.owner = THIS_MODULE,
.name = "btrfs",
.mount = btrfs_mount,
- .kill_sb = kill_anon_super,
+ .kill_sb = btrfs_kill_super,
.fs_flags = FS_REQUIRES_DEV,
};
@@ -1301,17 +1440,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
static int btrfs_freeze(struct super_block *sb)
{
- struct btrfs_root *root = btrfs_sb(sb);
- mutex_lock(&root->fs_info->transaction_kthread_mutex);
- mutex_lock(&root->fs_info->cleaner_mutex);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ mutex_lock(&fs_info->transaction_kthread_mutex);
+ mutex_lock(&fs_info->cleaner_mutex);
return 0;
}
static int btrfs_unfreeze(struct super_block *sb)
{
- struct btrfs_root *root = btrfs_sb(sb);
- mutex_unlock(&root->fs_info->cleaner_mutex);
- mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ mutex_unlock(&fs_info->transaction_kthread_mutex);
return 0;
}
@@ -1376,9 +1515,7 @@ static int __init init_btrfs_fs(void)
if (err)
return err;
- err = btrfs_init_compress();
- if (err)
- goto free_sysfs;
+ btrfs_init_compress();
err = btrfs_init_cachep();
if (err)
@@ -1421,7 +1558,6 @@ free_cachep:
btrfs_destroy_cachep();
free_compress:
btrfs_exit_compress();
-free_sysfs:
btrfs_exit_sysfs();
return err;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cd220f283638..63f835aa9788 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -31,7 +31,7 @@
#define BTRFS_ROOT_TRANS_TAG 0
-static noinline void put_transaction(struct btrfs_transaction *transaction)
+void put_transaction(struct btrfs_transaction *transaction)
{
WARN_ON(atomic_read(&transaction->use_count) == 0);
if (atomic_dec_and_test(&transaction->use_count)) {
@@ -58,6 +58,12 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
spin_lock(&root->fs_info->trans_lock);
loop:
+ /* The file system has been taken offline. No new transactions. */
+ if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ spin_unlock(&root->fs_info->trans_lock);
+ return -EROFS;
+ }
+
if (root->fs_info->trans_no_join) {
if (!nofail) {
spin_unlock(&root->fs_info->trans_lock);
@@ -67,6 +73,8 @@ loop:
cur_trans = root->fs_info->running_transaction;
if (cur_trans) {
+ if (cur_trans->aborted)
+ return cur_trans->aborted;
atomic_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
cur_trans->num_joined++;
@@ -123,6 +131,7 @@ loop:
root->fs_info->generation++;
cur_trans->transid = root->fs_info->generation;
root->fs_info->running_transaction = cur_trans;
+ cur_trans->aborted = 0;
spin_unlock(&root->fs_info->trans_lock);
return 0;
@@ -318,6 +327,7 @@ again:
h->use_count = 1;
h->block_rsv = NULL;
h->orig_rsv = NULL;
+ h->aborted = 0;
smp_mb();
if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -440,6 +450,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_block_rsv *rsv = trans->block_rsv;
int updates;
+ int err;
smp_mb();
if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
@@ -453,8 +464,11 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
updates = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
- if (updates)
- btrfs_run_delayed_refs(trans, root, updates);
+ if (updates) {
+ err = btrfs_run_delayed_refs(trans, root, updates);
+ if (err) /* Error code will also eval true */
+ return err;
+ }
trans->block_rsv = rsv;
@@ -525,6 +539,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (throttle)
btrfs_run_delayed_iputs(root);
+ if (trans->aborted ||
+ root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ return -EIO;
+ }
+
return 0;
}
@@ -690,11 +709,13 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
&root->root_item);
- BUG_ON(ret);
+ if (ret)
+ return ret;
old_root_used = btrfs_root_used(&root->root_item);
ret = btrfs_write_dirty_block_groups(trans, root);
- BUG_ON(ret);
+ if (ret)
+ return ret;
}
if (root != root->fs_info->extent_root)
@@ -705,6 +726,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
/*
* update all the cowonly tree roots on disk
+ *
+ * The error handling in this function may not be obvious. Any of the
+ * failures will cause the file system to go offline. We still need
+ * to clean up the delayed refs.
*/
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
@@ -715,23 +740,30 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
int ret;
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- BUG_ON(ret);
+ if (ret)
+ return ret;
eb = btrfs_lock_root_node(fs_info->tree_root);
- ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
- BUG_ON(ret);
+ ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
+ 0, &eb);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
+ if (ret)
+ return ret;
+
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- BUG_ON(ret);
+ if (ret)
+ return ret;
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
next = fs_info->dirty_cowonly_roots.next;
list_del_init(next);
root = list_entry(next, struct btrfs_root, dirty_list);
- update_cowonly_root(trans, root);
+ ret = update_cowonly_root(trans, root);
+ if (ret)
+ return ret;
}
down_write(&fs_info->extent_commit_sem);
@@ -875,7 +907,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
if (!new_root_item) {
- pending->error = -ENOMEM;
+ ret = pending->error = -ENOMEM;
goto fail;
}
@@ -912,21 +944,24 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* insert the directory item
*/
ret = btrfs_set_inode_index(parent_inode, &index);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
ret = btrfs_insert_dir_item(trans, parent_root,
dentry->d_name.name, dentry->d_name.len,
parent_inode, &key,
BTRFS_FT_DIR, index);
- if (ret) {
+ if (ret == -EEXIST) {
pending->error = -EEXIST;
dput(parent);
goto fail;
+ } else if (ret) {
+ goto abort_trans_dput;
}
btrfs_i_size_write(parent_inode, parent_inode->i_size +
dentry->d_name.len * 2);
ret = btrfs_update_inode(trans, parent_root, parent_inode);
- BUG_ON(ret);
+ if (ret)
+ goto abort_trans_dput;
/*
* pull in the delayed directory update
@@ -935,7 +970,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* snapshot
*/
ret = btrfs_run_delayed_items(trans, root);
- BUG_ON(ret);
+ if (ret) { /* Transaction aborted */
+ dput(parent);
+ goto fail;
+ }
record_root_in_trans(trans, root);
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
@@ -951,12 +989,20 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
old = btrfs_lock_root_node(root);
ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_tree_unlock(old);
+ free_extent_buffer(old);
+ goto abort_trans_dput;
+ }
+
btrfs_set_lock_blocking(old);
- btrfs_copy_root(trans, root, old, &tmp, objectid);
+ ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
+ /* clean up in any case */
btrfs_tree_unlock(old);
free_extent_buffer(old);
+ if (ret)
+ goto abort_trans_dput;
/* see comments in should_cow_block() */
root->force_cow = 1;
@@ -968,7 +1014,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
btrfs_tree_unlock(tmp);
free_extent_buffer(tmp);
- BUG_ON(ret);
+ if (ret)
+ goto abort_trans_dput;
/*
* insert root back/forward references
@@ -977,19 +1024,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
parent_root->root_key.objectid,
btrfs_ino(parent_inode), index,
dentry->d_name.name, dentry->d_name.len);
- BUG_ON(ret);
dput(parent);
+ if (ret)
+ goto fail;
key.offset = (u64)-1;
pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
- BUG_ON(IS_ERR(pending->snap));
+ if (IS_ERR(pending->snap)) {
+ ret = PTR_ERR(pending->snap);
+ goto abort_trans;
+ }
- btrfs_reloc_post_snapshot(trans, pending);
+ ret = btrfs_reloc_post_snapshot(trans, pending);
+ if (ret)
+ goto abort_trans;
+ ret = 0;
fail:
kfree(new_root_item);
trans->block_rsv = rsv;
btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
- return 0;
+ return ret;
+
+abort_trans_dput:
+ dput(parent);
+abort_trans:
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
}
/*
@@ -1126,6 +1186,33 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
return 0;
}
+
+static void cleanup_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+
+ WARN_ON(trans->use_count > 1);
+
+ spin_lock(&root->fs_info->trans_lock);
+ list_del_init(&cur_trans->list);
+ spin_unlock(&root->fs_info->trans_lock);
+
+ btrfs_cleanup_one_transaction(trans->transaction, root);
+
+ put_transaction(cur_trans);
+ put_transaction(cur_trans);
+
+ trace_btrfs_transaction_commit(root);
+
+ btrfs_scrub_continue(root);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+}
+
/*
* btrfs_transaction state sequence:
* in_commit = 0, blocked = 0 (initial)
@@ -1137,10 +1224,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
unsigned long joined = 0;
- struct btrfs_transaction *cur_trans;
+ struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
DEFINE_WAIT(wait);
- int ret;
+ int ret = -EIO;
int should_grow = 0;
unsigned long now = get_seconds();
int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
@@ -1150,13 +1237,18 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
+ if (cur_trans->aborted)
+ goto cleanup_transaction;
+
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
ret = btrfs_run_delayed_refs(trans, root, 0);
- BUG_ON(ret);
+ if (ret)
+ goto cleanup_transaction;
cur_trans = trans->transaction;
+
/*
* set the flushing flag so procs in this transaction have to
* start sending their work down.
@@ -1164,19 +1256,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
cur_trans->delayed_refs.flushing = 1;
ret = btrfs_run_delayed_refs(trans, root, 0);
- BUG_ON(ret);
+ if (ret)
+ goto cleanup_transaction;
spin_lock(&cur_trans->commit_lock);
if (cur_trans->in_commit) {
spin_unlock(&cur_trans->commit_lock);
atomic_inc(&cur_trans->use_count);
- btrfs_end_transaction(trans, root);
+ ret = btrfs_end_transaction(trans, root);
wait_for_commit(root, cur_trans);
put_transaction(cur_trans);
- return 0;
+ return ret;
}
trans->transaction->in_commit = 1;
@@ -1216,12 +1309,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (flush_on_commit || snap_pending) {
btrfs_start_delalloc_inodes(root, 1);
- ret = btrfs_wait_ordered_extents(root, 0, 1);
- BUG_ON(ret);
+ btrfs_wait_ordered_extents(root, 0, 1);
}
ret = btrfs_run_delayed_items(trans, root);
- BUG_ON(ret);
+ if (ret)
+ goto cleanup_transaction;
/*
* rename don't use btrfs_join_transaction, so, once we
@@ -1263,13 +1356,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_lock(&root->fs_info->reloc_mutex);
ret = btrfs_run_delayed_items(trans, root);
- BUG_ON(ret);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto cleanup_transaction;
+ }
ret = create_pending_snapshots(trans, root->fs_info);
- BUG_ON(ret);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto cleanup_transaction;
+ }
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- BUG_ON(ret);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto cleanup_transaction;
+ }
/*
* make sure none of the code above managed to slip in a
@@ -1296,7 +1398,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_lock(&root->fs_info->tree_log_mutex);
ret = commit_fs_roots(trans, root);
- BUG_ON(ret);
+ if (ret) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ goto cleanup_transaction;
+ }
/* commit_fs_roots gets rid of all the tree log roots, it is now
* safe to free the root of tree log roots
@@ -1304,7 +1409,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_free_log_root_tree(trans, root->fs_info);
ret = commit_cowonly_roots(trans, root);
- BUG_ON(ret);
+ if (ret) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ goto cleanup_transaction;
+ }
btrfs_prepare_extent_commit(trans, root);
@@ -1338,8 +1446,18 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
wake_up(&root->fs_info->transaction_wait);
ret = btrfs_write_and_wait_transaction(trans, root);
- BUG_ON(ret);
- write_ctree_super(trans, root, 0);
+ if (ret) {
+ btrfs_error(root->fs_info, ret,
+ "Error while writing out transaction.");
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ goto cleanup_transaction;
+ }
+
+ ret = write_ctree_super(trans, root, 0);
+ if (ret) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ goto cleanup_transaction;
+ }
/*
* the super is written, we can safely allow the tree-loggers
@@ -1375,6 +1493,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_run_delayed_iputs(root);
return ret;
+
+cleanup_transaction:
+ btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
+// WARN_ON(1);
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+ cleanup_transaction(trans, root);
+
+ return ret;
}
/*
@@ -1390,6 +1517,8 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
spin_unlock(&fs_info->trans_lock);
while (!list_empty(&list)) {
+ int ret;
+
root = list_entry(list.next, struct btrfs_root, root_list);
list_del(&root->root_list);
@@ -1397,9 +1526,10 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
- btrfs_drop_snapshot(root, NULL, 0, 0);
+ ret = btrfs_drop_snapshot(root, NULL, 0, 0);
else
- btrfs_drop_snapshot(root, NULL, 1, 0);
+ ret =btrfs_drop_snapshot(root, NULL, 1, 0);
+ BUG_ON(ret < 0);
}
return 0;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 02564e6230ac..fe27379e368b 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -43,6 +43,7 @@ struct btrfs_transaction {
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
struct btrfs_delayed_ref_root delayed_refs;
+ int aborted;
};
struct btrfs_trans_handle {
@@ -55,6 +56,7 @@ struct btrfs_trans_handle {
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
+ int aborted;
};
struct btrfs_pending_snapshot {
@@ -114,4 +116,5 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+void put_transaction(struct btrfs_transaction *transaction);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 966cc74f5d6c..d017283ae6f5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -212,14 +212,13 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
* indicate we're done making changes to the log tree
* and wake up anyone waiting to do a sync
*/
-int btrfs_end_log_trans(struct btrfs_root *root)
+void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
smp_mb();
if (waitqueue_active(&root->log_writer_wait))
wake_up(&root->log_writer_wait);
}
- return 0;
}
@@ -378,12 +377,11 @@ insert:
u32 found_size;
found_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
- if (found_size > item_size) {
+ if (found_size > item_size)
btrfs_truncate_item(trans, root, path, item_size, 1);
- } else if (found_size < item_size) {
- ret = btrfs_extend_item(trans, root, path,
- item_size - found_size);
- }
+ else if (found_size < item_size)
+ btrfs_extend_item(trans, root, path,
+ item_size - found_size);
} else if (ret) {
return ret;
}
@@ -1763,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_and_pin_reserved_extent(root,
bytenr, blocksize);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or logic errors */
}
free_extent_buffer(next);
continue;
@@ -1871,20 +1869,26 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
wret = walk_down_log_tree(trans, log, path, &level, wc);
if (wret > 0)
break;
- if (wret < 0)
+ if (wret < 0) {
ret = wret;
+ goto out;
+ }
wret = walk_up_log_tree(trans, log, path, &level, wc);
if (wret > 0)
break;
- if (wret < 0)
+ if (wret < 0) {
ret = wret;
+ goto out;
+ }
}
/* was the root node processed? if not, catch it here */
if (path->nodes[orig_level]) {
- wc->process_func(log, path->nodes[orig_level], wc,
+ ret = wc->process_func(log, path->nodes[orig_level], wc,
btrfs_header_generation(path->nodes[orig_level]));
+ if (ret)
+ goto out;
if (wc->free) {
struct extent_buffer *next;
@@ -1900,10 +1904,11 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_and_pin_reserved_extent(log, next->start,
next->len);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM or logic errors */
}
}
+out:
for (i = 0; i <= orig_level; i++) {
if (path->nodes[i]) {
free_extent_buffer(path->nodes[i]);
@@ -1963,8 +1968,8 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
return 0;
}
-static int wait_for_writer(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
DEFINE_WAIT(wait);
while (root->fs_info->last_trans_log_full_commit !=
@@ -1978,7 +1983,6 @@ static int wait_for_writer(struct btrfs_trans_handle *trans,
mutex_lock(&root->log_mutex);
finish_wait(&root->log_writer_wait, &wait);
}
- return 0;
}
/*
@@ -2046,7 +2050,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* wait for them until later.
*/
ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ mutex_unlock(&root->log_mutex);
+ goto out;
+ }
btrfs_set_root_node(&log->root_item, log->node);
@@ -2077,7 +2085,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (ret) {
- BUG_ON(ret != -ENOSPC);
+ if (ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, root, ret);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out;
+ }
root->fs_info->last_trans_log_full_commit = trans->transid;
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2117,7 +2129,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_and_wait_marked_extents(log_root_tree,
&log_root_tree->dirty_log_pages,
EXTENT_DIRTY | EXTENT_NEW);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ }
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -2326,7 +2342,9 @@ out_unlock:
if (ret == -ENOSPC) {
root->fs_info->last_trans_log_full_commit = trans->transid;
ret = 0;
- }
+ } else if (ret < 0)
+ btrfs_abort_transaction(trans, root, ret);
+
btrfs_end_log_trans(root);
return err;
@@ -2357,7 +2375,8 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
if (ret == -ENOSPC) {
root->fs_info->last_trans_log_full_commit = trans->transid;
ret = 0;
- }
+ } else if (ret < 0 && ret != -ENOENT)
+ btrfs_abort_transaction(trans, root, ret);
btrfs_end_log_trans(root);
return ret;
@@ -3169,13 +3188,20 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
fs_info->log_root_recovering = 1;
trans = btrfs_start_transaction(fs_info->tree_root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto error;
+ }
wc.trans = trans;
wc.pin = 1;
ret = walk_log_tree(trans, log_root_tree, &wc);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_error(fs_info, ret, "Failed to pin buffers while "
+ "recovering log root tree.");
+ goto error;
+ }
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
@@ -3184,8 +3210,12 @@ again:
while (1) {
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
- if (ret < 0)
- break;
+
+ if (ret < 0) {
+ btrfs_error(fs_info, ret,
+ "Couldn't find tree log root.");
+ goto error;
+ }
if (ret > 0) {
if (path->slots[0] == 0)
break;
@@ -3199,14 +3229,24 @@ again:
log = btrfs_read_fs_root_no_radix(log_root_tree,
&found_key);
- BUG_ON(IS_ERR(log));
+ if (IS_ERR(log)) {
+ ret = PTR_ERR(log);
+ btrfs_error(fs_info, ret,
+ "Couldn't read tree log root.");
+ goto error;
+ }
tmp_key.objectid = found_key.offset;
tmp_key.type = BTRFS_ROOT_ITEM_KEY;
tmp_key.offset = (u64)-1;
wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
- BUG_ON(IS_ERR_OR_NULL(wc.replay_dest));
+ if (IS_ERR(wc.replay_dest)) {
+ ret = PTR_ERR(wc.replay_dest);
+ btrfs_error(fs_info, ret, "Couldn't read target root "
+ "for tree log recovery.");
+ goto error;
+ }
wc.replay_dest->log_root = log;
btrfs_record_root_in_trans(trans, wc.replay_dest);
@@ -3254,6 +3294,10 @@ again:
kfree(log_root_tree);
return 0;
+
+error:
+ btrfs_free_path(path);
+ return ret;
}
/*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 2270ac58d746..862ac813f6b8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -38,7 +38,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct inode *inode, u64 dirid);
-int btrfs_end_log_trans(struct btrfs_root *root);
+void btrfs_end_log_trans(struct btrfs_root *root);
int btrfs_pin_log_trans(struct btrfs_root *root);
int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7eefdef8a14c..68a1754fe367 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -67,7 +67,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
kfree(fs_devices);
}
-int btrfs_cleanup_fs_uuids(void)
+void btrfs_cleanup_fs_uuids(void)
{
struct btrfs_fs_devices *fs_devices;
@@ -77,7 +77,6 @@ int btrfs_cleanup_fs_uuids(void)
list_del(&fs_devices->list);
free_fs_devices(fs_devices);
}
- return 0;
}
static noinline struct btrfs_device *__find_device(struct list_head *head,
@@ -130,7 +129,7 @@ static void requeue_list(struct btrfs_pending_bios *pending_bios,
* the list if the block device is congested. This way, multiple devices
* can make progress from a single worker thread.
*/
-static noinline int run_scheduled_bios(struct btrfs_device *device)
+static noinline void run_scheduled_bios(struct btrfs_device *device)
{
struct bio *pending;
struct backing_dev_info *bdi;
@@ -316,7 +315,6 @@ loop_lock:
done:
blk_finish_plug(&plug);
- return 0;
}
static void pending_bios_fn(struct btrfs_work *work)
@@ -455,7 +453,7 @@ error:
return ERR_PTR(-ENOMEM);
}
-int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *next;
@@ -503,7 +501,6 @@ again:
fs_devices->latest_trans = latest_transid;
mutex_unlock(&uuid_mutex);
- return 0;
}
static void __free_device(struct work_struct *work)
@@ -552,10 +549,10 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
fs_devices->num_can_discard--;
new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
- BUG_ON(!new_device);
+ BUG_ON(!new_device); /* -ENOMEM */
memcpy(new_device, device, sizeof(*new_device));
new_device->name = kstrdup(device->name, GFP_NOFS);
- BUG_ON(device->name && !new_device->name);
+ BUG_ON(device->name && !new_device->name); /* -ENOMEM */
new_device->bdev = NULL;
new_device->writeable = 0;
new_device->in_fs_metadata = 0;
@@ -723,8 +720,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
u64 devid;
u64 transid;
- mutex_lock(&uuid_mutex);
-
flags |= FMODE_EXCL;
bdev = blkdev_get_by_path(path, flags, holder);
@@ -733,6 +728,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
goto error;
}
+ mutex_lock(&uuid_mutex);
ret = set_blocksize(bdev, 4096);
if (ret)
goto error_close;
@@ -754,9 +750,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
brelse(bh);
error_close:
+ mutex_unlock(&uuid_mutex);
blkdev_put(bdev, flags);
error:
- mutex_unlock(&uuid_mutex);
return ret;
}
@@ -1040,8 +1036,10 @@ again:
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
+ } else {
+ btrfs_error(root->fs_info, ret, "Slot search failed");
+ goto out;
}
- BUG_ON(ret);
if (device->bytes_used > 0) {
u64 len = btrfs_dev_extent_length(leaf, extent);
@@ -1051,7 +1049,10 @@ again:
spin_unlock(&root->fs_info->free_chunk_lock);
}
ret = btrfs_del_item(trans, root, path);
-
+ if (ret) {
+ btrfs_error(root->fs_info, ret,
+ "Failed to remove dev extent item");
+ }
out:
btrfs_free_path(path);
return ret;
@@ -1079,7 +1080,8 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*extent));
- BUG_ON(ret);
+ if (ret)
+ goto out;
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1094,6 +1096,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
btrfs_mark_buffer_dirty(leaf);
+out:
btrfs_free_path(path);
return ret;
}
@@ -1119,7 +1122,7 @@ static noinline int find_next_chunk(struct btrfs_root *root,
if (ret < 0)
goto error;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
if (ret) {
@@ -1163,7 +1166,7 @@ static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
if (ret < 0)
goto error;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
BTRFS_DEV_ITEM_KEY);
@@ -1597,7 +1600,7 @@ next_slot:
(unsigned long)btrfs_device_fsid(dev_item),
BTRFS_UUID_SIZE);
device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
- BUG_ON(!device);
+ BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
btrfs_set_device_generation(leaf, dev_item,
@@ -1707,7 +1710,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if (seeding_dev) {
sb->s_flags &= ~MS_RDONLY;
ret = btrfs_prepare_sprout(root);
- BUG_ON(ret);
+ BUG_ON(ret); /* -ENOMEM */
}
device->fs_devices = root->fs_info->fs_devices;
@@ -1745,11 +1748,15 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if (seeding_dev) {
ret = init_first_rw_device(trans, root, device);
- BUG_ON(ret);
+ if (ret)
+ goto error_trans;
ret = btrfs_finish_sprout(trans, root);
- BUG_ON(ret);
+ if (ret)
+ goto error_trans;
} else {
ret = btrfs_add_device(trans, root, device);
+ if (ret)
+ goto error_trans;
}
/*
@@ -1759,17 +1766,31 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
btrfs_clear_space_info_full(root->fs_info);
unlock_chunks(root);
- btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
+ if (ret) /* transaction commit */
+ return ret;
+
ret = btrfs_relocate_sys_chunks(root);
- BUG_ON(ret);
+ if (ret < 0)
+ btrfs_error(root->fs_info, ret,
+ "Failed to relocate sys chunks after "
+ "device initialization. This can be fixed "
+ "using the \"btrfs balance\" command.");
}
return ret;
+
+error_trans:
+ unlock_chunks(root);
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ kfree(device->name);
+ kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
if (seeding_dev) {
@@ -1877,10 +1898,20 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
key.type = BTRFS_CHUNK_ITEM_KEY;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- BUG_ON(ret);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0) { /* Logic error or corruption */
+ btrfs_error(root->fs_info, -ENOENT,
+ "Failed lookup while freeing chunk.");
+ ret = -ENOENT;
+ goto out;
+ }
ret = btrfs_del_item(trans, root, path);
-
+ if (ret < 0)
+ btrfs_error(root->fs_info, ret,
+ "Failed to delete chunk item.");
+out:
btrfs_free_path(path);
return ret;
}
@@ -2042,7 +2073,7 @@ again:
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
- BUG_ON(ret == 0);
+ BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
@@ -2251,15 +2282,13 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info)
* Balance filters. Return 1 if chunk should be filtered out
* (should not be balanced).
*/
-static int chunk_profiles_filter(u64 chunk_profile,
+static int chunk_profiles_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
- chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+ chunk_type = chunk_to_extended(chunk_type) &
+ BTRFS_EXTENDED_PROFILE_MASK;
- if (chunk_profile == 0)
- chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-
- if (bargs->profiles & chunk_profile)
+ if (bargs->profiles & chunk_type)
return 0;
return 1;
@@ -2366,18 +2395,16 @@ static int chunk_vrange_filter(struct extent_buffer *leaf,
return 1;
}
-static int chunk_soft_convert_filter(u64 chunk_profile,
+static int chunk_soft_convert_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return 0;
- chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
-
- if (chunk_profile == 0)
- chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ chunk_type = chunk_to_extended(chunk_type) &
+ BTRFS_EXTENDED_PROFILE_MASK;
- if (bargs->target & chunk_profile)
+ if (bargs->target == chunk_type)
return 1;
return 0;
@@ -2603,6 +2630,30 @@ error:
return ret;
}
+/**
+ * alloc_profile_is_valid - see if a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static int alloc_profile_is_valid(u64 flags, int extended)
+{
+ u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
+ BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+ flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ /* 1) check that all other bits are zeroed */
+ if (flags & ~mask)
+ return 0;
+
+ /* 2) see if profile is reduced */
+ if (flags == 0)
+ return !extended; /* "0" is valid for usual profiles */
+
+ /* true if exactly one bit set */
+ return (flags & (flags - 1)) == 0;
+}
+
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
{
/* cancel requested || normal exit path */
@@ -2631,6 +2682,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
{
struct btrfs_fs_info *fs_info = bctl->fs_info;
u64 allowed;
+ int mixed = 0;
int ret;
if (btrfs_fs_closing(fs_info) ||
@@ -2640,13 +2692,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
goto out;
}
+ allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+ if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ mixed = 1;
+
/*
* In case of mixed groups both data and meta should be picked,
* and identical options should be given for both of them.
*/
- allowed = btrfs_super_incompat_flags(fs_info->super_copy);
- if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
+ allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
+ if (mixed && (bctl->flags & allowed)) {
if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
!(bctl->flags & BTRFS_BALANCE_METADATA) ||
memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
@@ -2657,14 +2712,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
}
- /*
- * Profile changing sanity checks. Skip them if a simple
- * balance is requested.
- */
- if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
- BTRFS_BALANCE_ARGS_CONVERT))
- goto do_balance;
-
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
if (fs_info->fs_devices->num_devices == 1)
allowed |= BTRFS_BLOCK_GROUP_DUP;
@@ -2674,24 +2721,27 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10);
- if (!profile_is_valid(bctl->data.target, 1) ||
- bctl->data.target & ~allowed) {
+ if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl->data.target, 1) ||
+ (bctl->data.target & ~allowed))) {
printk(KERN_ERR "btrfs: unable to start balance with target "
"data profile %llu\n",
(unsigned long long)bctl->data.target);
ret = -EINVAL;
goto out;
}
- if (!profile_is_valid(bctl->meta.target, 1) ||
- bctl->meta.target & ~allowed) {
+ if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl->meta.target, 1) ||
+ (bctl->meta.target & ~allowed))) {
printk(KERN_ERR "btrfs: unable to start balance with target "
"metadata profile %llu\n",
(unsigned long long)bctl->meta.target);
ret = -EINVAL;
goto out;
}
- if (!profile_is_valid(bctl->sys.target, 1) ||
- bctl->sys.target & ~allowed) {
+ if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl->sys.target, 1) ||
+ (bctl->sys.target & ~allowed))) {
printk(KERN_ERR "btrfs: unable to start balance with target "
"system profile %llu\n",
(unsigned long long)bctl->sys.target);
@@ -2699,7 +2749,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
goto out;
}
- if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
+ /* allow dup'ed data chunks only in mixed mode */
+ if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
printk(KERN_ERR "btrfs: dup for data is not allowed\n");
ret = -EINVAL;
goto out;
@@ -2725,7 +2777,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
}
-do_balance:
ret = insert_balance_item(fs_info->tree_root, bctl);
if (ret && ret != -EEXIST)
goto out;
@@ -2968,7 +3019,7 @@ again:
key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
- while (1) {
+ do {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto done;
@@ -3010,8 +3061,7 @@ again:
goto done;
if (ret == -ENOSPC)
failed++;
- key.offset -= 1;
- }
+ } while (key.offset-- > 0);
if (failed && !retried) {
failed = 0;
@@ -3129,11 +3179,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
int i;
int j;
- if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
- (type & BTRFS_BLOCK_GROUP_DUP)) {
- WARN_ON(1);
- type &= ~BTRFS_BLOCK_GROUP_DUP;
- }
+ BUG_ON(!alloc_profile_is_valid(type, 0));
if (list_empty(&fs_devices->alloc_list))
return -ENOSPC;
@@ -3329,13 +3375,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
- BUG_ON(ret);
free_extent_map(em);
+ if (ret)
+ goto error;
ret = btrfs_make_block_group(trans, extent_root, 0, type,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
start, num_bytes);
- BUG_ON(ret);
+ if (ret)
+ goto error;
for (i = 0; i < map->num_stripes; ++i) {
struct btrfs_device *device;
@@ -3348,7 +3396,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
info->chunk_root->root_key.objectid,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
start, dev_offset, stripe_size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto error;
+ }
}
kfree(devices_info);
@@ -3384,7 +3435,8 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
device = map->stripes[index].dev;
device->bytes_used += stripe_size;
ret = btrfs_update_device(trans, device);
- BUG_ON(ret);
+ if (ret)
+ goto out_free;
index++;
}
@@ -3421,16 +3473,19 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
key.offset = chunk_offset;
ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
- BUG_ON(ret);
- if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ /*
+ * TODO: Cleanup of inserted chunk root in case of
+ * failure.
+ */
ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
item_size);
- BUG_ON(ret);
}
+out_free:
kfree(chunk);
- return 0;
+ return ret;
}
/*
@@ -3462,7 +3517,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
chunk_size, stripe_size);
- BUG_ON(ret);
+ if (ret)
+ return ret;
return 0;
}
@@ -3494,7 +3550,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
&stripe_size, chunk_offset, alloc_profile);
- BUG_ON(ret);
+ if (ret)
+ return ret;
sys_chunk_offset = chunk_offset + chunk_size;
@@ -3505,10 +3562,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
&sys_chunk_size, &sys_stripe_size,
sys_chunk_offset, alloc_profile);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
ret = btrfs_add_device(trans, fs_info->chunk_root, device);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
/*
* Modifying chunk tree needs allocating new blocks from both
@@ -3518,13 +3577,20 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
*/
ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
chunk_size, stripe_size);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
ret = __finish_chunk_alloc(trans, extent_root, sys_map,
sys_chunk_offset, sys_chunk_size,
sys_stripe_size);
- BUG_ON(ret);
+ if (ret)
+ goto abort;
+
return 0;
+
+abort:
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
}
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
@@ -3875,7 +3941,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
do_div(length, map->num_stripes);
buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
- BUG_ON(!buf);
+ BUG_ON(!buf); /* -ENOMEM */
for (i = 0; i < map->num_stripes; i++) {
if (devid && map->stripes[i].dev->devid != devid)
@@ -3968,7 +4034,7 @@ struct async_sched {
* This will add one bio to the pending list for a device and make sure
* the work struct is scheduled.
*/
-static noinline int schedule_bio(struct btrfs_root *root,
+static noinline void schedule_bio(struct btrfs_root *root,
struct btrfs_device *device,
int rw, struct bio *bio)
{
@@ -3980,7 +4046,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
bio_get(bio);
btrfsic_submit_bio(rw, bio);
bio_put(bio);
- return 0;
+ return;
}
/*
@@ -4014,7 +4080,6 @@ static noinline int schedule_bio(struct btrfs_root *root,
if (should_queue)
btrfs_queue_worker(&root->fs_info->submit_workers,
&device->work);
- return 0;
}
int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
@@ -4037,7 +4102,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
mirror_num);
- BUG_ON(ret);
+ if (ret) /* -ENOMEM */
+ return ret;
total_devs = bbio->num_stripes;
if (map_length < length) {
@@ -4056,7 +4122,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
while (dev_nr < total_devs) {
if (dev_nr < total_devs - 1) {
bio = bio_clone(first_bio, GFP_NOFS);
- BUG_ON(!bio);
+ BUG_ON(!bio); /* -ENOMEM */
} else {
bio = first_bio;
}
@@ -4210,13 +4276,13 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
write_lock(&map_tree->map_tree.lock);
ret = add_extent_mapping(&map_tree->map_tree, em);
write_unlock(&map_tree->map_tree.lock);
- BUG_ON(ret);
+ BUG_ON(ret); /* Tree corruption */
free_extent_map(em);
return 0;
}
-static int fill_device_from_item(struct extent_buffer *leaf,
+static void fill_device_from_item(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item,
struct btrfs_device *device)
{
@@ -4233,8 +4299,6 @@ static int fill_device_from_item(struct extent_buffer *leaf,
ptr = (unsigned long)btrfs_device_uuid(dev_item);
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
-
- return 0;
}
static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
@@ -4385,7 +4449,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
* to silence the warning eg. on PowerPC 64.
*/
if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
- SetPageUptodate(sb->first_page);
+ SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 19ac95048b88..bb6b03f97aaa 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -260,12 +260,12 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
int btrfs_add_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_root *root, char *device_path);
-int btrfs_cleanup_fs_uuids(void);
+void btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
diff --git a/fs/buffer.c b/fs/buffer.c
index 19d8eb7fdc81..1a30db77af32 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,7 +41,6 @@
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
-#include <linux/cleancache.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -231,55 +230,6 @@ out:
return ret;
}
-/* If invalidate_buffers() will trash dirty buffers, it means some kind
- of fs corruption is going on. Trashing dirty data always imply losing
- information that was supposed to be just stored on the physical layer
- by the user.
-
- Thus invalidate_buffers in general usage is not allwowed to trash
- dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
- be preserved. These buffers are simply skipped.
-
- We also skip buffers which are still in use. For example this can
- happen if a userspace program is reading the block device.
-
- NOTE: In the case where the user removed a removable-media-disk even if
- there's still dirty data not synced on disk (due a bug in the device driver
- or due an error of the user), by not destroying the dirty buffers we could
- generate corruption also on the next media inserted, thus a parameter is
- necessary to handle this case in the most safe way possible (trying
- to not corrupt also the new disk inserted with the data belonging to
- the old now corrupted disk). Also for the ramdisk the natural thing
- to do in order to release the ramdisk memory is to destroy dirty buffers.
-
- These are two special cases. Normal usage imply the device driver
- to issue a sync on the device (without waiting I/O completion) and
- then an invalidate_buffers call that doesn't trash dirty buffers.
-
- For handling cache coherency with the blkdev pagecache the 'update' case
- is been introduced. It is needed to re-read from disk any pinned
- buffer. NOTE: re-reading from disk is destructive so we can do it only
- when we assume nobody is changing the buffercache under our I/O and when
- we think the disk contains more recent information than the buffercache.
- The update == 1 pass marks the buffers we need to update, the update == 2
- pass does the actual I/O. */
-void invalidate_bdev(struct block_device *bdev)
-{
- struct address_space *mapping = bdev->bd_inode->i_mapping;
-
- if (mapping->nrpages == 0)
- return;
-
- invalidate_bh_lrus();
- lru_add_drain_all(); /* make sure all lru add caches are flushed */
- invalidate_mapping_pages(mapping, 0, -1);
- /* 99% of the time, we don't need to flush the cleancache on the bdev.
- * But, for the strange corners, lets be cautious
- */
- cleancache_flush_inode(mapping);
-}
-EXPORT_SYMBOL(invalidate_bdev);
-
/*
* Kick the writeback threads then try to free up some ZONE_NORMAL memory.
*/
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 1064805e653b..67bef6d01484 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -11,7 +11,6 @@
#include <linux/slab.h>
#include <linux/mount.h>
-#include <linux/buffer_head.h>
#include "internal.h"
#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8b53193e4f7c..620daad201db 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -641,10 +641,10 @@ static int __cap_is_valid(struct ceph_cap *cap)
unsigned long ttl;
u32 gen;
- spin_lock(&cap->session->s_cap_lock);
+ spin_lock(&cap->session->s_gen_ttl_lock);
gen = cap->session->s_cap_gen;
ttl = cap->session->s_cap_ttl;
- spin_unlock(&cap->session->s_cap_lock);
+ spin_unlock(&cap->session->s_gen_ttl_lock);
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
dout("__cap_is_valid %p cap %p issued %s "
@@ -928,7 +928,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
u64 time_warp_seq,
- uid_t uid, gid_t gid, mode_t mode,
+ uid_t uid, gid_t gid, umode_t mode,
u64 xattr_version,
struct ceph_buffer *xattrs_buf,
u64 follows)
@@ -1078,7 +1078,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
u64 size, max_size;
struct timespec mtime, atime;
int wake = 0;
- mode_t mode;
+ umode_t mode;
uid_t uid;
gid_t gid;
struct ceph_mds_session *session;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 98954003a8d3..3e8094be4604 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -666,7 +666,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
}
static int ceph_mknod(struct inode *dir, struct dentry *dentry,
- int mode, dev_t rdev)
+ umode_t mode, dev_t rdev)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -676,7 +676,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
- dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
+ dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
dir, dentry, mode, rdev);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
if (IS_ERR(req)) {
@@ -699,7 +699,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
return err;
}
-static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
+static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
dout("create in dir %p dentry %p name '%.*s'\n",
@@ -753,7 +753,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
return err;
}
-static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -767,7 +767,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
dentry->d_name.len, dentry->d_name.name, dentry);
} else if (ceph_snap(dir) == CEPH_NOSNAP) {
- dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
+ dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
op = CEPH_MDS_OP_MKDIR;
} else {
goto out;
@@ -870,7 +870,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
} else if (ceph_snap(dir) == CEPH_NOSNAP) {
dout("unlink/rmdir dir %p dn %p inode %p\n",
dir, dentry, inode);
- op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
+ op = S_ISDIR(dentry->d_inode->i_mode) ?
CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
} else
goto out;
@@ -973,12 +973,12 @@ static int dentry_lease_is_valid(struct dentry *dentry)
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
- if (di && di->lease_session) {
+ if (di->lease_session) {
s = di->lease_session;
- spin_lock(&s->s_cap_lock);
+ spin_lock(&s->s_gen_ttl_lock);
gen = s->s_cap_gen;
ttl = s->s_cap_ttl;
- spin_unlock(&s->s_cap_lock);
+ spin_unlock(&s->s_gen_ttl_lock);
if (di->lease_gen == gen &&
time_before(jiffies, dentry->d_time) &&
@@ -1072,13 +1072,11 @@ static void ceph_d_release(struct dentry *dentry)
struct ceph_dentry_info *di = ceph_dentry(dentry);
dout("d_release %p\n", dentry);
- if (di) {
- ceph_dentry_lru_del(dentry);
- if (di->lease_session)
- ceph_put_mds_session(di->lease_session);
- kmem_cache_free(ceph_dentry_cachep, di);
- dentry->d_fsdata = NULL;
- }
+ ceph_dentry_lru_del(dentry);
+ if (di->lease_session)
+ ceph_put_mds_session(di->lease_session);
+ kmem_cache_free(ceph_dentry_cachep, di);
+ dentry->d_fsdata = NULL;
}
static int ceph_snapdir_d_revalidate(struct dentry *dentry,
@@ -1096,17 +1094,36 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
*/
void ceph_dir_set_complete(struct inode *inode)
{
- /* not yet implemented */
+ struct dentry *dentry = d_find_any_alias(inode);
+
+ if (dentry && ceph_dentry(dentry) &&
+ ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
+ dout(" marking %p (%p) complete\n", inode, dentry);
+ set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+ }
+ dput(dentry);
}
void ceph_dir_clear_complete(struct inode *inode)
{
- /* not yet implemented */
+ struct dentry *dentry = d_find_any_alias(inode);
+
+ if (dentry && ceph_dentry(dentry)) {
+ dout(" marking %p (%p) complete\n", inode, dentry);
+ set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+ }
+ dput(dentry);
}
bool ceph_dir_test_complete(struct inode *inode)
{
- /* not yet implemented */
+ struct dentry *dentry = d_find_any_alias(inode);
+
+ if (dentry && ceph_dentry(dentry)) {
+ dout(" marking %p (%p) NOT complete\n", inode, dentry);
+ clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+ }
+ dput(dentry);
return false;
}
@@ -1220,6 +1237,7 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
do {
ceph_mdsc_get_request(req);
spin_unlock(&ci->i_unsafe_lock);
+
dout("dir_fsync %p wait on tid %llu (until %llu)\n",
inode, req->r_tid, last_tid);
if (req->r_timeout) {
@@ -1232,9 +1250,9 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
} else {
wait_for_completion(&req->r_safe_completion);
}
- spin_lock(&ci->i_unsafe_lock);
ceph_mdsc_put_request(req);
+ spin_lock(&ci->i_unsafe_lock);
if (ret || list_empty(head))
break;
req = list_entry(head->next,
@@ -1259,13 +1277,11 @@ void ceph_dentry_lru_add(struct dentry *dn)
dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name);
- if (di) {
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_add_tail(&di->lru, &mdsc->dentry_lru);
- mdsc->num_dentry++;
- spin_unlock(&mdsc->dentry_lru_lock);
- }
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_add_tail(&di->lru, &mdsc->dentry_lru);
+ mdsc->num_dentry++;
+ spin_unlock(&mdsc->dentry_lru_lock);
}
void ceph_dentry_lru_touch(struct dentry *dn)
@@ -1275,12 +1291,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
dn->d_name.len, dn->d_name.name, di->offset);
- if (di) {
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_move_tail(&di->lru, &mdsc->dentry_lru);
- spin_unlock(&mdsc->dentry_lru_lock);
- }
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_move_tail(&di->lru, &mdsc->dentry_lru);
+ spin_unlock(&mdsc->dentry_lru_lock);
}
void ceph_dentry_lru_del(struct dentry *dn)
@@ -1290,13 +1304,11 @@ void ceph_dentry_lru_del(struct dentry *dn)
dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name);
- if (di) {
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_del_init(&di->lru);
- mdsc->num_dentry--;
- spin_unlock(&mdsc->dentry_lru_lock);
- }
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_del_init(&di->lru);
+ mdsc->num_dentry--;
+ spin_unlock(&mdsc->dentry_lru_lock);
}
/*
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9fbcdecaaccd..fbb2a643ef10 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -56,9 +56,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
return -EINVAL;
spin_lock(&dentry->d_lock);
- parent = dget(dentry->d_parent);
- spin_unlock(&dentry->d_lock);
-
+ parent = dentry->d_parent;
if (*max_len >= connected_handle_length) {
dout("encode_fh %p connectable\n", dentry);
cfh->ino = ceph_ino(dentry->d_inode);
@@ -81,7 +79,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
*max_len = handle_length;
type = 255;
}
- dput(parent);
+ spin_unlock(&dentry->d_lock);
return type;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 87fb132fb330..2c489378b4cd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -384,7 +384,6 @@ static void ceph_i_callback(struct rcu_head *head)
struct inode *inode = container_of(head, struct inode, i_rcu);
struct ceph_inode_info *ci = ceph_inode(inode);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(ceph_inode_cachep, ci);
}
@@ -851,11 +850,12 @@ static void ceph_set_dentry_offset(struct dentry *dn)
{
struct dentry *dir = dn->d_parent;
struct inode *inode = dir->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_info *ci;
struct ceph_dentry_info *di;
BUG_ON(!inode);
+ ci = ceph_inode(inode);
di = ceph_dentry(dn);
spin_lock(&ci->i_ceph_lock);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6203d805eb45..866e8d7ca37d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -262,6 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
/* trace */
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
+ ceph_decode_need(&p, end, len, bad);
err = parse_reply_info_trace(&p, p+len, info, features);
if (err < 0)
goto out_bad;
@@ -270,6 +271,7 @@ static int parse_reply_info(struct ceph_msg *msg,
/* extra */
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
+ ceph_decode_need(&p, end, len, bad);
err = parse_reply_info_extra(&p, p+len, info, features);
if (err < 0)
goto out_bad;
@@ -398,9 +400,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
s->s_con.peer_name.num = cpu_to_le64(mds);
- spin_lock_init(&s->s_cap_lock);
+ spin_lock_init(&s->s_gen_ttl_lock);
s->s_cap_gen = 0;
s->s_cap_ttl = 0;
+
+ spin_lock_init(&s->s_cap_lock);
s->s_renew_requested = 0;
s->s_renew_seq = 0;
INIT_LIST_HEAD(&s->s_caps);
@@ -2326,10 +2330,10 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_STALE:
pr_info("mds%d caps went stale, renewing\n",
session->s_mds);
- spin_lock(&session->s_cap_lock);
+ spin_lock(&session->s_gen_ttl_lock);
session->s_cap_gen++;
session->s_cap_ttl = 0;
- spin_unlock(&session->s_cap_lock);
+ spin_unlock(&session->s_gen_ttl_lock);
send_renew_caps(mdsc, session);
break;
@@ -2772,7 +2776,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
di = ceph_dentry(dentry);
switch (h->action) {
case CEPH_MDS_LEASE_REVOKE:
- if (di && di->lease_session == session) {
+ if (di->lease_session == session) {
if (ceph_seq_cmp(di->lease_seq, seq) > 0)
h->seq = cpu_to_le32(di->lease_seq);
__ceph_mdsc_drop_dentry_lease(dentry);
@@ -2781,7 +2785,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
break;
case CEPH_MDS_LEASE_RENEW:
- if (di && di->lease_session == session &&
+ if (di->lease_session == session &&
di->lease_gen == session->s_cap_gen &&
di->lease_renew_from &&
di->lease_renew_after == 0) {
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index a50ca0e39475..8c7c04ebb595 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -117,10 +117,13 @@ struct ceph_mds_session {
void *s_authorizer_buf, *s_authorizer_reply_buf;
size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
- /* protected by s_cap_lock */
- spinlock_t s_cap_lock;
+ /* protected by s_gen_ttl_lock */
+ spinlock_t s_gen_ttl_lock;
u32 s_cap_gen; /* inc each time we get mds stale msg */
unsigned long s_cap_ttl; /* when session caps expire */
+
+ /* protected by s_cap_lock */
+ spinlock_t s_cap_lock;
struct list_head s_caps; /* all caps issued by this session */
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b48f15f101a0..00de2c9568cd 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -131,6 +131,8 @@ enum {
Opt_rbytes,
Opt_norbytes,
Opt_noasyncreaddir,
+ Opt_dcache,
+ Opt_nodcache,
Opt_ino32,
};
@@ -152,6 +154,8 @@ static match_table_t fsopt_tokens = {
{Opt_rbytes, "rbytes"},
{Opt_norbytes, "norbytes"},
{Opt_noasyncreaddir, "noasyncreaddir"},
+ {Opt_dcache, "dcache"},
+ {Opt_nodcache, "nodcache"},
{Opt_ino32, "ino32"},
{-1, NULL}
};
@@ -231,6 +235,12 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_noasyncreaddir:
fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
break;
+ case Opt_dcache:
+ fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
+ break;
+ case Opt_nodcache:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
+ break;
case Opt_ino32:
fsopt->flags |= CEPH_MOUNT_OPT_INO32;
break;
@@ -341,11 +351,11 @@ out:
/**
* ceph_show_options - Show mount options in /proc/mounts
* @m: seq_file to write to
- * @mnt: mount descriptor
+ * @root: root of that (sub)tree
*/
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int ceph_show_options(struct seq_file *m, struct dentry *root)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
struct ceph_mount_options *fsopt = fsc->mount_options;
struct ceph_options *opt = fsc->client->options;
@@ -377,6 +387,10 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
seq_puts(m, ",norbytes");
if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
seq_puts(m, ",noasyncreaddir");
+ if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
+ seq_puts(m, ",dcache");
+ else
+ seq_puts(m, ",nodcache");
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
@@ -636,19 +650,26 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err == 0) {
+ struct inode *inode = req->r_target_inode;
+ req->r_target_inode = NULL;
dout("open_root_inode success\n");
- if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
+ if (ceph_ino(inode) == CEPH_INO_ROOT &&
fsc->sb->s_root == NULL) {
- root = d_alloc_root(req->r_target_inode);
- ceph_init_dentry(root);
+ root = d_alloc_root(inode);
+ if (!root) {
+ iput(inode);
+ root = ERR_PTR(-ENOMEM);
+ goto out;
+ }
} else {
- root = d_obtain_alias(req->r_target_inode);
+ root = d_obtain_alias(inode);
}
- req->r_target_inode = NULL;
+ ceph_init_dentry(root);
dout("open_root_inode success, root dentry is %p\n", root);
} else {
root = ERR_PTR(err);
}
+out:
ceph_mdsc_put_request(req);
return root;
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index edcbf3774a56..1421f3d875a2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -28,6 +28,7 @@
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
+#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
@@ -136,7 +137,7 @@ struct ceph_cap_snap {
int issued, dirty;
struct ceph_snap_context *context;
- mode_t mode;
+ umode_t mode;
uid_t uid;
gid_t gid;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a5e36e4488a7..a76f697303d9 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -111,8 +111,10 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
}
static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+ { true, "ceph.file.layout", ceph_vxattrcb_layout},
+ /* The following extended attribute name is deprecated */
{ true, "ceph.layout", ceph_vxattrcb_layout},
- { NULL, NULL }
+ { true, NULL, NULL }
};
static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
@@ -818,6 +820,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
int issued;
int err;
+ int required_blob_size;
int dirty;
if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -833,14 +836,34 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
return -EOPNOTSUPP;
}
+ err = -ENOMEM;
spin_lock(&ci->i_ceph_lock);
__build_xattrs(inode);
+retry:
issued = __ceph_caps_issued(ci, NULL);
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
if (!(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync;
+ required_blob_size = __get_required_blob_size(ci, 0, 0);
+
+ if (!ci->i_xattrs.prealloc_blob ||
+ required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+ struct ceph_buffer *blob;
+
+ spin_unlock(&ci->i_ceph_lock);
+ dout(" preaallocating new blob size=%d\n", required_blob_size);
+ blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+ if (!blob)
+ goto out;
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_xattrs.prealloc_blob)
+ ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+ ci->i_xattrs.prealloc_blob = blob;
+ goto retry;
+ }
+
err = __remove_xattr_by_name(ceph_inode(inode), name);
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
ci->i_xattrs.dirty = true;
@@ -853,6 +876,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
do_sync:
spin_unlock(&ci->i_ceph_lock);
err = ceph_send_removexattr(dentry, name);
+out:
return err;
}
diff --git a/fs/char_dev.c b/fs/char_dev.c
index dca9e5e0f73b..3f152b92a94a 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -272,7 +272,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
cd = __register_chrdev_region(major, baseminor, count, name);
if (IS_ERR(cd))
return PTR_ERR(cd);
-
+
cdev = cdev_alloc();
if (!cdev)
goto out2;
@@ -280,7 +280,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
cdev->owner = fops->owner;
cdev->ops = fops;
kobject_set_name(&cdev->kobj, "%s", name);
-
+
err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
if (err)
goto out;
@@ -405,7 +405,7 @@ static int chrdev_open(struct inode *inode, struct file *filp)
goto out_cdev_put;
if (filp->f_op->open) {
- ret = filp->f_op->open(inode,filp);
+ ret = filp->f_op->open(inode, filp);
if (ret)
goto out_cdev_put;
}
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f66cc1625150..2b243af70aa3 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -139,8 +139,7 @@ config CIFS_DFS_UPCALL
points. If unsure, say N.
config CIFS_FSCACHE
- bool "Provide CIFS client caching support (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "Provide CIFS client caching support"
depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
help
Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
@@ -148,8 +147,8 @@ config CIFS_FSCACHE
manager. If unsure, say N.
config CIFS_ACL
- bool "Provide CIFS ACL support (EXPERIMENTAL)"
- depends on EXPERIMENTAL && CIFS_XATTR && KEYS
+ bool "Provide CIFS ACL support"
+ depends on CIFS_XATTR && KEYS
help
Allows to fetch CIFS/NTFS ACL from the server. The DACL blob
is handed over to the application/caller.
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 84e8c0724704..24b3dfc05282 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -676,14 +676,23 @@ static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
{
char c;
int rc;
+ static bool warned;
rc = get_user(c, buffer);
if (rc)
return rc;
if (c == '0' || c == 'n' || c == 'N')
multiuser_mount = 0;
- else if (c == '1' || c == 'y' || c == 'Y')
+ else if (c == '1' || c == 'y' || c == 'Y') {
multiuser_mount = 1;
+ if (!warned) {
+ warned = true;
+ printk(KERN_WARNING "CIFS VFS: The legacy multiuser "
+ "mount code is scheduled to be deprecated in "
+ "3.5. Please switch to using the multiuser "
+ "mount option.");
+ }
+ }
return count;
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 500d65859279..c865bfdfe819 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -59,8 +59,8 @@ struct cifs_sb_info {
gid_t mnt_gid;
uid_t mnt_backupuid;
gid_t mnt_backupgid;
- mode_t mnt_file_mode;
- mode_t mnt_dir_mode;
+ umode_t mnt_file_mode;
+ umode_t mnt_dir_mode;
unsigned int mnt_cifs_flags;
char *mountdata; /* options received at mount time or via DFS refs */
struct backing_dev_info bdi;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 2272fd5fe5b7..e622863b292f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -113,9 +113,11 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
MAX_MECH_STR_LEN +
UID_KEY_LEN + (sizeof(uid_t) * 2) +
CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
- USER_KEY_LEN + strlen(sesInfo->user_name) +
PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
+ if (sesInfo->user_name)
+ desc_len += USER_KEY_LEN + strlen(sesInfo->user_name);
+
spnego_key = ERR_PTR(-ENOMEM);
description = kzalloc(desc_len, GFP_KERNEL);
if (description == NULL)
@@ -152,8 +154,10 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
dp = description + strlen(description);
sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
- dp = description + strlen(description);
- sprintf(dp, ";user=%s", sesInfo->user_name);
+ if (sesInfo->user_name) {
+ dp = description + strlen(description);
+ sprintf(dp, ";user=%s", sesInfo->user_name);
+ }
dp = description + strlen(description);
sprintf(dp, ";pid=0x%x", current->pid);
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 1b2e180b018d..fbb9da951843 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -27,17 +27,17 @@
#include "cifs_debug.h"
/*
- * cifs_ucs2_bytes - how long will a string be after conversion?
- * @ucs - pointer to input string
+ * cifs_utf16_bytes - how long will a string be after conversion?
+ * @utf16 - pointer to input string
* @maxbytes - don't go past this many bytes of input string
* @codepage - destination codepage
*
- * Walk a ucs2le string and return the number of bytes that the string will
+ * Walk a utf16le string and return the number of bytes that the string will
* be after being converted to the given charset, not including any null
* termination required. Don't walk past maxbytes in the source buffer.
*/
int
-cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+cifs_utf16_bytes(const __le16 *from, int maxbytes,
const struct nls_table *codepage)
{
int i;
@@ -122,7 +122,7 @@ cp_convert:
}
/*
- * cifs_from_ucs2 - convert utf16le string to local charset
+ * cifs_from_utf16 - convert utf16le string to local charset
* @to - destination buffer
* @from - source buffer
* @tolen - destination buffer size (in bytes)
@@ -130,7 +130,7 @@ cp_convert:
* @codepage - codepage to which characters should be converted
* @mapchar - should characters be remapped according to the mapchars option?
*
- * Convert a little-endian ucs2le string (as sent by the server) to a string
+ * Convert a little-endian utf16le string (as sent by the server) to a string
* in the provided codepage. The tolen and fromlen parameters are to ensure
* that the code doesn't walk off of the end of the buffer (which is always
* a danger if the alignment of the source buffer is off). The destination
@@ -139,12 +139,12 @@ cp_convert:
* null terminator).
*
* Note that some windows versions actually send multiword UTF-16 characters
- * instead of straight UCS-2. The linux nls routines however aren't able to
+ * instead of straight UTF16-2. The linux nls routines however aren't able to
* deal with those characters properly. In the event that we get some of
* those characters, they won't be translated properly.
*/
int
-cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
const struct nls_table *codepage, bool mapchar)
{
int i, charlen, safelen;
@@ -190,13 +190,13 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
}
/*
- * NAME: cifs_strtoUCS()
+ * NAME: cifs_strtoUTF16()
*
* FUNCTION: Convert character string to unicode string
*
*/
int
-cifs_strtoUCS(__le16 *to, const char *from, int len,
+cifs_strtoUTF16(__le16 *to, const char *from, int len,
const struct nls_table *codepage)
{
int charlen;
@@ -206,7 +206,7 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
charlen = codepage->char2uni(from, len, &wchar_to);
if (charlen < 1) {
- cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
+ cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d",
*from, charlen);
/* A question mark */
wchar_to = 0x003f;
@@ -220,7 +220,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
}
/*
- * cifs_strndup_from_ucs - copy a string from wire format to the local codepage
+ * cifs_strndup_from_utf16 - copy a string from wire format to the local
+ * codepage
* @src - source string
* @maxlen - don't walk past this many bytes in the source string
* @is_unicode - is this a unicode string?
@@ -231,19 +232,19 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
* error.
*/
char *
-cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
- const struct nls_table *codepage)
+cifs_strndup_from_utf16(const char *src, const int maxlen,
+ const bool is_unicode, const struct nls_table *codepage)
{
int len;
char *dst;
if (is_unicode) {
- len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage);
+ len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
len += nls_nullsize(codepage);
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return NULL;
- cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage,
+ cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
false);
} else {
len = strnlen(src, maxlen);
@@ -264,7 +265,7 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
* names are little endian 16 bit Unicode on the wire
*/
int
-cifsConvertToUCS(__le16 *target, const char *source, int srclen,
+cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
const struct nls_table *cp, int mapChars)
{
int i, j, charlen;
@@ -273,7 +274,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
wchar_t tmp;
if (!mapChars)
- return cifs_strtoUCS(target, source, PATH_MAX, cp);
+ return cifs_strtoUTF16(target, source, PATH_MAX, cp);
for (i = 0, j = 0; i < srclen; j++) {
src_char = source[i];
@@ -281,7 +282,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
switch (src_char) {
case 0:
put_unaligned(0, &target[j]);
- goto ctoUCS_out;
+ goto ctoUTF16_out;
case ':':
dst_char = cpu_to_le16(UNI_COLON);
break;
@@ -326,7 +327,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
put_unaligned(dst_char, &target[j]);
}
-ctoUCS_out:
+ctoUTF16_out:
return i;
}
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 6d02fd560566..a513a546700b 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -74,16 +74,16 @@ extern const struct UniCaseRange CifsUniLowerRange[];
#endif /* UNIUPR_NOLOWER */
#ifdef __KERNEL__
-int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
- const struct nls_table *codepage, bool mapchar);
-int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
- const struct nls_table *codepage);
-int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
-char *cifs_strndup_from_ucs(const char *src, const int maxlen,
- const bool is_unicode,
- const struct nls_table *codepage);
-extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
- const struct nls_table *cp, int mapChars);
+int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
+ const struct nls_table *codepage, bool mapchar);
+int cifs_utf16_bytes(const __le16 *from, int maxbytes,
+ const struct nls_table *codepage);
+int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *);
+char *cifs_strndup_from_utf16(const char *src, const int maxlen,
+ const bool is_unicode,
+ const struct nls_table *codepage);
+extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen,
+ const struct nls_table *cp, int mapChars);
#endif
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 72ddf23ef6f7..c1b254487388 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -909,6 +909,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
umode_t group_mask = S_IRWXG;
umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
+ if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *))
+ return;
ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
GFP_KERNEL);
if (!ppace) {
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 5d9b9acc5fce..63c460e503b6 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -327,7 +327,7 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
attrptr->length = cpu_to_le16(2 * dlen);
blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
- cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+ cifs_strtoUTF16((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
return 0;
}
@@ -376,7 +376,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
kmalloc(attrsize + 1, GFP_KERNEL);
if (!ses->domainName)
return -ENOMEM;
- cifs_from_ucs2(ses->domainName,
+ cifs_from_utf16(ses->domainName,
(__le16 *)blobptr, attrsize, attrsize,
nls_cp, false);
break;
@@ -420,15 +420,20 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
}
/* convert ses->user_name to unicode and uppercase */
- len = strlen(ses->user_name);
+ len = ses->user_name ? strlen(ses->user_name) : 0;
user = kmalloc(2 + (len * 2), GFP_KERNEL);
if (user == NULL) {
cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
rc = -ENOMEM;
return rc;
}
- len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
- UniStrupr(user);
+
+ if (len) {
+ len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp);
+ UniStrupr(user);
+ } else {
+ memset(user, '\0', 2);
+ }
rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
(char *)user, 2 * len);
@@ -448,8 +453,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
rc = -ENOMEM;
return rc;
}
- len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
- nls_cp);
+ len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
+ nls_cp);
rc =
crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
(char *)domain, 2 * len);
@@ -468,7 +473,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
rc = -ENOMEM;
return rc;
}
- len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+ len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len,
nls_cp);
rc =
crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8f1fe324162b..b1fd382d1952 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -343,9 +343,9 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
* ones are.
*/
static int
-cifs_show_options(struct seq_file *s, struct vfsmount *m)
+cifs_show_options(struct seq_file *s, struct dentry *root)
{
- struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct sockaddr *srcaddr;
srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
@@ -393,7 +393,7 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
cifs_show_address(s, tcon->ses->server);
if (!tcon->unix_ext)
- seq_printf(s, ",file_mode=0%o,dir_mode=0%o",
+ seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho",
cifs_sb->mnt_file_mode,
cifs_sb->mnt_dir_mode);
if (tcon->seal)
@@ -430,7 +430,7 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
seq_printf(s, ",cifsacl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
seq_printf(s, ",dynperm");
- if (m->mnt_sb->s_flags & MS_POSIXACL)
+ if (root->d_sb->s_flags & MS_POSIXACL)
seq_printf(s, ",acl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
seq_printf(s, ",mfsymlinks");
@@ -488,7 +488,7 @@ static void cifs_umount_begin(struct super_block *sb)
}
#ifdef CONFIG_CIFS_STATS2
-static int cifs_show_stats(struct seq_file *s, struct vfsmount *mnt)
+static int cifs_show_stats(struct seq_file *s, struct dentry *root)
{
/* BB FIXME */
return 0;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 30ff56005d8f..fe5ecf1b422a 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -44,14 +44,14 @@ extern const struct address_space_operations cifs_addr_ops_smallbuf;
/* Functions related to inodes */
extern const struct inode_operations cifs_dir_inode_ops;
extern struct inode *cifs_root_iget(struct super_block *);
-extern int cifs_create(struct inode *, struct dentry *, int,
+extern int cifs_create(struct inode *, struct dentry *, umode_t,
struct nameidata *);
extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
struct nameidata *);
extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
-extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t);
-extern int cifs_mkdir(struct inode *, struct dentry *, int);
+extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
extern int cifs_rmdir(struct inode *, struct dentry *);
extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
struct dentry *);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 8238aa13e01c..76e7d8b6da17 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -169,8 +169,8 @@ struct smb_vol {
gid_t linux_gid;
uid_t backupuid;
gid_t backupgid;
- mode_t file_mode;
- mode_t dir_mode;
+ umode_t file_mode;
+ umode_t dir_mode;
unsigned secFlg;
bool retry:1;
bool intr:1;
@@ -879,6 +879,8 @@ require use of the stronger protocol */
#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */
#endif /* UPCALL */
#else /* do not allow weak pw hash */
+#define CIFSSEC_MUST_LANMAN 0
+#define CIFSSEC_MUST_PLNTXT 0
#ifdef CONFIG_CIFS_UPCALL
#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */
#else
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6600aa2d2ef3..8b7794c31591 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -821,8 +821,8 @@ PsxDelete:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB add path length overrun check */
@@ -893,8 +893,8 @@ DelFileRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->fileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->fileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve check for buffer overruns BB */
@@ -938,8 +938,8 @@ RmDirRetry:
return rc;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, dirName,
- PATH_MAX, nls_codepage, remap);
+ name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, dirName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve check for buffer overruns BB */
@@ -981,8 +981,8 @@ MkDirRetry:
return rc;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, name,
- PATH_MAX, nls_codepage, remap);
+ name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve check for buffer overruns BB */
@@ -1030,8 +1030,8 @@ PsxCreat:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, name,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, name,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -1197,8 +1197,8 @@ OldOpenRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
count = 1; /* account for one byte pad to word boundary */
name_len =
- cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
- fileName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+ fileName, PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve check for buffer overruns BB */
@@ -1304,8 +1304,8 @@ openRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
count = 1; /* account for one byte pad to word boundary */
name_len =
- cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
- fileName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+ fileName, PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
pSMB->NameLength = cpu_to_le16(name_len);
@@ -2649,16 +2649,16 @@ renameRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
pSMB->OldFileName[name_len] = 0x04; /* pad */
/* protocol requires ASCII signature byte on Unicode string */
pSMB->OldFileName[name_len + 1] = 0x00;
name_len2 =
- cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
- toName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+ toName, PATH_MAX, nls_codepage, remap);
name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
name_len2 *= 2; /* convert to bytes */
} else { /* BB improve the check for buffer overruns BB */
@@ -2738,10 +2738,12 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
/* unicode only call */
if (target_name == NULL) {
sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid);
- len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+ len_of_str =
+ cifsConvertToUTF16((__le16 *)rename_info->target_name,
dummy_string, 24, nls_codepage, remap);
} else {
- len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+ len_of_str =
+ cifsConvertToUTF16((__le16 *)rename_info->target_name,
target_name, PATH_MAX, nls_codepage,
remap);
}
@@ -2795,17 +2797,17 @@ copyRetry:
pSMB->Flags = cpu_to_le16(flags & COPY_TREE);
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len = cifsConvertToUCS((__le16 *) pSMB->OldFileName,
- fromName, PATH_MAX, nls_codepage,
- remap);
+ name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName,
+ fromName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
pSMB->OldFileName[name_len] = 0x04; /* pad */
/* protocol requires ASCII signature byte on Unicode string */
pSMB->OldFileName[name_len + 1] = 0x00;
name_len2 =
- cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
- toName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+ toName, PATH_MAX, nls_codepage, remap);
name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
name_len2 *= 2; /* convert to bytes */
} else { /* BB improve the check for buffer overruns BB */
@@ -2861,9 +2863,9 @@ createSymLinkRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX
- /* find define for this maxpathcomponent */
- , nls_codepage);
+ cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName,
+ /* find define for this maxpathcomponent */
+ PATH_MAX, nls_codepage);
name_len++; /* trailing null */
name_len *= 2;
@@ -2885,9 +2887,9 @@ createSymLinkRetry:
data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len_target =
- cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX
- /* find define for this maxpathcomponent */
- , nls_codepage);
+ cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX
+ /* find define for this maxpathcomponent */
+ , nls_codepage);
name_len_target++; /* trailing null */
name_len_target *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -2949,8 +2951,8 @@ createHardLinkRetry:
return rc;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len = cifsConvertToUCS((__le16 *) pSMB->FileName, toName,
- PATH_MAX, nls_codepage, remap);
+ name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, toName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
@@ -2972,8 +2974,8 @@ createHardLinkRetry:
data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len_target =
- cifsConvertToUCS((__le16 *) data_offset, fromName, PATH_MAX,
- nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) data_offset, fromName,
+ PATH_MAX, nls_codepage, remap);
name_len_target++; /* trailing null */
name_len_target *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -3042,8 +3044,8 @@ winCreateHardLinkRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
@@ -3051,8 +3053,8 @@ winCreateHardLinkRetry:
pSMB->OldFileName[name_len] = 0x04;
pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
name_len2 =
- cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
- toName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+ toName, PATH_MAX, nls_codepage, remap);
name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
name_len2 *= 2; /* convert to bytes */
} else { /* BB improve the check for buffer overruns BB */
@@ -3108,8 +3110,8 @@ querySymLinkRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifs_strtoUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage);
+ cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -3166,8 +3168,8 @@ querySymLinkRetry:
is_unicode = false;
/* BB FIXME investigate remapping reserved chars here */
- *symlinkinfo = cifs_strndup_from_ucs(data_start, count,
- is_unicode, nls_codepage);
+ *symlinkinfo = cifs_strndup_from_utf16(data_start,
+ count, is_unicode, nls_codepage);
if (!*symlinkinfo)
rc = -ENOMEM;
}
@@ -3450,8 +3452,9 @@ queryAclRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
pSMB->FileName[name_len] = 0;
@@ -3537,8 +3540,8 @@ setAclRetry:
return rc;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -3948,8 +3951,9 @@ QInfRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
} else {
@@ -4086,8 +4090,8 @@ QPathInfoRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -4255,8 +4259,8 @@ UnixQPathInfoRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -4344,8 +4348,8 @@ findFirstRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage, remap);
/* We can not add the asterik earlier in case
it got remapped to 0xF03A as if it were part of the
directory name instead of a wildcard */
@@ -4656,8 +4660,9 @@ GetInodeNumberRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -4794,9 +4799,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
rc = -ENOMEM;
goto parse_DFS_referrals_exit;
}
- cifsConvertToUCS((__le16 *) tmp, searchName,
- PATH_MAX, nls_codepage, remap);
- node->path_consumed = cifs_ucs2_bytes(tmp,
+ cifsConvertToUTF16((__le16 *) tmp, searchName,
+ PATH_MAX, nls_codepage, remap);
+ node->path_consumed = cifs_utf16_bytes(tmp,
le16_to_cpu(pSMBr->PathConsumed),
nls_codepage);
kfree(tmp);
@@ -4809,8 +4814,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
/* copy DfsPath */
temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
max_len = data_end - temp;
- node->path_name = cifs_strndup_from_ucs(temp, max_len,
- is_unicode, nls_codepage);
+ node->path_name = cifs_strndup_from_utf16(temp, max_len,
+ is_unicode, nls_codepage);
if (!node->path_name) {
rc = -ENOMEM;
goto parse_DFS_referrals_exit;
@@ -4819,8 +4824,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
/* copy link target UNC */
temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
max_len = data_end - temp;
- node->node_name = cifs_strndup_from_ucs(temp, max_len,
- is_unicode, nls_codepage);
+ node->node_name = cifs_strndup_from_utf16(temp, max_len,
+ is_unicode, nls_codepage);
if (!node->node_name)
rc = -ENOMEM;
}
@@ -4873,8 +4878,9 @@ getDFSRetry:
if (ses->capabilities & CAP_UNICODE) {
pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
name_len =
- cifsConvertToUCS((__le16 *) pSMB->RequestFileName,
- searchName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->RequestFileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -5506,8 +5512,8 @@ SetEOFRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -5796,8 +5802,8 @@ SetTimesRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -5877,8 +5883,8 @@ SetAttrLgcyRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- ConvertToUCS((__le16 *) pSMB->fileName, fileName,
- PATH_MAX, nls_codepage);
+ ConvertToUTF16((__le16 *) pSMB->fileName, fileName,
+ PATH_MAX, nls_codepage);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -6030,8 +6036,8 @@ setPermsRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -6123,8 +6129,8 @@ QAllEAsRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
list_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage, remap);
list_len++; /* trailing null */
list_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -6301,8 +6307,8 @@ SetEARetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f3670cf72587..602f77c304c9 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -38,6 +38,7 @@
#include <asm/processor.h>
#include <linux/inet.h>
#include <linux/module.h>
+#include <keys/user-type.h>
#include <net/ipv6.h>
#include "cifspdu.h"
#include "cifsglob.h"
@@ -225,74 +226,90 @@ static int check2ndT2(struct smb_hdr *pSMB)
static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
{
- struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
+ struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)psecond;
struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB;
- char *data_area_of_target;
- char *data_area_of_buf2;
+ char *data_area_of_tgt;
+ char *data_area_of_src;
int remaining;
- unsigned int byte_count, total_in_buf;
- __u16 total_data_size, total_in_buf2;
+ unsigned int byte_count, total_in_tgt;
+ __u16 tgt_total_cnt, src_total_cnt, total_in_src;
- total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+ src_total_cnt = get_unaligned_le16(&pSMBs->t2_rsp.TotalDataCount);
+ tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
- if (total_data_size !=
- get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
- cFYI(1, "total data size of primary and secondary t2 differ");
+ if (tgt_total_cnt != src_total_cnt)
+ cFYI(1, "total data count of primary and secondary t2 differ "
+ "source=%hu target=%hu", src_total_cnt, tgt_total_cnt);
- total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+ total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
- remaining = total_data_size - total_in_buf;
+ remaining = tgt_total_cnt - total_in_tgt;
- if (remaining < 0)
+ if (remaining < 0) {
+ cFYI(1, "Server sent too much data. tgt_total_cnt=%hu "
+ "total_in_tgt=%hu", tgt_total_cnt, total_in_tgt);
return -EPROTO;
+ }
- if (remaining == 0) /* nothing to do, ignore */
+ if (remaining == 0) {
+ /* nothing to do, ignore */
+ cFYI(1, "no more data remains");
return 0;
+ }
- total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
- if (remaining < total_in_buf2) {
+ total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount);
+ if (remaining < total_in_src)
cFYI(1, "transact2 2nd response contains too much data");
- }
/* find end of first SMB data area */
- data_area_of_target = (char *)&pSMBt->hdr.Protocol +
+ data_area_of_tgt = (char *)&pSMBt->hdr.Protocol +
get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
- /* validate target area */
- data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
- get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
+ /* validate target area */
+ data_area_of_src = (char *)&pSMBs->hdr.Protocol +
+ get_unaligned_le16(&pSMBs->t2_rsp.DataOffset);
- data_area_of_target += total_in_buf;
+ data_area_of_tgt += total_in_tgt;
- /* copy second buffer into end of first buffer */
- total_in_buf += total_in_buf2;
+ total_in_tgt += total_in_src;
/* is the result too big for the field? */
- if (total_in_buf > USHRT_MAX)
+ if (total_in_tgt > USHRT_MAX) {
+ cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt);
return -EPROTO;
- put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
+ }
+ put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount);
/* fix up the BCC */
byte_count = get_bcc(pTargetSMB);
- byte_count += total_in_buf2;
+ byte_count += total_in_src;
/* is the result too big for the field? */
- if (byte_count > USHRT_MAX)
+ if (byte_count > USHRT_MAX) {
+ cFYI(1, "coalesced BCC too large (%u)", byte_count);
return -EPROTO;
+ }
put_bcc(byte_count, pTargetSMB);
byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
- byte_count += total_in_buf2;
+ byte_count += total_in_src;
/* don't allow buffer to overflow */
- if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)
+ if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+ cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count);
return -ENOBUFS;
+ }
pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
- memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
+ /* copy second buffer into end of first buffer */
+ memcpy(data_area_of_tgt, data_area_of_src, total_in_src);
- if (remaining == total_in_buf2) {
- cFYI(1, "found the last secondary response");
- return 0; /* we are done */
- } else /* more responses to go */
+ if (remaining != total_in_src) {
+ /* more responses to go */
+ cFYI(1, "waiting for more secondary responses");
return 1;
+ }
+
+ /* we are done */
+ cFYI(1, "found the last secondary response");
+ return 0;
}
static void
@@ -756,10 +773,11 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
cifs_dump_mem("Bad SMB: ", buf,
min_t(unsigned int, server->total_read, 48));
- if (mid)
- handle_mid(mid, server, smb_buffer, length);
+ if (!mid)
+ return length;
- return length;
+ handle_mid(mid, server, smb_buffer, length);
+ return 0;
}
static int
@@ -1578,11 +1596,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
}
- if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
- cERROR(1, "Multiuser mounts currently require krb5 "
- "authentication!");
+#ifndef CONFIG_KEYS
+ /* Muliuser mounts require CONFIG_KEYS support */
+ if (vol->multiuser) {
+ cERROR(1, "Multiuser mounts require kernels with "
+ "CONFIG_KEYS enabled.");
goto cifs_parse_mount_err;
}
+#endif
if (vol->UNCip == NULL)
vol->UNCip = &vol->UNC[2];
@@ -1981,10 +2002,16 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
return 0;
break;
default:
+ /* NULL username means anonymous session */
+ if (ses->user_name == NULL) {
+ if (!vol->nullauth)
+ return 0;
+ break;
+ }
+
/* anything else takes username/password */
- if (ses->user_name == NULL)
- return 0;
- if (strncmp(ses->user_name, vol->username,
+ if (strncmp(ses->user_name,
+ vol->username ? vol->username : "",
MAX_USERNAME_SIZE))
return 0;
if (strlen(vol->username) != 0 &&
@@ -2039,6 +2066,132 @@ cifs_put_smb_ses(struct cifs_ses *ses)
cifs_put_tcp_session(server);
}
+#ifdef CONFIG_KEYS
+
+/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */
+#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1)
+
+/* Populate username and pw fields from keyring if possible */
+static int
+cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
+{
+ int rc = 0;
+ char *desc, *delim, *payload;
+ ssize_t len;
+ struct key *key;
+ struct TCP_Server_Info *server = ses->server;
+ struct sockaddr_in *sa;
+ struct sockaddr_in6 *sa6;
+ struct user_key_payload *upayload;
+
+ desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
+ if (!desc)
+ return -ENOMEM;
+
+ /* try to find an address key first */
+ switch (server->dstaddr.ss_family) {
+ case AF_INET:
+ sa = (struct sockaddr_in *)&server->dstaddr;
+ sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr);
+ break;
+ case AF_INET6:
+ sa6 = (struct sockaddr_in6 *)&server->dstaddr;
+ sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr);
+ break;
+ default:
+ cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family);
+ rc = -EINVAL;
+ goto out_err;
+ }
+
+ cFYI(1, "%s: desc=%s", __func__, desc);
+ key = request_key(&key_type_logon, desc, "");
+ if (IS_ERR(key)) {
+ if (!ses->domainName) {
+ cFYI(1, "domainName is NULL");
+ rc = PTR_ERR(key);
+ goto out_err;
+ }
+
+ /* didn't work, try to find a domain key */
+ sprintf(desc, "cifs:d:%s", ses->domainName);
+ cFYI(1, "%s: desc=%s", __func__, desc);
+ key = request_key(&key_type_logon, desc, "");
+ if (IS_ERR(key)) {
+ rc = PTR_ERR(key);
+ goto out_err;
+ }
+ }
+
+ down_read(&key->sem);
+ upayload = key->payload.data;
+ if (IS_ERR_OR_NULL(upayload)) {
+ rc = upayload ? PTR_ERR(upayload) : -EINVAL;
+ goto out_key_put;
+ }
+
+ /* find first : in payload */
+ payload = (char *)upayload->data;
+ delim = strnchr(payload, upayload->datalen, ':');
+ cFYI(1, "payload=%s", payload);
+ if (!delim) {
+ cFYI(1, "Unable to find ':' in payload (datalen=%d)",
+ upayload->datalen);
+ rc = -EINVAL;
+ goto out_key_put;
+ }
+
+ len = delim - payload;
+ if (len > MAX_USERNAME_SIZE || len <= 0) {
+ cFYI(1, "Bad value from username search (len=%zd)", len);
+ rc = -EINVAL;
+ goto out_key_put;
+ }
+
+ vol->username = kstrndup(payload, len, GFP_KERNEL);
+ if (!vol->username) {
+ cFYI(1, "Unable to allocate %zd bytes for username", len);
+ rc = -ENOMEM;
+ goto out_key_put;
+ }
+ cFYI(1, "%s: username=%s", __func__, vol->username);
+
+ len = key->datalen - (len + 1);
+ if (len > MAX_PASSWORD_SIZE || len <= 0) {
+ cFYI(1, "Bad len for password search (len=%zd)", len);
+ rc = -EINVAL;
+ kfree(vol->username);
+ vol->username = NULL;
+ goto out_key_put;
+ }
+
+ ++delim;
+ vol->password = kstrndup(delim, len, GFP_KERNEL);
+ if (!vol->password) {
+ cFYI(1, "Unable to allocate %zd bytes for password", len);
+ rc = -ENOMEM;
+ kfree(vol->username);
+ vol->username = NULL;
+ goto out_key_put;
+ }
+
+out_key_put:
+ up_read(&key->sem);
+ key_put(key);
+out_err:
+ kfree(desc);
+ cFYI(1, "%s: returning %d", __func__, rc);
+ return rc;
+}
+#else /* ! CONFIG_KEYS */
+static inline int
+cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
+ struct cifs_ses *ses __attribute__((unused)))
+{
+ return -ENOSYS;
+}
+#endif /* CONFIG_KEYS */
+
static bool warned_on_ntlm; /* globals init to false automatically */
static struct cifs_ses *
@@ -2819,7 +2972,7 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->mnt_backupgid = pvolume_info->backupgid;
cifs_sb->mnt_file_mode = pvolume_info->file_mode;
cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
- cFYI(1, "file mode: 0x%x dir mode: 0x%x",
+ cFYI(1, "file mode: 0x%hx dir mode: 0x%hx",
cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
cifs_sb->actimeo = pvolume_info->actimeo;
@@ -2914,18 +3067,33 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
/*
- * Windows only supports a max of 60k reads. Default to that when posix
- * extensions aren't in force.
+ * Windows only supports a max of 60kb reads and 65535 byte writes. Default to
+ * those values when posix extensions aren't in force. In actuality here, we
+ * use 65536 to allow for a write that is a multiple of 4k. Most servers seem
+ * to be ok with the extra byte even though Windows doesn't send writes that
+ * are that large.
+ *
+ * Citation:
+ *
+ * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx
*/
#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
+#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
static unsigned int
cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
{
__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
struct TCP_Server_Info *server = tcon->ses->server;
- unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
- CIFS_DEFAULT_IOSIZE;
+ unsigned int wsize;
+
+ /* start with specified wsize, or default */
+ if (pvolume_info->wsize)
+ wsize = pvolume_info->wsize;
+ else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+ wsize = CIFS_DEFAULT_IOSIZE;
+ else
+ wsize = CIFS_DEFAULT_NON_POSIX_WSIZE;
/* can server support 24-bit write sizes? (via UNIX extensions) */
if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
@@ -3136,10 +3304,9 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
return -EINVAL;
if (volume_info->nullauth) {
- cFYI(1, "null user");
- volume_info->username = kzalloc(1, GFP_KERNEL);
- if (volume_info->username == NULL)
- return -ENOMEM;
+ cFYI(1, "Anonymous login");
+ kfree(volume_info->username);
+ volume_info->username = NULL;
} else if (volume_info->username) {
/* BB fixme parse for domain name here */
cFYI(1, "Username: %s", volume_info->username);
@@ -3478,7 +3645,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
if (ses->capabilities & CAP_UNICODE) {
smb_buffer->Flags2 |= SMBFLG2_UNICODE;
length =
- cifs_strtoUCS((__le16 *) bcc_ptr, tree,
+ cifs_strtoUTF16((__le16 *) bcc_ptr, tree,
6 /* max utf8 char length in bytes */ *
(/* server len*/ + 256 /* share len */), nls_codepage);
bcc_ptr += 2 * length; /* convert num 16 bit words to bytes */
@@ -3533,7 +3700,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
/* mostly informational -- no need to fail on error here */
kfree(tcon->nativeFileSystem);
- tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
+ tcon->nativeFileSystem = cifs_strndup_from_utf16(bcc_ptr,
bytes_left, is_unicode,
nls_codepage);
@@ -3657,25 +3824,43 @@ int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
return rc;
}
+static int
+cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
+{
+ switch (ses->server->secType) {
+ case Kerberos:
+ vol->secFlg = CIFSSEC_MUST_KRB5;
+ return 0;
+ case NTLMv2:
+ vol->secFlg = CIFSSEC_MUST_NTLMV2;
+ break;
+ case NTLM:
+ vol->secFlg = CIFSSEC_MUST_NTLM;
+ break;
+ case RawNTLMSSP:
+ vol->secFlg = CIFSSEC_MUST_NTLMSSP;
+ break;
+ case LANMAN:
+ vol->secFlg = CIFSSEC_MUST_LANMAN;
+ break;
+ }
+
+ return cifs_set_cifscreds(vol, ses);
+}
+
static struct cifs_tcon *
cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
{
+ int rc;
struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
struct cifs_ses *ses;
struct cifs_tcon *tcon = NULL;
struct smb_vol *vol_info;
- char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */
- /* We used to have this as MAX_USERNAME which is */
- /* way too big now (256 instead of 32) */
vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
- if (vol_info == NULL) {
- tcon = ERR_PTR(-ENOMEM);
- goto out;
- }
+ if (vol_info == NULL)
+ return ERR_PTR(-ENOMEM);
- snprintf(username, sizeof(username), "krb50x%x", fsuid);
- vol_info->username = username;
vol_info->local_nls = cifs_sb->local_nls;
vol_info->linux_uid = fsuid;
vol_info->cred_uid = fsuid;
@@ -3685,8 +3870,11 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
vol_info->local_lease = master_tcon->local_lease;
vol_info->no_linux_ext = !master_tcon->unix_ext;
- /* FIXME: allow for other secFlg settings */
- vol_info->secFlg = CIFSSEC_MUST_KRB5;
+ rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
+ if (rc) {
+ tcon = ERR_PTR(rc);
+ goto out;
+ }
/* get a reference for the same TCP session */
spin_lock(&cifs_tcp_ses_lock);
@@ -3709,6 +3897,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
if (ses->capabilities & CAP_UNIX)
reset_cifs_unix_caps(0, tcon, NULL, vol_info);
out:
+ kfree(vol_info->username);
+ kfree(vol_info->password);
kfree(vol_info);
return tcon;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index d7eeb9d3ed6f..bc7e24420ac0 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -136,7 +136,7 @@ cifs_bp_rename_retry:
/* Inode operations in similar order to how they appear in Linux file fs.h */
int
-cifs_create(struct inode *inode, struct dentry *direntry, int mode,
+cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
struct nameidata *nd)
{
int rc = -ENOENT;
@@ -355,7 +355,7 @@ cifs_create_out:
return rc;
}
-int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
+int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
dev_t device_number)
{
int rc = -EPERM;
@@ -492,7 +492,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
{
int xid;
int rc = 0; /* to get around spurious gcc warning, set to zero here */
- __u32 oplock = 0;
+ __u32 oplock = enable_oplocks ? REQ_OPLOCK : 0;
__u16 fileHandle = 0;
bool posix_open = false;
struct cifs_sb_info *cifs_sb;
@@ -584,10 +584,26 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
* If either that or op not supported returned, follow
* the normal lookup.
*/
- if ((rc == 0) || (rc == -ENOENT))
+ switch (rc) {
+ case 0:
+ /*
+ * The server may allow us to open things like
+ * FIFOs, but the client isn't set up to deal
+ * with that. If it's not a regular file, just
+ * close it and proceed as if it were a normal
+ * lookup.
+ */
+ if (newInode && !S_ISREG(newInode->i_mode)) {
+ CIFSSMBClose(xid, pTcon, fileHandle);
+ break;
+ }
+ case -ENOENT:
posix_open = true;
- else if ((rc == -EINVAL) || (rc != -EOPNOTSUPP))
+ case -EOPNOTSUPP:
+ break;
+ default:
pTcon->broken_posix_open = true;
+ }
}
if (!posix_open)
rc = cifs_get_inode_info_unix(&newInode, full_path,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4dd9283885e7..5e64748a2917 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -920,16 +920,26 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
for (lockp = &inode->i_flock; *lockp != NULL; \
lockp = &(*lockp)->fl_next)
+struct lock_to_push {
+ struct list_head llist;
+ __u64 offset;
+ __u64 length;
+ __u32 pid;
+ __u16 netfid;
+ __u8 type;
+};
+
static int
cifs_push_posix_locks(struct cifsFileInfo *cfile)
{
struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct file_lock *flock, **before;
- struct cifsLockInfo *lck, *tmp;
+ unsigned int count = 0, i = 0;
int rc = 0, xid, type;
+ struct list_head locks_to_send, *el;
+ struct lock_to_push *lck, *tmp;
__u64 length;
- struct list_head locks_to_send;
xid = GetXid();
@@ -940,29 +950,55 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
return rc;
}
+ lock_flocks();
+ cifs_for_each_lock(cfile->dentry->d_inode, before) {
+ if ((*before)->fl_flags & FL_POSIX)
+ count++;
+ }
+ unlock_flocks();
+
INIT_LIST_HEAD(&locks_to_send);
+ /*
+ * Allocating count locks is enough because no locks can be added to
+ * the list while we are holding cinode->lock_mutex that protects
+ * locking operations of this inode.
+ */
+ for (; i < count; i++) {
+ lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
+ if (!lck) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+ list_add_tail(&lck->llist, &locks_to_send);
+ }
+
+ i = 0;
+ el = locks_to_send.next;
lock_flocks();
cifs_for_each_lock(cfile->dentry->d_inode, before) {
+ if (el == &locks_to_send) {
+ /* something is really wrong */
+ cERROR(1, "Can't push all brlocks!");
+ break;
+ }
flock = *before;
+ if ((flock->fl_flags & FL_POSIX) == 0)
+ continue;
length = 1 + flock->fl_end - flock->fl_start;
if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
type = CIFS_RDLCK;
else
type = CIFS_WRLCK;
-
- lck = cifs_lock_init(flock->fl_start, length, type,
- cfile->netfid);
- if (!lck) {
- rc = -ENOMEM;
- goto send_locks;
- }
+ lck = list_entry(el, struct lock_to_push, llist);
lck->pid = flock->fl_pid;
-
- list_add_tail(&lck->llist, &locks_to_send);
+ lck->netfid = cfile->netfid;
+ lck->length = length;
+ lck->type = type;
+ lck->offset = flock->fl_start;
+ i++;
+ el = el->next;
}
-
-send_locks:
unlock_flocks();
list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
@@ -979,11 +1015,18 @@ send_locks:
kfree(lck);
}
+out:
cinode->can_cache_brlcks = false;
mutex_unlock(&cinode->lock_mutex);
FreeXid(xid);
return rc;
+err_out:
+ list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
+ list_del(&lck->llist);
+ kfree(lck);
+ }
+ goto out;
}
static int
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index e851d5b8931e..745da3d0653e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -534,6 +534,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
fattr->cf_dtype = DT_DIR;
+ /*
+ * Server can return wrong NumberOfLinks value for directories
+ * when Unix extensions are disabled - fake it.
+ */
+ fattr->cf_nlink = 2;
} else {
fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
fattr->cf_dtype = DT_REG;
@@ -541,9 +546,9 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
/* clear write bits if ATTR_READONLY is set */
if (fattr->cf_cifsattrs & ATTR_READONLY)
fattr->cf_mode &= ~(S_IWUGO);
- }
- fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+ fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+ }
fattr->cf_uid = cifs_sb->mnt_uid;
fattr->cf_gid = cifs_sb->mnt_gid;
@@ -1264,7 +1269,7 @@ unlink_out:
return rc;
}
-int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
+int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
{
int rc = 0, tmprc;
int xid;
@@ -1275,7 +1280,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
struct inode *newinode = NULL;
struct cifs_fattr fattr;
- cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
+ cFYI(1, "In cifs_mkdir, mode = 0x%hx inode = 0x%p", mode, inode);
cifs_sb = CIFS_SB(inode->i_sb);
tlink = cifs_sb_tlink(cifs_sb);
@@ -1322,7 +1327,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
}
/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
to set uid/gid */
- inc_nlink(inode);
cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1355,7 +1359,6 @@ mkdir_retry_old:
d_drop(direntry);
} else {
mkdir_get_info:
- inc_nlink(inode);
if (pTcon->unix_ext)
rc = cifs_get_inode_info_unix(&newinode, full_path,
inode->i_sb, xid);
@@ -1436,6 +1439,11 @@ mkdir_get_info:
}
}
mkdir_out:
+ /*
+ * Force revalidate to get parent dir info when needed since cached
+ * attributes are invalid now.
+ */
+ CIFS_I(inode)->time = 0;
kfree(full_path);
FreeXid(xid);
cifs_put_tlink(tlink);
@@ -1475,7 +1483,6 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
cifs_put_tlink(tlink);
if (!rc) {
- drop_nlink(inode);
spin_lock(&direntry->d_inode->i_lock);
i_size_write(direntry->d_inode, 0);
clear_nlink(direntry->d_inode);
@@ -1483,12 +1490,15 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
}
cifsInode = CIFS_I(direntry->d_inode);
- cifsInode->time = 0; /* force revalidate to go get info when
- needed */
+ /* force revalidate to go get info when needed */
+ cifsInode->time = 0;
cifsInode = CIFS_I(inode);
- cifsInode->time = 0; /* force revalidate to get parent dir info
- since cached search results now invalid */
+ /*
+ * Force revalidate to get parent dir info when needed since cached
+ * attributes are invalid now.
+ */
+ cifsInode->time = 0;
direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
current_fs_time(inode->i_sb);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a090bbe6ee29..e2bbc683e018 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -647,10 +647,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
name.name = scratch_buf;
name.len =
- cifs_from_ucs2((char *)name.name, (__le16 *)de.name,
- UNICODE_NAME_MAX,
- min(de.namelen, (size_t)max_len), nlt,
- cifs_sb->mnt_cifs_flags &
+ cifs_from_utf16((char *)name.name, (__le16 *)de.name,
+ UNICODE_NAME_MAX,
+ min_t(size_t, de.namelen,
+ (size_t)max_len), nlt,
+ cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
name.len -= nls_nullsize(nlt);
} else {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 4ec3ee9d72cc..551d0c2b9736 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -167,16 +167,16 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
int bytes_ret = 0;
/* Copy OS version */
- bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32,
- nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *)bcc_ptr, "Linux version ", 32,
+ nls_cp);
bcc_ptr += 2 * bytes_ret;
- bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release,
- 32, nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, init_utsname()->release,
+ 32, nls_cp);
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* trailing null */
- bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
- 32, nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
+ 32, nls_cp);
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* trailing null */
@@ -197,8 +197,8 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
*(bcc_ptr+1) = 0;
bytes_ret = 0;
} else
- bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName,
- 256, nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
+ 256, nls_cp);
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* account for null terminator */
@@ -226,8 +226,8 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
*bcc_ptr = 0;
*(bcc_ptr+1) = 0;
} else {
- bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name,
- MAX_USERNAME_SIZE, nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name,
+ MAX_USERNAME_SIZE, nls_cp);
}
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* account for null termination */
@@ -246,16 +246,15 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
/* copy user */
/* BB what about null user mounts - check that we do this BB */
/* copy user */
- if (ses->user_name != NULL)
+ if (ses->user_name != NULL) {
strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE);
+ bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
+ }
/* else null user mount */
-
- bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
*bcc_ptr = 0;
bcc_ptr++; /* account for null termination */
/* copy domain */
-
if (ses->domainName != NULL) {
strncpy(bcc_ptr, ses->domainName, 256);
bcc_ptr += strnlen(ses->domainName, 256);
@@ -287,7 +286,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
cFYI(1, "bleft %d", bleft);
kfree(ses->serverOS);
- ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+ ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
cFYI(1, "serverOS=%s", ses->serverOS);
len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
data += len;
@@ -296,7 +295,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
return;
kfree(ses->serverNOS);
- ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+ ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
cFYI(1, "serverNOS=%s", ses->serverNOS);
len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
data += len;
@@ -305,7 +304,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
return;
kfree(ses->serverDomain);
- ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+ ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
cFYI(1, "serverDomain=%s", ses->serverDomain);
return;
@@ -395,6 +394,10 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
+ if (tioffset > blob_len || tioffset + tilen > blob_len) {
+ cERROR(1, "tioffset + tilen too high %u + %u", tioffset, tilen);
+ return -EINVAL;
+ }
if (tilen) {
ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
if (!ses->auth_key.response) {
@@ -502,8 +505,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
tmp += 2;
} else {
int len;
- len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
- MAX_USERNAME_SIZE, nls_cp);
+ len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
+ MAX_USERNAME_SIZE, nls_cp);
len *= 2; /* unicode is 2 bytes each */
sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
sec_blob->DomainName.Length = cpu_to_le16(len);
@@ -518,8 +521,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
tmp += 2;
} else {
int len;
- len = cifs_strtoUCS((__le16 *)tmp, ses->user_name,
- MAX_USERNAME_SIZE, nls_cp);
+ len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
+ MAX_USERNAME_SIZE, nls_cp);
len *= 2; /* unicode is 2 bytes each */
sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
sec_blob->UserName.Length = cpu_to_le16(len);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 80d850881938..d5cd9aa7eacc 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -213,7 +213,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
/* Password cannot be longer than 128 characters */
if (passwd) /* Password must be converted to NT unicode */
- len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
+ len = cifs_strtoUTF16(wpwd, passwd, 128, codepage);
else {
len = 0;
*wpwd = 0; /* Ensure string is null terminated */
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 45f07c46f3ed..10d92cf57ab6 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -105,7 +105,6 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
struct cifs_tcon *pTcon;
struct super_block *sb;
char *full_path;
- struct cifs_ntsd *pacl;
if (direntry == NULL)
return -EIO;
@@ -164,23 +163,24 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+#ifdef CONFIG_CIFS_ACL
+ struct cifs_ntsd *pacl;
pacl = kmalloc(value_size, GFP_KERNEL);
if (!pacl) {
cFYI(1, "%s: Can't allocate memory for ACL",
__func__);
rc = -ENOMEM;
} else {
-#ifdef CONFIG_CIFS_ACL
memcpy(pacl, ea_value, value_size);
rc = set_cifs_acl(pacl, value_size,
direntry->d_inode, full_path, CIFS_ACL_DACL);
if (rc == 0) /* force revalidate of the inode */
CIFS_I(direntry->d_inode)->time = 0;
kfree(pacl);
+ }
#else
cFYI(1, "Set CIFS ACL not supported yet");
#endif /* CONFIG_CIFS_ACL */
- }
} else {
int temp;
temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 6475877b0763..911cf30d057d 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -88,24 +88,21 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
- link the two up if this is needed
- fill in the attributes
*/
-int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_block *sb)
+struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb)
{
struct coda_vattr attr;
+ struct inode *inode;
int error;
/* We get inode numbers from Venus -- see venus source */
error = venus_getattr(sb, fid, &attr);
- if ( error ) {
- *inode = NULL;
- return error;
- }
+ if (error)
+ return ERR_PTR(error);
- *inode = coda_iget(sb, fid, &attr);
- if ( IS_ERR(*inode) ) {
+ inode = coda_iget(sb, fid, &attr);
+ if (IS_ERR(inode))
printk("coda_cnode_make: coda_iget failed\n");
- return PTR_ERR(*inode);
- }
- return 0;
+ return inode;
}
@@ -156,19 +153,16 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb)
}
/* the CONTROL inode is made without asking attributes from Venus */
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb)
+struct inode *coda_cnode_makectl(struct super_block *sb)
{
- int error = -ENOMEM;
-
- *inode = new_inode(sb);
- if (*inode) {
- (*inode)->i_ino = CTL_INO;
- (*inode)->i_op = &coda_ioctl_inode_operations;
- (*inode)->i_fop = &coda_ioctl_operations;
- (*inode)->i_mode = 0444;
- error = 0;
+ struct inode *inode = new_inode(sb);
+ if (inode) {
+ inode->i_ino = CTL_INO;
+ inode->i_op = &coda_ioctl_inode_operations;
+ inode->i_fop = &coda_ioctl_operations;
+ inode->i_mode = 0444;
+ return inode;
}
-
- return error;
+ return ERR_PTR(-ENOMEM);
}
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index e35071b1de0e..b24fdfd8a3f0 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -49,9 +49,9 @@ struct coda_file_info {
#define C_DYING 0x4 /* from venus (which died) */
#define C_PURGE 0x8
-int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_cnode_make(struct CodaFid *, struct super_block *);
struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_cnode_makectl(struct super_block *sb);
struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 28e7e135cfab..177515829062 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -30,14 +30,14 @@
#include "coda_int.h"
/* dir inode-ops */
-static int coda_create(struct inode *dir, struct dentry *new, int mode, struct nameidata *nd);
+static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, struct nameidata *nd);
static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, struct nameidata *nd);
static int coda_link(struct dentry *old_dentry, struct inode *dir_inode,
struct dentry *entry);
static int coda_unlink(struct inode *dir_inode, struct dentry *entry);
static int coda_symlink(struct inode *dir_inode, struct dentry *entry,
const char *symname);
-static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, int mode);
+static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, umode_t mode);
static int coda_rmdir(struct inode *dir_inode, struct dentry *entry);
static int coda_rename(struct inode *old_inode, struct dentry *old_dentry,
struct inode *new_inode, struct dentry *new_dentry);
@@ -96,12 +96,11 @@ const struct file_operations coda_dir_operations = {
/* access routines: lookup, readlink, permission */
static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd)
{
- struct inode *inode = NULL;
- struct CodaFid resfid = { { 0, } };
- int type = 0;
- int error = 0;
+ struct super_block *sb = dir->i_sb;
const char *name = entry->d_name.name;
size_t length = entry->d_name.len;
+ struct inode *inode;
+ int type = 0;
if (length > CODA_MAXNAMLEN) {
printk(KERN_ERR "name too long: lookup, %s (%*s)\n",
@@ -111,23 +110,21 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
/* control object, create inode on the fly */
if (coda_isroot(dir) && coda_iscontrol(name, length)) {
- error = coda_cnode_makectl(&inode, dir->i_sb);
+ inode = coda_cnode_makectl(sb);
type = CODA_NOCACHE;
- goto exit;
+ } else {
+ struct CodaFid fid = { { 0, } };
+ int error = venus_lookup(sb, coda_i2f(dir), name, length,
+ &type, &fid);
+ inode = !error ? coda_cnode_make(&fid, sb) : ERR_PTR(error);
}
- error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
- &type, &resfid);
- if (!error)
- error = coda_cnode_make(&inode, &resfid, dir->i_sb);
-
- if (error && error != -ENOENT)
- return ERR_PTR(error);
-
-exit:
- if (inode && (type & CODA_NOCACHE))
+ if (!IS_ERR(inode) && (type & CODA_NOCACHE))
coda_flag_inode(inode, C_VATTR | C_PURGE);
+ if (inode == ERR_PTR(-ENOENT))
+ inode = NULL;
+
return d_splice_alias(inode, entry);
}
@@ -191,7 +188,7 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
}
/* creation routines: create, mknod, mkdir, link, symlink */
-static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
+static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, struct nameidata *nd)
{
int error;
const char *name=de->d_name.name;
@@ -223,7 +220,7 @@ err_out:
return error;
}
-static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
+static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode)
{
struct inode *inode;
struct coda_vattr attrs;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 871b27715465..5e2e1b3f068d 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -58,7 +58,6 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
static void coda_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(coda_inode_cachep, ITOC(inode));
}
@@ -205,10 +204,12 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid));
/* make root inode */
- error = coda_cnode_make(&root, &fid, sb);
- if ( error || !root ) {
- printk("Failure of coda_cnode_make for root: error %d\n", error);
- goto error;
+ root = coda_cnode_make(&fid, sb);
+ if (IS_ERR(root)) {
+ error = PTR_ERR(root);
+ printk("Failure of coda_cnode_make for root: error %d\n", error);
+ root = NULL;
+ goto error;
}
printk("coda_read_super: rootinode is %ld dev %s\n",
diff --git a/fs/compat.c b/fs/compat.c
index c98787536bb8..07880bae28a9 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -131,41 +131,35 @@ asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_tim
static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
{
- compat_ino_t ino = stat->ino;
- typeof(ubuf->st_uid) uid = 0;
- typeof(ubuf->st_gid) gid = 0;
- int err;
+ struct compat_stat tmp;
- SET_UID(uid, stat->uid);
- SET_GID(gid, stat->gid);
+ if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ return -EOVERFLOW;
- if ((u64) stat->size > MAX_NON_LFS ||
- !old_valid_dev(stat->dev) ||
- !old_valid_dev(stat->rdev))
+ memset(&tmp, 0, sizeof(tmp));
+ tmp.st_dev = old_encode_dev(stat->dev);
+ tmp.st_ino = stat->ino;
+ if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
- if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino)
+ tmp.st_mode = stat->mode;
+ tmp.st_nlink = stat->nlink;
+ if (tmp.st_nlink != stat->nlink)
return -EOVERFLOW;
-
- if (clear_user(ubuf, sizeof(*ubuf)))
- return -EFAULT;
-
- err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev);
- err |= __put_user(ino, &ubuf->st_ino);
- err |= __put_user(stat->mode, &ubuf->st_mode);
- err |= __put_user(stat->nlink, &ubuf->st_nlink);
- err |= __put_user(uid, &ubuf->st_uid);
- err |= __put_user(gid, &ubuf->st_gid);
- err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev);
- err |= __put_user(stat->size, &ubuf->st_size);
- err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime);
- err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec);
- err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime);
- err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec);
- err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime);
- err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec);
- err |= __put_user(stat->blksize, &ubuf->st_blksize);
- err |= __put_user(stat->blocks, &ubuf->st_blocks);
- return err;
+ SET_UID(tmp.st_uid, stat->uid);
+ SET_GID(tmp.st_gid, stat->gid);
+ tmp.st_rdev = old_encode_dev(stat->rdev);
+ if ((u64) stat->size > MAX_NON_LFS)
+ return -EOVERFLOW;
+ tmp.st_size = stat->size;
+ tmp.st_atime = stat->atime.tv_sec;
+ tmp.st_atime_nsec = stat->atime.tv_nsec;
+ tmp.st_mtime = stat->mtime.tv_sec;
+ tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+ tmp.st_ctime = stat->ctime.tv_sec;
+ tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+ tmp.st_blocks = stat->blocks;
+ tmp.st_blksize = stat->blksize;
+ return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
asmlinkage long compat_sys_newstat(const char __user * filename,
@@ -342,16 +336,9 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
*/
asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
{
- struct super_block *sb;
struct compat_ustat tmp;
struct kstatfs sbuf;
- int err;
-
- sb = user_get_super(new_decode_dev(dev));
- if (!sb)
- return -EINVAL;
- err = statfs_by_dentry(sb->s_root, &sbuf);
- drop_super(sb);
+ int err = vfs_ustat(new_decode_dev(dev), &sbuf);
if (err)
return err;
@@ -1288,7 +1275,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
* O_LARGEFILE flag.
*/
asmlinkage long
-compat_sys_open(const char __user *filename, int flags, int mode)
+compat_sys_open(const char __user *filename, int flags, umode_t mode)
{
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
@@ -1298,7 +1285,7 @@ compat_sys_open(const char __user *filename, int flags, int mode)
* O_LARGEFILE flag.
*/
asmlinkage long
-compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode)
+compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, umode_t mode)
{
return do_sys_open(dfd, filename, flags, mode);
}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 51352de88ef1..a26bea10e81b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -105,6 +105,7 @@
#include <linux/hiddev.h>
+#define __DVB_CORE__
#include <linux/dvb/audio.h>
#include <linux/dvb/dmx.h>
#include <linux/dvb/frontend.h>
@@ -1506,35 +1507,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
return -ENOIOCTLCMD;
}
-static void compat_ioctl_error(struct file *filp, unsigned int fd,
- unsigned int cmd, unsigned long arg)
-{
- char buf[10];
- char *fn = "?";
- char *path;
-
- /* find the name of the device. */
- path = (char *)__get_free_page(GFP_KERNEL);
- if (path) {
- fn = d_path(&filp->f_path, path, PAGE_SIZE);
- if (IS_ERR(fn))
- fn = "?";
- }
-
- sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK);
- if (!isprint(buf[1]))
- sprintf(buf, "%02x", buf[1]);
- compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
- "cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n",
- current->comm, current->pid,
- (int)fd, (unsigned int)cmd, buf,
- (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
- (unsigned int)arg, fn);
-
- if (path)
- free_page((unsigned long)path);
-}
-
static int compat_ioctl_check_table(unsigned int xcmd)
{
int i;
@@ -1621,13 +1593,8 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
goto found_handler;
error = do_ioctl_trans(fd, cmd, arg, filp);
- if (error == -ENOIOCTLCMD) {
- static int count;
-
- if (++count <= 50)
- compat_ioctl_error(filp, fd, cmd, arg);
- error = -EINVAL;
- }
+ if (error == -ENOIOCTLCMD)
+ error = -ENOTTY;
goto out_fput;
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 82bda8fdfc1c..ede857d20a04 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -63,8 +63,8 @@ extern struct kmem_cache *configfs_dir_cachep;
extern int configfs_is_root(struct config_item *item);
-extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *);
-extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *);
+extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
extern int configfs_inode_init(void);
extern void configfs_inode_exit(void);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9a37a9b6de3a..5ddd7ebd9dcd 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -311,8 +311,8 @@ static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
if (item->ci_parent)
parent = item->ci_parent->ci_dentry;
- else if (configfs_mount && configfs_mount->mnt_sb)
- parent = configfs_mount->mnt_sb->s_root;
+ else if (configfs_mount)
+ parent = configfs_mount->mnt_root;
else
return -EFAULT;
@@ -1170,7 +1170,7 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
}
EXPORT_SYMBOL(configfs_undepend_item);
-static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int ret = 0;
int module_got = 0;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 9d8715c45f25..3ee36d418863 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -116,7 +116,7 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
return error;
}
-static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
+static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
{
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -132,7 +132,7 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
inode->i_ctime = iattr->ia_ctime;
}
-struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
+struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent * sd)
{
struct inode * inode = new_inode(configfs_sb);
if (inode) {
@@ -185,7 +185,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
#endif /* CONFIG_LOCKDEP */
-int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
+int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct inode *))
{
int error = 0;
struct inode * inode = NULL;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 739fb59bcdc2..a2ee8f9f5a38 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -20,7 +20,6 @@
#include <linux/cramfs_fs.h>
#include <linux/slab.h>
#include <linux/cramfs_fs_sb.h>
-#include <linux/buffer_head.h>
#include <linux/vfs.h>
#include <linux/mutex.h>
@@ -378,7 +377,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
unsigned long nextoffset;
char *name;
ino_t ino;
- mode_t mode;
+ umode_t mode;
int namelen, error;
mutex_lock(&read_mutex);
diff --git a/fs/dcache.c b/fs/dcache.c
index 89509b5a090e..bcbdb33fcc20 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -38,6 +38,7 @@
#include <linux/prefetch.h>
#include <linux/ratelimit.h>
#include "internal.h"
+#include "mount.h"
/*
* Usage:
@@ -103,7 +104,7 @@ static unsigned int d_hash_shift __read_mostly;
static struct hlist_bl_head *dentry_hashtable __read_mostly;
-static inline struct hlist_bl_head *d_hash(struct dentry *parent,
+static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
unsigned long hash)
{
hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
@@ -136,6 +137,26 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
}
#endif
+/*
+ * Compare 2 name strings, return 0 if they match, otherwise non-zero.
+ * The strings are both count bytes long, and count is non-zero.
+ */
+static inline int dentry_cmp(const unsigned char *cs, size_t scount,
+ const unsigned char *ct, size_t tcount)
+{
+ if (scount != tcount)
+ return 1;
+
+ do {
+ if (*cs != *ct)
+ return 1;
+ cs++;
+ ct++;
+ tcount--;
+ } while (tcount);
+ return 0;
+}
+
static void __d_free(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
@@ -242,6 +263,7 @@ static void dentry_lru_add(struct dentry *dentry)
static void __dentry_lru_del(struct dentry *dentry)
{
list_del_init(&dentry->d_lru);
+ dentry->d_flags &= ~DCACHE_SHRINK_LIST;
dentry->d_sb->s_nr_dentry_unused--;
dentry_stat.nr_unused--;
}
@@ -275,15 +297,15 @@ static void dentry_lru_prune(struct dentry *dentry)
}
}
-static void dentry_lru_move_tail(struct dentry *dentry)
+static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
{
spin_lock(&dcache_lru_lock);
if (list_empty(&dentry->d_lru)) {
- list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+ list_add_tail(&dentry->d_lru, list);
dentry->d_sb->s_nr_dentry_unused++;
dentry_stat.nr_unused++;
} else {
- list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+ list_move_tail(&dentry->d_lru, list);
}
spin_unlock(&dcache_lru_lock);
}
@@ -769,14 +791,18 @@ static void shrink_dentry_list(struct list_head *list)
}
/**
- * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
- * @sb: superblock to shrink dentry LRU.
- * @count: number of entries to prune
- * @flags: flags to control the dentry processing
+ * prune_dcache_sb - shrink the dcache
+ * @sb: superblock
+ * @count: number of entries to try to free
*
- * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
+ * Attempt to shrink the superblock dcache LRU by @count entries. This is
+ * done when we need more memory an called from the superblock shrinker
+ * function.
+ *
+ * This function may fail to free any resources if all the dentries are in
+ * use.
*/
-static void __shrink_dcache_sb(struct super_block *sb, int count, int flags)
+void prune_dcache_sb(struct super_block *sb, int count)
{
struct dentry *dentry;
LIST_HEAD(referenced);
@@ -795,18 +821,13 @@ relock:
goto relock;
}
- /*
- * If we are honouring the DCACHE_REFERENCED flag and the
- * dentry has this flag set, don't free it. Clear the flag
- * and put it back on the LRU.
- */
- if (flags & DCACHE_REFERENCED &&
- dentry->d_flags & DCACHE_REFERENCED) {
+ if (dentry->d_flags & DCACHE_REFERENCED) {
dentry->d_flags &= ~DCACHE_REFERENCED;
list_move(&dentry->d_lru, &referenced);
spin_unlock(&dentry->d_lock);
} else {
list_move_tail(&dentry->d_lru, &tmp);
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
spin_unlock(&dentry->d_lock);
if (!--count)
break;
@@ -821,23 +842,6 @@ relock:
}
/**
- * prune_dcache_sb - shrink the dcache
- * @sb: superblock
- * @nr_to_scan: number of entries to try to free
- *
- * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
- * done when we need more memory an called from the superblock shrinker
- * function.
- *
- * This function may fail to free any resources if all the dentries are in
- * use.
- */
-void prune_dcache_sb(struct super_block *sb, int nr_to_scan)
-{
- __shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED);
-}
-
-/**
* shrink_dcache_sb - shrink dcache for a superblock
* @sb: superblock
*
@@ -1091,7 +1095,7 @@ EXPORT_SYMBOL(have_submounts);
* drop the lock and return early due to latency
* constraints.
*/
-static int select_parent(struct dentry * parent)
+static int select_parent(struct dentry *parent, struct list_head *dispose)
{
struct dentry *this_parent;
struct list_head *next;
@@ -1113,17 +1117,21 @@ resume:
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- /*
- * move only zero ref count dentries to the end
- * of the unused list for prune_dcache
+ /*
+ * move only zero ref count dentries to the dispose list.
+ *
+ * Those which are presently on the shrink list, being processed
+ * by shrink_dentry_list(), shouldn't be moved. Otherwise the
+ * loop in shrink_dcache_parent() might not make any progress
+ * and loop forever.
*/
- if (!dentry->d_count) {
- dentry_lru_move_tail(dentry);
- found++;
- } else {
+ if (dentry->d_count) {
dentry_lru_del(dentry);
+ } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
+ dentry_lru_move_list(dentry, dispose);
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
+ found++;
}
-
/*
* We can return to the caller if we have found some (this
* ensures forward progress). We'll be coming back to find
@@ -1180,14 +1188,13 @@ rename_retry:
*
* Prune the dcache to remove unused children of the parent dentry.
*/
-
void shrink_dcache_parent(struct dentry * parent)
{
- struct super_block *sb = parent->d_sb;
+ LIST_HEAD(dispose);
int found;
- while ((found = select_parent(parent)) != 0)
- __shrink_dcache_sb(sb, found, 0);
+ while ((found = select_parent(parent, &dispose)) != 0)
+ shrink_dentry_list(&dispose);
}
EXPORT_SYMBOL(shrink_dcache_parent);
@@ -1460,6 +1467,23 @@ struct dentry * d_alloc_root(struct inode * root_inode)
}
EXPORT_SYMBOL(d_alloc_root);
+struct dentry *d_make_root(struct inode *root_inode)
+{
+ struct dentry *res = NULL;
+
+ if (root_inode) {
+ static const struct qstr name = { .name = "/", .len = 1 };
+
+ res = __d_alloc(root_inode->i_sb, &name);
+ if (res)
+ d_instantiate(res, root_inode);
+ else
+ iput(root_inode);
+ }
+ return res;
+}
+EXPORT_SYMBOL(d_make_root);
+
static struct dentry * __d_find_any_alias(struct inode *inode)
{
struct dentry *alias;
@@ -1471,7 +1495,14 @@ static struct dentry * __d_find_any_alias(struct inode *inode)
return alias;
}
-static struct dentry * d_find_any_alias(struct inode *inode)
+/**
+ * d_find_any_alias - find any alias for a given inode
+ * @inode: inode to find an alias for
+ *
+ * If any aliases exist for the given inode, take and return a
+ * reference for one of them. If no aliases exist, return %NULL.
+ */
+struct dentry *d_find_any_alias(struct inode *inode)
{
struct dentry *de;
@@ -1480,7 +1511,7 @@ static struct dentry * d_find_any_alias(struct inode *inode)
spin_unlock(&inode->i_lock);
return de;
}
-
+EXPORT_SYMBOL(d_find_any_alias);
/**
* d_obtain_alias - find or allocate a dentry for a given inode
@@ -1706,8 +1737,9 @@ EXPORT_SYMBOL(d_add_ci);
* child is looked up. Thus, an interlocking stepping of sequence lock checks
* is formed, giving integrity down the path walk.
*/
-struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
- unsigned *seq, struct inode **inode)
+struct dentry *__d_lookup_rcu(const struct dentry *parent,
+ const struct qstr *name,
+ unsigned *seqp, struct inode **inode)
{
unsigned int len = name->len;
unsigned int hash = name->hash;
@@ -1737,6 +1769,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
* See Documentation/filesystems/path-lookup.txt for more details.
*/
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+ unsigned seq;
struct inode *i;
const char *tname;
int tlen;
@@ -1745,7 +1778,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
continue;
seqretry:
- *seq = read_seqcount_begin(&dentry->d_seq);
+ seq = read_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
continue;
if (d_unhashed(dentry))
@@ -1760,7 +1793,7 @@ seqretry:
* edge of memory when walking. If we could load this
* atomically some other way, we could drop this check.
*/
- if (read_seqcount_retry(&dentry->d_seq, *seq))
+ if (read_seqcount_retry(&dentry->d_seq, seq))
goto seqretry;
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
if (parent->d_op->d_compare(parent, *inode,
@@ -1777,6 +1810,7 @@ seqretry:
* order to do anything useful with the returned dentry
* anyway.
*/
+ *seqp = seq;
*inode = i;
return dentry;
}
@@ -2451,6 +2485,7 @@ static int prepend_path(const struct path *path,
{
struct dentry *dentry = path->dentry;
struct vfsmount *vfsmnt = path->mnt;
+ struct mount *mnt = real_mount(vfsmnt);
bool slash = false;
int error = 0;
@@ -2460,11 +2495,11 @@ static int prepend_path(const struct path *path,
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
/* Global root? */
- if (vfsmnt->mnt_parent == vfsmnt) {
+ if (!mnt_has_parent(mnt))
goto global_root;
- }
- dentry = vfsmnt->mnt_mountpoint;
- vfsmnt = vfsmnt->mnt_parent;
+ dentry = mnt->mnt_mountpoint;
+ mnt = mnt->mnt_parent;
+ vfsmnt = &mnt->mnt;
continue;
}
parent = dentry->d_parent;
@@ -2501,7 +2536,7 @@ global_root:
if (!slash)
error = prepend(buffer, buflen, "/", 1);
if (!error)
- error = vfsmnt->mnt_ns ? 1 : 2;
+ error = real_mount(vfsmnt)->mnt_ns ? 1 : 2;
goto out;
}
@@ -2853,31 +2888,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
return result;
}
-int path_is_under(struct path *path1, struct path *path2)
-{
- struct vfsmount *mnt = path1->mnt;
- struct dentry *dentry = path1->dentry;
- int res;
-
- br_read_lock(vfsmount_lock);
- if (mnt != path2->mnt) {
- for (;;) {
- if (mnt->mnt_parent == mnt) {
- br_read_unlock(vfsmount_lock);
- return 0;
- }
- if (mnt->mnt_parent == path2->mnt)
- break;
- mnt = mnt->mnt_parent;
- }
- dentry = mnt->mnt_mountpoint;
- }
- res = is_subdir(dentry, path2->dentry);
- br_read_unlock(vfsmount_lock);
- return res;
-}
-EXPORT_SYMBOL(path_is_under);
-
void d_genocide(struct dentry *root)
{
struct dentry *this_parent;
@@ -2981,7 +2991,7 @@ __setup("dhash_entries=", set_dhash_entries);
static void __init dcache_init_early(void)
{
- int loop;
+ unsigned int loop;
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
@@ -2999,13 +3009,13 @@ static void __init dcache_init_early(void)
&d_hash_mask,
0);
- for (loop = 0; loop < (1 << d_hash_shift); loop++)
+ for (loop = 0; loop < (1U << d_hash_shift); loop++)
INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
}
static void __init dcache_init(void)
{
- int loop;
+ unsigned int loop;
/*
* A constructor could be added for stable state like the lists,
@@ -3029,7 +3039,7 @@ static void __init dcache_init(void)
&d_hash_mask,
0);
- for (loop = 0; loop < (1 << d_hash_shift); loop++)
+ for (loop = 0; loop < (1U << d_hash_shift); loop++)
INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
}
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 90f76575c056..ef023eef0464 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -15,9 +15,11 @@
#include <linux/module.h>
#include <linux/fs.h>
+#include <linux/seq_file.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
#include <linux/debugfs.h>
+#include <linux/io.h>
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -95,7 +97,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
* %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
* code.
*/
-struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+struct dentry *debugfs_create_u8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
/* if there are no write bits set, make read only */
@@ -147,7 +149,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
* %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
* code.
*/
-struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+struct dentry *debugfs_create_u16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
/* if there are no write bits set, make read only */
@@ -199,7 +201,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
* %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
* code.
*/
-struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+struct dentry *debugfs_create_u32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
/* if there are no write bits set, make read only */
@@ -252,7 +254,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
* %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
* code.
*/
-struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+struct dentry *debugfs_create_u64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
/* if there are no write bits set, make read only */
@@ -298,7 +300,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+struct dentry *debugfs_create_x8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
/* if there are no write bits set, make read only */
@@ -322,7 +324,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+struct dentry *debugfs_create_x16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
/* if there are no write bits set, make read only */
@@ -346,7 +348,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+struct dentry *debugfs_create_x32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
/* if there are no write bits set, make read only */
@@ -370,7 +372,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+struct dentry *debugfs_create_x64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
return debugfs_create_file(name, mode, parent, value, &fops_x64);
@@ -401,7 +403,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
struct dentry *parent, size_t *value)
{
return debugfs_create_file(name, mode, parent, value, &fops_size_t);
@@ -473,7 +475,7 @@ static const struct file_operations fops_bool = {
* %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
* code.
*/
-struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+struct dentry *debugfs_create_bool(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
return debugfs_create_file(name, mode, parent, value, &fops_bool);
@@ -518,10 +520,103 @@ static const struct file_operations fops_blob = {
* %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
* code.
*/
-struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+struct dentry *debugfs_create_blob(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_blob_wrapper *blob)
{
return debugfs_create_file(name, mode, parent, blob, &fops_blob);
}
EXPORT_SYMBOL_GPL(debugfs_create_blob);
+
+#ifdef CONFIG_HAS_IOMEM
+
+/*
+ * The regset32 stuff is used to print 32-bit registers using the
+ * seq_file utilities. We offer printing a register set in an already-opened
+ * sequential file or create a debugfs file that only prints a regset32.
+ */
+
+/**
+ * debugfs_print_regs32 - use seq_print to describe a set of registers
+ * @s: the seq_file structure being used to generate output
+ * @regs: an array if struct debugfs_reg32 structures
+ * @nregs: the length of the above array
+ * @base: the base address to be used in reading the registers
+ * @prefix: a string to be prefixed to every output line
+ *
+ * This function outputs a text block describing the current values of
+ * some 32-bit hardware registers. It is meant to be used within debugfs
+ * files based on seq_file that need to show registers, intermixed with other
+ * information. The prefix argument may be used to specify a leading string,
+ * because some peripherals have several blocks of identical registers,
+ * for example configuration of dma channels
+ */
+int debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
+ int nregs, void __iomem *base, char *prefix)
+{
+ int i, ret = 0;
+
+ for (i = 0; i < nregs; i++, regs++) {
+ if (prefix)
+ ret += seq_printf(s, "%s", prefix);
+ ret += seq_printf(s, "%s = 0x%08x\n", regs->name,
+ readl(base + regs->offset));
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(debugfs_print_regs32);
+
+static int debugfs_show_regset32(struct seq_file *s, void *data)
+{
+ struct debugfs_regset32 *regset = s->private;
+
+ debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, "");
+ return 0;
+}
+
+static int debugfs_open_regset32(struct inode *inode, struct file *file)
+{
+ return single_open(file, debugfs_show_regset32, inode->i_private);
+}
+
+static const struct file_operations fops_regset32 = {
+ .open = debugfs_open_regset32,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/**
+ * debugfs_create_regset32 - create a debugfs file that returns register values
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @regset: a pointer to a struct debugfs_regset32, which contains a pointer
+ * to an array of register definitions, the array size and the base
+ * address where the register bank is to be found.
+ *
+ * This function creates a file in debugfs with the given name that reports
+ * the names and values of a set of 32-bit registers. If the @mode variable
+ * is so set it can be read from. Writing is not supported.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned. It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_regset32(const char *name, mode_t mode,
+ struct dentry *parent,
+ struct debugfs_regset32 *regset)
+{
+ return debugfs_create_file(name, mode, parent, regset, &fops_regset32);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_regset32);
+
+#endif /* CONFIG_HAS_IOMEM */
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index f3a257d7a985..956d5ddddf6e 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,7 +30,7 @@ static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
-static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev,
+static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev,
void *data, const struct file_operations *fops)
{
@@ -69,7 +69,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
/* SMP-safe */
static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
- int mode, dev_t dev, void *data,
+ umode_t mode, dev_t dev, void *data,
const struct file_operations *fops)
{
struct inode *inode;
@@ -87,7 +87,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
return error;
}
-static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
+static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode,
void *data, const struct file_operations *fops)
{
int res;
@@ -101,14 +101,14 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
return res;
}
-static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode,
+static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode,
void *data, const struct file_operations *fops)
{
mode = (mode & S_IALLUGO) | S_IFLNK;
return debugfs_mknod(dir, dentry, mode, 0, data, fops);
}
-static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
void *data, const struct file_operations *fops)
{
int res;
@@ -146,7 +146,7 @@ static struct file_system_type debug_fs_type = {
.kill_sb = kill_litter_super,
};
-static int debugfs_create_by_name(const char *name, mode_t mode,
+static int debugfs_create_by_name(const char *name, umode_t mode,
struct dentry *parent,
struct dentry **dentry,
void *data,
@@ -160,7 +160,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
* have around.
*/
if (!parent)
- parent = debugfs_mount->mnt_sb->s_root;
+ parent = debugfs_mount->mnt_root;
*dentry = NULL;
mutex_lock(&parent->d_inode->i_mutex);
@@ -214,7 +214,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
*/
-struct dentry *debugfs_create_file(const char *name, mode_t mode,
+struct dentry *debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index d5d5297efe97..c4e2a58a2e82 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -246,9 +246,9 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
return err;
}
-static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int devpts_show_options(struct seq_file *seq, struct dentry *root)
{
- struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb);
+ struct pts_fs_info *fsi = DEVPTS_SB(root->d_sb);
struct pts_mount_opts *opts = &fsi->mount_opts;
if (opts->setuid)
@@ -301,7 +301,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
inode = new_inode(s);
if (!inode)
- goto free_fsi;
+ goto fail;
inode->i_ino = 1;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
@@ -316,8 +316,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
printk(KERN_ERR "devpts: get root dentry failed\n");
iput(inode);
-free_fsi:
- kfree(s->s_fs_info);
fail:
return -ENOMEM;
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d740ab67ff6e..f4aadd15b613 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -36,6 +36,7 @@
#include <linux/rwsem.h>
#include <linux/uio.h>
#include <linux/atomic.h>
+#include <linux/prefetch.h>
/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -172,7 +173,7 @@ void inode_dio_wait(struct inode *inode)
if (atomic_read(&inode->i_dio_count))
__inode_dio_wait(inode);
}
-EXPORT_SYMBOL_GPL(inode_dio_wait);
+EXPORT_SYMBOL(inode_dio_wait);
/*
* inode_dio_done - signal finish of a direct I/O requests
@@ -186,7 +187,7 @@ void inode_dio_done(struct inode *inode)
if (atomic_dec_and_test(&inode->i_dio_count))
wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
}
-EXPORT_SYMBOL_GPL(inode_dio_done);
+EXPORT_SYMBOL(inode_dio_done);
/*
* How many pages are in the queue?
@@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
{
int ret;
sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
+ sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
unsigned long fs_count; /* Number of filesystem-sized blocks */
- unsigned long dio_count;/* Number of dio_block-sized blocks */
- unsigned long blkmask;
int create;
/*
@@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
if (ret == 0) {
BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
fs_startblk = sdio->block_in_file >> sdio->blkfactor;
- dio_count = sdio->final_block_in_request - sdio->block_in_file;
- fs_count = dio_count >> sdio->blkfactor;
- blkmask = (1 << sdio->blkfactor) - 1;
- if (dio_count & blkmask)
- fs_count++;
+ fs_endblk = (sdio->final_block_in_request - 1) >>
+ sdio->blkfactor;
+ fs_count = fs_endblk - fs_startblk + 1;
map_bh->b_state = 0;
map_bh->b_size = fs_count << dio->inode->i_blkbits;
@@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio)
* individual fields and will generate much worse code. This is important
* for the whole file.
*/
-ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+static inline ssize_t
+do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
@@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
size_t size;
unsigned long addr;
unsigned blkbits = inode->i_blkbits;
- unsigned bdev_blkbits = 0;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
loff_t end = offset;
@@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (rw & WRITE)
rw = WRITE_ODIRECT;
- if (bdev)
- bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+ /*
+ * Avoid references to bdev if not absolutely needed to give
+ * the early prefetch in the caller enough time.
+ */
if (offset & blocksize_mask) {
if (bdev)
- blkbits = bdev_blkbits;
+ blkbits = blksize_bits(bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1;
if (offset & blocksize_mask)
goto out;
@@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
addr = (unsigned long)iov[seg].iov_base;
size = iov[seg].iov_len;
end += size;
- if ((addr & blocksize_mask) || (size & blocksize_mask)) {
+ if (unlikely((addr & blocksize_mask) ||
+ (size & blocksize_mask))) {
if (bdev)
- blkbits = bdev_blkbits;
+ blkbits = blksize_bits(
+ bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
+ if ((addr & blocksize_mask) || (size & blocksize_mask))
goto out;
}
}
@@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
out:
return retval;
}
+
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+ struct block_device *bdev, const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+ dio_submit_t submit_io, int flags)
+{
+ /*
+ * The block device state is needed in the end to finally
+ * submit everything. Since it's likely to be cache cold
+ * prefetch it here as first thing to hide some of the
+ * latency.
+ *
+ * Attempt to prefetch the pieces we likely need later.
+ */
+ prefetch(&bdev->bd_disk->part_tbl);
+ prefetch(bdev->bd_queue);
+ prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
+
+ return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+ nr_segs, get_block, end_io,
+ submit_io, flags);
+}
+
EXPORT_SYMBOL(__blockdev_direct_IO);
static __init int dio_init(void)
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 6cf72fcc0d0c..e7e327d43fa5 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/in.h>
#include <linux/in6.h>
+#include <linux/dlmconstants.h>
#include <net/ipv6.h>
#include <net/sock.h>
@@ -36,6 +37,7 @@
static struct config_group *space_list;
static struct config_group *comm_list;
static struct dlm_comm *local_comm;
+static uint32_t dlm_comm_count;
struct dlm_clusters;
struct dlm_cluster;
@@ -103,6 +105,8 @@ struct dlm_cluster {
unsigned int cl_timewarn_cs;
unsigned int cl_waitwarn_us;
unsigned int cl_new_rsb_count;
+ unsigned int cl_recover_callbacks;
+ char cl_cluster_name[DLM_LOCKSPACE_LEN];
};
enum {
@@ -118,6 +122,8 @@ enum {
CLUSTER_ATTR_TIMEWARN_CS,
CLUSTER_ATTR_WAITWARN_US,
CLUSTER_ATTR_NEW_RSB_COUNT,
+ CLUSTER_ATTR_RECOVER_CALLBACKS,
+ CLUSTER_ATTR_CLUSTER_NAME,
};
struct cluster_attribute {
@@ -126,6 +132,27 @@ struct cluster_attribute {
ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
};
+static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
+{
+ return sprintf(buf, "%s\n", cl->cl_cluster_name);
+}
+
+static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
+ const char *buf, size_t len)
+{
+ strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
+ strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
+ return len;
+}
+
+static struct cluster_attribute cluster_attr_cluster_name = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "cluster_name",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = cluster_cluster_name_read,
+ .store = cluster_cluster_name_write,
+};
+
static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
int *info_field, int check_zero,
const char *buf, size_t len)
@@ -171,6 +198,7 @@ CLUSTER_ATTR(protocol, 0);
CLUSTER_ATTR(timewarn_cs, 1);
CLUSTER_ATTR(waitwarn_us, 0);
CLUSTER_ATTR(new_rsb_count, 0);
+CLUSTER_ATTR(recover_callbacks, 0);
static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -185,6 +213,8 @@ static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
[CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
[CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr,
+ [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr,
+ [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr,
NULL,
};
@@ -293,6 +323,7 @@ struct dlm_comms {
struct dlm_comm {
struct config_item item;
+ int seq;
int nodeid;
int local;
int addr_count;
@@ -309,6 +340,7 @@ struct dlm_node {
int nodeid;
int weight;
int new;
+ int comm_seq; /* copy of cm->seq when nd->nodeid is set */
};
static struct configfs_group_operations clusters_ops = {
@@ -455,6 +487,9 @@ static struct config_group *make_cluster(struct config_group *g,
cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
+ cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
+ memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
+ DLM_LOCKSPACE_LEN);
space_list = &sps->ss_group;
comm_list = &cms->cs_group;
@@ -558,6 +593,11 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
return ERR_PTR(-ENOMEM);
config_item_init_type_name(&cm->item, name, &comm_type);
+
+ cm->seq = dlm_comm_count++;
+ if (!cm->seq)
+ cm->seq = dlm_comm_count++;
+
cm->nodeid = -1;
cm->local = 0;
cm->addr_count = 0;
@@ -801,7 +841,10 @@ static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
size_t len)
{
+ uint32_t seq = 0;
nd->nodeid = simple_strtol(buf, NULL, 0);
+ dlm_comm_seq(nd->nodeid, &seq);
+ nd->comm_seq = seq;
return len;
}
@@ -908,13 +951,13 @@ static void put_comm(struct dlm_comm *cm)
}
/* caller must free mem */
-int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
- int **new_out, int *new_count_out)
+int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
+ int *count_out)
{
struct dlm_space *sp;
struct dlm_node *nd;
- int i = 0, rv = 0, ids_count = 0, new_count = 0;
- int *ids, *new;
+ struct dlm_config_node *nodes, *node;
+ int rv, count;
sp = get_space(lsname);
if (!sp)
@@ -927,73 +970,42 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
goto out;
}
- ids_count = sp->members_count;
+ count = sp->members_count;
- ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
- if (!ids) {
+ nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS);
+ if (!nodes) {
rv = -ENOMEM;
goto out;
}
+ node = nodes;
list_for_each_entry(nd, &sp->members, list) {
- ids[i++] = nd->nodeid;
- if (nd->new)
- new_count++;
- }
-
- if (ids_count != i)
- printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i);
-
- if (!new_count)
- goto out_ids;
+ node->nodeid = nd->nodeid;
+ node->weight = nd->weight;
+ node->new = nd->new;
+ node->comm_seq = nd->comm_seq;
+ node++;
- new = kcalloc(new_count, sizeof(int), GFP_NOFS);
- if (!new) {
- kfree(ids);
- rv = -ENOMEM;
- goto out;
+ nd->new = 0;
}
- i = 0;
- list_for_each_entry(nd, &sp->members, list) {
- if (nd->new) {
- new[i++] = nd->nodeid;
- nd->new = 0;
- }
- }
- *new_count_out = new_count;
- *new_out = new;
-
- out_ids:
- *ids_count_out = ids_count;
- *ids_out = ids;
+ *count_out = count;
+ *nodes_out = nodes;
+ rv = 0;
out:
mutex_unlock(&sp->members_lock);
put_space(sp);
return rv;
}
-int dlm_node_weight(char *lsname, int nodeid)
+int dlm_comm_seq(int nodeid, uint32_t *seq)
{
- struct dlm_space *sp;
- struct dlm_node *nd;
- int w = -EEXIST;
-
- sp = get_space(lsname);
- if (!sp)
- goto out;
-
- mutex_lock(&sp->members_lock);
- list_for_each_entry(nd, &sp->members, list) {
- if (nd->nodeid != nodeid)
- continue;
- w = nd->weight;
- break;
- }
- mutex_unlock(&sp->members_lock);
- put_space(sp);
- out:
- return w;
+ struct dlm_comm *cm = get_comm(nodeid, NULL);
+ if (!cm)
+ return -EEXIST;
+ *seq = cm->seq;
+ put_comm(cm);
+ return 0;
}
int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
@@ -1047,6 +1059,8 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
#define DEFAULT_WAITWARN_US 0
#define DEFAULT_NEW_RSB_COUNT 128
+#define DEFAULT_RECOVER_CALLBACKS 0
+#define DEFAULT_CLUSTER_NAME ""
struct dlm_config_info dlm_config = {
.ci_tcp_port = DEFAULT_TCP_PORT,
@@ -1060,6 +1074,8 @@ struct dlm_config_info dlm_config = {
.ci_protocol = DEFAULT_PROTOCOL,
.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
.ci_waitwarn_us = DEFAULT_WAITWARN_US,
- .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT
+ .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
+ .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
+ .ci_cluster_name = DEFAULT_CLUSTER_NAME
};
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 3099d0dd26c0..9f5e3663bb0c 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,13 @@
#ifndef __CONFIG_DOT_H__
#define __CONFIG_DOT_H__
+struct dlm_config_node {
+ int nodeid;
+ int weight;
+ int new;
+ uint32_t comm_seq;
+};
+
#define DLM_MAX_ADDR_COUNT 3
struct dlm_config_info {
@@ -29,15 +36,17 @@ struct dlm_config_info {
int ci_timewarn_cs;
int ci_waitwarn_us;
int ci_new_rsb_count;
+ int ci_recover_callbacks;
+ char ci_cluster_name[DLM_LOCKSPACE_LEN];
};
extern struct dlm_config_info dlm_config;
int dlm_config_init(void);
void dlm_config_exit(void);
-int dlm_node_weight(char *lsname, int nodeid);
-int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
- int **new_out, int *new_count_out);
+int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
+ int *count_out);
+int dlm_comm_seq(int nodeid, uint32_t *seq);
int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
int dlm_our_nodeid(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 59779237e2b4..3dca2b39e83f 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -393,6 +393,7 @@ static const struct seq_operations format3_seq_ops;
static void *table_seq_start(struct seq_file *seq, loff_t *pos)
{
+ struct rb_node *node;
struct dlm_ls *ls = seq->private;
struct rsbtbl_iter *ri;
struct dlm_rsb *r;
@@ -418,9 +419,10 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
ri->format = 3;
spin_lock(&ls->ls_rsbtbl[bucket].lock);
- if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
- list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
- res_hashchain) {
+ if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
+ for (node = rb_first(&ls->ls_rsbtbl[bucket].keep); node;
+ node = rb_next(node)) {
+ r = rb_entry(node, struct dlm_rsb, res_hashnode);
if (!entry--) {
dlm_hold_rsb(r);
ri->rsb = r;
@@ -449,9 +451,9 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
}
spin_lock(&ls->ls_rsbtbl[bucket].lock);
- if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
- r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
- struct dlm_rsb, res_hashchain);
+ if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
+ node = rb_first(&ls->ls_rsbtbl[bucket].keep);
+ r = rb_entry(node, struct dlm_rsb, res_hashnode);
dlm_hold_rsb(r);
ri->rsb = r;
ri->bucket = bucket;
@@ -467,7 +469,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
{
struct dlm_ls *ls = seq->private;
struct rsbtbl_iter *ri = iter_ptr;
- struct list_head *next;
+ struct rb_node *next;
struct dlm_rsb *r, *rp;
loff_t n = *pos;
unsigned bucket;
@@ -480,10 +482,10 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
spin_lock(&ls->ls_rsbtbl[bucket].lock);
rp = ri->rsb;
- next = rp->res_hashchain.next;
+ next = rb_next(&rp->res_hashnode);
- if (next != &ls->ls_rsbtbl[bucket].list) {
- r = list_entry(next, struct dlm_rsb, res_hashchain);
+ if (next) {
+ r = rb_entry(next, struct dlm_rsb, res_hashnode);
dlm_hold_rsb(r);
ri->rsb = r;
spin_unlock(&ls->ls_rsbtbl[bucket].lock);
@@ -511,9 +513,9 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
}
spin_lock(&ls->ls_rsbtbl[bucket].lock);
- if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
- r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
- struct dlm_rsb, res_hashchain);
+ if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
+ next = rb_first(&ls->ls_rsbtbl[bucket].keep);
+ r = rb_entry(next, struct dlm_rsb, res_hashnode);
dlm_hold_rsb(r);
ri->rsb = r;
ri->bucket = bucket;
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 7b84c1dbc82e..83641574b016 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -290,7 +290,6 @@ int dlm_recover_directory(struct dlm_ls *ls)
out_status:
error = 0;
- dlm_set_recover_status(ls, DLM_RS_DIR);
log_debug(ls, "dlm_recover_directory %d entries", count);
out_free:
kfree(last_name);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index fe2860c02449..3a564d197e99 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -103,8 +103,8 @@ struct dlm_dirtable {
};
struct dlm_rsbtable {
- struct list_head list;
- struct list_head toss;
+ struct rb_root keep;
+ struct rb_root toss;
spinlock_t lock;
};
@@ -117,6 +117,10 @@ struct dlm_member {
struct list_head list;
int nodeid;
int weight;
+ int slot;
+ int slot_prev;
+ int comm_seq;
+ uint32_t generation;
};
/*
@@ -125,10 +129,8 @@ struct dlm_member {
struct dlm_recover {
struct list_head list;
- int *nodeids; /* nodeids of all members */
- int node_count;
- int *new; /* nodeids of new members */
- int new_count;
+ struct dlm_config_node *nodes;
+ int nodes_count;
uint64_t seq;
};
@@ -285,7 +287,10 @@ struct dlm_rsb {
unsigned long res_toss_time;
uint32_t res_first_lkid;
struct list_head res_lookup; /* lkbs waiting on first */
- struct list_head res_hashchain; /* rsbtbl */
+ union {
+ struct list_head res_hashchain;
+ struct rb_node res_hashnode; /* rsbtbl */
+ };
struct list_head res_grantqueue;
struct list_head res_convertqueue;
struct list_head res_waitqueue;
@@ -334,7 +339,9 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
/* dlm_header is first element of all structs sent between nodes */
#define DLM_HEADER_MAJOR 0x00030000
-#define DLM_HEADER_MINOR 0x00000000
+#define DLM_HEADER_MINOR 0x00000001
+
+#define DLM_HEADER_SLOTS 0x00000001
#define DLM_MSG 1
#define DLM_RCOM 2
@@ -422,10 +429,34 @@ union dlm_packet {
struct dlm_rcom rcom;
};
+#define DLM_RSF_NEED_SLOTS 0x00000001
+
+/* RCOM_STATUS data */
+struct rcom_status {
+ __le32 rs_flags;
+ __le32 rs_unused1;
+ __le64 rs_unused2;
+};
+
+/* RCOM_STATUS_REPLY data */
struct rcom_config {
__le32 rf_lvblen;
__le32 rf_lsflags;
- __le64 rf_unused;
+
+ /* DLM_HEADER_SLOTS adds: */
+ __le32 rf_flags;
+ __le16 rf_our_slot;
+ __le16 rf_num_slots;
+ __le32 rf_generation;
+ __le32 rf_unused1;
+ __le64 rf_unused2;
+};
+
+struct rcom_slot {
+ __le32 ro_nodeid;
+ __le16 ro_slot;
+ __le16 ro_unused1;
+ __le64 ro_unused2;
};
struct rcom_lock {
@@ -452,6 +483,7 @@ struct dlm_ls {
struct list_head ls_list; /* list of lockspaces */
dlm_lockspace_t *ls_local_handle;
uint32_t ls_global_id; /* global unique lockspace ID */
+ uint32_t ls_generation;
uint32_t ls_exflags;
int ls_lvblen;
int ls_count; /* refcount of processes in
@@ -490,6 +522,11 @@ struct dlm_ls {
int ls_total_weight;
int *ls_node_array;
+ int ls_slot;
+ int ls_num_slots;
+ int ls_slots_size;
+ struct dlm_slot *ls_slots;
+
struct dlm_rsb ls_stub_rsb; /* for returning errors */
struct dlm_lkb ls_stub_lkb; /* for returning errors */
struct dlm_message ls_stub_ms; /* for faking a reply */
@@ -537,6 +574,9 @@ struct dlm_ls {
struct list_head ls_root_list; /* root resources */
struct rw_semaphore ls_root_sem; /* protect root_list */
+ const struct dlm_lockspace_ops *ls_ops;
+ void *ls_ops_arg;
+
int ls_namelen;
char ls_name[1];
};
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 83b5e32514e1..d47183043c59 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,6 +56,7 @@
L: receive_xxxx_reply() <- R: send_xxxx_reply()
*/
#include <linux/types.h>
+#include <linux/rbtree.h>
#include <linux/slab.h>
#include "dlm_internal.h"
#include <linux/dlm_device.h>
@@ -380,6 +381,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain);
list_del(&r->res_hashchain);
+ /* Convert the empty list_head to a NULL rb_node for tree usage: */
+ memset(&r->res_hashnode, 0, sizeof(struct rb_node));
ls->ls_new_rsb_count--;
spin_unlock(&ls->ls_new_rsb_spin);
@@ -388,7 +391,6 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
memcpy(r->res_name, name, len);
mutex_init(&r->res_mutex);
- INIT_LIST_HEAD(&r->res_hashchain);
INIT_LIST_HEAD(&r->res_lookup);
INIT_LIST_HEAD(&r->res_grantqueue);
INIT_LIST_HEAD(&r->res_convertqueue);
@@ -400,14 +402,31 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
return 0;
}
-static int search_rsb_list(struct list_head *head, char *name, int len,
+static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
+{
+ char maxname[DLM_RESNAME_MAXLEN];
+
+ memset(maxname, 0, DLM_RESNAME_MAXLEN);
+ memcpy(maxname, name, nlen);
+ return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
+}
+
+static int search_rsb_tree(struct rb_root *tree, char *name, int len,
unsigned int flags, struct dlm_rsb **r_ret)
{
+ struct rb_node *node = tree->rb_node;
struct dlm_rsb *r;
int error = 0;
-
- list_for_each_entry(r, head, res_hashchain) {
- if (len == r->res_length && !memcmp(name, r->res_name, len))
+ int rc;
+
+ while (node) {
+ r = rb_entry(node, struct dlm_rsb, res_hashnode);
+ rc = rsb_cmp(r, name, len);
+ if (rc < 0)
+ node = node->rb_left;
+ else if (rc > 0)
+ node = node->rb_right;
+ else
goto found;
}
*r_ret = NULL;
@@ -420,22 +439,54 @@ static int search_rsb_list(struct list_head *head, char *name, int len,
return error;
}
+static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
+{
+ struct rb_node **newn = &tree->rb_node;
+ struct rb_node *parent = NULL;
+ int rc;
+
+ while (*newn) {
+ struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb,
+ res_hashnode);
+
+ parent = *newn;
+ rc = rsb_cmp(cur, rsb->res_name, rsb->res_length);
+ if (rc < 0)
+ newn = &parent->rb_left;
+ else if (rc > 0)
+ newn = &parent->rb_right;
+ else {
+ log_print("rsb_insert match");
+ dlm_dump_rsb(rsb);
+ dlm_dump_rsb(cur);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&rsb->res_hashnode, parent, newn);
+ rb_insert_color(&rsb->res_hashnode, tree);
+ return 0;
+}
+
static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
unsigned int flags, struct dlm_rsb **r_ret)
{
struct dlm_rsb *r;
int error;
- error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
+ error = search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, flags, &r);
if (!error) {
kref_get(&r->res_ref);
goto out;
}
- error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+ error = search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
if (error)
goto out;
- list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
+ rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+ error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
+ if (error)
+ return error;
if (dlm_no_directory(ls))
goto out;
@@ -527,8 +578,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
nodeid = 0;
r->res_nodeid = nodeid;
}
- list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
- error = 0;
+ error = rsb_insert(r, &ls->ls_rsbtbl[bucket].keep);
out_unlock:
spin_unlock(&ls->ls_rsbtbl[bucket].lock);
out:
@@ -556,7 +606,8 @@ static void toss_rsb(struct kref *kref)
DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
kref_init(&r->res_ref);
- list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
+ rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
+ rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
r->res_toss_time = jiffies;
if (r->res_lvbptr) {
dlm_free_lvb(r->res_lvbptr);
@@ -1082,19 +1133,19 @@ static void dir_remove(struct dlm_rsb *r)
r->res_name, r->res_length);
}
-/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
- found since they are in order of newest to oldest? */
+/* FIXME: make this more efficient */
static int shrink_bucket(struct dlm_ls *ls, int b)
{
+ struct rb_node *n;
struct dlm_rsb *r;
int count = 0, found;
for (;;) {
found = 0;
spin_lock(&ls->ls_rsbtbl[b].lock);
- list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
- res_hashchain) {
+ for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = rb_next(n)) {
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
if (!time_after_eq(jiffies, r->res_toss_time +
dlm_config.ci_toss_secs * HZ))
continue;
@@ -1108,7 +1159,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
}
if (kref_put(&r->res_ref, kill_rsb)) {
- list_del(&r->res_hashchain);
+ rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
spin_unlock(&ls->ls_rsbtbl[b].lock);
if (is_master(r))
@@ -4441,10 +4492,12 @@ int dlm_purge_locks(struct dlm_ls *ls)
static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
{
+ struct rb_node *n;
struct dlm_rsb *r, *r_ret = NULL;
spin_lock(&ls->ls_rsbtbl[bucket].lock);
- list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
+ for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) {
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
if (!rsb_flag(r, RSB_LOCKS_PURGED))
continue;
hold_rsb(r);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index a1d8f1af144b..a1ea25face82 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -386,12 +386,15 @@ static void threads_stop(void)
dlm_lowcomms_stop();
}
-static int new_lockspace(const char *name, int namelen, void **lockspace,
- uint32_t flags, int lvblen)
+static int new_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops, void *ops_arg,
+ int *ops_result, dlm_lockspace_t **lockspace)
{
struct dlm_ls *ls;
int i, size, error;
int do_unreg = 0;
+ int namelen = strlen(name);
if (namelen > DLM_LOCKSPACE_LEN)
return -EINVAL;
@@ -403,8 +406,24 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
return -EINVAL;
if (!dlm_user_daemon_available()) {
- module_put(THIS_MODULE);
- return -EUNATCH;
+ log_print("dlm user daemon not available");
+ error = -EUNATCH;
+ goto out;
+ }
+
+ if (ops && ops_result) {
+ if (!dlm_config.ci_recover_callbacks)
+ *ops_result = -EOPNOTSUPP;
+ else
+ *ops_result = 0;
+ }
+
+ if (dlm_config.ci_recover_callbacks && cluster &&
+ strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) {
+ log_print("dlm cluster name %s mismatch %s",
+ dlm_config.ci_cluster_name, cluster);
+ error = -EBADR;
+ goto out;
}
error = 0;
@@ -442,6 +461,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
ls->ls_flags = 0;
ls->ls_scan_time = jiffies;
+ if (ops && dlm_config.ci_recover_callbacks) {
+ ls->ls_ops = ops;
+ ls->ls_ops_arg = ops_arg;
+ }
+
if (flags & DLM_LSFL_TIMEWARN)
set_bit(LSFL_TIMEWARN, &ls->ls_flags);
@@ -457,8 +481,8 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
if (!ls->ls_rsbtbl)
goto out_lsfree;
for (i = 0; i < size; i++) {
- INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
- INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
+ ls->ls_rsbtbl[i].keep.rb_node = NULL;
+ ls->ls_rsbtbl[i].toss.rb_node = NULL;
spin_lock_init(&ls->ls_rsbtbl[i].lock);
}
@@ -525,6 +549,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
if (!ls->ls_recover_buf)
goto out_dirfree;
+ ls->ls_slot = 0;
+ ls->ls_num_slots = 0;
+ ls->ls_slots_size = 0;
+ ls->ls_slots = NULL;
+
INIT_LIST_HEAD(&ls->ls_recover_list);
spin_lock_init(&ls->ls_recover_list_lock);
ls->ls_recover_list_count = 0;
@@ -614,8 +643,10 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
return error;
}
-int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
- uint32_t flags, int lvblen)
+int dlm_new_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops, void *ops_arg,
+ int *ops_result, dlm_lockspace_t **lockspace)
{
int error = 0;
@@ -625,7 +656,8 @@ int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
if (error)
goto out;
- error = new_lockspace(name, namelen, lockspace, flags, lvblen);
+ error = new_lockspace(name, cluster, flags, lvblen, ops, ops_arg,
+ ops_result, lockspace);
if (!error)
ls_count++;
if (error > 0)
@@ -685,7 +717,7 @@ static int lockspace_busy(struct dlm_ls *ls, int force)
static int release_lockspace(struct dlm_ls *ls, int force)
{
struct dlm_rsb *rsb;
- struct list_head *head;
+ struct rb_node *n;
int i, busy, rv;
busy = lockspace_busy(ls, force);
@@ -746,20 +778,15 @@ static int release_lockspace(struct dlm_ls *ls, int force)
*/
for (i = 0; i < ls->ls_rsbtbl_size; i++) {
- head = &ls->ls_rsbtbl[i].list;
- while (!list_empty(head)) {
- rsb = list_entry(head->next, struct dlm_rsb,
- res_hashchain);
-
- list_del(&rsb->res_hashchain);
+ while ((n = rb_first(&ls->ls_rsbtbl[i].keep))) {
+ rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+ rb_erase(n, &ls->ls_rsbtbl[i].keep);
dlm_free_rsb(rsb);
}
- head = &ls->ls_rsbtbl[i].toss;
- while (!list_empty(head)) {
- rsb = list_entry(head->next, struct dlm_rsb,
- res_hashchain);
- list_del(&rsb->res_hashchain);
+ while ((n = rb_first(&ls->ls_rsbtbl[i].toss))) {
+ rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+ rb_erase(n, &ls->ls_rsbtbl[i].toss);
dlm_free_rsb(rsb);
}
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 990626e7da80..0b3109ee4257 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -281,7 +281,7 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
} else {
struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
- ipv6_addr_copy(&ret6->sin6_addr, &in6->sin6_addr);
+ ret6->sin6_addr = in6->sin6_addr;
}
return 0;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b12532e553f8..862640a36d5c 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
/******************************************************************************
*******************************************************************************
**
-** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,280 @@
#include "config.h"
#include "lowcomms.h"
+int dlm_slots_version(struct dlm_header *h)
+{
+ if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS)
+ return 0;
+ return 1;
+}
+
+void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
+ struct dlm_member *memb)
+{
+ struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
+
+ if (!dlm_slots_version(&rc->rc_header))
+ return;
+
+ memb->slot = le16_to_cpu(rf->rf_our_slot);
+ memb->generation = le32_to_cpu(rf->rf_generation);
+}
+
+void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ struct dlm_slot *slot;
+ struct rcom_slot *ro;
+ int i;
+
+ ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
+
+ /* ls_slots array is sparse, but not rcom_slots */
+
+ for (i = 0; i < ls->ls_slots_size; i++) {
+ slot = &ls->ls_slots[i];
+ if (!slot->nodeid)
+ continue;
+ ro->ro_nodeid = cpu_to_le32(slot->nodeid);
+ ro->ro_slot = cpu_to_le16(slot->slot);
+ ro++;
+ }
+}
+
+#define SLOT_DEBUG_LINE 128
+
+static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
+ struct rcom_slot *ro0, struct dlm_slot *array,
+ int array_size)
+{
+ char line[SLOT_DEBUG_LINE];
+ int len = SLOT_DEBUG_LINE - 1;
+ int pos = 0;
+ int ret, i;
+
+ if (!dlm_config.ci_log_debug)
+ return;
+
+ memset(line, 0, sizeof(line));
+
+ if (array) {
+ for (i = 0; i < array_size; i++) {
+ if (!array[i].nodeid)
+ continue;
+
+ ret = snprintf(line + pos, len - pos, " %d:%d",
+ array[i].slot, array[i].nodeid);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+ } else if (ro0) {
+ for (i = 0; i < num_slots; i++) {
+ ret = snprintf(line + pos, len - pos, " %d:%d",
+ ro0[i].ro_slot, ro0[i].ro_nodeid);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+ }
+
+ log_debug(ls, "generation %u slots %d%s", gen, num_slots, line);
+}
+
+int dlm_slots_copy_in(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ struct dlm_rcom *rc = ls->ls_recover_buf;
+ struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
+ struct rcom_slot *ro0, *ro;
+ int our_nodeid = dlm_our_nodeid();
+ int i, num_slots;
+ uint32_t gen;
+
+ if (!dlm_slots_version(&rc->rc_header))
+ return -1;
+
+ gen = le32_to_cpu(rf->rf_generation);
+ if (gen <= ls->ls_generation) {
+ log_error(ls, "dlm_slots_copy_in gen %u old %u",
+ gen, ls->ls_generation);
+ }
+ ls->ls_generation = gen;
+
+ num_slots = le16_to_cpu(rf->rf_num_slots);
+ if (!num_slots)
+ return -1;
+
+ ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
+
+ for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
+ ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid);
+ ro->ro_slot = le16_to_cpu(ro->ro_slot);
+ }
+
+ log_debug_slots(ls, gen, num_slots, ro0, NULL, 0);
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
+ if (ro->ro_nodeid != memb->nodeid)
+ continue;
+ memb->slot = ro->ro_slot;
+ memb->slot_prev = memb->slot;
+ break;
+ }
+
+ if (memb->nodeid == our_nodeid) {
+ if (ls->ls_slot && ls->ls_slot != memb->slot) {
+ log_error(ls, "dlm_slots_copy_in our slot "
+ "changed %d %d", ls->ls_slot,
+ memb->slot);
+ return -1;
+ }
+
+ if (!ls->ls_slot)
+ ls->ls_slot = memb->slot;
+ }
+
+ if (!memb->slot) {
+ log_error(ls, "dlm_slots_copy_in nodeid %d no slot",
+ memb->nodeid);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/* for any nodes that do not support slots, we will not have set memb->slot
+ in wait_status_all(), so memb->slot will remain -1, and we will not
+ assign slots or set ls_num_slots here */
+
+int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
+ struct dlm_slot **slots_out, uint32_t *gen_out)
+{
+ struct dlm_member *memb;
+ struct dlm_slot *array;
+ int our_nodeid = dlm_our_nodeid();
+ int array_size, max_slots, i;
+ int need = 0;
+ int max = 0;
+ int num = 0;
+ uint32_t gen = 0;
+
+ /* our own memb struct will have slot -1 gen 0 */
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->nodeid == our_nodeid) {
+ memb->slot = ls->ls_slot;
+ memb->generation = ls->ls_generation;
+ break;
+ }
+ }
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->generation > gen)
+ gen = memb->generation;
+
+ /* node doesn't support slots */
+
+ if (memb->slot == -1)
+ return -1;
+
+ /* node needs a slot assigned */
+
+ if (!memb->slot)
+ need++;
+
+ /* node has a slot assigned */
+
+ num++;
+
+ if (!max || max < memb->slot)
+ max = memb->slot;
+
+ /* sanity check, once slot is assigned it shouldn't change */
+
+ if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) {
+ log_error(ls, "nodeid %d slot changed %d %d",
+ memb->nodeid, memb->slot_prev, memb->slot);
+ return -1;
+ }
+ memb->slot_prev = memb->slot;
+ }
+
+ array_size = max + need;
+
+ array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS);
+ if (!array)
+ return -ENOMEM;
+
+ num = 0;
+
+ /* fill in slots (offsets) that are used */
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (!memb->slot)
+ continue;
+
+ if (memb->slot > array_size) {
+ log_error(ls, "invalid slot number %d", memb->slot);
+ kfree(array);
+ return -1;
+ }
+
+ array[memb->slot - 1].nodeid = memb->nodeid;
+ array[memb->slot - 1].slot = memb->slot;
+ num++;
+ }
+
+ /* assign new slots from unused offsets */
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->slot)
+ continue;
+
+ for (i = 0; i < array_size; i++) {
+ if (array[i].nodeid)
+ continue;
+
+ memb->slot = i + 1;
+ memb->slot_prev = memb->slot;
+ array[i].nodeid = memb->nodeid;
+ array[i].slot = memb->slot;
+ num++;
+
+ if (!ls->ls_slot && memb->nodeid == our_nodeid)
+ ls->ls_slot = memb->slot;
+ break;
+ }
+
+ if (!memb->slot) {
+ log_error(ls, "no free slot found");
+ kfree(array);
+ return -1;
+ }
+ }
+
+ gen++;
+
+ log_debug_slots(ls, gen, num, NULL, array, array_size);
+
+ max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
+ sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
+
+ if (num > max_slots) {
+ log_error(ls, "num_slots %d exceeds max_slots %d",
+ num, max_slots);
+ kfree(array);
+ return -1;
+ }
+
+ *gen_out = gen;
+ *slots_out = array;
+ *slots_size = array_size;
+ *num_slots = num;
+ return 0;
+}
+
static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
{
struct dlm_member *memb = NULL;
@@ -43,59 +317,51 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
}
}
-static int dlm_add_member(struct dlm_ls *ls, int nodeid)
+static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node)
{
struct dlm_member *memb;
- int w, error;
+ int error;
memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
if (!memb)
return -ENOMEM;
- w = dlm_node_weight(ls->ls_name, nodeid);
- if (w < 0) {
- kfree(memb);
- return w;
- }
-
- error = dlm_lowcomms_connect_node(nodeid);
+ error = dlm_lowcomms_connect_node(node->nodeid);
if (error < 0) {
kfree(memb);
return error;
}
- memb->nodeid = nodeid;
- memb->weight = w;
+ memb->nodeid = node->nodeid;
+ memb->weight = node->weight;
+ memb->comm_seq = node->comm_seq;
add_ordered_member(ls, memb);
ls->ls_num_nodes++;
return 0;
}
-static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
-{
- list_move(&memb->list, &ls->ls_nodes_gone);
- ls->ls_num_nodes--;
-}
-
-int dlm_is_member(struct dlm_ls *ls, int nodeid)
+static struct dlm_member *find_memb(struct list_head *head, int nodeid)
{
struct dlm_member *memb;
- list_for_each_entry(memb, &ls->ls_nodes, list) {
+ list_for_each_entry(memb, head, list) {
if (memb->nodeid == nodeid)
- return 1;
+ return memb;
}
+ return NULL;
+}
+
+int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+ if (find_memb(&ls->ls_nodes, nodeid))
+ return 1;
return 0;
}
int dlm_is_removed(struct dlm_ls *ls, int nodeid)
{
- struct dlm_member *memb;
-
- list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
- if (memb->nodeid == nodeid)
- return 1;
- }
+ if (find_memb(&ls->ls_nodes_gone, nodeid))
+ return 1;
return 0;
}
@@ -176,7 +442,7 @@ static int ping_members(struct dlm_ls *ls)
error = dlm_recovery_stopped(ls);
if (error)
break;
- error = dlm_rcom_status(ls, memb->nodeid);
+ error = dlm_rcom_status(ls, memb->nodeid, 0);
if (error)
break;
}
@@ -186,10 +452,88 @@ static int ping_members(struct dlm_ls *ls)
return error;
}
+static void dlm_lsop_recover_prep(struct dlm_ls *ls)
+{
+ if (!ls->ls_ops || !ls->ls_ops->recover_prep)
+ return;
+ ls->ls_ops->recover_prep(ls->ls_ops_arg);
+}
+
+static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
+{
+ struct dlm_slot slot;
+ uint32_t seq;
+ int error;
+
+ if (!ls->ls_ops || !ls->ls_ops->recover_slot)
+ return;
+
+ /* if there is no comms connection with this node
+ or the present comms connection is newer
+ than the one when this member was added, then
+ we consider the node to have failed (versus
+ being removed due to dlm_release_lockspace) */
+
+ error = dlm_comm_seq(memb->nodeid, &seq);
+
+ if (!error && seq == memb->comm_seq)
+ return;
+
+ slot.nodeid = memb->nodeid;
+ slot.slot = memb->slot;
+
+ ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot);
+}
+
+void dlm_lsop_recover_done(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ struct dlm_slot *slots;
+ int i, num;
+
+ if (!ls->ls_ops || !ls->ls_ops->recover_done)
+ return;
+
+ num = ls->ls_num_nodes;
+
+ slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL);
+ if (!slots)
+ return;
+
+ i = 0;
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (i == num) {
+ log_error(ls, "dlm_lsop_recover_done bad num %d", num);
+ goto out;
+ }
+ slots[i].nodeid = memb->nodeid;
+ slots[i].slot = memb->slot;
+ i++;
+ }
+
+ ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num,
+ ls->ls_slot, ls->ls_generation);
+ out:
+ kfree(slots);
+}
+
+static struct dlm_config_node *find_config_node(struct dlm_recover *rv,
+ int nodeid)
+{
+ int i;
+
+ for (i = 0; i < rv->nodes_count; i++) {
+ if (rv->nodes[i].nodeid == nodeid)
+ return &rv->nodes[i];
+ }
+ return NULL;
+}
+
int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
{
struct dlm_member *memb, *safe;
- int i, error, found, pos = 0, neg = 0, low = -1;
+ struct dlm_config_node *node;
+ int i, error, neg = 0, low = -1;
/* previously removed members that we've not finished removing need to
count as a negative change so the "neg" recovery steps will happen */
@@ -202,46 +546,32 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
/* move departed members from ls_nodes to ls_nodes_gone */
list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
- found = 0;
- for (i = 0; i < rv->node_count; i++) {
- if (memb->nodeid == rv->nodeids[i]) {
- found = 1;
- break;
- }
- }
+ node = find_config_node(rv, memb->nodeid);
+ if (node && !node->new)
+ continue;
- if (!found) {
- neg++;
- dlm_remove_member(ls, memb);
+ if (!node) {
log_debug(ls, "remove member %d", memb->nodeid);
+ } else {
+ /* removed and re-added */
+ log_debug(ls, "remove member %d comm_seq %u %u",
+ memb->nodeid, memb->comm_seq, node->comm_seq);
}
- }
-
- /* Add an entry to ls_nodes_gone for members that were removed and
- then added again, so that previous state for these nodes will be
- cleared during recovery. */
-
- for (i = 0; i < rv->new_count; i++) {
- if (!dlm_is_member(ls, rv->new[i]))
- continue;
- log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
- memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
- if (!memb)
- return -ENOMEM;
- memb->nodeid = rv->new[i];
- list_add_tail(&memb->list, &ls->ls_nodes_gone);
neg++;
+ list_move(&memb->list, &ls->ls_nodes_gone);
+ ls->ls_num_nodes--;
+ dlm_lsop_recover_slot(ls, memb);
}
/* add new members to ls_nodes */
- for (i = 0; i < rv->node_count; i++) {
- if (dlm_is_member(ls, rv->nodeids[i]))
+ for (i = 0; i < rv->nodes_count; i++) {
+ node = &rv->nodes[i];
+ if (dlm_is_member(ls, node->nodeid))
continue;
- dlm_add_member(ls, rv->nodeids[i]);
- pos++;
- log_debug(ls, "add member %d", rv->nodeids[i]);
+ dlm_add_member(ls, node);
+ log_debug(ls, "add member %d", node->nodeid);
}
list_for_each_entry(memb, &ls->ls_nodes, list) {
@@ -251,7 +581,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
ls->ls_low_nodeid = low;
make_member_array(ls);
- dlm_set_recover_status(ls, DLM_RS_NODES);
*neg_out = neg;
error = ping_members(ls);
@@ -261,12 +590,8 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
ls->ls_members_result = error;
complete(&ls->ls_members_done);
}
- if (error)
- goto out;
- error = dlm_recover_members_wait(ls);
- out:
- log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
+ log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
return error;
}
@@ -327,26 +652,35 @@ int dlm_ls_stop(struct dlm_ls *ls)
*/
dlm_recoverd_suspend(ls);
+
+ spin_lock(&ls->ls_recover_lock);
+ kfree(ls->ls_slots);
+ ls->ls_slots = NULL;
+ ls->ls_num_slots = 0;
+ ls->ls_slots_size = 0;
ls->ls_recover_status = 0;
+ spin_unlock(&ls->ls_recover_lock);
+
dlm_recoverd_resume(ls);
if (!ls->ls_recover_begin)
ls->ls_recover_begin = jiffies;
+
+ dlm_lsop_recover_prep(ls);
return 0;
}
int dlm_ls_start(struct dlm_ls *ls)
{
struct dlm_recover *rv = NULL, *rv_old;
- int *ids = NULL, *new = NULL;
- int error, ids_count = 0, new_count = 0;
+ struct dlm_config_node *nodes;
+ int error, count;
rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
if (!rv)
return -ENOMEM;
- error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count,
- &new, &new_count);
+ error = dlm_config_nodes(ls->ls_name, &nodes, &count);
if (error < 0)
goto fail;
@@ -361,10 +695,8 @@ int dlm_ls_start(struct dlm_ls *ls)
goto fail;
}
- rv->nodeids = ids;
- rv->node_count = ids_count;
- rv->new = new;
- rv->new_count = new_count;
+ rv->nodes = nodes;
+ rv->nodes_count = count;
rv->seq = ++ls->ls_recover_seq;
rv_old = ls->ls_recover_args;
ls->ls_recover_args = rv;
@@ -372,9 +704,8 @@ int dlm_ls_start(struct dlm_ls *ls)
if (rv_old) {
log_error(ls, "unused recovery %llx %d",
- (unsigned long long)rv_old->seq, rv_old->node_count);
- kfree(rv_old->nodeids);
- kfree(rv_old->new);
+ (unsigned long long)rv_old->seq, rv_old->nodes_count);
+ kfree(rv_old->nodes);
kfree(rv_old);
}
@@ -383,8 +714,7 @@ int dlm_ls_start(struct dlm_ls *ls)
fail:
kfree(rv);
- kfree(ids);
- kfree(new);
+ kfree(nodes);
return error;
}
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 7a26fca1e0b5..3deb70661c69 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
/******************************************************************************
*******************************************************************************
**
-** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -20,6 +20,14 @@ void dlm_clear_members_gone(struct dlm_ls *ls);
int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
int dlm_is_removed(struct dlm_ls *ls, int nodeid);
int dlm_is_member(struct dlm_ls *ls, int nodeid);
+int dlm_slots_version(struct dlm_header *h);
+void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
+ struct dlm_member *memb);
+void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_slots_copy_in(struct dlm_ls *ls);
+int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
+ struct dlm_slot **slots_out, uint32_t *gen_out);
+void dlm_lsop_recover_done(struct dlm_ls *ls);
#endif /* __MEMBER_DOT_H__ */
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f10a50f24e8f..ac5c616c9696 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -23,6 +23,7 @@
#include "memory.h"
#include "lock.h"
#include "util.h"
+#include "member.h"
static int rcom_response(struct dlm_ls *ls)
@@ -72,20 +73,30 @@ static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
dlm_lowcomms_commit_buffer(mh);
}
+static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs,
+ uint32_t flags)
+{
+ rs->rs_flags = cpu_to_le32(flags);
+}
+
/* When replying to a status request, a node also sends back its
configuration values. The requesting node then checks that the remote
node is configured the same way as itself. */
-static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
+static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf,
+ uint32_t num_slots)
{
rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen);
rf->rf_lsflags = cpu_to_le32(ls->ls_exflags);
+
+ rf->rf_our_slot = cpu_to_le16(ls->ls_slot);
+ rf->rf_num_slots = cpu_to_le16(num_slots);
+ rf->rf_generation = cpu_to_le32(ls->ls_generation);
}
-static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
+static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
{
struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
- size_t conf_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
log_error(ls, "version mismatch: %x nodeid %d: %x",
@@ -94,12 +105,6 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
return -EPROTO;
}
- if (rc->rc_header.h_length < conf_size) {
- log_error(ls, "config too short: %d nodeid %d",
- rc->rc_header.h_length, nodeid);
- return -EPROTO;
- }
-
if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen ||
le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) {
log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
@@ -127,7 +132,18 @@ static void disallow_sync_reply(struct dlm_ls *ls)
spin_unlock(&ls->ls_rcom_spin);
}
-int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
+/*
+ * low nodeid gathers one slot value at a time from each node.
+ * it sets need_slots=0, and saves rf_our_slot returned from each
+ * rcom_config.
+ *
+ * other nodes gather all slot values at once from the low nodeid.
+ * they set need_slots=1, and ignore the rf_our_slot returned from each
+ * rcom_config. they use the rf_num_slots returned from the low
+ * node's rcom_config.
+ */
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
@@ -141,10 +157,13 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
goto out;
}
- error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
+ error = create_rcom(ls, nodeid, DLM_RCOM_STATUS,
+ sizeof(struct rcom_status), &rc, &mh);
if (error)
goto out;
+ set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
+
allow_sync_reply(ls, &rc->rc_id);
memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
@@ -161,8 +180,11 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
/* we pretend the remote lockspace exists with 0 status */
log_debug(ls, "remote node %d not ready", nodeid);
rc->rc_result = 0;
- } else
- error = check_config(ls, rc, nodeid);
+ error = 0;
+ } else {
+ error = check_rcom_config(ls, rc, nodeid);
+ }
+
/* the caller looks at rc_result for the remote recovery status */
out:
return error;
@@ -172,17 +194,60 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
- int error, nodeid = rc_in->rc_header.h_nodeid;
+ struct rcom_status *rs;
+ uint32_t status;
+ int nodeid = rc_in->rc_header.h_nodeid;
+ int len = sizeof(struct rcom_config);
+ int num_slots = 0;
+ int error;
+
+ if (!dlm_slots_version(&rc_in->rc_header)) {
+ status = dlm_recover_status(ls);
+ goto do_create;
+ }
+
+ rs = (struct rcom_status *)rc_in->rc_buf;
+ if (!(rs->rs_flags & DLM_RSF_NEED_SLOTS)) {
+ status = dlm_recover_status(ls);
+ goto do_create;
+ }
+
+ spin_lock(&ls->ls_recover_lock);
+ status = ls->ls_recover_status;
+ num_slots = ls->ls_num_slots;
+ spin_unlock(&ls->ls_recover_lock);
+ len += num_slots * sizeof(struct rcom_slot);
+
+ do_create:
error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
- sizeof(struct rcom_config), &rc, &mh);
+ len, &rc, &mh);
if (error)
return;
+
rc->rc_id = rc_in->rc_id;
rc->rc_seq_reply = rc_in->rc_seq;
- rc->rc_result = dlm_recover_status(ls);
- make_config(ls, (struct rcom_config *) rc->rc_buf);
+ rc->rc_result = status;
+
+ set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots);
+
+ if (!num_slots)
+ goto do_send;
+
+ spin_lock(&ls->ls_recover_lock);
+ if (ls->ls_num_slots != num_slots) {
+ spin_unlock(&ls->ls_recover_lock);
+ log_debug(ls, "receive_rcom_status num_slots %d to %d",
+ num_slots, ls->ls_num_slots);
+ rc->rc_result = 0;
+ set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0);
+ goto do_send;
+ }
+
+ dlm_slots_copy_out(ls, rc);
+ spin_unlock(&ls->ls_recover_lock);
+ do_send:
send_rcom(ls, mh, rc);
}
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
index b09abd29ba38..206723ab744d 100644
--- a/fs/dlm/rcom.h
+++ b/fs/dlm/rcom.h
@@ -14,7 +14,7 @@
#ifndef __RCOM_DOT_H__
#define __RCOM_DOT_H__
-int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags);
int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 14638235f7b2..34d5adf1fce7 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -85,14 +85,20 @@ uint32_t dlm_recover_status(struct dlm_ls *ls)
return status;
}
+static void _set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+ ls->ls_recover_status |= status;
+}
+
void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
{
spin_lock(&ls->ls_recover_lock);
- ls->ls_recover_status |= status;
+ _set_recover_status(ls, status);
spin_unlock(&ls->ls_recover_lock);
}
-static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
+ int save_slots)
{
struct dlm_rcom *rc = ls->ls_recover_buf;
struct dlm_member *memb;
@@ -106,10 +112,13 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
goto out;
}
- error = dlm_rcom_status(ls, memb->nodeid);
+ error = dlm_rcom_status(ls, memb->nodeid, 0);
if (error)
goto out;
+ if (save_slots)
+ dlm_slot_save(ls, rc, memb);
+
if (rc->rc_result & wait_status)
break;
if (delay < 1000)
@@ -121,7 +130,8 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
return error;
}
-static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
+ uint32_t status_flags)
{
struct dlm_rcom *rc = ls->ls_recover_buf;
int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
@@ -132,7 +142,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
goto out;
}
- error = dlm_rcom_status(ls, nodeid);
+ error = dlm_rcom_status(ls, nodeid, status_flags);
if (error)
break;
@@ -152,18 +162,56 @@ static int wait_status(struct dlm_ls *ls, uint32_t status)
int error;
if (ls->ls_low_nodeid == dlm_our_nodeid()) {
- error = wait_status_all(ls, status);
+ error = wait_status_all(ls, status, 0);
if (!error)
dlm_set_recover_status(ls, status_all);
} else
- error = wait_status_low(ls, status_all);
+ error = wait_status_low(ls, status_all, 0);
return error;
}
int dlm_recover_members_wait(struct dlm_ls *ls)
{
- return wait_status(ls, DLM_RS_NODES);
+ struct dlm_member *memb;
+ struct dlm_slot *slots;
+ int num_slots, slots_size;
+ int error, rv;
+ uint32_t gen;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ memb->slot = -1;
+ memb->generation = 0;
+ }
+
+ if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+ error = wait_status_all(ls, DLM_RS_NODES, 1);
+ if (error)
+ goto out;
+
+ /* slots array is sparse, slots_size may be > num_slots */
+
+ rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen);
+ if (!rv) {
+ spin_lock(&ls->ls_recover_lock);
+ _set_recover_status(ls, DLM_RS_NODES_ALL);
+ ls->ls_num_slots = num_slots;
+ ls->ls_slots_size = slots_size;
+ ls->ls_slots = slots;
+ ls->ls_generation = gen;
+ spin_unlock(&ls->ls_recover_lock);
+ } else {
+ dlm_set_recover_status(ls, DLM_RS_NODES_ALL);
+ }
+ } else {
+ error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS);
+ if (error)
+ goto out;
+
+ dlm_slots_copy_in(ls);
+ }
+ out:
+ return error;
}
int dlm_recover_directory_wait(struct dlm_ls *ls)
@@ -542,8 +590,6 @@ int dlm_recover_locks(struct dlm_ls *ls)
out:
if (error)
recover_list_clear(ls);
- else
- dlm_set_recover_status(ls, DLM_RS_LOCKS);
return error;
}
@@ -715,6 +761,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
int dlm_create_root_list(struct dlm_ls *ls)
{
+ struct rb_node *n;
struct dlm_rsb *r;
int i, error = 0;
@@ -727,7 +774,8 @@ int dlm_create_root_list(struct dlm_ls *ls)
for (i = 0; i < ls->ls_rsbtbl_size; i++) {
spin_lock(&ls->ls_rsbtbl[i].lock);
- list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+ for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
list_add(&r->res_root_list, &ls->ls_root_list);
dlm_hold_rsb(r);
}
@@ -741,7 +789,8 @@ int dlm_create_root_list(struct dlm_ls *ls)
continue;
}
- list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
+ for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = rb_next(n)) {
+ r = rb_entry(n, struct dlm_rsb, res_hashnode);
list_add(&r->res_root_list, &ls->ls_root_list);
dlm_hold_rsb(r);
}
@@ -771,16 +820,18 @@ void dlm_release_root_list(struct dlm_ls *ls)
void dlm_clear_toss_list(struct dlm_ls *ls)
{
- struct dlm_rsb *r, *safe;
+ struct rb_node *n, *next;
+ struct dlm_rsb *rsb;
int i;
for (i = 0; i < ls->ls_rsbtbl_size; i++) {
spin_lock(&ls->ls_rsbtbl[i].lock);
- list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
- res_hashchain) {
- if (dlm_no_directory(ls) || !is_master(r)) {
- list_del(&r->res_hashchain);
- dlm_free_rsb(r);
+ for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) {
+ next = rb_next(n);;
+ rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
+ if (dlm_no_directory(ls) || !is_master(rsb)) {
+ rb_erase(n, &ls->ls_rsbtbl[i].toss);
+ dlm_free_rsb(rsb);
}
}
spin_unlock(&ls->ls_rsbtbl[i].lock);
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 774da3cf92c6..3780caf7ae0c 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -54,7 +54,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
unsigned long start;
int error, neg = 0;
- log_debug(ls, "recover %llx", (unsigned long long)rv->seq);
+ log_debug(ls, "dlm_recover %llx", (unsigned long long)rv->seq);
mutex_lock(&ls->ls_recoverd_active);
@@ -76,14 +76,22 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
/*
* Add or remove nodes from the lockspace's ls_nodes list.
- * Also waits for all nodes to complete dlm_recover_members.
*/
error = dlm_recover_members(ls, rv, &neg);
if (error) {
- log_debug(ls, "recover_members failed %d", error);
+ log_debug(ls, "dlm_recover_members error %d", error);
goto fail;
}
+
+ dlm_set_recover_status(ls, DLM_RS_NODES);
+
+ error = dlm_recover_members_wait(ls);
+ if (error) {
+ log_debug(ls, "dlm_recover_members_wait error %d", error);
+ goto fail;
+ }
+
start = jiffies;
/*
@@ -93,17 +101,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_directory(ls);
if (error) {
- log_debug(ls, "recover_directory failed %d", error);
+ log_debug(ls, "dlm_recover_directory error %d", error);
goto fail;
}
- /*
- * Wait for all nodes to complete directory rebuild.
- */
+ dlm_set_recover_status(ls, DLM_RS_DIR);
error = dlm_recover_directory_wait(ls);
if (error) {
- log_debug(ls, "recover_directory_wait failed %d", error);
+ log_debug(ls, "dlm_recover_directory_wait error %d", error);
goto fail;
}
@@ -133,7 +139,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_masters(ls);
if (error) {
- log_debug(ls, "recover_masters failed %d", error);
+ log_debug(ls, "dlm_recover_masters error %d", error);
goto fail;
}
@@ -143,13 +149,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_locks(ls);
if (error) {
- log_debug(ls, "recover_locks failed %d", error);
+ log_debug(ls, "dlm_recover_locks error %d", error);
goto fail;
}
+ dlm_set_recover_status(ls, DLM_RS_LOCKS);
+
error = dlm_recover_locks_wait(ls);
if (error) {
- log_debug(ls, "recover_locks_wait failed %d", error);
+ log_debug(ls, "dlm_recover_locks_wait error %d", error);
goto fail;
}
@@ -170,7 +178,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_locks_wait(ls);
if (error) {
- log_debug(ls, "recover_locks_wait failed %d", error);
+ log_debug(ls, "dlm_recover_locks_wait error %d", error);
goto fail;
}
}
@@ -186,9 +194,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
dlm_purge_requestqueue(ls);
dlm_set_recover_status(ls, DLM_RS_DONE);
+
error = dlm_recover_done_wait(ls);
if (error) {
- log_debug(ls, "recover_done_wait failed %d", error);
+ log_debug(ls, "dlm_recover_done_wait error %d", error);
goto fail;
}
@@ -200,34 +209,35 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = enable_locking(ls, rv->seq);
if (error) {
- log_debug(ls, "enable_locking failed %d", error);
+ log_debug(ls, "enable_locking error %d", error);
goto fail;
}
error = dlm_process_requestqueue(ls);
if (error) {
- log_debug(ls, "process_requestqueue failed %d", error);
+ log_debug(ls, "dlm_process_requestqueue error %d", error);
goto fail;
}
error = dlm_recover_waiters_post(ls);
if (error) {
- log_debug(ls, "recover_waiters_post failed %d", error);
+ log_debug(ls, "dlm_recover_waiters_post error %d", error);
goto fail;
}
dlm_grant_after_purge(ls);
- log_debug(ls, "recover %llx done: %u ms",
- (unsigned long long)rv->seq,
+ log_debug(ls, "dlm_recover %llx generation %u done: %u ms",
+ (unsigned long long)rv->seq, ls->ls_generation,
jiffies_to_msecs(jiffies - start));
mutex_unlock(&ls->ls_recoverd_active);
+ dlm_lsop_recover_done(ls);
return 0;
fail:
dlm_release_root_list(ls);
- log_debug(ls, "recover %llx error %d",
+ log_debug(ls, "dlm_recover %llx error %d",
(unsigned long long)rv->seq, error);
mutex_unlock(&ls->ls_recoverd_active);
return error;
@@ -250,8 +260,7 @@ static void do_ls_recovery(struct dlm_ls *ls)
if (rv) {
ls_recover(ls, rv);
- kfree(rv->nodeids);
- kfree(rv->new);
+ kfree(rv->nodes);
kfree(rv);
}
}
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d8ea60756403..eb4ed9ba3098 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -392,8 +392,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- error = dlm_new_lockspace(params->name, strlen(params->name),
- &lockspace, params->flags, DLM_USER_LVB_LEN);
+ error = dlm_new_lockspace(params->name, NULL, params->flags,
+ DLM_USER_LVB_LEN, NULL, NULL, NULL,
+ &lockspace);
if (error)
return error;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 2a834255c75d..ea9931281557 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -417,17 +417,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
(unsigned long long)(extent_base + extent_offset), rc);
goto out;
}
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Encrypting extent "
- "with iv:\n");
- ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
- ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
- "encryption:\n");
- ecryptfs_dump_hex((char *)
- (page_address(page)
- + (extent_offset * crypt_stat->extent_size)),
- 8);
- }
rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,
page, (extent_offset
* crypt_stat->extent_size),
@@ -440,14 +429,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
goto out;
}
rc = 0;
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
- "rc = [%d]\n",
- (unsigned long long)(extent_base + extent_offset), rc);
- ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
- "encryption:\n");
- ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
- }
out:
return rc;
}
@@ -543,17 +524,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
(unsigned long long)(extent_base + extent_offset), rc);
goto out;
}
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Decrypting extent "
- "with iv:\n");
- ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
- ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
- "decryption:\n");
- ecryptfs_dump_hex((char *)
- (page_address(enc_extent_page)
- + (extent_offset * crypt_stat->extent_size)),
- 8);
- }
rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
(extent_offset
* crypt_stat->extent_size),
@@ -567,16 +537,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
goto out;
}
rc = 0;
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
- "rc = [%d]\n",
- (unsigned long long)(extent_base + extent_offset), rc);
- ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
- "decryption:\n");
- ecryptfs_dump_hex((char *)(page_address(page)
- + (extent_offset
- * crypt_stat->extent_size)), 8);
- }
out:
return rc;
}
@@ -1590,8 +1550,8 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
*/
int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
{
- int rc = 0;
- char *page_virt = NULL;
+ int rc;
+ char *page_virt;
struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
@@ -1616,11 +1576,13 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
ecryptfs_dentry,
ECRYPTFS_VALIDATE_HEADER_SIZE);
if (rc) {
+ /* metadata is not in the file header, so try xattrs */
memset(page_virt, 0, PAGE_CACHE_SIZE);
rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
if (rc) {
printk(KERN_DEBUG "Valid eCryptfs headers not found in "
- "file header region or xattr region\n");
+ "file header region or xattr region, inode %lu\n",
+ ecryptfs_inode->i_ino);
rc = -EINVAL;
goto out;
}
@@ -1629,7 +1591,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
ECRYPTFS_DONT_VALIDATE_HEADER_SIZE);
if (rc) {
printk(KERN_DEBUG "Valid eCryptfs headers not found in "
- "file xattr region either\n");
+ "file xattr region either, inode %lu\n",
+ ecryptfs_inode->i_ino);
rc = -EINVAL;
}
if (crypt_stat->mount_crypt_stat->flags
@@ -1640,7 +1603,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
"crypto metadata only in the extended attribute "
"region, but eCryptfs was mounted without "
"xattr support enabled. eCryptfs will not treat "
- "this like an encrypted file.\n");
+ "this like an encrypted file, inode %lu\n",
+ ecryptfs_inode->i_ino);
rc = -EINVAL;
}
}
@@ -2026,6 +1990,17 @@ out:
return;
}
+static size_t ecryptfs_max_decoded_size(size_t encoded_size)
+{
+ /* Not exact; conservatively long. Every block of 4
+ * encoded characters decodes into a block of 3
+ * decoded characters. This segment of code provides
+ * the caller with the maximum amount of allocated
+ * space that @dst will need to point to in a
+ * subsequent call. */
+ return ((encoded_size + 1) * 3) / 4;
+}
+
/**
* ecryptfs_decode_from_filename
* @dst: If NULL, this function only sets @dst_size and returns. If
@@ -2044,13 +2019,7 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
size_t dst_byte_offset = 0;
if (dst == NULL) {
- /* Not exact; conservatively long. Every block of 4
- * encoded characters decodes into a block of 3
- * decoded characters. This segment of code provides
- * the caller with the maximum amount of allocated
- * space that @dst will need to point to in a
- * subsequent call. */
- (*dst_size) = (((src_size + 1) * 3) / 4);
+ (*dst_size) = ecryptfs_max_decoded_size(src_size);
goto out;
}
while (src_byte_offset < src_size) {
@@ -2275,3 +2244,52 @@ out_free:
out:
return rc;
}
+
+#define ENC_NAME_MAX_BLOCKLEN_8_OR_16 143
+
+int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+ struct blkcipher_desc desc;
+ struct mutex *tfm_mutex;
+ size_t cipher_blocksize;
+ int rc;
+
+ if (!(mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
+ (*namelen) = lower_namelen;
+ return 0;
+ }
+
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+ mount_crypt_stat->global_default_fn_cipher_name);
+ if (unlikely(rc)) {
+ (*namelen) = 0;
+ return rc;
+ }
+
+ mutex_lock(tfm_mutex);
+ cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm);
+ mutex_unlock(tfm_mutex);
+
+ /* Return an exact amount for the common cases */
+ if (lower_namelen == NAME_MAX
+ && (cipher_blocksize == 8 || cipher_blocksize == 16)) {
+ (*namelen) = ENC_NAME_MAX_BLOCKLEN_8_OR_16;
+ return 0;
+ }
+
+ /* Return a safe estimate for the uncommon cases */
+ (*namelen) = lower_namelen;
+ (*namelen) -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+ /* Since this is the max decoded size, subtract 1 "decoded block" len */
+ (*namelen) = ecryptfs_max_decoded_size(*namelen) - 3;
+ (*namelen) -= ECRYPTFS_TAG_70_MAX_METADATA_SIZE;
+ (*namelen) -= ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES;
+ /* Worst case is that the filename is padded nearly a full block size */
+ (*namelen) -= cipher_blocksize - 1;
+
+ if ((*namelen) < 0)
+ (*namelen) = 0;
+
+ return 0;
+}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a9f29b12fbf2..867b64c5d84f 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -151,12 +151,21 @@ ecryptfs_get_key_payload_data(struct key *key)
* dentry name */
#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
* metadata */
+#define ECRYPTFS_MIN_PKT_LEN_SIZE 1 /* Min size to specify packet length */
+#define ECRYPTFS_MAX_PKT_LEN_SIZE 2 /* Pass at least this many bytes to
+ * ecryptfs_parse_packet_length() and
+ * ecryptfs_write_packet_length()
+ */
/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
* ECRYPTFS_MAX_IV_BYTES */
#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
#define MD5_DIGEST_SIZE 16
#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
+#define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \
+ + ECRYPTFS_SIG_SIZE + 1 + 1)
+#define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \
+ + ECRYPTFS_SIG_SIZE + 1 + 1)
#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
@@ -696,6 +705,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
size_t *packet_size,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
char *data, size_t max_packet_size);
+int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
loff_t offset);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 32f90a3ae63e..ab35b113003b 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -144,24 +144,6 @@ static int ecryptfs_interpose(struct dentry *lower_dentry,
}
/**
- * ecryptfs_create_underlying_file
- * @lower_dir_inode: inode of the parent in the lower fs of the new file
- * @dentry: New file's dentry
- * @mode: The mode of the new file
- *
- * Creates the file in the lower file system.
- *
- * Returns zero on success; non-zero on error condition
- */
-static int
-ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
- struct dentry *dentry, int mode)
-{
- struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- return vfs_create(lower_dir_inode, lower_dentry, mode, NULL);
-}
-
-/**
* ecryptfs_do_create
* @directory_inode: inode of the new file's dentry's parent in ecryptfs
* @ecryptfs_dentry: New file's dentry in ecryptfs
@@ -176,7 +158,7 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
*/
static struct inode *
ecryptfs_do_create(struct inode *directory_inode,
- struct dentry *ecryptfs_dentry, int mode)
+ struct dentry *ecryptfs_dentry, umode_t mode)
{
int rc;
struct dentry *lower_dentry;
@@ -191,8 +173,7 @@ ecryptfs_do_create(struct inode *directory_inode,
inode = ERR_CAST(lower_dir_dentry);
goto out;
}
- rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
- ecryptfs_dentry, mode);
+ rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, NULL);
if (rc) {
printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
"rc = [%d]\n", __func__, rc);
@@ -267,7 +248,7 @@ out:
*/
static int
ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
- int mode, struct nameidata *nd)
+ umode_t mode, struct nameidata *nd)
{
struct inode *ecryptfs_inode;
int rc;
@@ -559,7 +540,7 @@ out_lock:
return rc;
}
-static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int rc;
struct dentry *lower_dentry;
@@ -607,7 +588,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
}
static int
-ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
int rc;
struct dentry *lower_dentry;
@@ -841,18 +822,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
size_t num_zeros = (PAGE_CACHE_SIZE
- (ia->ia_size & ~PAGE_CACHE_MASK));
-
- /*
- * XXX(truncate) this should really happen at the begginning
- * of ->setattr. But the code is too messy to that as part
- * of a larger patch. ecryptfs is also totally missing out
- * on the inode_change_ok check at the beginning of
- * ->setattr while would include this.
- */
- rc = inode_newsize_ok(inode, ia->ia_size);
- if (rc)
- goto out;
-
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
truncate_setsize(inode, ia->ia_size);
lower_ia->ia_size = ia->ia_size;
@@ -902,6 +871,28 @@ out:
return rc;
}
+static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset)
+{
+ struct ecryptfs_crypt_stat *crypt_stat;
+ loff_t lower_oldsize, lower_newsize;
+
+ crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+ lower_oldsize = upper_size_to_lower_size(crypt_stat,
+ i_size_read(inode));
+ lower_newsize = upper_size_to_lower_size(crypt_stat, offset);
+ if (lower_newsize > lower_oldsize) {
+ /*
+ * The eCryptfs inode and the new *lower* size are mixed here
+ * because we may not have the lower i_mutex held and/or it may
+ * not be appropriate to call inode_newsize_ok() with inodes
+ * from other filesystems.
+ */
+ return inode_newsize_ok(inode, lower_newsize);
+ }
+
+ return 0;
+}
+
/**
* ecryptfs_truncate
* @dentry: The ecryptfs layer dentry
@@ -918,6 +909,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
struct iattr lower_ia = { .ia_valid = 0 };
int rc;
+ rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length);
+ if (rc)
+ return rc;
+
rc = truncate_upper(dentry, &ia, &lower_ia);
if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
@@ -997,6 +992,16 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
}
}
mutex_unlock(&crypt_stat->cs_mutex);
+
+ rc = inode_change_ok(inode, ia);
+ if (rc)
+ goto out;
+ if (ia->ia_valid & ATTR_SIZE) {
+ rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size);
+ if (rc)
+ goto out;
+ }
+
if (S_ISREG(inode->i_mode)) {
rc = filemap_write_and_wait(inode->i_mapping);
if (rc)
@@ -1080,6 +1085,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
}
rc = vfs_setxattr(lower_dentry, name, value, size, flags);
+ if (!rc)
+ fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);
out:
return rc;
}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ac1ad48c2376..2333203a120b 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -109,7 +109,7 @@ int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
(*size) += ((unsigned char)(data[1]) + 192);
(*length_size) = 2;
} else if (data[0] == 255) {
- /* Five-byte length; we're not supposed to see this */
+ /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
"supported\n");
rc = -EINVAL;
@@ -126,7 +126,7 @@ out:
/**
* ecryptfs_write_packet_length
* @dest: The byte array target into which to write the length. Must
- * have at least 5 bytes allocated.
+ * have at least ECRYPTFS_MAX_PKT_LEN_SIZE bytes allocated.
* @size: The length to write.
* @packet_size_length: The number of bytes used to encode the packet
* length is written to this address.
@@ -146,6 +146,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
dest[1] = ((size - 192) % 256);
(*packet_size_length) = 2;
} else {
+ /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
rc = -EINVAL;
ecryptfs_printk(KERN_WARNING,
"Unsupported packet size: [%zd]\n", size);
@@ -678,10 +679,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
* Octets N3-N4: Block-aligned encrypted filename
* - Consists of a minimum number of random characters, a \0
* separator, and then the filename */
- s->max_packet_size = (1 /* Tag 70 identifier */
- + 3 /* Max Tag 70 packet size */
- + ECRYPTFS_SIG_SIZE /* FNEK sig */
- + 1 /* Cipher identifier */
+ s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE
+ s->block_aligned_filename_size);
if (dest == NULL) {
(*packet_size) = s->max_packet_size;
@@ -933,10 +931,10 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
goto out;
}
s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
+ if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {
printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
"at least [%d]\n", __func__, max_packet_size,
- (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
+ ECRYPTFS_TAG_70_MIN_METADATA_SIZE);
rc = -EINVAL;
goto out;
}
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 940a82e63dc3..3a06f4043df4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -218,6 +218,29 @@ out_unlock:
return rc;
}
+/*
+ * miscdevfs packet format:
+ * Octet 0: Type
+ * Octets 1-4: network byte order msg_ctx->counter
+ * Octets 5-N0: Size of struct ecryptfs_message to follow
+ * Octets N0-N1: struct ecryptfs_message (including data)
+ *
+ * Octets 5-N1 not written if the packet type does not include a message
+ */
+#define PKT_TYPE_SIZE 1
+#define PKT_CTR_SIZE 4
+#define MIN_NON_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE)
+#define MIN_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE \
+ + ECRYPTFS_MIN_PKT_LEN_SIZE)
+/* 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES comes from tag 65 packet format */
+#define MAX_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE \
+ + ECRYPTFS_MAX_PKT_LEN_SIZE \
+ + sizeof(struct ecryptfs_message) \
+ + 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)
+#define PKT_TYPE_OFFSET 0
+#define PKT_CTR_OFFSET PKT_TYPE_SIZE
+#define PKT_LEN_OFFSET (PKT_TYPE_SIZE + PKT_CTR_SIZE)
+
/**
* ecryptfs_miscdev_read - format and send message from queue
* @file: fs/ecryptfs/euid miscdevfs handle (ignored)
@@ -237,7 +260,7 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
struct ecryptfs_daemon *daemon;
struct ecryptfs_msg_ctx *msg_ctx;
size_t packet_length_size;
- char packet_length[3];
+ char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE];
size_t i;
size_t total_length;
uid_t euid = current_euid();
@@ -305,15 +328,8 @@ check_list:
packet_length_size = 0;
msg_ctx->msg_size = 0;
}
- /* miscdevfs packet format:
- * Octet 0: Type
- * Octets 1-4: network byte order msg_ctx->counter
- * Octets 5-N0: Size of struct ecryptfs_message to follow
- * Octets N0-N1: struct ecryptfs_message (including data)
- *
- * Octets 5-N1 not written if the packet type does not
- * include a message */
- total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size);
+ total_length = (PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_length_size
+ + msg_ctx->msg_size);
if (count < total_length) {
rc = 0;
printk(KERN_WARNING "%s: Only given user buffer of "
@@ -324,9 +340,10 @@ check_list:
rc = -EFAULT;
if (put_user(msg_ctx->type, buf))
goto out_unlock_msg_ctx;
- if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1)))
+ if (put_user(cpu_to_be32(msg_ctx->counter),
+ (__be32 __user *)(&buf[PKT_CTR_OFFSET])))
goto out_unlock_msg_ctx;
- i = 5;
+ i = PKT_TYPE_SIZE + PKT_CTR_SIZE;
if (msg_ctx->msg) {
if (copy_to_user(&buf[i], packet_length, packet_length_size))
goto out_unlock_msg_ctx;
@@ -391,12 +408,6 @@ out:
* @count: Amount of data in @buf
* @ppos: Pointer to offset in file (ignored)
*
- * miscdevfs packet format:
- * Octet 0: Type
- * Octets 1-4: network byte order msg_ctx->counter (0's for non-response)
- * Octets 5-N0: Size of struct ecryptfs_message to follow
- * Octets N0-N1: struct ecryptfs_message (including data)
- *
* Returns the number of bytes read from @buf
*/
static ssize_t
@@ -405,60 +416,78 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
{
__be32 counter_nbo;
u32 seq;
- size_t packet_size, packet_size_length, i;
- ssize_t sz = 0;
+ size_t packet_size, packet_size_length;
char *data;
uid_t euid = current_euid();
- int rc;
+ unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE];
+ ssize_t rc;
- if (count == 0)
- goto out;
+ if (count == 0) {
+ return 0;
+ } else if (count == MIN_NON_MSG_PKT_SIZE) {
+ /* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
+ goto memdup;
+ } else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) {
+ printk(KERN_WARNING "%s: Acceptable packet size range is "
+ "[%d-%zu], but amount of data written is [%zu].",
+ __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(packet_size_peek, &buf[PKT_LEN_OFFSET],
+ sizeof(packet_size_peek))) {
+ printk(KERN_WARNING "%s: Error while inspecting packet size\n",
+ __func__);
+ return -EFAULT;
+ }
+ rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size,
+ &packet_size_length);
+ if (rc) {
+ printk(KERN_WARNING "%s: Error parsing packet length; "
+ "rc = [%zd]\n", __func__, rc);
+ return rc;
+ }
+
+ if ((PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_size_length + packet_size)
+ != count) {
+ printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__,
+ packet_size);
+ return -EINVAL;
+ }
+
+memdup:
data = memdup_user(buf, count);
if (IS_ERR(data)) {
printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
__func__, PTR_ERR(data));
- goto out;
+ return PTR_ERR(data);
}
- sz = count;
- i = 0;
- switch (data[i++]) {
+ switch (data[PKT_TYPE_OFFSET]) {
case ECRYPTFS_MSG_RESPONSE:
- if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
+ if (count < (MIN_MSG_PKT_SIZE
+ + sizeof(struct ecryptfs_message))) {
printk(KERN_WARNING "%s: Minimum acceptable packet "
"size is [%zd], but amount of data written is "
"only [%zd]. Discarding response packet.\n",
__func__,
- (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
- count);
+ (MIN_MSG_PKT_SIZE
+ + sizeof(struct ecryptfs_message)), count);
+ rc = -EINVAL;
goto out_free;
}
- memcpy(&counter_nbo, &data[i], 4);
+ memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE);
seq = be32_to_cpu(counter_nbo);
- i += 4;
- rc = ecryptfs_parse_packet_length(&data[i], &packet_size,
- &packet_size_length);
+ rc = ecryptfs_miscdev_response(
+ &data[PKT_LEN_OFFSET + packet_size_length],
+ packet_size, euid, current_user_ns(),
+ task_pid(current), seq);
if (rc) {
- printk(KERN_WARNING "%s: Error parsing packet length; "
- "rc = [%d]\n", __func__, rc);
- goto out_free;
- }
- i += packet_size_length;
- if ((1 + 4 + packet_size_length + packet_size) != count) {
- printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
- " + packet_size([%zd]))([%zd]) != "
- "count([%zd]). Invalid packet format.\n",
- __func__, packet_size_length, packet_size,
- (1 + packet_size_length + packet_size), count);
- goto out_free;
- }
- rc = ecryptfs_miscdev_response(&data[i], packet_size,
- euid, current_user_ns(),
- task_pid(current), seq);
- if (rc)
printk(KERN_WARNING "%s: Failed to deliver miscdev "
- "response to requesting operation; rc = [%d]\n",
+ "response to requesting operation; rc = [%zd]\n",
__func__, rc);
+ goto out_free;
+ }
break;
case ECRYPTFS_MSG_HELO:
case ECRYPTFS_MSG_QUIT:
@@ -467,12 +496,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
"message of unrecognized type [%d]\n",
data[0]);
- break;
+ rc = -EINVAL;
+ goto out_free;
}
+ rc = count;
out_free:
kfree(data);
-out:
- return sz;
+ return rc;
}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 6a44148c5fb9..a46b3a8fee1e 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -57,6 +57,10 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
* @page: Page that is locked before this call is made
*
* Returns zero on success; non-zero otherwise
+ *
+ * This is where we encrypt the data and pass the encrypted data to
+ * the lower filesystem. In OpenPGP-compatible mode, we operate on
+ * entire underlying packets.
*/
static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
{
@@ -146,7 +150,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
/* This is a header extent */
char *page_virt;
- page_virt = kmap_atomic(page, KM_USER0);
+ page_virt = kmap_atomic(page);
memset(page_virt, 0, PAGE_CACHE_SIZE);
/* TODO: Support more than one header extent */
if (view_extent_num == 0) {
@@ -159,7 +163,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
crypt_stat,
&written);
}
- kunmap_atomic(page_virt, KM_USER0);
+ kunmap_atomic(page_virt);
flush_dcache_page(page);
if (rc) {
printk(KERN_ERR "%s: Error reading xattr "
@@ -481,10 +485,6 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
* @copied: The amount of data copied
* @page: The eCryptfs page
* @fsdata: The fsdata (unused)
- *
- * This is where we encrypt the data and pass the encrypted data to
- * the lower filesystem. In OpenPGP-compatible mode, we operate on
- * entire underlying packets.
*/
static int ecryptfs_write_end(struct file *file,
struct address_space *mapping,
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 3745f7c2b9c2..b2a34a192f4f 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -130,13 +130,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
- size_t total_remaining_bytes = ((offset + size) - pos);
+ loff_t total_remaining_bytes = ((offset + size) - pos);
+
+ if (fatal_signal_pending(current)) {
+ rc = -EINTR;
+ break;
+ }
if (num_bytes > total_remaining_bytes)
num_bytes = total_remaining_bytes;
if (pos < offset) {
/* remaining zeros to write, up to destination offset */
- size_t total_remaining_zeros = (offset - pos);
+ loff_t total_remaining_zeros = (offset - pos);
if (num_bytes > total_remaining_zeros)
num_bytes = total_remaining_zeros;
@@ -151,7 +156,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
ecryptfs_page_idx, rc);
goto out;
}
- ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
+ ecryptfs_page_virt = kmap_atomic(ecryptfs_page);
/*
* pos: where we're now writing, offset: where the request was
@@ -174,7 +179,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
(data + data_offset), num_bytes);
data_offset += num_bytes;
}
- kunmap_atomic(ecryptfs_page_virt, KM_USER0);
+ kunmap_atomic(ecryptfs_page_virt);
flush_dcache_page(ecryptfs_page);
SetPageUptodate(ecryptfs_page);
unlock_page(ecryptfs_page);
@@ -193,15 +198,19 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
}
pos += num_bytes;
}
- if ((offset + size) > ecryptfs_file_size) {
- i_size_write(ecryptfs_inode, (offset + size));
+ if (pos > ecryptfs_file_size) {
+ i_size_write(ecryptfs_inode, pos);
if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
- rc = ecryptfs_write_inode_size_to_metadata(
+ int rc2;
+
+ rc2 = ecryptfs_write_inode_size_to_metadata(
ecryptfs_inode);
- if (rc) {
+ if (rc2) {
printk(KERN_ERR "Problem with "
"ecryptfs_write_inode_size_to_metadata; "
- "rc = [%d]\n", rc);
+ "rc = [%d]\n", rc2);
+ if (!rc)
+ rc = rc2;
goto out;
}
}
@@ -273,76 +282,3 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
flush_dcache_page(page_for_ecryptfs);
return rc;
}
-
-#if 0
-/**
- * ecryptfs_read
- * @data: The virtual address into which to write the data read (and
- * possibly decrypted) from the lower file
- * @offset: The offset in the decrypted view of the file from which to
- * read into @data
- * @size: The number of bytes to read into @data
- * @ecryptfs_file: The eCryptfs file from which to read
- *
- * Read an arbitrary amount of data from an arbitrary location in the
- * eCryptfs page cache. This is done on an extent-by-extent basis;
- * individual extents are decrypted and read from the lower page
- * cache (via VFS reads). This function takes care of all the
- * address translation to locations in the lower filesystem.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_read(char *data, loff_t offset, size_t size,
- struct file *ecryptfs_file)
-{
- struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
- struct page *ecryptfs_page;
- char *ecryptfs_page_virt;
- loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
- loff_t data_offset = 0;
- loff_t pos;
- int rc = 0;
-
- if ((offset + size) > ecryptfs_file_size) {
- rc = -EINVAL;
- printk(KERN_ERR "%s: Attempt to read data past the end of the "
- "file; offset = [%lld]; size = [%td]; "
- "ecryptfs_file_size = [%lld]\n",
- __func__, offset, size, ecryptfs_file_size);
- goto out;
- }
- pos = offset;
- while (pos < (offset + size)) {
- pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
- size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
- size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
- size_t total_remaining_bytes = ((offset + size) - pos);
-
- if (num_bytes > total_remaining_bytes)
- num_bytes = total_remaining_bytes;
- ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
- ecryptfs_page_idx);
- if (IS_ERR(ecryptfs_page)) {
- rc = PTR_ERR(ecryptfs_page);
- printk(KERN_ERR "%s: Error getting page at "
- "index [%ld] from eCryptfs inode "
- "mapping; rc = [%d]\n", __func__,
- ecryptfs_page_idx, rc);
- goto out;
- }
- ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
- memcpy((data + data_offset),
- ((char *)ecryptfs_page_virt + start_offset_in_page),
- num_bytes);
- kunmap_atomic(ecryptfs_page_virt, KM_USER0);
- flush_dcache_page(ecryptfs_page);
- SetPageUptodate(ecryptfs_page);
- unlock_page(ecryptfs_page);
- page_cache_release(ecryptfs_page);
- pos += num_bytes;
- data_offset += num_bytes;
- }
-out:
- return rc;
-}
-#endif /* 0 */
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index dbd52d40df4c..cf152823bbf4 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -30,6 +30,8 @@
#include <linux/seq_file.h>
#include <linux/file.h>
#include <linux/crypto.h>
+#include <linux/statfs.h>
+#include <linux/magic.h>
#include "ecryptfs_kernel.h"
struct kmem_cache *ecryptfs_inode_info_cache;
@@ -69,7 +71,6 @@ static void ecryptfs_i_callback(struct rcu_head *head)
struct ecryptfs_inode_info *inode_info;
inode_info = ecryptfs_inode_to_private(inode);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
}
@@ -103,10 +104,20 @@ static void ecryptfs_destroy_inode(struct inode *inode)
static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ int rc;
if (!lower_dentry->d_sb->s_op->statfs)
return -ENOSYS;
- return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
+
+ rc = lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
+ if (rc)
+ return rc;
+
+ buf->f_type = ECRYPTFS_SUPER_MAGIC;
+ rc = ecryptfs_set_f_namelen(&buf->f_namelen, buf->f_namelen,
+ &ecryptfs_superblock_to_private(dentry->d_sb)->mount_crypt_stat);
+
+ return rc;
}
/**
@@ -132,9 +143,9 @@ static void ecryptfs_evict_inode(struct inode *inode)
* Prints the mount options for a given superblock.
* Returns zero; does not fail.
*/
-static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int ecryptfs_show_options(struct seq_file *m, struct dentry *root)
{
- struct super_block *sb = mnt->mnt_sb;
+ struct super_block *sb = root->d_sb;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
&ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
struct ecryptfs_global_auth_tok *walker;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 0f31acb0131c..981106429a9f 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -68,7 +68,6 @@ static struct inode *efs_alloc_inode(struct super_block *sb)
static void efs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 828e750af23a..4d9d3a45e356 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -197,6 +197,12 @@ struct eventpoll {
/* The user that created the eventpoll descriptor */
struct user_struct *user;
+
+ struct file *file;
+
+ /* used to optimize loop detection check */
+ int visited;
+ struct list_head visited_list_link;
};
/* Wait structure used by the poll hooks */
@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly;
/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __read_mostly;
+/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
+static LIST_HEAD(visited_list);
+
+/*
+ * List of files with newly added links, where we may need to limit the number
+ * of emanating paths. Protected by the epmutex.
+ */
+static LIST_HEAD(tfile_check_list);
+
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -276,6 +291,12 @@ ctl_table epoll_table[] = {
};
#endif /* CONFIG_SYSCTL */
+static const struct file_operations eventpoll_fops;
+
+static inline int is_file_epoll(struct file *f)
+{
+ return f->f_op == &eventpoll_fops;
+}
/* Setup the structure that is used as key for the RB tree */
static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -299,6 +320,11 @@ static inline int ep_is_linked(struct list_head *p)
return !list_empty(p);
}
+static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
+{
+ return container_of(p, struct eppoll_entry, wait);
+}
+
/* Get the "struct epitem" from a wait queue pointer */
static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
{
@@ -446,6 +472,18 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
put_cpu();
}
+static void ep_remove_wait_queue(struct eppoll_entry *pwq)
+{
+ wait_queue_head_t *whead;
+
+ rcu_read_lock();
+ /* If it is cleared by POLLFREE, it should be rcu-safe */
+ whead = rcu_dereference(pwq->whead);
+ if (whead)
+ remove_wait_queue(whead, &pwq->wait);
+ rcu_read_unlock();
+}
+
/*
* This function unregisters poll callbacks from the associated file
* descriptor. Must be called with "mtx" held (or "epmutex" if called from
@@ -460,7 +498,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
list_del(&pwq->llink);
- remove_wait_queue(pwq->whead, &pwq->wait);
+ ep_remove_wait_queue(pwq);
kmem_cache_free(pwq_cache, pwq);
}
}
@@ -711,12 +749,6 @@ static const struct file_operations eventpoll_fops = {
.llseek = noop_llseek,
};
-/* Fast test to see if the file is an eventpoll file */
-static inline int is_file_epoll(struct file *f)
-{
- return f->f_op == &eventpoll_fops;
-}
-
/*
* This is called from eventpoll_release() to unlink files from the eventpoll
* interface. We need to have this facility to cleanup correctly files that are
@@ -827,6 +859,17 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
+ if ((unsigned long)key & POLLFREE) {
+ ep_pwq_from_wait(wait)->whead = NULL;
+ /*
+ * whead = NULL above can race with ep_remove_wait_queue()
+ * which can do another remove_wait_queue() after us, so we
+ * can't use __remove_wait_queue(). whead->lock is held by
+ * the caller.
+ */
+ list_del_init(&wait->task_list);
+ }
+
spin_lock_irqsave(&ep->lock, flags);
/*
@@ -926,6 +969,103 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
rb_insert_color(&epi->rbn, &ep->rbr);
}
+
+
+#define PATH_ARR_SIZE 5
+/*
+ * These are the number paths of length 1 to 5, that we are allowing to emanate
+ * from a single file of interest. For example, we allow 1000 paths of length
+ * 1, to emanate from each file of interest. This essentially represents the
+ * potential wakeup paths, which need to be limited in order to avoid massive
+ * uncontrolled wakeup storms. The common use case should be a single ep which
+ * is connected to n file sources. In this case each file source has 1 path
+ * of length 1. Thus, the numbers below should be more than sufficient. These
+ * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
+ * and delete can't add additional paths. Protected by the epmutex.
+ */
+static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
+static int path_count[PATH_ARR_SIZE];
+
+static int path_count_inc(int nests)
+{
+ /* Allow an arbitrary number of depth 1 paths */
+ if (nests == 0)
+ return 0;
+
+ if (++path_count[nests] > path_limits[nests])
+ return -1;
+ return 0;
+}
+
+static void path_count_init(void)
+{
+ int i;
+
+ for (i = 0; i < PATH_ARR_SIZE; i++)
+ path_count[i] = 0;
+}
+
+static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
+{
+ int error = 0;
+ struct file *file = priv;
+ struct file *child_file;
+ struct epitem *epi;
+
+ list_for_each_entry(epi, &file->f_ep_links, fllink) {
+ child_file = epi->ep->file;
+ if (is_file_epoll(child_file)) {
+ if (list_empty(&child_file->f_ep_links)) {
+ if (path_count_inc(call_nests)) {
+ error = -1;
+ break;
+ }
+ } else {
+ error = ep_call_nested(&poll_loop_ncalls,
+ EP_MAX_NESTS,
+ reverse_path_check_proc,
+ child_file, child_file,
+ current);
+ }
+ if (error != 0)
+ break;
+ } else {
+ printk(KERN_ERR "reverse_path_check_proc: "
+ "file is not an ep!\n");
+ }
+ }
+ return error;
+}
+
+/**
+ * reverse_path_check - The tfile_check_list is list of file *, which have
+ * links that are proposed to be newly added. We need to
+ * make sure that those added links don't add too many
+ * paths such that we will spend all our time waking up
+ * eventpoll objects.
+ *
+ * Returns: Returns zero if the proposed links don't create too many paths,
+ * -1 otherwise.
+ */
+static int reverse_path_check(void)
+{
+ int length = 0;
+ int error = 0;
+ struct file *current_file;
+
+ /* let's call this for all tfiles */
+ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
+ length++;
+ path_count_init();
+ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ reverse_path_check_proc, current_file,
+ current_file, current);
+ if (error)
+ break;
+ }
+ return error;
+}
+
/*
* Must be called with "mtx" held.
*/
@@ -987,6 +1127,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
*/
ep_rbtree_insert(ep, epi);
+ /* now check if we've created too many backpaths */
+ error = -EINVAL;
+ if (reverse_path_check())
+ goto error_remove_epi;
+
/* We have to drop the new item inside our item list to keep track of it */
spin_lock_irqsave(&ep->lock, flags);
@@ -1011,6 +1156,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
return 0;
+error_remove_epi:
+ spin_lock(&tfile->f_lock);
+ if (ep_is_linked(&epi->fllink))
+ list_del_init(&epi->fllink);
+ spin_unlock(&tfile->f_lock);
+
+ rb_erase(&epi->rbn, &ep->rbr);
+
error_unregister:
ep_unregister_pollwait(ep, epi);
@@ -1275,18 +1428,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
int error = 0;
struct file *file = priv;
struct eventpoll *ep = file->private_data;
+ struct eventpoll *ep_tovisit;
struct rb_node *rbp;
struct epitem *epi;
mutex_lock_nested(&ep->mtx, call_nests + 1);
+ ep->visited = 1;
+ list_add(&ep->visited_list_link, &visited_list);
for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
+ ep_tovisit = epi->ffd.file->private_data;
+ if (ep_tovisit->visited)
+ continue;
error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
- ep_loop_check_proc, epi->ffd.file,
- epi->ffd.file->private_data, current);
+ ep_loop_check_proc, epi->ffd.file,
+ ep_tovisit, current);
if (error != 0)
break;
+ } else {
+ /*
+ * If we've reached a file that is not associated with
+ * an ep, then we need to check if the newly added
+ * links are going to add too many wakeup paths. We do
+ * this by adding it to the tfile_check_list, if it's
+ * not already there, and calling reverse_path_check()
+ * during ep_insert().
+ */
+ if (list_empty(&epi->ffd.file->f_tfile_llink))
+ list_add(&epi->ffd.file->f_tfile_llink,
+ &tfile_check_list);
}
}
mutex_unlock(&ep->mtx);
@@ -1307,8 +1478,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
*/
static int ep_loop_check(struct eventpoll *ep, struct file *file)
{
- return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ int ret;
+ struct eventpoll *ep_cur, *ep_next;
+
+ ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
ep_loop_check_proc, file, ep, current);
+ /* clear visited list */
+ list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
+ visited_list_link) {
+ ep_cur->visited = 0;
+ list_del(&ep_cur->visited_list_link);
+ }
+ return ret;
+}
+
+static void clear_tfile_check_list(void)
+{
+ struct file *file;
+
+ /* first clear the tfile_check_list */
+ while (!list_empty(&tfile_check_list)) {
+ file = list_first_entry(&tfile_check_list, struct file,
+ f_tfile_llink);
+ list_del_init(&file->f_tfile_llink);
+ }
+ INIT_LIST_HEAD(&tfile_check_list);
}
/*
@@ -1316,8 +1510,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
*/
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
- int error;
+ int error, fd;
struct eventpoll *ep = NULL;
+ struct file *file;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -1334,11 +1529,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
- error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
+ if (fd < 0) {
+ error = fd;
+ goto out_free_ep;
+ }
+ file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
- if (error < 0)
- ep_free(ep);
-
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto out_free_fd;
+ }
+ fd_install(fd, file);
+ ep->file = file;
+ return fd;
+
+out_free_fd:
+ put_unused_fd(fd);
+out_free_ep:
+ ep_free(ep);
return error;
}
@@ -1404,21 +1613,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/*
* When we insert an epoll file descriptor, inside another epoll file
* descriptor, there is the change of creating closed loops, which are
- * better be handled here, than in more critical paths.
+ * better be handled here, than in more critical paths. While we are
+ * checking for loops we also determine the list of files reachable
+ * and hang them on the tfile_check_list, so we can check that we
+ * haven't created too many possible wakeup paths.
*
- * We hold epmutex across the loop check and the insert in this case, in
- * order to prevent two separate inserts from racing and each doing the
- * insert "at the same time" such that ep_loop_check passes on both
- * before either one does the insert, thereby creating a cycle.
+ * We need to hold the epmutex across both ep_insert and ep_remove
+ * b/c we want to make sure we are looking at a coherent view of
+ * epoll network.
*/
- if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+ if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
mutex_lock(&epmutex);
did_lock_epmutex = 1;
- error = -ELOOP;
- if (ep_loop_check(ep, tfile) != 0)
- goto error_tgt_fput;
}
-
+ if (op == EPOLL_CTL_ADD) {
+ if (is_file_epoll(tfile)) {
+ error = -ELOOP;
+ if (ep_loop_check(ep, tfile) != 0)
+ goto error_tgt_fput;
+ } else
+ list_add(&tfile->f_tfile_llink, &tfile_check_list);
+ }
mutex_lock_nested(&ep->mtx, 0);
@@ -1437,6 +1652,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST;
+ clear_tfile_check_list();
break;
case EPOLL_CTL_DEL:
if (epi)
@@ -1455,7 +1671,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
mutex_unlock(&ep->mtx);
error_tgt_fput:
- if (unlikely(did_lock_epmutex))
+ if (did_lock_epmutex)
mutex_unlock(&epmutex);
fput(tfile);
diff --git a/fs/exec.c b/fs/exec.c
index 36254645b7cc..153dee14fe55 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,6 +59,8 @@
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
+
+#include <trace/events/task.h>
#include "internal.h"
int core_uses_pid;
@@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf)
{
task_lock(tsk);
+ trace_task_rename(tsk, buf);
+
/*
* Threads may access current->comm without holding
* the task lock, so write the string carefully.
@@ -1067,6 +1071,21 @@ void set_task_comm(struct task_struct *tsk, char *buf)
perf_event_comm(tsk);
}
+static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
+{
+ int i, ch;
+
+ /* Copies the binary name from after last slash */
+ for (i = 0; (ch = *(fn++)) != '\0';) {
+ if (ch == '/')
+ i = 0; /* overwrite what we wrote */
+ else
+ if (i < len - 1)
+ tcomm[i++] = ch;
+ }
+ tcomm[i] = '\0';
+}
+
int flush_old_exec(struct linux_binprm * bprm)
{
int retval;
@@ -1081,6 +1100,7 @@ int flush_old_exec(struct linux_binprm * bprm)
set_mm_exe_file(bprm->mm, bprm->file);
+ filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
/*
* Release all of the old mmap stuff
*/
@@ -1112,10 +1132,6 @@ EXPORT_SYMBOL(would_dump);
void setup_new_exec(struct linux_binprm * bprm)
{
- int i, ch;
- const char *name;
- char tcomm[sizeof(current->comm)];
-
arch_pick_mmap_layout(current->mm);
/* This is the point of no return */
@@ -1126,18 +1142,7 @@ void setup_new_exec(struct linux_binprm * bprm)
else
set_dumpable(current->mm, suid_dumpable);
- name = bprm->filename;
-
- /* Copies the binary name from after last slash */
- for (i=0; (ch = *(name++)) != '\0';) {
- if (ch == '/')
- i = 0; /* overwrite what we wrote */
- else
- if (i < (sizeof(tcomm) - 1))
- tcomm[i++] = ch;
- }
- tcomm[i] = '\0';
- set_task_comm(current, tcomm);
+ set_task_comm(current, bprm->tcomm);
/* Set the new mm task size. We have to do that late because it may
* depend on TIF_32BIT which is only updated in flush_thread() on
@@ -1225,7 +1230,7 @@ EXPORT_SYMBOL(install_exec_creds);
* - the caller must hold ->cred_guard_mutex to protect against
* PTRACE_ATTACH
*/
-int check_unsafe_exec(struct linux_binprm *bprm)
+static int check_unsafe_exec(struct linux_binprm *bprm)
{
struct task_struct *p = current, *t;
unsigned n_fs;
@@ -1910,7 +1915,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
- struct completion *vfork_done;
int core_waiters = -EBUSY;
init_completion(&core_state->startup);
@@ -1922,22 +1926,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
core_waiters = zap_threads(tsk, mm, core_state, exit_code);
up_write(&mm->mmap_sem);
- if (unlikely(core_waiters < 0))
- goto fail;
-
- /*
- * Make sure nobody is waiting for us to release the VM,
- * otherwise we can deadlock when we wait on each other
- */
- vfork_done = tsk->vfork_done;
- if (vfork_done) {
- tsk->vfork_done = NULL;
- complete(vfork_done);
- }
-
- if (core_waiters)
+ if (core_waiters > 0)
wait_for_completion(&core_state->startup);
-fail:
+
return core_waiters;
}
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index da42f32c49be..86194b2f799d 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,14 +1,3 @@
-# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
-# for every ORE user we do it like this. Any user should add itself here
-# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
-# selected here, and we default to "ON". So in effect it is like been
-# selected by any of the users.
-config ORE
- tristate
- depends on EXOFS_FS || PNFS_OBJLAYOUT
- select ASYNC_XOR
- default SCSI_OSD_ULD
-
config EXOFS_FS
tristate "exofs: OSD based file system support"
depends on SCSI_OSD_ULD
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
new file mode 100644
index 000000000000..1ca7fb7b6ba8
--- /dev/null
+++ b/fs/exofs/Kconfig.ore
@@ -0,0 +1,12 @@
+# ORE - Objects Raid Engine (libore.ko)
+#
+# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
+# for every ORE user we do it like this. Any user should add itself here
+# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
+# selected here, and we default to "ON". So in effect it is like been
+# selected by any of the users.
+config ORE
+ tristate
+ depends on EXOFS_FS || PNFS_OBJLAYOUT
+ select ASYNC_XOR
+ default SCSI_OSD_ULD
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index d0941c6a1f72..80405836ba6e 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -234,7 +234,7 @@ static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
static inline
void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
{
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
}
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 51f4b4c40f09..ca9d49665ef6 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -154,7 +154,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
extern struct inode *exofs_iget(struct super_block *, unsigned long);
-struct inode *exofs_new_inode(struct inode *, int);
+struct inode *exofs_new_inode(struct inode *, umode_t);
extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
extern void exofs_evict_inode(struct inode *);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f6dbf7768ce6..ea5e1f97806a 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1276,7 +1276,7 @@ static void create_done(struct ore_io_state *ios, void *p)
/*
* Set up a new inode and create an object for it on the OSD
*/
-struct inode *exofs_new_inode(struct inode *dir, int mode)
+struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct exofs_sb_info *sbi = sb->s_fs_info;
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b54c43775f17..9dbf0c301030 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -59,7 +59,7 @@ static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
return d_splice_alias(inode, dentry);
}
-static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
struct inode *inode = exofs_new_inode(dir, mode);
@@ -74,7 +74,7 @@ static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
return err;
}
-static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
struct inode *inode;
@@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
return exofs_add_nondir(dentry, inode);
}
-static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
int err = -EMLINK;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index d271ad837202..49cf230554a2 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -266,7 +266,7 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
/* first/last seg is split */
num_raid_units += layout->group_width;
- sgs_per_dev = div_u64(num_raid_units, data_devs);
+ sgs_per_dev = div_u64(num_raid_units, data_devs) + 2;
} else {
/* For Writes add parity pages array. */
max_par_pages = num_raid_units * pages_in_unit *
@@ -445,10 +445,10 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
u64 residual = ios->reading ?
or->in.residual : or->out.residual;
u64 offset = (ios->offset + ios->length) - residual;
- struct ore_dev *od = ios->oc->ods[
- per_dev->dev - ios->oc->first_dev];
+ unsigned dev = per_dev->dev - ios->oc->first_dev;
+ struct ore_dev *od = ios->oc->ods[dev];
- on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
+ on_dev_error(ios, od, dev, osi.osd_err_pri,
offset, residual);
}
if (osi.osd_err_pri >= acumulated_osd_err) {
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 29c47e5c4a86..d222c77cfa1b 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -328,8 +328,8 @@ static int _alloc_read_4_write(struct ore_io_state *ios)
/* @si contains info of the to-be-inserted page. Update of @si should be
* maintained by caller. Specificaly si->dev, si->obj_offset, ...
*/
-static int _add_to_read_4_write(struct ore_io_state *ios,
- struct ore_striping_info *si, struct page *page)
+static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si,
+ struct page *page, unsigned pg_len)
{
struct request_queue *q;
struct ore_per_dev_state *per_dev;
@@ -366,17 +366,60 @@ static int _add_to_read_4_write(struct ore_io_state *ios,
_ore_add_sg_seg(per_dev, gap, true);
}
q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
- added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0);
- if (unlikely(added_len != PAGE_SIZE)) {
+ added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len,
+ si->obj_offset % PAGE_SIZE);
+ if (unlikely(added_len != pg_len)) {
ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
per_dev->bio->bi_vcnt);
return -ENOMEM;
}
- per_dev->length += PAGE_SIZE;
+ per_dev->length += pg_len;
return 0;
}
+/* read the beginning of an unaligned first page */
+static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page)
+{
+ struct ore_striping_info si;
+ unsigned pg_len;
+
+ ore_calc_stripe_info(ios->layout, ios->offset, 0, &si);
+
+ pg_len = si.obj_offset % PAGE_SIZE;
+ si.obj_offset -= pg_len;
+
+ ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n",
+ _LLU(si.obj_offset), pg_len, page->index, si.dev);
+
+ return _add_to_r4w(ios, &si, page, pg_len);
+}
+
+/* read the end of an incomplete last page */
+static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
+{
+ struct ore_striping_info si;
+ struct page *page;
+ unsigned pg_len, p, c;
+
+ ore_calc_stripe_info(ios->layout, *offset, 0, &si);
+
+ p = si.unit_off / PAGE_SIZE;
+ c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
+ ios->layout->mirrors_p1, si.par_dev, si.dev);
+ page = ios->sp2d->_1p_stripes[p].pages[c];
+
+ pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
+ *offset += pg_len;
+
+ ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n",
+ p, c, _LLU(*offset), pg_len, si.dev, si.par_dev);
+
+ BUG_ON(!page);
+
+ return _add_to_r4w(ios, &si, page, pg_len);
+}
+
static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
{
struct bio_vec *bv;
@@ -444,9 +487,13 @@ static int _read_4_write(struct ore_io_state *ios)
struct page **pp = &_1ps->pages[c];
bool uptodate;
- if (*pp)
+ if (*pp) {
+ if (ios->offset % PAGE_SIZE)
+ /* Read the remainder of the page */
+ _add_to_r4w_first_page(ios, *pp);
/* to-be-written pages start here */
goto read_last_stripe;
+ }
*pp = ios->r4w->get_page(ios->private, offset,
&uptodate);
@@ -454,7 +501,7 @@ static int _read_4_write(struct ore_io_state *ios)
return -ENOMEM;
if (!uptodate)
- _add_to_read_4_write(ios, &read_si, *pp);
+ _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE);
/* Mark read-pages to be cache_released */
_1ps->page_is_read[c] = true;
@@ -465,8 +512,11 @@ static int _read_4_write(struct ore_io_state *ios)
}
read_last_stripe:
- offset = ios->offset + (ios->length + PAGE_SIZE - 1) /
- PAGE_SIZE * PAGE_SIZE;
+ offset = ios->offset + ios->length;
+ if (offset % PAGE_SIZE)
+ _add_to_r4w_last_page(ios, &offset);
+ /* offset will be aligned to next page */
+
last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
* bytes_in_stripe;
if (offset == last_stripe_end) /* Optimize for the aligned case */
@@ -503,7 +553,7 @@ read_last_stripe:
/* Mark read-pages to be cache_released */
_1ps->page_is_read[c] = true;
if (!uptodate)
- _add_to_read_4_write(ios, &read_si, page);
+ _add_to_r4w(ios, &read_si, page, PAGE_SIZE);
}
offset += PAGE_SIZE;
@@ -551,7 +601,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
unsigned cur_len)
{
if (ios->reading) {
- BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
+ if (per_dev->cur_sg >= ios->sgs_per_dev) {
+ ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" ,
+ per_dev->cur_sg, ios->sgs_per_dev);
+ return -ENOMEM;
+ }
_ore_add_sg_seg(per_dev, cur_len, true);
} else {
struct __stripe_pages_2d *sp2d = ios->sp2d;
@@ -612,8 +666,6 @@ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
return -ENOMEM;
}
- BUG_ON(ios->offset % PAGE_SIZE);
-
/* Round io down to last full strip */
first_stripe = div_u64(ios->offset, stripe_size);
last_stripe = div_u64(ios->offset + ios->length, stripe_size);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index e6085ec192d6..d22cd168c6ee 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -166,7 +166,6 @@ static struct inode *exofs_alloc_inode(struct super_block *sb)
static void exofs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
}
@@ -839,6 +838,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
if (ret) {
EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
+ dput(sb->s_root);
+ sb->s_root = NULL;
goto free_sbi;
}
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 47cda410b548..d37df352d324 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -279,7 +279,7 @@ static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = {
static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
{
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
else
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 9a4e5e206d08..75ad433c6691 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -110,7 +110,7 @@ extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
/* ialloc.c */
-extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *);
+extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *);
extern void ext2_free_inode (struct inode *);
extern unsigned long ext2_count_free_inodes (struct super_block *);
extern void ext2_check_inodes_bitmap (struct super_block *);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c4e81dfb74ba..8b15cf8cef37 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -429,7 +429,7 @@ found:
return group;
}
-struct inode *ext2_new_inode(struct inode *dir, int mode,
+struct inode *ext2_new_inode(struct inode *dir, umode_t mode,
const struct qstr *qstr)
{
struct super_block *sb;
@@ -573,8 +573,11 @@ got:
inode->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
if (insert_inode_locked(inode) < 0) {
- err = -EINVAL;
- goto fail_drop;
+ ext2_error(sb, "ext2_new_inode",
+ "inode number already in use - inode=%lu",
+ (unsigned long) ino);
+ err = -EIO;
+ goto fail;
}
dquot_initialize(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 91a6945af6d8..740cad8dcd8d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -26,7 +26,6 @@
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
-#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
@@ -36,10 +35,6 @@
#include "acl.h"
#include "xip.h"
-MODULE_AUTHOR("Remy Card and others");
-MODULE_DESCRIPTION("Second Extended Filesystem");
-MODULE_LICENSE("GPL");
-
static int __ext2_write_inode(struct inode *inode, int do_sync);
/*
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index f81e250ac5c4..2de655f5d625 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -35,7 +35,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case EXT2_IOC_SETFLAGS: {
unsigned int oldflags;
- ret = mnt_want_write(filp->f_path.mnt);
+ ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -77,31 +77,41 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
flags = flags & EXT2_FL_USER_MODIFIABLE;
flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE;
ei->i_flags = flags;
- mutex_unlock(&inode->i_mutex);
ext2_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME_SEC;
+ mutex_unlock(&inode->i_mutex);
+
mark_inode_dirty(inode);
setflags_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return ret;
}
case EXT2_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *) arg);
- case EXT2_IOC_SETVERSION:
+ case EXT2_IOC_SETVERSION: {
+ __u32 generation;
+
if (!inode_owner_or_capable(inode))
return -EPERM;
- ret = mnt_want_write(filp->f_path.mnt);
+ ret = mnt_want_write_file(filp);
if (ret)
return ret;
- if (get_user(inode->i_generation, (int __user *) arg)) {
+ if (get_user(generation, (int __user *) arg)) {
ret = -EFAULT;
- } else {
- inode->i_ctime = CURRENT_TIME_SEC;
- mark_inode_dirty(inode);
+ goto setversion_out;
}
- mnt_drop_write(filp->f_path.mnt);
+
+ mutex_lock(&inode->i_mutex);
+ inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_generation = generation;
+ mutex_unlock(&inode->i_mutex);
+
+ mark_inode_dirty(inode);
+setversion_out:
+ mnt_drop_write_file(filp);
return ret;
+ }
case EXT2_IOC_GETRSVSZ:
if (test_opt(inode->i_sb, RESERVATION)
&& S_ISREG(inode->i_mode)
@@ -121,7 +131,7 @@ setflags_out:
if (get_user(rsv_window_size, (int __user *)arg))
return -EFAULT;
- ret = mnt_want_write(filp->f_path.mnt);
+ ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -145,7 +155,7 @@ setflags_out:
rsv->rsv_goal_size = rsv_window_size;
}
mutex_unlock(&ei->truncate_mutex);
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return 0;
}
default:
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 761fde807fc9..080419814bae 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -94,7 +94,7 @@ struct dentry *ext2_get_parent(struct dentry *child)
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
+static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd)
{
struct inode *inode;
@@ -119,7 +119,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st
return ext2_add_nondir(dentry, inode);
}
-static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
+static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct inode * inode;
int err;
@@ -214,7 +214,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
return err;
}
-static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
{
struct inode * inode;
int err = -EMLINK;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index bd8ac164a3bf..0090595beb28 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -173,7 +173,6 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
static void ext2_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
}
@@ -211,9 +210,9 @@ static void destroy_inodecache(void)
kmem_cache_destroy(ext2_inode_cachep);
}
-static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int ext2_show_options(struct seq_file *seq, struct dentry *root)
{
- struct super_block *sb = vfs->mnt_sb;
+ struct super_block *sb = root->d_sb;
struct ext2_sb_info *sbi = EXT2_SB(sb);
struct ext2_super_block *es = sbi->s_es;
unsigned long def_mount_opts;
@@ -1521,5 +1520,8 @@ static void __exit exit_ext2_fs(void)
exit_ext2_xattr();
}
+MODULE_AUTHOR("Remy Card and others");
+MODULE_DESCRIPTION("Second Extended Filesystem");
+MODULE_LICENSE("GPL");
module_init(init_ext2_fs)
module_exit(exit_ext2_fs)
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index d27b71f1d183..6dcafc7efdfd 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -54,7 +54,6 @@
*/
#include <linux/buffer_head.h>
-#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/mbcache.h>
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c922adc8ef41..be7a8d02c9a7 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -3,7 +3,6 @@
* Handler for storing security labels as extended attributes.
*/
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/fs.h>
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 667e46a8d62d..2989467d3595 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -5,7 +5,6 @@
* Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/capability.h>
#include <linux/fs.h>
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 099d20f47163..f470e44c4b8d 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -6,7 +6,6 @@
*/
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/string.h>
#include "ext2.h"
#include "xattr.h"
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 5c866e06e7ab..1cde28438014 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -371,7 +371,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
* group to find a free inode.
*/
struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
- const struct qstr *qstr, int mode)
+ const struct qstr *qstr, umode_t mode)
{
struct super_block *sb;
struct buffer_head *bitmap_bh = NULL;
@@ -525,8 +525,12 @@ got:
if (IS_DIRSYNC(inode))
handle->h_sync = 1;
if (insert_inode_locked(inode) < 0) {
- err = -EINVAL;
- goto fail_drop;
+ /*
+ * Likely a bitmap corruption causing inode to be allocated
+ * twice.
+ */
+ err = -EIO;
+ goto fail;
}
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 85fe655fe3e0..2d0afeca0b47 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -22,7 +22,6 @@
* Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
*/
-#include <linux/module.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/ext3_jbd.h>
@@ -223,8 +222,12 @@ void ext3_evict_inode (struct inode *inode)
*
* Note that directories do not have this problem because they don't
* use page cache.
+ *
+ * The s_journal check handles the case when ext3_get_journal() fails
+ * and puts the journal inode.
*/
if (inode->i_nlink && ext3_should_journal_data(inode) &&
+ EXT3_SB(inode->i_sb)->s_journal &&
(S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
@@ -1132,9 +1135,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
bh = ext3_getblk(handle, inode, block, create, err);
if (!bh)
return bh;
- if (buffer_uptodate(bh))
+ if (bh_uptodate_or_lock(bh))
return bh;
- ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
+ get_bh(bh);
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -1617,7 +1622,13 @@ static int ext3_ordered_writepage(struct page *page,
int err;
J_ASSERT(PageLocked(page));
- WARN_ON_ONCE(IS_RDONLY(inode));
+ /*
+ * We don't want to warn for emergency remount. The condition is
+ * ordered to avoid dereferencing inode->i_sb in non-error case to
+ * avoid slow-downs.
+ */
+ WARN_ON_ONCE(IS_RDONLY(inode) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
/*
* We give up here if we're reentered, because it might be for a
@@ -1692,7 +1703,13 @@ static int ext3_writeback_writepage(struct page *page,
int err;
J_ASSERT(PageLocked(page));
- WARN_ON_ONCE(IS_RDONLY(inode));
+ /*
+ * We don't want to warn for emergency remount. The condition is
+ * ordered to avoid dereferencing inode->i_sb in non-error case to
+ * avoid slow-downs.
+ */
+ WARN_ON_ONCE(IS_RDONLY(inode) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
if (ext3_journal_current_handle())
goto out_fail;
@@ -1735,7 +1752,13 @@ static int ext3_journalled_writepage(struct page *page,
int err;
J_ASSERT(PageLocked(page));
- WARN_ON_ONCE(IS_RDONLY(inode));
+ /*
+ * We don't want to warn for emergency remount. The condition is
+ * ordered to avoid dereferencing inode->i_sb in non-error case to
+ * avoid slow-downs.
+ */
+ WARN_ON_ONCE(IS_RDONLY(inode) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
if (ext3_journal_current_handle())
goto no_write;
@@ -2064,12 +2087,10 @@ static int ext3_block_truncate_page(struct inode *inode, loff_t from)
if (PageUptodate(page))
set_buffer_uptodate(bh);
- if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
+ if (!bh_uptodate_or_lock(bh)) {
+ err = bh_submit_read(bh);
/* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
+ if (err)
goto unlock;
}
@@ -2490,7 +2511,7 @@ int ext3_can_truncate(struct inode *inode)
* transaction, and VFS/VM ensures that ext3_truncate() cannot run
* simultaneously on behalf of the same inode.
*
- * As we work through the truncate and commmit bits of it to the journal there
+ * As we work through the truncate and commit bits of it to the journal there
* is one core, guiding principle: the file's tree must always be consistent on
* disk. We must be able to restart the truncate after a crash.
*
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index ba1b54e23cae..4af574ce4a46 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -44,7 +44,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
@@ -110,7 +110,7 @@ flags_err:
err = ext3_change_inode_journal_flag(inode, jflag);
flags_out:
mutex_unlock(&inode->i_mutex);
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
case EXT3_IOC_GETVERSION:
@@ -126,7 +126,7 @@ flags_out:
if (!inode_owner_or_capable(inode))
return -EPERM;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
if (get_user(generation, (int __user *) arg)) {
@@ -134,10 +134,11 @@ flags_out:
goto setversion_out;
}
+ mutex_lock(&inode->i_mutex);
handle = ext3_journal_start(inode, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
- goto setversion_out;
+ goto unlock_out;
}
err = ext3_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
@@ -146,8 +147,11 @@ flags_out:
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
}
ext3_journal_stop(handle);
+
+unlock_out:
+ mutex_unlock(&inode->i_mutex);
setversion_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
case EXT3_IOC_GETRSVSZ:
@@ -164,7 +168,7 @@ setversion_out:
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
@@ -195,7 +199,7 @@ setversion_out:
}
mutex_unlock(&ei->truncate_mutex);
setrsvsz_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
case EXT3_IOC_GROUP_EXTEND: {
@@ -206,7 +210,7 @@ setrsvsz_out:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
@@ -221,7 +225,7 @@ setrsvsz_out:
if (err == 0)
err = err2;
group_extend_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
case EXT3_IOC_GROUP_ADD: {
@@ -232,7 +236,7 @@ group_extend_out:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
@@ -249,7 +253,7 @@ group_extend_out:
if (err == 0)
err = err2;
group_add_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
case FITRIM: {
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 642dc6d66dfd..e8e211795e9f 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -921,9 +921,12 @@ restart:
num++;
bh = ext3_getblk(NULL, dir, b++, 0, &err);
bh_use[ra_max] = bh;
- if (bh)
- ll_rw_block(READ | REQ_META | REQ_PRIO,
- 1, &bh);
+ if (bh && !bh_uptodate_or_lock(bh)) {
+ get_bh(bh);
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READ | REQ_META | REQ_PRIO,
+ bh);
+ }
}
}
if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -1698,7 +1701,7 @@ static int ext3_add_nondir(handle_t *handle,
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
+static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode,
struct nameidata *nd)
{
handle_t *handle;
@@ -1732,7 +1735,7 @@ retry:
}
static int ext3_mknod (struct inode * dir, struct dentry *dentry,
- int mode, dev_t rdev)
+ umode_t mode, dev_t rdev)
{
handle_t *handle;
struct inode *inode;
@@ -1768,7 +1771,7 @@ retry:
return err;
}
-static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
{
handle_t *handle;
struct inode * inode;
@@ -2272,7 +2275,7 @@ retry:
err = PTR_ERR(handle);
goto err_drop_inode;
}
- inc_nlink(inode);
+ set_nlink(inode, 1);
err = ext3_orphan_del(handle, inode);
if (err) {
ext3_journal_stop(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 922d289aeeb3..726c7ef6cdf1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -511,7 +511,6 @@ static int ext3_drop_inode(struct inode *inode)
static void ext3_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
}
@@ -611,9 +610,9 @@ static char *data_mode_string(unsigned long mode)
* - it's set to a non-default value OR
* - if the per-sb default is different from the global default
*/
-static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int ext3_show_options(struct seq_file *seq, struct dentry *root)
{
- struct super_block *sb = vfs->mnt_sb;
+ struct super_block *sb = root->d_sb;
struct ext3_sb_info *sbi = EXT3_SB(sb);
struct ext3_super_block *es = sbi->s_es;
unsigned long def_mount_opts;
@@ -2060,9 +2059,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
ext3_orphan_cleanup(sb, es);
EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
- if (needs_recovery)
+ if (needs_recovery) {
+ ext3_mark_recovery_complete(sb, es);
ext3_msg(sb, KERN_INFO, "recovery complete");
- ext3_mark_recovery_complete(sb, es);
+ }
ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
@@ -2230,11 +2230,11 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
journal->j_private = sb;
- ll_rw_block(READ, 1, &journal->j_sb_buffer);
- wait_on_buffer(journal->j_sb_buffer);
- if (!buffer_uptodate(journal->j_sb_buffer)) {
- ext3_msg(sb, KERN_ERR, "I/O error on journal device");
- goto out_journal;
+ if (!bh_uptodate_or_lock(journal->j_sb_buffer)) {
+ if (bh_submit_read(journal->j_sb_buffer)) {
+ ext3_msg(sb, KERN_ERR, "I/O error on journal device");
+ goto out_journal;
+ }
}
if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
ext3_msg(sb, KERN_ERR,
@@ -2910,7 +2910,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
return -EINVAL;
/* Quotafile not on the same filesystem? */
- if (path->mnt->mnt_sb != sb)
+ if (path->dentry->d_sb != sb)
return -EXDEV;
/* Journaling quota? */
if (EXT3_SB(sb)->s_qf_names[type]) {
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3c218b8a51d4..ea26f2acab94 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -3,7 +3,6 @@
* Handler for storing security labels as extended attributes.
*/
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/fs.h>
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index dc8edda9ffe0..2526a8829de8 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -5,7 +5,6 @@
* Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/capability.h>
#include <linux/fs.h>
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 7a321974d584..b32e473a1e33 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -5,7 +5,6 @@
* Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/ext3_jbd.h>
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 12ccacda44e0..f9e2cd8cf711 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -23,6 +23,8 @@
#include <trace/events/ext4.h>
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group);
/*
* balloc.c contains the blocks allocation and deallocation routines
*/
@@ -668,7 +670,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
* This function returns the number of file system metadata clusters at
* the beginning of a block group, including the reserved gdt blocks.
*/
-unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
ext4_group_t block_group)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 8efb2f0a3447..3f11656bd72e 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -13,7 +13,6 @@
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
-#include <linux/module.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5b0e26a1272d..513004fc3d84 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -511,6 +511,14 @@ struct ext4_new_group_data {
__u32 free_blocks_count;
};
+/* Indexes used to index group tables in ext4_new_group_data */
+enum {
+ BLOCK_BITMAP = 0, /* block bitmap */
+ INODE_BITMAP, /* inode bitmap */
+ INODE_TABLE, /* inode tables */
+ GROUP_TABLE_COUNT,
+};
+
/*
* Flags used by ext4_map_blocks()
*/
@@ -575,6 +583,7 @@ struct ext4_new_group_data {
/* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -957,12 +966,13 @@ struct ext4_inode_info {
#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
EXT4_MOUNT2_##opt)
-#define ext4_set_bit __test_and_set_bit_le
+#define ext4_test_and_set_bit __test_and_set_bit_le
+#define ext4_set_bit __set_bit_le
#define ext4_set_bit_atomic ext2_set_bit_atomic
-#define ext4_clear_bit __test_and_clear_bit_le
+#define ext4_test_and_clear_bit __test_and_clear_bit_le
+#define ext4_clear_bit __clear_bit_le
#define ext4_clear_bit_atomic ext2_clear_bit_atomic
#define ext4_test_bit test_bit_le
-#define ext4_find_first_zero_bit find_first_zero_bit_le
#define ext4_find_next_zero_bit find_next_zero_bit_le
#define ext4_find_next_bit find_next_bit_le
@@ -1397,6 +1407,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200
+#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1409,6 +1420,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
+#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1790,8 +1803,6 @@ extern void ext4_init_block_bitmap(struct super_block *sb,
extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp);
-extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
- ext4_group_t block_group);
extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp);
@@ -1819,7 +1830,7 @@ extern int ext4fs_dirhash(const char *name, int len, struct
dx_hash_info *hinfo);
/* ialloc.c */
-extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
+extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t,
const struct qstr *qstr, __u32 goal,
uid_t *owner);
extern void ext4_free_inode(handle_t *, struct inode *);
@@ -1880,16 +1891,9 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from);
-extern int ext4_block_zero_page_range(handle_t *handle,
- struct address_space *mapping, loff_t from, loff_t length);
extern int ext4_discard_partial_page_buffers(handle_t *handle,
struct address_space *mapping, loff_t from,
loff_t length, int flags);
-extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
- struct inode *inode, struct page *page, loff_t from,
- loff_t length, int flags);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1924,6 +1928,7 @@ extern int ext4_group_add(struct super_block *sb,
extern int ext4_group_extend(struct super_block *sb,
struct ext4_super_block *es,
ext4_fsblk_t n_blocks_count);
+extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 607b1557d292..74f23c292e1b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -29,7 +29,6 @@
* - smart tree reduction
*/
-#include <linux/module.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/jbd2.h>
@@ -3281,6 +3280,9 @@ static int ext4_find_delalloc_range(struct inode *inode,
ext4_lblk_t i, pg_lblk;
pgoff_t index;
+ if (!test_opt(inode->i_sb, DELALLOC))
+ return 0;
+
/* reverse search wont work if fs block size is less than page size */
if (inode->i_blkbits < PAGE_CACHE_SHIFT)
search_hint_reverse = 0;
@@ -3453,8 +3455,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
int err = 0;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
- ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
- "block %llu, max_blocks %u, flags %d, allocated %u",
+ ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
+ "block %llu, max_blocks %u, flags %x, allocated %u\n",
inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
flags, allocated);
ext4_ext_show_leaf(inode, path);
@@ -3625,7 +3627,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
ext4_lblk_t ex_cluster_start, ex_cluster_end;
- ext4_lblk_t rr_cluster_start, rr_cluster_end;
+ ext4_lblk_t rr_cluster_start;
ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
@@ -3636,7 +3638,6 @@ static int get_implied_cluster_alloc(struct super_block *sb,
/* The requested region passed into ext4_map_blocks() */
rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
- rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
if ((rr_cluster_start == ex_cluster_end) ||
(rr_cluster_start == ex_cluster_start)) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00beb4f9cc4f..25d8c9781ad9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -252,7 +252,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
fatal = ext4_journal_get_write_access(handle, bh2);
}
ext4_lock_group(sb, block_group);
- cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+ cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
if (fatal || !cleared) {
ext4_unlock_group(sb, block_group);
goto out;
@@ -351,14 +351,14 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
*/
static int find_group_orlov(struct super_block *sb, struct inode *parent,
- ext4_group_t *group, int mode,
+ ext4_group_t *group, umode_t mode,
const struct qstr *qstr)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t real_ngroups = ext4_get_groups_count(sb);
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
- unsigned int freei, avefreei;
+ unsigned int freei, avefreei, grp_free;
ext4_fsblk_t freeb, avefreec;
unsigned int ndirs;
int max_dirs, min_inodes;
@@ -477,8 +477,8 @@ fallback_retry:
for (i = 0; i < ngroups; i++) {
grp = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, grp, NULL);
- if (desc && ext4_free_inodes_count(sb, desc) &&
- ext4_free_inodes_count(sb, desc) >= avefreei) {
+ grp_free = ext4_free_inodes_count(sb, desc);
+ if (desc && grp_free && grp_free >= avefreei) {
*group = grp;
return 0;
}
@@ -497,7 +497,7 @@ fallback_retry:
}
static int find_group_other(struct super_block *sb, struct inode *parent,
- ext4_group_t *group, int mode)
+ ext4_group_t *group, umode_t mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
@@ -602,7 +602,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*/
static int ext4_claim_inode(struct super_block *sb,
struct buffer_head *inode_bitmap_bh,
- unsigned long ino, ext4_group_t group, int mode)
+ unsigned long ino, ext4_group_t group, umode_t mode)
{
int free = 0, retval = 0, count;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -618,7 +618,7 @@ static int ext4_claim_inode(struct super_block *sb,
*/
down_read(&grp->alloc_sem);
ext4_lock_group(sb, group);
- if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+ if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
/* not a free inode */
retval = 1;
goto err_ret;
@@ -690,7 +690,7 @@ err_ret:
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
const struct qstr *qstr, __u32 goal, uid_t *owner)
{
struct super_block *sb;
@@ -885,8 +885,12 @@ got:
if (IS_DIRSYNC(inode))
ext4_handle_sync(handle);
if (insert_inode_locked(inode) < 0) {
- err = -EINVAL;
- goto fail_drop;
+ /*
+ * Likely a bitmap corruption causing inode to be allocated
+ * twice.
+ */
+ err = -EIO;
+ goto fail;
}
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 3cfc73fbca8e..830e1b2bf145 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,7 +20,6 @@
* (sct@redhat.com), 1993, 1998
*/
-#include <linux/module.h>
#include "ext4_jbd2.h"
#include "truncate.h"
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 92655fd89657..feaa82fe629d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -18,7 +18,6 @@
* Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
*/
-#include <linux/module.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/jbd2.h>
@@ -72,6 +71,9 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+ struct inode *inode, struct page *page, loff_t from,
+ loff_t length, int flags);
/*
* Test whether an inode is a fast symlink.
@@ -1881,7 +1883,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
* a[0] = 'a';
* truncate(f, 4096);
* we have in the page first buffer_head mapped via page_mkwrite call back
- * but other bufer_heads would be unmapped but dirty(dirty done via the
+ * but other buffer_heads would be unmapped but dirty (dirty done via the
* do_wp_page). So writepage should write the first block. If we modify
* the mmap area beyond 1024 we will again get a page_fault and the
* page_mkwrite callback will do the block allocation and mark the
@@ -2760,7 +2762,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
if (!io_end || !size)
goto out;
- ext_debug("ext4_end_io_dio(): io_end 0x%p"
+ ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
iocb->private, io_end->inode->i_ino, iocb, offset,
size);
@@ -3161,7 +3163,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
*
* Returns zero on sucess or negative on failure.
*/
-int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
struct inode *inode, struct page *page, loff_t from,
loff_t length, int flags)
{
@@ -3301,126 +3303,6 @@ next:
return err;
}
-/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from)
-{
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned length;
- unsigned blocksize;
- struct inode *inode = mapping->host;
-
- blocksize = inode->i_sb->s_blocksize;
- length = blocksize - (offset & (blocksize - 1));
-
- return ext4_block_zero_page_range(handle, mapping, from, length);
-}
-
-/*
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'. The range to be zero'd must
- * be contained with in one block. If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-int ext4_block_zero_page_range(handle_t *handle,
- struct address_space *mapping, loff_t from, loff_t length)
-{
- ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize, max, pos;
- ext4_lblk_t iblock;
- struct inode *inode = mapping->host;
- struct buffer_head *bh;
- struct page *page;
- int err = 0;
-
- page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
- if (!page)
- return -ENOMEM;
-
- blocksize = inode->i_sb->s_blocksize;
- max = blocksize - (offset & (blocksize - 1));
-
- /*
- * correct length if it does not fall between
- * 'from' and the end of the block
- */
- if (length > max || length < 0)
- length = max;
-
- iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
-
- /* Find the buffer that contains "offset" */
- bh = page_buffers(page);
- pos = blocksize;
- while (offset >= pos) {
- bh = bh->b_this_page;
- iblock++;
- pos += blocksize;
- }
-
- err = 0;
- if (buffer_freed(bh)) {
- BUFFER_TRACE(bh, "freed: skip");
- goto unlock;
- }
-
- if (!buffer_mapped(bh)) {
- BUFFER_TRACE(bh, "unmapped");
- ext4_get_block(inode, iblock, bh, 0);
- /* unmapped? It's a hole - nothing to do */
- if (!buffer_mapped(bh)) {
- BUFFER_TRACE(bh, "still unmapped");
- goto unlock;
- }
- }
-
- /* Ok, it's mapped. Make sure it's up-to-date */
- if (PageUptodate(page))
- set_buffer_uptodate(bh);
-
- if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
- /* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
- goto unlock;
- }
-
- if (ext4_should_journal_data(inode)) {
- BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, bh);
- if (err)
- goto unlock;
- }
-
- zero_user(page, offset, length);
-
- BUFFER_TRACE(bh, "zeroed end of block");
-
- err = 0;
- if (ext4_should_journal_data(inode)) {
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- } else
- mark_buffer_dirty(bh);
-
-unlock:
- unlock_page(page);
- page_cache_release(page);
- return err;
-}
-
int ext4_can_truncate(struct inode *inode)
{
if (S_ISREG(inode->i_mode))
@@ -3469,7 +3351,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
* transaction, and VFS/VM ensures that ext4_truncate() cannot run
* simultaneously on behalf of the same inode.
*
- * As we work through the truncate and commmit bits of it to the journal there
+ * As we work through the truncate and commit bits of it to the journal there
* is one core, guiding principle: the file's tree must always be consistent on
* disk. We must be able to restart the truncate after a crash.
*
@@ -4647,9 +4529,19 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return 0;
if (is_journal_aborted(journal))
return -EROFS;
+ /* We have to allocate physical blocks for delalloc blocks
+ * before flushing journal. otherwise delalloc blocks can not
+ * be allocated any more. even more truncate on delalloc blocks
+ * could trigger BUG by flushing delalloc blocks in journal.
+ * There is no delalloc block in non-journal data mode.
+ */
+ if (val && test_opt(inode->i_sb, DELALLOC)) {
+ err = ext4_alloc_da_blocks(inode);
+ if (err < 0)
+ return err;
+ }
jbd2_journal_lock_updates(journal);
- jbd2_journal_flush(journal);
/*
* OK, there are no updates running now, and all cached data is
@@ -4661,8 +4553,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (val)
ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
- else
+ else {
+ jbd2_journal_flush(journal);
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ }
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a56796814d6a..6eee25591b81 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -18,6 +18,8 @@
#include "ext4_jbd2.h"
#include "ext4.h"
+#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = filp->f_dentry->d_inode;
@@ -45,7 +47,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
@@ -134,7 +136,7 @@ flags_err:
err = ext4_ext_migrate(inode);
flags_out:
mutex_unlock(&inode->i_mutex);
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
case EXT4_IOC_GETVERSION:
@@ -150,7 +152,7 @@ flags_out:
if (!inode_owner_or_capable(inode))
return -EPERM;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
if (get_user(generation, (int __user *) arg)) {
@@ -158,10 +160,11 @@ flags_out:
goto setversion_out;
}
+ mutex_lock(&inode->i_mutex);
handle = ext4_journal_start(inode, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
- goto setversion_out;
+ goto unlock_out;
}
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
@@ -170,8 +173,11 @@ flags_out:
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
ext4_journal_stop(handle);
+
+unlock_out:
+ mutex_unlock(&inode->i_mutex);
setversion_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
case EXT4_IOC_GROUP_EXTEND: {
@@ -182,19 +188,22 @@ setversion_out:
if (err)
return err;
- if (get_user(n_blocks_count, (__u32 __user *)arg))
- return -EFAULT;
+ if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+ err = -EFAULT;
+ goto group_extend_out;
+ }
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not supported with bigalloc");
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto group_extend_out;
}
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
- return err;
+ goto group_extend_out;
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
@@ -204,9 +213,9 @@ setversion_out:
}
if (err == 0)
err = err2;
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
+group_extend_out:
ext4_resize_end(sb);
-
return err;
}
@@ -240,15 +249,14 @@ setversion_out:
return -EOPNOTSUPP;
}
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
goto mext_out;
err = ext4_move_extents(filp, donor_filp, me.orig_start,
me.donor_start, me.len, &me.moved_len);
+ mnt_drop_write_file(filp);
mnt_drop_write(filp->f_path.mnt);
- if (me.moved_len > 0)
- file_remove_suid(donor_filp);
if (copy_to_user((struct move_extent __user *)arg,
&me, sizeof(me)))
@@ -267,19 +275,22 @@ mext_out:
return err;
if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
- sizeof(input)))
- return -EFAULT;
+ sizeof(input))) {
+ err = -EFAULT;
+ goto group_add_out;
+ }
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not supported with bigalloc");
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto group_add_out;
}
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
- return err;
+ goto group_add_out;
err = ext4_group_add(sb, &input);
if (EXT4_SB(sb)->s_journal) {
@@ -289,9 +300,9 @@ mext_out:
}
if (err == 0)
err = err2;
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
+group_add_out:
ext4_resize_end(sb);
-
return err;
}
@@ -301,7 +312,7 @@ mext_out:
if (!inode_owner_or_capable(inode))
return -EACCES;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
/*
@@ -313,7 +324,7 @@ mext_out:
mutex_lock(&(inode->i_mutex));
err = ext4_ext_migrate(inode);
mutex_unlock(&(inode->i_mutex));
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
@@ -323,11 +334,65 @@ mext_out:
if (!inode_owner_or_capable(inode))
return -EACCES;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
err = ext4_alloc_da_blocks(inode);
+ mnt_drop_write_file(filp);
+ return err;
+ }
+
+ case EXT4_IOC_RESIZE_FS: {
+ ext4_fsblk_t n_blocks_count;
+ struct super_block *sb = inode->i_sb;
+ int err = 0, err2 = 0;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not (yet) supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_META_BG)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not (yet) supported with meta_bg");
+ return -EOPNOTSUPP;
+ }
+
+ if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+ sizeof(__u64))) {
+ return -EFAULT;
+ }
+
+ if (n_blocks_count > MAX_32_NUM &&
+ !EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "File system only supports 32-bit block numbers");
+ return -EOPNOTSUPP;
+ }
+
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
+
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ goto resizefs_out;
+
+ err = ext4_resize_fs(sb, n_blocks_count);
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
mnt_drop_write(filp->f_path.mnt);
+resizefs_out:
+ ext4_resize_end(sb);
return err;
}
@@ -429,6 +494,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
}
case EXT4_IOC_MOVE_EXT:
case FITRIM:
+ case EXT4_IOC_RESIZE_FS:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e2d8be8f28bf..cb990b21c698 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3671,7 +3671,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
ext4_group_t group;
ext4_grpblk_t bit;
- trace_ext4_mb_release_group_pa(pa);
+ trace_ext4_mb_release_group_pa(sb, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 16ac228dbec6..e7d6bb0acfa6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -12,7 +12,6 @@
*
*/
-#include <linux/module.h>
#include <linux/slab.h>
#include "ext4_jbd2.h"
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index aa4c782c9dd7..2043f482375d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1736,7 +1736,7 @@ static int ext4_add_nondir(handle_t *handle,
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
+static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
handle_t *handle;
@@ -1770,7 +1770,7 @@ retry:
}
static int ext4_mknod(struct inode *dir, struct dentry *dentry,
- int mode, dev_t rdev)
+ umode_t mode, dev_t rdev)
{
handle_t *handle;
struct inode *inode;
@@ -1806,7 +1806,7 @@ retry:
return err;
}
-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
handle_t *handle;
struct inode *inode;
@@ -2315,7 +2315,7 @@ retry:
err = PTR_ERR(handle);
goto err_drop_inode;
}
- inc_nlink(inode);
+ set_nlink(inode, 1);
err = ext4_orphan_del(handle, inode);
if (err) {
ext4_journal_stop(handle);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7e106c810c62..475851896518 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -6,7 +6,6 @@
* Written by Theodore Ts'o, 2010.
*/
-#include <linux/module.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/jbd2.h>
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 996780ab4f4e..f9d948f0eb86 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -134,6 +134,172 @@ static int verify_group_input(struct super_block *sb,
return err;
}
+/*
+ * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex
+ * group each time.
+ */
+struct ext4_new_flex_group_data {
+ struct ext4_new_group_data *groups; /* new_group_data for groups
+ in the flex group */
+ __u16 *bg_flags; /* block group flags of groups
+ in @groups */
+ ext4_group_t count; /* number of groups in @groups
+ */
+};
+
+/*
+ * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
+ * @flexbg_size.
+ *
+ * Returns NULL on failure otherwise address of the allocated structure.
+ */
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+{
+ struct ext4_new_flex_group_data *flex_gd;
+
+ flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
+ if (flex_gd == NULL)
+ goto out3;
+
+ flex_gd->count = flexbg_size;
+
+ flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
+ flexbg_size, GFP_NOFS);
+ if (flex_gd->groups == NULL)
+ goto out2;
+
+ flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS);
+ if (flex_gd->bg_flags == NULL)
+ goto out1;
+
+ return flex_gd;
+
+out1:
+ kfree(flex_gd->groups);
+out2:
+ kfree(flex_gd);
+out3:
+ return NULL;
+}
+
+static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
+{
+ kfree(flex_gd->bg_flags);
+ kfree(flex_gd->groups);
+ kfree(flex_gd);
+}
+
+/*
+ * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps
+ * and inode tables for a flex group.
+ *
+ * This function is used by 64bit-resize. Note that this function allocates
+ * group tables from the 1st group of groups contained by @flexgd, which may
+ * be a partial of a flex group.
+ *
+ * @sb: super block of fs to which the groups belongs
+ */
+static void ext4_alloc_group_tables(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd,
+ int flexbg_size)
+{
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ ext4_fsblk_t start_blk;
+ ext4_fsblk_t last_blk;
+ ext4_group_t src_group;
+ ext4_group_t bb_index = 0;
+ ext4_group_t ib_index = 0;
+ ext4_group_t it_index = 0;
+ ext4_group_t group;
+ ext4_group_t last_group;
+ unsigned overhead;
+
+ BUG_ON(flex_gd->count == 0 || group_data == NULL);
+
+ src_group = group_data[0].group;
+ last_group = src_group + flex_gd->count - 1;
+
+ BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) !=
+ (last_group & ~(flexbg_size - 1))));
+next_group:
+ group = group_data[0].group;
+ start_blk = ext4_group_first_block_no(sb, src_group);
+ last_blk = start_blk + group_data[src_group - group].blocks_count;
+
+ overhead = ext4_bg_has_super(sb, src_group) ?
+ (1 + ext4_bg_num_gdb(sb, src_group) +
+ le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+
+ start_blk += overhead;
+
+ BUG_ON(src_group >= group_data[0].group + flex_gd->count);
+ /* We collect contiguous blocks as much as possible. */
+ src_group++;
+ for (; src_group <= last_group; src_group++)
+ if (!ext4_bg_has_super(sb, src_group))
+ last_blk += group_data[src_group - group].blocks_count;
+ else
+ break;
+
+ /* Allocate block bitmaps */
+ for (; bb_index < flex_gd->count; bb_index++) {
+ if (start_blk >= last_blk)
+ goto next_group;
+ group_data[bb_index].block_bitmap = start_blk++;
+ ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count--;
+ if (flexbg_size > 1)
+ flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+ }
+
+ /* Allocate inode bitmaps */
+ for (; ib_index < flex_gd->count; ib_index++) {
+ if (start_blk >= last_blk)
+ goto next_group;
+ group_data[ib_index].inode_bitmap = start_blk++;
+ ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count--;
+ if (flexbg_size > 1)
+ flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+ }
+
+ /* Allocate inode tables */
+ for (; it_index < flex_gd->count; it_index++) {
+ if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
+ goto next_group;
+ group_data[it_index].inode_table = start_blk;
+ ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count -=
+ EXT4_SB(sb)->s_itb_per_group;
+ if (flexbg_size > 1)
+ flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+
+ start_blk += EXT4_SB(sb)->s_itb_per_group;
+ }
+
+ if (test_opt(sb, DEBUG)) {
+ int i;
+ group = group_data[0].group;
+
+ printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
+ "%d groups, flexbg size is %d:\n", flex_gd->count,
+ flexbg_size);
+
+ for (i = 0; i < flex_gd->count; i++) {
+ printk(KERN_DEBUG "adding %s group %u: %u "
+ "blocks (%d free)\n",
+ ext4_bg_has_super(sb, group + i) ? "normal" :
+ "no-super", group + i,
+ group_data[i].blocks_count,
+ group_data[i].free_blocks_count);
+ }
+ }
+}
+
static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
ext4_fsblk_t blk)
{
@@ -179,131 +345,250 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh)
}
/*
- * Set up the block and inode bitmaps, and the inode table for the new group.
+ * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ *
+ * Helper function for ext4_setup_new_group_blocks() which set .
+ *
+ * @sb: super block
+ * @handle: journal handle
+ * @flex_gd: flex group data
+ */
+static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
+ struct ext4_new_flex_group_data *flex_gd,
+ ext4_fsblk_t block, ext4_group_t count)
+{
+ ext4_group_t count2;
+
+ ext4_debug("mark blocks [%llu/%u] used\n", block, count);
+ for (count2 = count; count > 0; count -= count2, block += count2) {
+ ext4_fsblk_t start;
+ struct buffer_head *bh;
+ ext4_group_t group;
+ int err;
+
+ ext4_get_group_no_and_offset(sb, block, &group, NULL);
+ start = ext4_group_first_block_no(sb, group);
+ group -= flex_gd->groups[0].group;
+
+ count2 = sb->s_blocksize * 8 - (block - start);
+ if (count2 > count)
+ count2 = count;
+
+ if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) {
+ BUG_ON(flex_gd->count > 1);
+ continue;
+ }
+
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ return err;
+
+ bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
+ if (!bh)
+ return -EIO;
+
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ return err;
+ ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
+ block - start, count2);
+ ext4_set_bits(bh->b_data, block - start, count2);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (unlikely(err))
+ return err;
+ brelse(bh);
+ }
+
+ return 0;
+}
+
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new groups.
* This doesn't need to be part of the main transaction, since we are only
* changing blocks outside the actual filesystem. We still do journaling to
* ensure the recovery is correct in case of a failure just after resize.
* If any part of this fails, we simply abort the resize.
+ *
+ * setup_new_flex_group_blocks handles a flex group as follow:
+ * 1. copy super block and GDT, and initialize group tables if necessary.
+ * In this step, we only set bits in blocks bitmaps for blocks taken by
+ * super block and GDT.
+ * 2. allocate group tables in block bitmaps, that is, set bits in block
+ * bitmap for blocks taken by group tables.
*/
-static int setup_new_group_blocks(struct super_block *sb,
- struct ext4_new_group_data *input)
+static int setup_new_flex_group_blocks(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
{
+ int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group};
+ ext4_fsblk_t start;
+ ext4_fsblk_t block;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
- int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
- le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
- unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
- struct buffer_head *bh;
+ struct ext4_super_block *es = sbi->s_es;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ __u16 *bg_flags = flex_gd->bg_flags;
handle_t *handle;
- ext4_fsblk_t block;
- ext4_grpblk_t bit;
- int i;
- int err = 0, err2;
+ ext4_group_t group, count;
+ struct buffer_head *bh = NULL;
+ int reserved_gdb, i, j, err = 0, err2;
+
+ BUG_ON(!flex_gd->count || !group_data ||
+ group_data[0].group != sbi->s_groups_count);
+
+ reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
/* This transaction may be extended/restarted along the way */
handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
-
if (IS_ERR(handle))
return PTR_ERR(handle);
- BUG_ON(input->group != sbi->s_groups_count);
+ group = group_data[0].group;
+ for (i = 0; i < flex_gd->count; i++, group++) {
+ unsigned long gdblocks;
- /* Copy all of the GDT blocks into the backup in this group */
- for (i = 0, bit = 1, block = start + 1;
- i < gdblocks; i++, block++, bit++) {
- struct buffer_head *gdb;
+ gdblocks = ext4_bg_num_gdb(sb, group);
+ start = ext4_group_first_block_no(sb, group);
- ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
- err = extend_or_restart_transaction(handle, 1);
- if (err)
- goto exit_journal;
+ /* Copy all of the GDT blocks into the backup in this group */
+ for (j = 0, block = start + 1; j < gdblocks; j++, block++) {
+ struct buffer_head *gdb;
- gdb = sb_getblk(sb, block);
- if (!gdb) {
- err = -EIO;
- goto exit_journal;
- }
- if ((err = ext4_journal_get_write_access(handle, gdb))) {
+ ext4_debug("update backup group %#04llx\n", block);
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+
+ gdb = sb_getblk(sb, block);
+ if (!gdb) {
+ err = -EIO;
+ goto out;
+ }
+
+ err = ext4_journal_get_write_access(handle, gdb);
+ if (err) {
+ brelse(gdb);
+ goto out;
+ }
+ memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
+ gdb->b_size);
+ set_buffer_uptodate(gdb);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+ if (unlikely(err)) {
+ brelse(gdb);
+ goto out;
+ }
brelse(gdb);
- goto exit_journal;
}
- memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
- set_buffer_uptodate(gdb);
- err = ext4_handle_dirty_metadata(handle, NULL, gdb);
- if (unlikely(err)) {
- brelse(gdb);
- goto exit_journal;
+
+ /* Zero out all of the reserved backup group descriptor
+ * table blocks
+ */
+ if (ext4_bg_has_super(sb, group)) {
+ err = sb_issue_zeroout(sb, gdblocks + start + 1,
+ reserved_gdb, GFP_NOFS);
+ if (err)
+ goto out;
}
- brelse(gdb);
- }
- /* Zero out all of the reserved backup group descriptor table blocks */
- ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
- block, sbi->s_itb_per_group);
- err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
- GFP_NOFS);
- if (err)
- goto exit_journal;
+ /* Initialize group tables of the grop @group */
+ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
+ goto handle_bb;
- err = extend_or_restart_transaction(handle, 2);
- if (err)
- goto exit_journal;
+ /* Zero out all of the inode table blocks */
+ block = group_data[i].inode_table;
+ ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+ block, sbi->s_itb_per_group);
+ err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
+ GFP_NOFS);
+ if (err)
+ goto out;
- bh = bclean(handle, sb, input->block_bitmap);
- if (IS_ERR(bh)) {
- err = PTR_ERR(bh);
- goto exit_journal;
- }
+handle_bb:
+ if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT)
+ goto handle_ib;
- if (ext4_bg_has_super(sb, input->group)) {
- ext4_debug("mark backup group tables %#04llx (+0)\n", start);
- ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1);
- }
+ /* Initialize block bitmap of the @group */
+ block = group_data[i].block_bitmap;
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
- ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
- input->block_bitmap - start);
- ext4_set_bit(input->block_bitmap - start, bh->b_data);
- ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
- input->inode_bitmap - start);
- ext4_set_bit(input->inode_bitmap - start, bh->b_data);
-
- /* Zero out all of the inode table blocks */
- block = input->inode_table;
- ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
- block, sbi->s_itb_per_group);
- err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
- if (err)
- goto exit_bh;
- ext4_set_bits(bh->b_data, input->inode_table - start,
- sbi->s_itb_per_group);
+ bh = bclean(handle, sb, block);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ goto out;
+ }
+ if (ext4_bg_has_super(sb, group)) {
+ ext4_debug("mark backup superblock %#04llx (+0)\n",
+ start);
+ ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb +
+ 1);
+ }
+ ext4_mark_bitmap_end(group_data[i].blocks_count,
+ sb->s_blocksize * 8, bh->b_data);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out;
+ brelse(bh);
+handle_ib:
+ if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
+ continue;
- ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
- bh->b_data);
- err = ext4_handle_dirty_metadata(handle, NULL, bh);
- if (unlikely(err)) {
- ext4_std_error(sb, err);
- goto exit_bh;
+ /* Initialize inode bitmap of the @group */
+ block = group_data[i].inode_bitmap;
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+ /* Mark unused entries in inode bitmap used */
+ bh = bclean(handle, sb, block);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ goto out;
+ }
+
+ ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
+ sb->s_blocksize * 8, bh->b_data);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out;
+ brelse(bh);
}
- brelse(bh);
- /* Mark unused entries in inode bitmap used */
- ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
- input->inode_bitmap, input->inode_bitmap - start);
- if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
- err = PTR_ERR(bh);
- goto exit_journal;
+ bh = NULL;
+
+ /* Mark group tables in block bitmap */
+ for (j = 0; j < GROUP_TABLE_COUNT; j++) {
+ count = group_table_count[j];
+ start = (&group_data[0].block_bitmap)[j];
+ block = start;
+ for (i = 1; i < flex_gd->count; i++) {
+ block += group_table_count[j];
+ if (block == (&group_data[i].block_bitmap)[j]) {
+ count += group_table_count[j];
+ continue;
+ }
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd, start, count);
+ if (err)
+ goto out;
+ count = group_table_count[j];
+ start = group_data[i].block_bitmap;
+ block = start;
+ }
+
+ if (count) {
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd, start, count);
+ if (err)
+ goto out;
+ }
}
- ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
- bh->b_data);
- err = ext4_handle_dirty_metadata(handle, NULL, bh);
- if (unlikely(err))
- ext4_std_error(sb, err);
-exit_bh:
+out:
brelse(bh);
-
-exit_journal:
- if ((err2 = ext4_journal_stop(handle)) && !err)
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
err = err2;
return err;
@@ -351,10 +636,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
* groups in current filesystem that have BACKUPS, or -ve error code.
*/
static int verify_reserved_gdb(struct super_block *sb,
+ ext4_group_t end,
struct buffer_head *primary)
{
const ext4_fsblk_t blk = primary->b_blocknr;
- const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
unsigned three = 1;
unsigned five = 5;
unsigned seven = 7;
@@ -429,7 +714,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
if (!gdb_bh)
return -EIO;
- gdbackups = verify_reserved_gdb(sb, gdb_bh);
+ gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
if (gdbackups < 0) {
err = gdbackups;
goto exit_bh;
@@ -592,7 +877,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
err = -EIO;
goto exit_bh;
}
- if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+ gdbackups = verify_reserved_gdb(sb, group, primary[res]);
+ if (gdbackups < 0) {
brelse(primary[res]);
err = gdbackups;
goto exit_bh;
@@ -735,6 +1021,348 @@ exit_err:
}
}
+/*
+ * ext4_add_new_descs() adds @count group descriptor of groups
+ * starting at @group
+ *
+ * @handle: journal handle
+ * @sb: super block
+ * @group: the group no. of the first group desc to be added
+ * @resize_inode: the resize inode
+ * @count: number of group descriptors to be added
+ */
+static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
+ ext4_group_t group, struct inode *resize_inode,
+ ext4_group_t count)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head *gdb_bh;
+ int i, gdb_off, gdb_num, err = 0;
+
+ for (i = 0; i < count; i++, group++) {
+ int reserved_gdb = ext4_bg_has_super(sb, group) ?
+ le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+
+ gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+ gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+ /*
+ * We will only either add reserved group blocks to a backup group
+ * or remove reserved blocks for the first group in a new group block.
+ * Doing both would be mean more complex code, and sane people don't
+ * use non-sparse filesystems anymore. This is already checked above.
+ */
+ if (gdb_off) {
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+
+ if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
+ err = reserve_backup_gdb(handle, resize_inode, group);
+ } else
+ err = add_new_gdb(handle, resize_inode, group);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/*
+ * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
+ */
+static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_group_desc *gdp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *gdb_bh;
+ ext4_group_t group;
+ __u16 *bg_flags = flex_gd->bg_flags;
+ int i, gdb_off, gdb_num, err = 0;
+
+
+ for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
+ group = group_data->group;
+
+ gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+ gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+ /*
+ * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
+ */
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ /* Update group descriptor block for new group */
+ gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data +
+ gdb_off * EXT4_DESC_SIZE(sb));
+
+ memset(gdp, 0, EXT4_DESC_SIZE(sb));
+ ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
+ ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+ ext4_inode_table_set(sb, gdp, group_data->inode_table);
+ ext4_free_group_clusters_set(sb, gdp,
+ EXT4_B2C(sbi, group_data->free_blocks_count));
+ ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+ gdp->bg_flags = cpu_to_le16(*bg_flags);
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+ if (unlikely(err)) {
+ ext4_std_error(sb, err);
+ break;
+ }
+
+ /*
+ * We can allocate memory for mb_alloc based on the new group
+ * descriptor
+ */
+ err = ext4_mb_add_groupinfo(sb, group, gdp);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/*
+ * ext4_update_super() updates the super block so that the newly added
+ * groups can be seen by the filesystem.
+ *
+ * @sb: super block
+ * @flex_gd: new added groups
+ */
+static void ext4_update_super(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ ext4_fsblk_t blocks_count = 0;
+ ext4_fsblk_t free_blocks = 0;
+ ext4_fsblk_t reserved_blocks = 0;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int i;
+
+ BUG_ON(flex_gd->count == 0 || group_data == NULL);
+ /*
+ * Make the new blocks and inodes valid next. We do this before
+ * increasing the group count so that once the group is enabled,
+ * all of its blocks and inodes are already valid.
+ *
+ * We always allocate group-by-group, then block-by-block or
+ * inode-by-inode within a group, so enabling these
+ * blocks/inodes before the group is live won't actually let us
+ * allocate the new space yet.
+ */
+ for (i = 0; i < flex_gd->count; i++) {
+ blocks_count += group_data[i].blocks_count;
+ free_blocks += group_data[i].free_blocks_count;
+ }
+
+ reserved_blocks = ext4_r_blocks_count(es) * 100;
+ do_div(reserved_blocks, ext4_blocks_count(es));
+ reserved_blocks *= blocks_count;
+ do_div(reserved_blocks, 100);
+
+ ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+ le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+ flex_gd->count);
+
+ /*
+ * We need to protect s_groups_count against other CPUs seeing
+ * inconsistent state in the superblock.
+ *
+ * The precise rules we use are:
+ *
+ * * Writers must perform a smp_wmb() after updating all
+ * dependent data and before modifying the groups count
+ *
+ * * Readers must perform an smp_rmb() after reading the groups
+ * count and before reading any dependent data.
+ *
+ * NB. These rules can be relaxed when checking the group count
+ * while freeing data, as we can only allocate from a block
+ * group after serialising against the group count, and we can
+ * only then free after serialising in turn against that
+ * allocation.
+ */
+ smp_wmb();
+
+ /* Update the global fs size fields */
+ sbi->s_groups_count += flex_gd->count;
+
+ /* Update the reserved block counts only once the new group is
+ * active. */
+ ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+ reserved_blocks);
+
+ /* Update the free space counts */
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ EXT4_B2C(sbi, free_blocks));
+ percpu_counter_add(&sbi->s_freeinodes_counter,
+ EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group;
+ flex_group = ext4_flex_group(sbi, group_data[0].group);
+ atomic_add(EXT4_B2C(sbi, free_blocks),
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
+ &sbi->s_flex_groups[flex_group].free_inodes);
+ }
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: added group %u:"
+ "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
+ blocks_count, free_blocks, reserved_blocks);
+}
+
+/* Add a flex group to an fs. Ensure we handle all possible error conditions
+ * _before_ we start modifying the filesystem, because we cannot abort the
+ * transaction and not have it write the data to disk.
+ */
+static int ext4_flex_group_add(struct super_block *sb,
+ struct inode *resize_inode,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_fsblk_t o_blocks_count;
+ ext4_grpblk_t last;
+ ext4_group_t group;
+ handle_t *handle;
+ unsigned reserved_gdb;
+ int err = 0, err2 = 0, credit;
+
+ BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags);
+
+ reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+ o_blocks_count = ext4_blocks_count(es);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+ BUG_ON(last);
+
+ err = setup_new_flex_group_blocks(sb, flex_gd);
+ if (err)
+ goto exit;
+ /*
+ * We will always be modifying at least the superblock and GDT
+ * block. If we are adding a group past the last current GDT block,
+ * we will also modify the inode and the dindirect block. If we
+ * are adding a group with superblock/GDT backups we will also
+ * modify each of the reserved GDT dindirect blocks.
+ */
+ credit = flex_gd->count * 4 + reserved_gdb;
+ handle = ext4_journal_start_sb(sb, credit);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto exit;
+ }
+
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto exit_journal;
+
+ group = flex_gd->groups[0].group;
+ BUG_ON(group != EXT4_SB(sb)->s_groups_count);
+ err = ext4_add_new_descs(handle, sb, group,
+ resize_inode, flex_gd->count);
+ if (err)
+ goto exit_journal;
+
+ err = ext4_setup_new_descs(handle, sb, flex_gd);
+ if (err)
+ goto exit_journal;
+
+ ext4_update_super(sb, flex_gd);
+
+ err = ext4_handle_dirty_super(handle, sb);
+
+exit_journal:
+ err2 = ext4_journal_stop(handle);
+ if (!err)
+ err = err2;
+
+ if (!err) {
+ int i;
+ update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+ sizeof(struct ext4_super_block));
+ for (i = 0; i < flex_gd->count; i++, group++) {
+ struct buffer_head *gdb_bh;
+ int gdb_num;
+ gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
+ gdb_bh->b_size);
+ }
+ }
+exit:
+ return err;
+}
+
+static int ext4_setup_next_flex_gd(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd,
+ ext4_fsblk_t n_blocks_count,
+ unsigned long flexbg_size)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ ext4_fsblk_t o_blocks_count;
+ ext4_group_t n_group;
+ ext4_group_t group;
+ ext4_group_t last_group;
+ ext4_grpblk_t last;
+ ext4_grpblk_t blocks_per_group;
+ unsigned long i;
+
+ blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+
+ o_blocks_count = ext4_blocks_count(es);
+
+ if (o_blocks_count == n_blocks_count)
+ return 0;
+
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+ BUG_ON(last);
+ ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
+
+ last_group = group | (flexbg_size - 1);
+ if (last_group > n_group)
+ last_group = n_group;
+
+ flex_gd->count = last_group - group + 1;
+
+ for (i = 0; i < flex_gd->count; i++) {
+ int overhead;
+
+ group_data[i].group = group + i;
+ group_data[i].blocks_count = blocks_per_group;
+ overhead = ext4_bg_has_super(sb, group + i) ?
+ (1 + ext4_bg_num_gdb(sb, group + i) +
+ le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+ group_data[i].free_blocks_count = blocks_per_group - overhead;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
+ EXT4_BG_INODE_UNINIT;
+ else
+ flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
+ }
+
+ if (last_group == n_group &&
+ EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ /* We need to initialize block bitmap of last group. */
+ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
+
+ if ((last_group == n_group) && (last != blocks_per_group - 1)) {
+ group_data[i - 1].blocks_count = last + 1;
+ group_data[i - 1].free_blocks_count -= blocks_per_group-
+ last - 1;
+ }
+
+ return 1;
+}
+
/* Add group descriptor data to an existing or new group descriptor block.
* Ensure we handle all possible error conditions _before_ we start modifying
* the filesystem, because we cannot abort the transaction and not have it
@@ -750,16 +1378,15 @@ exit_err:
*/
int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
{
+ struct ext4_new_flex_group_data flex_gd;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
- struct buffer_head *primary = NULL;
- struct ext4_group_desc *gdp;
struct inode *inode = NULL;
- handle_t *handle;
int gdb_off, gdb_num;
- int err, err2;
+ int err;
+ __u16 bg_flags = 0;
gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
@@ -798,175 +1425,69 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
}
- if ((err = verify_group_input(sb, input)))
- goto exit_put;
+ err = verify_group_input(sb, input);
+ if (err)
+ goto out;
- if ((err = setup_new_group_blocks(sb, input)))
- goto exit_put;
+ flex_gd.count = 1;
+ flex_gd.groups = input;
+ flex_gd.bg_flags = &bg_flags;
+ err = ext4_flex_group_add(sb, inode, &flex_gd);
+out:
+ iput(inode);
+ return err;
+} /* ext4_group_add */
- /*
- * We will always be modifying at least the superblock and a GDT
- * block. If we are adding a group past the last current GDT block,
- * we will also modify the inode and the dindirect block. If we
- * are adding a group with superblock/GDT backups we will also
- * modify each of the reserved GDT dindirect blocks.
+/*
+ * extend a group without checking assuming that checking has been done.
+ */
+static int ext4_group_extend_no_check(struct super_block *sb,
+ ext4_fsblk_t o_blocks_count, ext4_grpblk_t add)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ handle_t *handle;
+ int err = 0, err2;
+
+ /* We will update the superblock, one block bitmap, and
+ * one group descriptor via ext4_group_add_blocks().
*/
- handle = ext4_journal_start_sb(sb,
- ext4_bg_has_super(sb, input->group) ?
- 3 + reserved_gdb : 4);
+ handle = ext4_journal_start_sb(sb, 3);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
- goto exit_put;
+ ext4_warning(sb, "error %d on journal start", err);
+ return err;
}
- if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
- goto exit_journal;
-
- /*
- * We will only either add reserved group blocks to a backup group
- * or remove reserved blocks for the first group in a new group block.
- * Doing both would be mean more complex code, and sane people don't
- * use non-sparse filesystems anymore. This is already checked above.
- */
- if (gdb_off) {
- primary = sbi->s_group_desc[gdb_num];
- if ((err = ext4_journal_get_write_access(handle, primary)))
- goto exit_journal;
-
- if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) {
- err = reserve_backup_gdb(handle, inode, input->group);
- if (err)
- goto exit_journal;
- }
- } else {
- /*
- * Note that we can access new group descriptor block safely
- * only if add_new_gdb() succeeds.
- */
- err = add_new_gdb(handle, inode, input->group);
- if (err)
- goto exit_journal;
- primary = sbi->s_group_desc[gdb_num];
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err) {
+ ext4_warning(sb, "error %d on journal write access", err);
+ goto errout;
}
- /*
- * OK, now we've set up the new group. Time to make it active.
- *
- * so we have to be safe wrt. concurrent accesses the group
- * data. So we need to be careful to set all of the relevant
- * group descriptor data etc. *before* we enable the group.
- *
- * The key field here is sbi->s_groups_count: as long as
- * that retains its old value, nobody is going to access the new
- * group.
- *
- * So first we update all the descriptor metadata for the new
- * group; then we update the total disk blocks count; then we
- * update the groups count to enable the group; then finally we
- * update the free space counts so that the system can start
- * using the new disk blocks.
- */
-
- /* Update group descriptor block for new group */
- gdp = (struct ext4_group_desc *)((char *)primary->b_data +
- gdb_off * EXT4_DESC_SIZE(sb));
-
- memset(gdp, 0, EXT4_DESC_SIZE(sb));
- ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
- ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
- ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
- ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count);
- ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
- gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
-
- /*
- * We can allocate memory for mb_alloc based on the new group
- * descriptor
- */
- err = ext4_mb_add_groupinfo(sb, input->group, gdp);
+ ext4_blocks_count_set(es, o_blocks_count + add);
+ ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
+ o_blocks_count + add);
+ /* We add the blocks to the bitmap and set the group need init bit */
+ err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
if (err)
- goto exit_journal;
-
- /*
- * Make the new blocks and inodes valid next. We do this before
- * increasing the group count so that once the group is enabled,
- * all of its blocks and inodes are already valid.
- *
- * We always allocate group-by-group, then block-by-block or
- * inode-by-inode within a group, so enabling these
- * blocks/inodes before the group is live won't actually let us
- * allocate the new space yet.
- */
- ext4_blocks_count_set(es, ext4_blocks_count(es) +
- input->blocks_count);
- le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb));
-
- /*
- * We need to protect s_groups_count against other CPUs seeing
- * inconsistent state in the superblock.
- *
- * The precise rules we use are:
- *
- * * Writers must perform a smp_wmb() after updating all dependent
- * data and before modifying the groups count
- *
- * * Readers must perform an smp_rmb() after reading the groups count
- * and before reading any dependent data.
- *
- * NB. These rules can be relaxed when checking the group count
- * while freeing data, as we can only allocate from a block
- * group after serialising against the group count, and we can
- * only then free after serialising in turn against that
- * allocation.
- */
- smp_wmb();
-
- /* Update the global fs size fields */
- sbi->s_groups_count++;
-
- err = ext4_handle_dirty_metadata(handle, NULL, primary);
- if (unlikely(err)) {
- ext4_std_error(sb, err);
- goto exit_journal;
- }
-
- /* Update the reserved block counts only once the new group is
- * active. */
- ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
- input->reserved_blocks);
-
- /* Update the free space counts */
- percpu_counter_add(&sbi->s_freeclusters_counter,
- EXT4_B2C(sbi, input->free_blocks_count));
- percpu_counter_add(&sbi->s_freeinodes_counter,
- EXT4_INODES_PER_GROUP(sb));
-
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
- sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group;
- flex_group = ext4_flex_group(sbi, input->group);
- atomic_add(EXT4_B2C(sbi, input->free_blocks_count),
- &sbi->s_flex_groups[flex_group].free_clusters);
- atomic_add(EXT4_INODES_PER_GROUP(sb),
- &sbi->s_flex_groups[flex_group].free_inodes);
- }
-
+ goto errout;
ext4_handle_dirty_super(handle, sb);
-
-exit_journal:
- if ((err2 = ext4_journal_stop(handle)) && !err)
+ ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+ o_blocks_count + add);
+errout:
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
err = err2;
- if (!err && primary) {
- update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+
+ if (!err) {
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
+ "blocks\n", ext4_blocks_count(es));
+ update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
sizeof(struct ext4_super_block));
- update_backups(sb, primary->b_blocknr, primary->b_data,
- primary->b_size);
}
-exit_put:
- iput(inode);
return err;
-} /* ext4_group_add */
+}
/*
* Extend the filesystem to the new number of blocks specified. This entry
@@ -985,8 +1506,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
ext4_grpblk_t last;
ext4_grpblk_t add;
struct buffer_head *bh;
- handle_t *handle;
- int err, err2;
+ int err;
ext4_group_t group;
o_blocks_count = ext4_blocks_count(es);
@@ -1042,42 +1562,119 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
}
brelse(bh);
- /* We will update the superblock, one block bitmap, and
- * one group descriptor via ext4_free_blocks().
- */
- handle = ext4_journal_start_sb(sb, 3);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- ext4_warning(sb, "error %d on journal start", err);
- goto exit_put;
+ err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+ return err;
+} /* ext4_group_extend */
+
+/*
+ * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
+ *
+ * @sb: super block of the fs to be resized
+ * @n_blocks_count: the number of blocks resides in the resized fs
+ */
+int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
+{
+ struct ext4_new_flex_group_data *flex_gd = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head *bh;
+ struct inode *resize_inode;
+ ext4_fsblk_t o_blocks_count;
+ ext4_group_t o_group;
+ ext4_group_t n_group;
+ ext4_grpblk_t offset;
+ unsigned long n_desc_blocks;
+ unsigned long o_desc_blocks;
+ unsigned long desc_blocks;
+ int err = 0, flexbg_size = 1;
+
+ o_blocks_count = ext4_blocks_count(es);
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu "
+ "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+
+ if (n_blocks_count < o_blocks_count) {
+ /* On-line shrinking not supported */
+ ext4_warning(sb, "can't shrink FS - resize aborted");
+ return -EINVAL;
}
- if ((err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh))) {
- ext4_warning(sb, "error %d on journal write access", err);
- ext4_journal_stop(handle);
- goto exit_put;
+ if (n_blocks_count == o_blocks_count)
+ /* Nothing need to do */
+ return 0;
+
+ ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset);
+
+ n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
+ EXT4_DESC_PER_BLOCK(sb);
+ o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+ EXT4_DESC_PER_BLOCK(sb);
+ desc_blocks = n_desc_blocks - o_desc_blocks;
+
+ if (desc_blocks &&
+ (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
+ le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
+ ext4_warning(sb, "No reserved GDT blocks, can't resize");
+ return -EPERM;
}
- ext4_blocks_count_set(es, o_blocks_count + add);
- ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
- o_blocks_count + add);
- /* We add the blocks to the bitmap and set the group need init bit */
- err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
- ext4_handle_dirty_super(handle, sb);
- ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
- o_blocks_count + add);
- err2 = ext4_journal_stop(handle);
- if (!err && err2)
- err = err2;
- if (err)
- goto exit_put;
+ resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ if (IS_ERR(resize_inode)) {
+ ext4_warning(sb, "Error opening resize inode");
+ return PTR_ERR(resize_inode);
+ }
+ /* See if the device is actually as big as what was requested */
+ bh = sb_bread(sb, n_blocks_count - 1);
+ if (!bh) {
+ ext4_warning(sb, "can't read last block, resize aborted");
+ return -ENOSPC;
+ }
+ brelse(bh);
+
+ if (offset != 0) {
+ /* extend the last group */
+ ext4_grpblk_t add;
+ add = EXT4_BLOCKS_PER_GROUP(sb) - offset;
+ err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+ if (err)
+ goto out;
+ }
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ es->s_log_groups_per_flex)
+ flexbg_size = 1 << es->s_log_groups_per_flex;
+
+ o_blocks_count = ext4_blocks_count(es);
+ if (o_blocks_count == n_blocks_count)
+ goto out;
+
+ flex_gd = alloc_flex_gd(flexbg_size);
+ if (flex_gd == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Add flex groups. Note that a regular group is a
+ * flex group with 1 group.
+ */
+ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
+ flexbg_size)) {
+ ext4_alloc_group_tables(sb, flex_gd, flexbg_size);
+ err = ext4_flex_group_add(sb, resize_inode, flex_gd);
+ if (unlikely(err))
+ break;
+ }
+
+out:
+ if (flex_gd)
+ free_flex_gd(flex_gd);
+
+ iput(resize_inode);
if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
- ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block));
-exit_put:
+ printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu "
+ "upto %llu blocks\n", o_blocks_count, n_blocks_count);
return err;
-} /* ext4_group_extend */
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3e1329e2f826..502c61fd7392 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -930,7 +930,6 @@ static int ext4_drop_inode(struct inode *inode)
static void ext4_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
}
@@ -1033,11 +1032,11 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
* - it's set to a non-default value OR
* - if the per-sb default is different from the global default
*/
-static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int ext4_show_options(struct seq_file *seq, struct dentry *root)
{
int def_errors;
unsigned long def_mount_opts;
- struct super_block *sb = vfs->mnt_sb;
+ struct super_block *sb = root->d_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
@@ -1096,7 +1095,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
}
if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
seq_printf(seq, ",max_batch_time=%u",
- (unsigned) sbi->s_min_batch_time);
+ (unsigned) sbi->s_max_batch_time);
}
/*
@@ -2006,17 +2005,16 @@ static int ext4_fill_flex_info(struct super_block *sb)
struct ext4_group_desc *gdp = NULL;
ext4_group_t flex_group_count;
ext4_group_t flex_group;
- int groups_per_flex = 0;
+ unsigned int groups_per_flex = 0;
size_t size;
int i;
sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
- groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-
- if (groups_per_flex < 2) {
+ if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
sbi->s_log_groups_per_flex = 0;
return 1;
}
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
/* We allocate both existing and potentially added groups */
flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
@@ -2883,8 +2881,7 @@ cont_thread:
}
mutex_unlock(&eli->li_list_mtx);
- if (freezing(current))
- refrigerator();
+ try_to_freeze();
cur = jiffies;
if ((time_after_eq(cur, next_wakeup)) ||
@@ -3508,7 +3505,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* of the filesystem.
*/
if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
"block %u is beyond end of filesystem (%llu)",
le32_to_cpu(es->s_first_data_block),
ext4_blocks_count(es));
@@ -3735,10 +3732,12 @@ no_journal:
}
if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
+ iput(root);
goto failed_mount4;
}
sb->s_root = d_alloc_root(root);
if (!sb->s_root) {
+ iput(root);
ext4_msg(sb, KERN_ERR, "get root dentry failed");
ret = -ENOMEM;
goto failed_mount4;
@@ -3775,7 +3774,7 @@ no_journal:
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
"zone (%d)", err);
- goto failed_mount4;
+ goto failed_mount4a;
}
ext4_ext_init(sb);
@@ -3832,13 +3831,14 @@ cantfind_ext4:
failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
- ext4_ext_release(sb);
-failed_mount5:
ext4_mb_release(sb);
+failed_mount5:
+ ext4_ext_release(sb);
ext4_release_system_zone(sb);
-failed_mount4:
- iput(root);
+failed_mount4a:
+ dput(sb->s_root);
sb->s_root = NULL;
+failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
failed_mount_wq:
@@ -4782,7 +4782,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
return -EINVAL;
/* Quotafile not on the same filesystem? */
- if (path->mnt->mnt_sb != sb)
+ if (path->dentry->d_sb != sb)
return -EXDEV;
/* Journaling quota? */
if (EXT4_SB(sb)->s_qf_names[type]) {
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 34e4350dd4d9..d2a200624af5 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -3,7 +3,6 @@
* Handler for storing security labels as extended attributes.
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/security.h>
@@ -48,8 +47,9 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int
+ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
const struct xattr *xattr;
handle_t *handle = fs_info;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 37e6ebca2cc3..95f1f4ab59a4 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -5,7 +5,6 @@
* Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/capability.h>
#include <linux/fs.h>
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 98c375352d0e..0edb7611ffbe 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -5,7 +5,6 @@
* Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
#include "ext4_jbd2.h"
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 1510a4d51990..66994f316e18 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -141,7 +141,7 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
static inline int fat_mode_can_hold_ro(struct inode *inode)
{
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
- mode_t mask;
+ umode_t mask;
if (S_ISDIR(inode->i_mode)) {
if (!sbi->options.rodir)
@@ -156,8 +156,8 @@ static inline int fat_mode_can_hold_ro(struct inode *inode)
}
/* Convert attribute bits and a mask to the UNIX mode. */
-static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
- u8 attrs, mode_t mode)
+static inline umode_t fat_make_mode(struct msdos_sb_info *sbi,
+ u8 attrs, umode_t mode)
{
if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir))
mode &= ~S_IWUGO;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index c118acf16e43..a71fe3715ee8 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -44,7 +44,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
goto out;
mutex_lock(&inode->i_mutex);
- err = mnt_want_write(file->f_path.mnt);
+ err = mnt_want_write_file(file);
if (err)
goto out_unlock_inode;
@@ -108,7 +108,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
fat_save_attrs(inode, attr);
mark_inode_dirty(inode);
out_drop_write:
- mnt_drop_write(file->f_path.mnt);
+ mnt_drop_write_file(file);
out_unlock_inode:
mutex_unlock(&inode->i_mutex);
out:
@@ -314,7 +314,7 @@ EXPORT_SYMBOL_GPL(fat_getattr);
static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
struct inode *inode, umode_t *mode_ptr)
{
- mode_t mask, perm;
+ umode_t mask, perm;
/*
* Note, the basic check is already done by a caller of
@@ -351,7 +351,7 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
{
- mode_t allow_utime = sbi->options.allow_utime;
+ umode_t allow_utime = sbi->options.allow_utime;
if (current_fsuid() != inode->i_uid) {
if (in_group_p(inode->i_gid))
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 808cac7edcfb..3ab841054d53 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -518,7 +518,6 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
static void fat_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
}
@@ -672,7 +671,7 @@ int fat_sync_inode(struct inode *inode)
EXPORT_SYMBOL_GPL(fat_sync_inode);
-static int fat_show_options(struct seq_file *m, struct vfsmount *mnt);
+static int fat_show_options(struct seq_file *m, struct dentry *root);
static const struct super_operations fat_sops = {
.alloc_inode = fat_alloc_inode,
.destroy_inode = fat_destroy_inode,
@@ -811,9 +810,9 @@ static const struct export_operations fat_export_ops = {
.get_parent = fat_get_parent,
};
-static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int fat_show_options(struct seq_file *m, struct dentry *root)
{
- struct msdos_sb_info *sbi = MSDOS_SB(mnt->mnt_sb);
+ struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb);
struct fat_mount_options *opts = &sbi->options;
int isvfat = opts->isvfat;
@@ -898,7 +897,7 @@ enum {
Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
- Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
+ Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err,
};
@@ -928,17 +927,17 @@ static const match_table_t fat_tokens = {
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
{Opt_discard, "discard"},
- {Opt_obsolate, "conv=binary"},
- {Opt_obsolate, "conv=text"},
- {Opt_obsolate, "conv=auto"},
- {Opt_obsolate, "conv=b"},
- {Opt_obsolate, "conv=t"},
- {Opt_obsolate, "conv=a"},
- {Opt_obsolate, "fat=%u"},
- {Opt_obsolate, "blocksize=%u"},
- {Opt_obsolate, "cvf_format=%20s"},
- {Opt_obsolate, "cvf_options=%100s"},
- {Opt_obsolate, "posix"},
+ {Opt_obsolete, "conv=binary"},
+ {Opt_obsolete, "conv=text"},
+ {Opt_obsolete, "conv=auto"},
+ {Opt_obsolete, "conv=b"},
+ {Opt_obsolete, "conv=t"},
+ {Opt_obsolete, "conv=a"},
+ {Opt_obsolete, "fat=%u"},
+ {Opt_obsolete, "blocksize=%u"},
+ {Opt_obsolete, "cvf_format=%20s"},
+ {Opt_obsolete, "cvf_options=%100s"},
+ {Opt_obsolete, "posix"},
{Opt_err, NULL},
};
static const match_table_t msdos_tokens = {
@@ -1170,7 +1169,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
break;
/* obsolete mount options */
- case Opt_obsolate:
+ case Opt_obsolete:
fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, "
"not supported now", p);
break;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 216b419f30e2..c5938c9084b9 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -264,7 +264,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
}
/***** Create a file */
-static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
+static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
struct super_block *sb = dir->i_sb;
@@ -346,7 +346,7 @@ out:
}
/***** Make a directory */
-static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a87a65663c25..a81eb2367d39 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -512,7 +512,8 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
int charlen;
if (utf8) {
- *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
+ *outlen = utf8s_to_utf16s(name, len, UTF16_HOST_ENDIAN,
+ (wchar_t *) outname, FAT_LFN_LEN + 2);
if (*outlen < 0)
return *outlen;
else if (*outlen > FAT_LFN_LEN)
@@ -781,7 +782,7 @@ error:
return ERR_PTR(err);
}
-static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
+static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
struct super_block *sb = dir->i_sb;
@@ -870,7 +871,7 @@ out:
return err;
}
-static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 6b088641f5bf..a48e4a139be1 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -10,6 +10,7 @@
#include <linux/personality.h>
#include <asm/uaccess.h>
#include "internal.h"
+#include "mount.h"
static long do_sys_name_to_handle(struct path *path,
struct file_handle __user *ufh,
@@ -24,8 +25,8 @@ static long do_sys_name_to_handle(struct path *path,
* We need t make sure wether the file system
* support decoding of the file handle
*/
- if (!path->mnt->mnt_sb->s_export_op ||
- !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
+ if (!path->dentry->d_sb->s_export_op ||
+ !path->dentry->d_sb->s_export_op->fh_to_dentry)
return -EOPNOTSUPP;
if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
@@ -66,7 +67,8 @@ static long do_sys_name_to_handle(struct path *path,
} else
retval = 0;
/* copy the mount id */
- if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
+ if (copy_to_user(mnt_id, &real_mount(path->mnt)->mnt_id,
+ sizeof(*mnt_id)) ||
copy_to_user(ufh, handle,
sizeof(struct file_handle) + handle_bytes))
retval = -EFAULT;
diff --git a/fs/file_table.c b/fs/file_table.c
index c322794f7360..20002e39754d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -474,29 +474,6 @@ void file_sb_list_del(struct file *file)
#endif
-int fs_may_remount_ro(struct super_block *sb)
-{
- struct file *file;
- /* Check that no files are currently opened for writing. */
- lg_global_lock(files_lglock);
- do_file_list_for_each_entry(sb, file) {
- struct inode *inode = file->f_path.dentry->d_inode;
-
- /* File with pending delete? */
- if (inode->i_nlink == 0)
- goto too_bad;
-
- /* Writeable file? */
- if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
- goto too_bad;
- } while_file_list_for_each_entry;
- lg_global_unlock(files_lglock);
- return 1; /* Tis' cool bro. */
-too_bad:
- lg_global_unlock(files_lglock);
- return 0;
-}
-
/**
* mark_files_ro - mark all files read-only
* @sb: superblock in question
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 0845f84f2a5f..96f24286667a 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -74,7 +74,6 @@ int register_filesystem(struct file_system_type * fs)
BUG_ON(strchr(fs->name, '.'));
if (fs->next)
return -EBUSY;
- INIT_LIST_HEAD(&fs->fs_supers);
write_lock(&file_systems_lock);
p = find_filesystem(fs->name, strlen(fs->name));
if (*p)
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 7b2af5abe2fa..cf9ef918a2a9 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -187,10 +187,10 @@ vxfs_stiget(struct super_block *sbp, ino_t ino)
* vxfs_transmod returns a Linux mode_t for a given
* VxFS inode structure.
*/
-static __inline__ mode_t
+static __inline__ umode_t
vxfs_transmod(struct vxfs_inode_info *vip)
{
- mode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK;
+ umode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK;
if (VXFS_ISFIFO(vip))
ret |= S_IFIFO;
@@ -340,7 +340,6 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
static void vxfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(vxfs_inode_cachep, inode->i_private);
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 517f211a3bd4..5b4a9362d5aa 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -20,16 +20,21 @@
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
#include <linux/tracepoint.h>
#include "internal.h"
/*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
+
+/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
struct wb_writeback_work {
@@ -48,14 +53,6 @@ struct wb_writeback_work {
};
/*
- * Include the creation of the trace points after defining the
- * wb_writeback_work structure so that the definition remains local to this
- * file.
- */
-#define CREATE_TRACE_POINTS
-#include <trace/events/writeback.h>
-
-/*
* We don't actually have pdflush, but this one is exported though /proc...
*/
int nr_pdflush_threads;
@@ -87,6 +84,14 @@ static inline struct inode *wb_inode(struct list_head *head)
return list_entry(head, struct inode, i_wb_list);
}
+/*
+ * Include the creation of the trace points after defining the
+ * wb_writeback_work structure and inline functions so that the definition
+ * remains local to this file.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/writeback.h>
+
/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
{
@@ -743,11 +748,17 @@ static long wb_writeback(struct bdi_writeback *wb,
if (work->for_background && !over_bground_thresh(wb->bdi))
break;
+ /*
+ * Kupdate and background works are special and we want to
+ * include all inodes that need writing. Livelock avoidance is
+ * handled by these works yielding to any other work so we are
+ * safe.
+ */
if (work->for_kupdate) {
oldest_jif = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
- work->older_than_this = &oldest_jif;
- }
+ } else if (work->for_background)
+ oldest_jif = jiffies;
trace_writeback_start(wb->bdi, work);
if (list_empty(&wb->b_io))
@@ -937,7 +948,7 @@ int bdi_writeback_thread(void *data)
trace_writeback_thread_start(bdi);
- while (!kthread_should_stop()) {
+ while (!kthread_freezable_should_stop(NULL)) {
/*
* Remove own delayed wake-up timer, since we are already awake
* and we'll take care of the preriodic write-back.
@@ -967,8 +978,6 @@ int bdi_writeback_thread(void *data)
*/
schedule();
}
-
- try_to_freeze();
}
/* Flush any work that raced with us exiting */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 2aaf3eaaf13d..5f3368ab0fa9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1378,7 +1378,59 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = -ENOENT;
if (fc->sb)
- err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
+ err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+ up_read(&fc->killsb);
+ kfree(buf);
+ return err;
+
+err:
+ kfree(buf);
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_delete_out outarg;
+ int err = -ENOMEM;
+ char *buf;
+ struct qstr name;
+
+ buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ err = -EINVAL;
+ if (size < sizeof(outarg))
+ goto err;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto err;
+
+ err = -ENAMETOOLONG;
+ if (outarg.namelen > FUSE_NAME_MAX)
+ goto err;
+
+ err = -EINVAL;
+ if (size != sizeof(outarg) + outarg.namelen + 1)
+ goto err;
+
+ name.name = buf;
+ name.len = outarg.namelen;
+ err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+ if (err)
+ goto err;
+ fuse_copy_finish(cs);
+ buf[outarg.namelen] = 0;
+ name.hash = full_name_hash(name.name, name.len);
+
+ down_read(&fc->killsb);
+ err = -ENOENT;
+ if (fc->sb)
+ err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
+ outarg.child, &name);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -1597,6 +1649,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
case FUSE_NOTIFY_RETRIEVE:
return fuse_notify_retrieve(fc, size, cs);
+ case FUSE_NOTIFY_DELETE:
+ return fuse_notify_delete(fc, size, cs);
+
default:
fuse_copy_finish(cs);
return -EINVAL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 9f63e493a9b6..206632887bb4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -369,8 +369,8 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
* If the filesystem doesn't support this, then fall back to separate
* 'mknod' + 'open' requests.
*/
-static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
- struct nameidata *nd)
+static int fuse_create_open(struct inode *dir, struct dentry *entry,
+ umode_t mode, struct nameidata *nd)
{
int err;
struct inode *inode;
@@ -480,7 +480,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
*/
static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
struct inode *dir, struct dentry *entry,
- int mode)
+ umode_t mode)
{
struct fuse_entry_out outarg;
struct inode *inode;
@@ -547,7 +547,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
return err;
}
-static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
+static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
dev_t rdev)
{
struct fuse_mknod_in inarg;
@@ -573,7 +573,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
return create_new_entry(fc, req, dir, entry, mode);
}
-static int fuse_create(struct inode *dir, struct dentry *entry, int mode,
+static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
struct nameidata *nd)
{
if (nd) {
@@ -585,7 +585,7 @@ static int fuse_create(struct inode *dir, struct dentry *entry, int mode,
return fuse_mknod(dir, entry, mode, 0);
}
-static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
+static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
{
struct fuse_mkdir_in inarg;
struct fuse_conn *fc = get_fuse_conn(dir);
@@ -868,7 +868,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
}
int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
- struct qstr *name)
+ u64 child_nodeid, struct qstr *name)
{
int err = -ENOTDIR;
struct inode *parent;
@@ -895,8 +895,36 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
fuse_invalidate_attr(parent);
fuse_invalidate_entry(entry);
+
+ if (child_nodeid != 0 && entry->d_inode) {
+ mutex_lock(&entry->d_inode->i_mutex);
+ if (get_node_id(entry->d_inode) != child_nodeid) {
+ err = -ENOENT;
+ goto badentry;
+ }
+ if (d_mountpoint(entry)) {
+ err = -EBUSY;
+ goto badentry;
+ }
+ if (S_ISDIR(entry->d_inode->i_mode)) {
+ shrink_dcache_parent(entry);
+ if (!simple_empty(entry)) {
+ err = -ENOTEMPTY;
+ goto badentry;
+ }
+ entry->d_inode->i_flags |= S_DEAD;
+ }
+ dont_mount(entry);
+ clear_nlink(entry->d_inode);
+ err = 0;
+ badentry:
+ mutex_unlock(&entry->d_inode->i_mutex);
+ if (!err)
+ d_delete(entry);
+ } else {
+ err = 0;
+ }
dput(entry);
- err = 0;
unlock:
mutex_unlock(&parent->i_mutex);
@@ -1182,6 +1210,30 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
return fuse_fsync_common(file, start, end, datasync, 1);
}
+static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+ /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
+ if (fc->minor < 18)
+ return -ENOTTY;
+
+ return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR);
+}
+
+static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+ if (fc->minor < 18)
+ return -ENOTTY;
+
+ return fuse_ioctl_common(file, cmd, arg,
+ FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
+}
+
static bool update_mtime(unsigned ivalid)
{
/* Always update if mtime is explicitly set */
@@ -1596,6 +1648,8 @@ static const struct file_operations fuse_dir_operations = {
.open = fuse_dir_open,
.release = fuse_dir_release,
.fsync = fuse_dir_fsync,
+ .unlocked_ioctl = fuse_dir_ioctl,
+ .compat_ioctl = fuse_dir_compat_ioctl,
};
static const struct inode_operations fuse_common_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0c84100acd44..4a199fd93fbd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1555,48 +1555,16 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
loff_t retval;
struct inode *inode = file->f_path.dentry->d_inode;
- mutex_lock(&inode->i_mutex);
- if (origin != SEEK_CUR && origin != SEEK_SET) {
- retval = fuse_update_attributes(inode, NULL, file, NULL);
- if (retval)
- goto exit;
- }
+ /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
+ if (origin == SEEK_CUR || origin == SEEK_SET)
+ return generic_file_llseek(file, offset, origin);
- switch (origin) {
- case SEEK_END:
- offset += i_size_read(inode);
- break;
- case SEEK_CUR:
- if (offset == 0) {
- retval = file->f_pos;
- goto exit;
- }
- offset += file->f_pos;
- break;
- case SEEK_DATA:
- if (offset >= i_size_read(inode)) {
- retval = -ENXIO;
- goto exit;
- }
- break;
- case SEEK_HOLE:
- if (offset >= i_size_read(inode)) {
- retval = -ENXIO;
- goto exit;
- }
- offset = i_size_read(inode);
- break;
- }
- retval = -EINVAL;
- if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
- retval = offset;
- }
-exit:
+ mutex_lock(&inode->i_mutex);
+ retval = fuse_update_attributes(inode, NULL, file, NULL);
+ if (!retval)
+ retval = generic_file_llseek(file, offset, origin);
mutex_unlock(&inode->i_mutex);
+
return retval;
}
@@ -1808,7 +1776,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
err = -ENOMEM;
- pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+ pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
if (!pages || !iov_page)
goto out;
@@ -1958,8 +1926,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
}
EXPORT_SYMBOL_GPL(fuse_do_ioctl);
-static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
- unsigned long arg, unsigned int flags)
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags)
{
struct inode *inode = file->f_dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1976,13 +1944,13 @@ static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
static long fuse_file_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- return fuse_file_ioctl_common(file, cmd, arg, 0);
+ return fuse_ioctl_common(file, cmd, arg, 0);
}
static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
+ return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
}
/*
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index cf6db0a93219..572cefc78012 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -80,7 +80,7 @@ struct fuse_inode {
/** The sticky bit in inode->i_mode may have been removed, so
preserve the original mode */
- mode_t orig_i_mode;
+ umode_t orig_i_mode;
/** Version of last attribute change */
u64 attr_version;
@@ -755,9 +755,15 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
/**
* File-system tells the kernel to invalidate parent attributes and
* the dentry matching parent/name.
+ *
+ * If the child_nodeid is non-zero and:
+ * - matches the inode number for the dentry matching parent/name,
+ * - is not a mount point
+ * - is a file or oan empty directory
+ * then the dentry is unhashed (d_delete()).
*/
int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
- struct qstr *name);
+ u64 child_nodeid, struct qstr *name);
int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
bool isdir);
@@ -765,6 +771,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
size_t count, loff_t *ppos, int write);
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
unsigned int flags);
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags);
unsigned fuse_file_poll(struct file *file, poll_table *wait);
int fuse_dev_release(struct inode *inode, struct file *file);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index aa83109b9431..64cf8d07393e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -107,7 +107,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
static void fuse_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(fuse_inode_cachep, inode);
}
@@ -498,9 +497,10 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
return 1;
}
-static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int fuse_show_options(struct seq_file *m, struct dentry *root)
{
- struct fuse_conn *fc = get_fuse_conn_super(mnt->mnt_sb);
+ struct super_block *sb = root->d_sb;
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
seq_printf(m, ",user_id=%u", fc->user_id);
seq_printf(m, ",group_id=%u", fc->group_id);
@@ -510,9 +510,8 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
seq_puts(m, ",allow_other");
if (fc->max_read != ~0)
seq_printf(m, ",max_read=%u", fc->max_read);
- if (mnt->mnt_sb->s_bdev &&
- mnt->mnt_sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
- seq_printf(m, ",blksize=%lu", mnt->mnt_sb->s_blocksize);
+ if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
+ seq_printf(m, ",blksize=%lu", sb->s_blocksize);
return 0;
}
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 65978d7885c8..230eb0f005b6 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -38,8 +38,9 @@ static const char *gfs2_acl_name(int type)
return NULL;
}
-static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
+struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
{
+ struct gfs2_inode *ip = GFS2_I(inode);
struct posix_acl *acl;
const char *name;
char *data;
@@ -67,11 +68,6 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
return acl;
}
-struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
-{
- return gfs2_acl_get(GFS2_I(inode), type);
-}
-
static int gfs2_set_mode(struct inode *inode, umode_t mode)
{
int error = 0;
@@ -125,7 +121,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
if (S_ISLNK(inode->i_mode))
return 0;
- acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT);
+ acl = gfs2_get_acl(&dip->i_inode, ACL_TYPE_DEFAULT);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (!acl) {
@@ -166,7 +162,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
unsigned int len;
int error;
- acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS);
+ acl = gfs2_get_acl(&ip->i_inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (!acl)
@@ -216,7 +212,7 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
if (type < 0)
return type;
- acl = gfs2_acl_get(GFS2_I(inode), type);
+ acl = gfs2_get_acl(inode, type);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4858e1fed8b1..501e5cba09b3 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
int alloc_required;
int error = 0;
- struct gfs2_alloc *al = NULL;
+ struct gfs2_qadata *qa = NULL;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
struct page *page;
@@ -639,8 +639,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
if (alloc_required) {
- al = gfs2_alloc_get(ip);
- if (!al) {
+ qa = gfs2_qadata_get(ip);
+ if (!qa) {
error = -ENOMEM;
goto out_unlock;
}
@@ -649,8 +649,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
if (error)
goto out_alloc_put;
- al->al_requested = data_blocks + ind_blocks;
- error = gfs2_inplace_reserve(ip);
+ error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks);
if (error)
goto out_qunlock;
}
@@ -711,7 +710,7 @@ out_trans_fail:
out_qunlock:
gfs2_quota_unlock(ip);
out_alloc_put:
- gfs2_alloc_put(ip);
+ gfs2_qadata_put(ip);
}
out_unlock:
if (&ip->i_inode == sdp->sd_rindex) {
@@ -848,7 +847,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
struct buffer_head *dibh;
- struct gfs2_alloc *al = ip->i_alloc;
+ struct gfs2_qadata *qa = ip->i_qadata;
unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
unsigned int to = from + len;
int ret;
@@ -880,10 +879,11 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
brelse(dibh);
failed:
gfs2_trans_end(sdp);
- if (al) {
+ if (ip->i_res)
gfs2_inplace_release(ip);
+ if (qa) {
gfs2_quota_unlock(ip);
- gfs2_alloc_put(ip);
+ gfs2_qadata_put(ip);
}
if (inode == sdp->sd_rindex) {
gfs2_glock_dq(&m_ip->i_gh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 41d494d79709..14a704015970 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -133,7 +133,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
and write it out to disk */
unsigned int n = 1;
- error = gfs2_alloc_block(ip, &block, &n);
+ error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
if (error)
goto out_brelse;
if (isdir) {
@@ -503,7 +503,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
do {
int error;
n = blks - alloced;
- error = gfs2_alloc_block(ip, &bn, &n);
+ error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
if (error)
return error;
alloced += n;
@@ -743,9 +743,6 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
else if (ip->i_depth)
revokes = sdp->sd_inptrs;
- if (error)
- return error;
-
memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
bstart = 0;
blen = 0;
@@ -1044,7 +1041,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
find_metapath(sdp, lblock, &mp, ip->i_height);
- if (!gfs2_alloc_get(ip))
+ if (!gfs2_qadata_get(ip))
return -ENOMEM;
error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
@@ -1064,7 +1061,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
gfs2_quota_unhold(ip);
out:
- gfs2_alloc_put(ip);
+ gfs2_qadata_put(ip);
return error;
}
@@ -1166,21 +1163,20 @@ static int do_grow(struct inode *inode, u64 size)
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct buffer_head *dibh;
- struct gfs2_alloc *al = NULL;
+ struct gfs2_qadata *qa = NULL;
int error;
if (gfs2_is_stuffed(ip) &&
(size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
- al = gfs2_alloc_get(ip);
- if (al == NULL)
+ qa = gfs2_qadata_get(ip);
+ if (qa == NULL)
return -ENOMEM;
error = gfs2_quota_lock_check(ip);
if (error)
goto do_grow_alloc_put;
- al->al_requested = 1;
- error = gfs2_inplace_reserve(ip);
+ error = gfs2_inplace_reserve(ip, 1);
if (error)
goto do_grow_qunlock;
}
@@ -1189,7 +1185,7 @@ static int do_grow(struct inode *inode, u64 size)
if (error)
goto do_grow_release;
- if (al) {
+ if (qa) {
error = gfs2_unstuff_dinode(ip, NULL);
if (error)
goto do_end_trans;
@@ -1208,12 +1204,12 @@ static int do_grow(struct inode *inode, u64 size)
do_end_trans:
gfs2_trans_end(sdp);
do_grow_release:
- if (al) {
+ if (qa) {
gfs2_inplace_release(ip);
do_grow_qunlock:
gfs2_quota_unlock(ip);
do_grow_alloc_put:
- gfs2_alloc_put(ip);
+ gfs2_qadata_put(ip);
}
return error;
}
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 8ccad2467cb6..c35573abd371 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -76,6 +76,8 @@
#define IS_LEAF 1 /* Hashed (leaf) directory */
#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */
+#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
+
#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
@@ -821,7 +823,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
struct gfs2_dirent *dent;
struct qstr name = { .name = "", .len = 0, .hash = 0 };
- error = gfs2_alloc_block(ip, &bn, &n);
+ error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
if (error)
return NULL;
bh = gfs2_meta_new(ip->i_gl, bn);
@@ -1376,6 +1378,52 @@ out:
return error;
}
+/**
+ * gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information.
+ */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
+ struct file_ra_state *f_ra)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_glock *gl = ip->i_gl;
+ struct buffer_head *bh;
+ u64 blocknr = 0, last;
+ unsigned count;
+
+ /* First check if we've already read-ahead for the whole range. */
+ if (index + MAX_RA_BLOCKS < f_ra->start)
+ return;
+
+ f_ra->start = max((pgoff_t)index, f_ra->start);
+ for (count = 0; count < MAX_RA_BLOCKS; count++) {
+ if (f_ra->start >= hsize) /* if exceeded the hash table */
+ break;
+
+ last = blocknr;
+ blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]);
+ f_ra->start++;
+ if (blocknr == last)
+ continue;
+
+ bh = gfs2_getbuf(gl, blocknr, 1);
+ if (trylock_buffer(bh)) {
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ brelse(bh);
+ continue;
+ }
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READA | REQ_META, bh);
+ continue;
+ }
+ brelse(bh);
+ }
+}
/**
* dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1388,7 +1436,7 @@ out:
*/
static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir)
+ filldir_t filldir, struct file_ra_state *f_ra)
{
struct gfs2_inode *dip = GFS2_I(inode);
u32 hsize, len = 0;
@@ -1402,10 +1450,14 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
hash = gfs2_dir_offset2hash(*offset);
index = hash >> (32 - dip->i_depth);
+ if (dip->i_hash_cache == NULL)
+ f_ra->start = 0;
lp = gfs2_dir_get_hash_table(dip);
if (IS_ERR(lp))
return PTR_ERR(lp);
+ gfs2_dir_readahead(inode, hsize, index, f_ra);
+
while (index < hsize) {
error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
&copied, &depth,
@@ -1423,7 +1475,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
}
int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir)
+ filldir_t filldir, struct file_ra_state *f_ra)
{
struct gfs2_inode *dip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1437,7 +1489,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
return 0;
if (dip->i_diskflags & GFS2_DIF_EXHASH)
- return dir_e_read(inode, offset, opaque, filldir);
+ return dir_e_read(inode, offset, opaque, filldir, f_ra);
if (!gfs2_is_stuffed(dip)) {
gfs2_consist_inode(dip);
@@ -1798,7 +1850,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
if (!ht)
return -ENOMEM;
- if (!gfs2_alloc_get(dip)) {
+ if (!gfs2_qadata_get(dip)) {
error = -ENOMEM;
goto out;
}
@@ -1887,7 +1939,7 @@ out_rlist:
gfs2_rlist_free(&rlist);
gfs2_quota_unhold(dip);
out_put:
- gfs2_alloc_put(dip);
+ gfs2_qadata_put(dip);
out:
kfree(ht);
return error;
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index ff5772fbf024..98c960beab35 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -25,7 +25,7 @@ extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
const struct gfs2_inode *ip);
extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir);
+ filldir_t filldir, struct file_ra_state *f_ra);
extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
const struct gfs2_inode *nip, unsigned int new_type);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index fe9945f2ff72..70ba891654f8 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -99,6 +99,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
struct gfs2_holder gh;
u64 offset = 0;
int error;
+ struct file_ra_state f_ra = { .start = 0 };
if (!dir)
return -EINVAL;
@@ -118,7 +119,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
if (error)
return error;
- error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+ error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);
gfs2_glock_dq_uninit(&gh);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ce36a56dfeac..c5fb3597f696 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -105,7 +105,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
return error;
}
- error = gfs2_dir_read(dir, &offset, dirent, filldir);
+ error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
gfs2_glock_dq_uninit(&d_gh);
@@ -223,7 +223,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
int error;
u32 new_flags, flags;
- error = mnt_want_write(filp->f_path.mnt);
+ error = mnt_want_write_file(filp);
if (error)
return error;
@@ -285,7 +285,7 @@ out_trans_end:
out:
gfs2_glock_dq_uninit(&gh);
out_drop_write:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return error;
}
@@ -365,7 +365,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
u64 pos = page->index << PAGE_CACHE_SHIFT;
unsigned int data_blocks, ind_blocks, rblocks;
struct gfs2_holder gh;
- struct gfs2_alloc *al;
+ struct gfs2_qadata *qa;
loff_t size;
int ret;
@@ -393,16 +393,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
}
ret = -ENOMEM;
- al = gfs2_alloc_get(ip);
- if (al == NULL)
+ qa = gfs2_qadata_get(ip);
+ if (qa == NULL)
goto out_unlock;
ret = gfs2_quota_lock_check(ip);
if (ret)
goto out_alloc_put;
gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
- al->al_requested = data_blocks + ind_blocks;
- ret = gfs2_inplace_reserve(ip);
+ ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks);
if (ret)
goto out_quota_unlock;
@@ -448,7 +447,7 @@ out_trans_fail:
out_quota_unlock:
gfs2_quota_unlock(ip);
out_alloc_put:
- gfs2_alloc_put(ip);
+ gfs2_qadata_put(ip);
out_unlock:
gfs2_glock_dq(&gh);
out:
@@ -609,7 +608,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
struct inode *inode = mapping->host;
int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
struct gfs2_inode *ip = GFS2_I(inode);
- int ret, ret1 = 0;
+ int ret = 0, ret1 = 0;
if (mapping->nrpages) {
ret1 = filemap_fdatawrite_range(mapping, start, end);
@@ -750,8 +749,10 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
struct gfs2_inode *ip = GFS2_I(inode);
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
loff_t bytes, max_bytes;
- struct gfs2_alloc *al;
+ struct gfs2_qadata *qa;
int error;
+ const loff_t pos = offset;
+ const loff_t count = len;
loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
loff_t max_chunk_size = UINT_MAX & bsize_mask;
@@ -782,8 +783,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
while (len > 0) {
if (len < bytes)
bytes = len;
- al = gfs2_alloc_get(ip);
- if (!al) {
+ qa = gfs2_qadata_get(ip);
+ if (!qa) {
error = -ENOMEM;
goto out_unlock;
}
@@ -795,8 +796,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
retry:
gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
- al->al_requested = data_blocks + ind_blocks;
- error = gfs2_inplace_reserve(ip);
+ error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks);
if (error) {
if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
bytes >>= 1;
@@ -810,7 +810,6 @@ retry:
max_bytes = bytes;
calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
&max_bytes, &data_blocks, &ind_blocks);
- al->al_requested = data_blocks + ind_blocks;
rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
RES_RG_HDR + gfs2_rg_blocks(ip);
@@ -832,8 +831,11 @@ retry:
offset += max_bytes;
gfs2_inplace_release(ip);
gfs2_quota_unlock(ip);
- gfs2_alloc_put(ip);
+ gfs2_qadata_put(ip);
}
+
+ if (error == 0)
+ error = generic_write_sync(file, pos, count);
goto out_unlock;
out_trans_fail:
@@ -841,7 +843,7 @@ out_trans_fail:
out_qunlock:
gfs2_quota_unlock(ip);
out_alloc_put:
- gfs2_alloc_put(ip);
+ gfs2_qadata_put(ip);
out_unlock:
gfs2_glock_dq(&ip->i_gh);
out_uninit:
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 88e8a23d0026..351a3e797789 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -167,14 +167,19 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
spin_unlock(&lru_lock);
}
-static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
{
- spin_lock(&lru_lock);
if (!list_empty(&gl->gl_lru)) {
list_del_init(&gl->gl_lru);
atomic_dec(&lru_count);
clear_bit(GLF_LRU, &gl->gl_flags);
}
+}
+
+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+{
+ spin_lock(&lru_lock);
+ __gfs2_glock_remove_from_lru(gl);
spin_unlock(&lru_lock);
}
@@ -217,11 +222,12 @@ void gfs2_glock_put(struct gfs2_glock *gl)
struct gfs2_sbd *sdp = gl->gl_sbd;
struct address_space *mapping = gfs2_glock2aspace(gl);
- if (atomic_dec_and_test(&gl->gl_ref)) {
+ if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
+ __gfs2_glock_remove_from_lru(gl);
+ spin_unlock(&lru_lock);
spin_lock_bucket(gl->gl_hash);
hlist_bl_del_rcu(&gl->gl_list);
spin_unlock_bucket(gl->gl_hash);
- gfs2_glock_remove_from_lru(gl);
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
trace_gfs2_glock_put(gl);
@@ -1353,7 +1359,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
spin_lock(&gl->gl_spin);
gl->gl_reply = ret;
- if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
+ if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2553b858a72e..307ac31df781 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -121,8 +121,11 @@ enum {
struct lm_lockops {
const char *lm_proto_name;
- int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
- void (*lm_unmount) (struct gfs2_sbd *sdp);
+ int (*lm_mount) (struct gfs2_sbd *sdp, const char *table);
+ void (*lm_first_done) (struct gfs2_sbd *sdp);
+ void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid,
+ unsigned int result);
+ void (*lm_unmount) (struct gfs2_sbd *sdp);
void (*lm_withdraw) (struct gfs2_sbd *sdp);
void (*lm_put_lock) (struct gfs2_glock *gl);
int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 7389dfdcc9ef..97742a7ea9cc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -139,8 +139,45 @@ struct gfs2_bufdata {
#define GDLM_STRNAME_BYTES 25
#define GDLM_LVB_SIZE 32
+/*
+ * ls_recover_flags:
+ *
+ * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been
+ * held by failed nodes whose journals need recovery. Those locks should
+ * only be used for journal recovery until the journal recovery is done.
+ * This is set by the dlm recover_prep callback and cleared by the
+ * gfs2_control thread when journal recovery is complete. To avoid
+ * races between recover_prep setting and gfs2_control clearing, recover_spin
+ * is held while changing this bit and reading/writing recover_block
+ * and recover_start.
+ *
+ * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used.
+ *
+ * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing
+ * recovery of all journals before allowing other nodes to mount the fs.
+ * This is cleared when FIRST_MOUNT_DONE is set.
+ *
+ * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished
+ * recovery of all journals, and now allows other nodes to mount the fs.
+ *
+ * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared
+ * BLOCK_LOCKS for the first time. The gfs2_control thread should now
+ * control clearing BLOCK_LOCKS for further recoveries.
+ *
+ * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq.
+ *
+ * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep()
+ * and recover_done(), i.e. set while recover_block == recover_start.
+ */
+
enum {
DFL_BLOCK_LOCKS = 0,
+ DFL_NO_DLM_OPS = 1,
+ DFL_FIRST_MOUNT = 2,
+ DFL_FIRST_MOUNT_DONE = 3,
+ DFL_MOUNT_DONE = 4,
+ DFL_UNMOUNT = 5,
+ DFL_DLM_RECOVERY = 6,
};
struct lm_lockname {
@@ -244,17 +281,16 @@ struct gfs2_glock {
#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
-struct gfs2_alloc {
+struct gfs2_qadata { /* quota allocation data */
/* Quota stuff */
- struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
- struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
- unsigned int al_qd_num;
-
- u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */
- u32 al_alloced; /* Filled in by gfs2_alloc_*() */
+ struct gfs2_quota_data *qa_qd[2*MAXQUOTAS];
+ struct gfs2_holder qa_qd_ghs[2*MAXQUOTAS];
+ unsigned int qa_qd_num;
+};
- /* Filled in by gfs2_inplace_reserve() */
- struct gfs2_holder al_rgd_gh;
+struct gfs2_blkreserv {
+ u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
+ struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
};
enum {
@@ -275,7 +311,8 @@ struct gfs2_inode {
struct gfs2_glock *i_gl; /* Move into i_gh? */
struct gfs2_holder i_iopen_gh;
struct gfs2_holder i_gh; /* for prepare/commit_write only */
- struct gfs2_alloc *i_alloc;
+ struct gfs2_qadata *i_qadata; /* quota allocation data */
+ struct gfs2_blkreserv *i_res; /* resource group block reservation */
struct gfs2_rgrpd *i_rgd;
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
@@ -392,6 +429,7 @@ struct gfs2_jdesc {
#define JDF_RECOVERY 1
unsigned int jd_jid;
unsigned int jd_blocks;
+ int jd_recover_error;
};
struct gfs2_statfs_change_host {
@@ -461,6 +499,7 @@ enum {
SDF_NORECOVERY = 4,
SDF_DEMOTE = 5,
SDF_NOJOURNALID = 6,
+ SDF_RORECOVERY = 7, /* read only recovery */
};
#define GFS2_FSNAME_LEN 256
@@ -499,14 +538,26 @@ struct gfs2_sb_host {
struct lm_lockstruct {
int ls_jid;
unsigned int ls_first;
- unsigned int ls_first_done;
unsigned int ls_nodir;
const struct lm_lockops *ls_ops;
- unsigned long ls_flags;
dlm_lockspace_t *ls_dlm;
- int ls_recover_jid_done;
- int ls_recover_jid_status;
+ int ls_recover_jid_done; /* These two are deprecated, */
+ int ls_recover_jid_status; /* used previously by gfs_controld */
+
+ struct dlm_lksb ls_mounted_lksb; /* mounted_lock */
+ struct dlm_lksb ls_control_lksb; /* control_lock */
+ char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */
+ struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */
+
+ spinlock_t ls_recover_spin; /* protects following fields */
+ unsigned long ls_recover_flags; /* DFL_ */
+ uint32_t ls_recover_mount; /* gen in first recover_done cb */
+ uint32_t ls_recover_start; /* gen in last recover_done cb */
+ uint32_t ls_recover_block; /* copy recover_start in last recover_prep */
+ uint32_t ls_recover_size; /* size of recover_submit, recover_result */
+ uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */
+ uint32_t *ls_recover_result; /* result of last jid recovery */
};
struct gfs2_sbd {
@@ -544,6 +595,7 @@ struct gfs2_sbd {
wait_queue_head_t sd_glock_wait;
atomic_t sd_glock_disposal;
struct completion sd_locking_init;
+ struct delayed_work sd_control_work;
/* Inode Stuff */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index cfd4959b218c..56987460cdae 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -333,7 +333,7 @@ out:
*/
static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
- unsigned int mode)
+ umode_t mode)
{
int error;
@@ -364,7 +364,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
return 0;
}
-static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
+static void munge_mode_uid_gid(struct gfs2_inode *dip, umode_t *mode,
unsigned int *uid, unsigned int *gid)
{
if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
@@ -389,12 +389,9 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
int error;
+ int dblocks = 1;
- if (gfs2_alloc_get(dip) == NULL)
- return -ENOMEM;
-
- dip->i_alloc->al_requested = RES_DINODE;
- error = gfs2_inplace_reserve(dip);
+ error = gfs2_inplace_reserve(dip, RES_DINODE);
if (error)
goto out;
@@ -402,14 +399,13 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
if (error)
goto out_ipreserv;
- error = gfs2_alloc_di(dip, no_addr, generation);
+ error = gfs2_alloc_blocks(dip, no_addr, &dblocks, 1, generation);
gfs2_trans_end(sdp);
out_ipreserv:
gfs2_inplace_release(dip);
out:
- gfs2_alloc_put(dip);
return error;
}
@@ -447,7 +443,7 @@ static void gfs2_init_dir(struct buffer_head *dibh,
*/
static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
- const struct gfs2_inum_host *inum, unsigned int mode,
+ const struct gfs2_inum_host *inum, umode_t mode,
unsigned int uid, unsigned int gid,
const u64 *generation, dev_t dev, const char *symname,
unsigned size, struct buffer_head **bhp)
@@ -516,7 +512,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
}
static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
- unsigned int mode, const struct gfs2_inum_host *inum,
+ umode_t mode, const struct gfs2_inum_host *inum,
const u64 *generation, dev_t dev, const char *symname,
unsigned int size, struct buffer_head **bhp)
{
@@ -525,7 +521,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
int error;
munge_mode_uid_gid(dip, &mode, &uid, &gid);
- if (!gfs2_alloc_get(dip))
+ if (!gfs2_qadata_get(dip))
return -ENOMEM;
error = gfs2_quota_lock(dip, uid, gid);
@@ -547,7 +543,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
out_quota:
gfs2_quota_unlock(dip);
out:
- gfs2_alloc_put(dip);
+ gfs2_qadata_put(dip);
return error;
}
@@ -555,13 +551,13 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
- struct gfs2_alloc *al;
+ struct gfs2_qadata *qa;
int alloc_required;
struct buffer_head *dibh;
int error;
- al = gfs2_alloc_get(dip);
- if (!al)
+ qa = gfs2_qadata_get(dip);
+ if (!qa)
return -ENOMEM;
error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
@@ -576,9 +572,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
if (error)
goto fail_quota_locks;
- al->al_requested = sdp->sd_max_dirres;
-
- error = gfs2_inplace_reserve(dip);
+ error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres);
if (error)
goto fail_quota_locks;
@@ -601,9 +595,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error)
goto fail_end_trans;
- inc_nlink(&ip->i_inode);
- if (S_ISDIR(ip->i_inode.i_mode))
- inc_nlink(&ip->i_inode);
+ set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1);
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -619,11 +611,11 @@ fail_quota_locks:
gfs2_quota_unlock(dip);
fail:
- gfs2_alloc_put(dip);
+ gfs2_qadata_put(dip);
return error;
}
-int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
void *fs_info)
{
const struct xattr *xattr;
@@ -659,7 +651,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
*/
static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
- unsigned int mode, dev_t dev, const char *symname,
+ umode_t mode, dev_t dev, const char *symname,
unsigned int size, int excl)
{
const struct qstr *name = &dentry->d_name;
@@ -728,9 +720,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
brelse(bh);
gfs2_trans_end(sdp);
- gfs2_inplace_release(dip);
+ /* Check if we reserved space in the rgrp. Function link_dinode may
+ not, depending on whether alloc is required. */
+ if (dip->i_res)
+ gfs2_inplace_release(dip);
gfs2_quota_unlock(dip);
- gfs2_alloc_put(dip);
+ gfs2_qadata_put(dip);
mark_inode_dirty(inode);
gfs2_glock_dq_uninit_m(2, ghs);
d_instantiate(dentry, inode);
@@ -760,7 +755,7 @@ fail:
*/
static int gfs2_create(struct inode *dir, struct dentry *dentry,
- int mode, struct nameidata *nd)
+ umode_t mode, struct nameidata *nd)
{
int excl = 0;
if (nd && (nd->flags & LOOKUP_EXCL))
@@ -875,8 +870,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
error = 0;
if (alloc_required) {
- struct gfs2_alloc *al = gfs2_alloc_get(dip);
- if (!al) {
+ struct gfs2_qadata *qa = gfs2_qadata_get(dip);
+
+ if (!qa) {
error = -ENOMEM;
goto out_gunlock;
}
@@ -885,9 +881,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (error)
goto out_alloc;
- al->al_requested = sdp->sd_max_dirres;
-
- error = gfs2_inplace_reserve(dip);
+ error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres);
if (error)
goto out_gunlock_q;
@@ -930,7 +924,7 @@ out_gunlock_q:
gfs2_quota_unlock(dip);
out_alloc:
if (alloc_required)
- gfs2_alloc_put(dip);
+ gfs2_qadata_put(dip);
out_gunlock:
gfs2_glock_dq(ghs + 1);
out_child:
@@ -1037,12 +1031,15 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
struct buffer_head *bh;
struct gfs2_holder ghs[3];
struct gfs2_rgrpd *rgd;
- int error;
+ int error = -EROFS;
gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
+ if (!rgd)
+ goto out_inodes;
+
gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
@@ -1088,12 +1085,13 @@ out_end_trans:
out_gunlock:
gfs2_glock_dq(ghs + 2);
out_rgrp:
- gfs2_holder_uninit(ghs + 2);
gfs2_glock_dq(ghs + 1);
out_child:
- gfs2_holder_uninit(ghs + 1);
gfs2_glock_dq(ghs);
out_parent:
+ gfs2_holder_uninit(ghs + 2);
+out_inodes:
+ gfs2_holder_uninit(ghs + 1);
gfs2_holder_uninit(ghs);
return error;
}
@@ -1129,7 +1127,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
* Returns: errno
*/
-static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0);
}
@@ -1143,7 +1141,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
*
*/
-static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
+static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t dev)
{
return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
@@ -1350,8 +1348,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
error = 0;
if (alloc_required) {
- struct gfs2_alloc *al = gfs2_alloc_get(ndip);
- if (!al) {
+ struct gfs2_qadata *qa = gfs2_qadata_get(ndip);
+
+ if (!qa) {
error = -ENOMEM;
goto out_gunlock;
}
@@ -1360,9 +1359,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (error)
goto out_alloc;
- al->al_requested = sdp->sd_max_dirres;
-
- error = gfs2_inplace_reserve(ndip);
+ error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres);
if (error)
goto out_gunlock_q;
@@ -1423,7 +1420,7 @@ out_gunlock_q:
gfs2_quota_unlock(ndip);
out_alloc:
if (alloc_required)
- gfs2_alloc_put(ndip);
+ gfs2_qadata_put(ndip);
out_gunlock:
while (x--) {
gfs2_glock_dq(ghs + x);
@@ -1584,7 +1581,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
ogid = ngid = NO_QUOTA_CHANGE;
- if (!gfs2_alloc_get(ip))
+ if (!gfs2_qadata_get(ip))
return -ENOMEM;
error = gfs2_quota_lock(ip, nuid, ngid);
@@ -1616,7 +1613,7 @@ out_end_trans:
out_gunlock_q:
gfs2_quota_unlock(ip);
out_alloc:
- gfs2_alloc_put(ip);
+ gfs2_qadata_put(ip);
return error;
}
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 98c80d8c2a62..8944d1e32ab5 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
+ * Copyright 2004-2011 Red Hat, Inc.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -11,12 +11,15 @@
#include <linux/dlm.h>
#include <linux/slab.h>
#include <linux/types.h>
+#include <linux/delay.h>
#include <linux/gfs2_ondisk.h>
#include "incore.h"
#include "glock.h"
#include "util.h"
+#include "sys.h"
+extern struct workqueue_struct *gfs2_control_wq;
static void gdlm_ast(void *arg)
{
@@ -185,34 +188,1002 @@ static void gdlm_cancel(struct gfs2_glock *gl)
dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
}
-static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname)
+/*
+ * dlm/gfs2 recovery coordination using dlm_recover callbacks
+ *
+ * 1. dlm_controld sees lockspace members change
+ * 2. dlm_controld blocks dlm-kernel locking activity
+ * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep)
+ * 4. dlm_controld starts and finishes its own user level recovery
+ * 5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery
+ * 6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot)
+ * 7. dlm_recoverd does its own lock recovery
+ * 8. dlm_recoverd unblocks dlm-kernel locking activity
+ * 9. dlm_recoverd notifies gfs2 when done (recover_done with new generation)
+ * 10. gfs2_control updates control_lock lvb with new generation and jid bits
+ * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none)
+ * 12. gfs2_recover dequeues and recovers journals of failed nodes
+ * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result)
+ * 14. gfs2_control updates control_lock lvb jid bits for recovered journals
+ * 15. gfs2_control unblocks normal locking when all journals are recovered
+ *
+ * - failures during recovery
+ *
+ * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control
+ * clears BLOCK_LOCKS (step 15), e.g. another node fails while still
+ * recovering for a prior failure. gfs2_control needs a way to detect
+ * this so it can leave BLOCK_LOCKS set in step 15. This is managed using
+ * the recover_block and recover_start values.
+ *
+ * recover_done() provides a new lockspace generation number each time it
+ * is called (step 9). This generation number is saved as recover_start.
+ * When recover_prep() is called, it sets BLOCK_LOCKS and sets
+ * recover_block = recover_start. So, while recover_block is equal to
+ * recover_start, BLOCK_LOCKS should remain set. (recover_spin must
+ * be held around the BLOCK_LOCKS/recover_block/recover_start logic.)
+ *
+ * - more specific gfs2 steps in sequence above
+ *
+ * 3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start
+ * 6. recover_slot records any failed jids (maybe none)
+ * 9. recover_done sets recover_start = new generation number
+ * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids
+ * 12. gfs2_recover does journal recoveries for failed jids identified above
+ * 14. gfs2_control clears control_lock lvb bits for recovered jids
+ * 15. gfs2_control checks if recover_block == recover_start (step 3 occured
+ * again) then do nothing, otherwise if recover_start > recover_block
+ * then clear BLOCK_LOCKS.
+ *
+ * - parallel recovery steps across all nodes
+ *
+ * All nodes attempt to update the control_lock lvb with the new generation
+ * number and jid bits, but only the first to get the control_lock EX will
+ * do so; others will see that it's already done (lvb already contains new
+ * generation number.)
+ *
+ * . All nodes get the same recover_prep/recover_slot/recover_done callbacks
+ * . All nodes attempt to set control_lock lvb gen + bits for the new gen
+ * . One node gets control_lock first and writes the lvb, others see it's done
+ * . All nodes attempt to recover jids for which they see control_lock bits set
+ * . One node succeeds for a jid, and that one clears the jid bit in the lvb
+ * . All nodes will eventually see all lvb bits clear and unblock locks
+ *
+ * - is there a problem with clearing an lvb bit that should be set
+ * and missing a journal recovery?
+ *
+ * 1. jid fails
+ * 2. lvb bit set for step 1
+ * 3. jid recovered for step 1
+ * 4. jid taken again (new mount)
+ * 5. jid fails (for step 4)
+ * 6. lvb bit set for step 5 (will already be set)
+ * 7. lvb bit cleared for step 3
+ *
+ * This is not a problem because the failure in step 5 does not
+ * require recovery, because the mount in step 4 could not have
+ * progressed far enough to unblock locks and access the fs. The
+ * control_mount() function waits for all recoveries to be complete
+ * for the latest lockspace generation before ever unblocking locks
+ * and returning. The mount in step 4 waits until the recovery in
+ * step 1 is done.
+ *
+ * - special case of first mounter: first node to mount the fs
+ *
+ * The first node to mount a gfs2 fs needs to check all the journals
+ * and recover any that need recovery before other nodes are allowed
+ * to mount the fs. (Others may begin mounting, but they must wait
+ * for the first mounter to be done before taking locks on the fs
+ * or accessing the fs.) This has two parts:
+ *
+ * 1. The mounted_lock tells a node it's the first to mount the fs.
+ * Each node holds the mounted_lock in PR while it's mounted.
+ * Each node tries to acquire the mounted_lock in EX when it mounts.
+ * If a node is granted the mounted_lock EX it means there are no
+ * other mounted nodes (no PR locks exist), and it is the first mounter.
+ * The mounted_lock is demoted to PR when first recovery is done, so
+ * others will fail to get an EX lock, but will get a PR lock.
+ *
+ * 2. The control_lock blocks others in control_mount() while the first
+ * mounter is doing first mount recovery of all journals.
+ * A mounting node needs to acquire control_lock in EX mode before
+ * it can proceed. The first mounter holds control_lock in EX while doing
+ * the first mount recovery, blocking mounts from other nodes, then demotes
+ * control_lock to NL when it's done (others_may_mount/first_done),
+ * allowing other nodes to continue mounting.
+ *
+ * first mounter:
+ * control_lock EX/NOQUEUE success
+ * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters)
+ * set first=1
+ * do first mounter recovery
+ * mounted_lock EX->PR
+ * control_lock EX->NL, write lvb generation
+ *
+ * other mounter:
+ * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry)
+ * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR)
+ * mounted_lock PR/NOQUEUE success
+ * read lvb generation
+ * control_lock EX->NL
+ * set first=0
+ *
+ * - mount during recovery
+ *
+ * If a node mounts while others are doing recovery (not first mounter),
+ * the mounting node will get its initial recover_done() callback without
+ * having seen any previous failures/callbacks.
+ *
+ * It must wait for all recoveries preceding its mount to be finished
+ * before it unblocks locks. It does this by repeating the "other mounter"
+ * steps above until the lvb generation number is >= its mount generation
+ * number (from initial recover_done) and all lvb bits are clear.
+ *
+ * - control_lock lvb format
+ *
+ * 4 bytes generation number: the latest dlm lockspace generation number
+ * from recover_done callback. Indicates the jid bitmap has been updated
+ * to reflect all slot failures through that generation.
+ * 4 bytes unused.
+ * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates
+ * that jid N needs recovery.
+ */
+
+#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */
+
+static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
+ char *lvb_bits)
+{
+ uint32_t gen;
+ memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
+ memcpy(&gen, lvb_bits, sizeof(uint32_t));
+ *lvb_gen = le32_to_cpu(gen);
+}
+
+static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
+ char *lvb_bits)
+{
+ uint32_t gen;
+ memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
+ gen = cpu_to_le32(lvb_gen);
+ memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t));
+}
+
+static int all_jid_bits_clear(char *lvb)
+{
+ int i;
+ for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) {
+ if (lvb[i])
+ return 0;
+ }
+ return 1;
+}
+
+static void sync_wait_cb(void *arg)
+{
+ struct lm_lockstruct *ls = arg;
+ complete(&ls->ls_sync_wait);
+}
+
+static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
int error;
- if (fsname == NULL) {
- fs_info(sdp, "no fsname found\n");
- return -EINVAL;
+ error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
+ if (error) {
+ fs_err(sdp, "%s lkid %x error %d\n",
+ name, lksb->sb_lkid, error);
+ return error;
+ }
+
+ wait_for_completion(&ls->ls_sync_wait);
+
+ if (lksb->sb_status != -DLM_EUNLOCK) {
+ fs_err(sdp, "%s lkid %x status %d\n",
+ name, lksb->sb_lkid, lksb->sb_status);
+ return -1;
+ }
+ return 0;
+}
+
+static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags,
+ unsigned int num, struct dlm_lksb *lksb, char *name)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char strname[GDLM_STRNAME_BYTES];
+ int error, status;
+
+ memset(strname, 0, GDLM_STRNAME_BYTES);
+ snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num);
+
+ error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
+ strname, GDLM_STRNAME_BYTES - 1,
+ 0, sync_wait_cb, ls, NULL);
+ if (error) {
+ fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n",
+ name, lksb->sb_lkid, flags, mode, error);
+ return error;
+ }
+
+ wait_for_completion(&ls->ls_sync_wait);
+
+ status = lksb->sb_status;
+
+ if (status && status != -EAGAIN) {
+ fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n",
+ name, lksb->sb_lkid, flags, mode, status);
+ }
+
+ return status;
+}
+
+static int mounted_unlock(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock");
+}
+
+static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK,
+ &ls->ls_mounted_lksb, "mounted_lock");
+}
+
+static int control_unlock(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock");
+}
+
+static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK,
+ &ls->ls_control_lksb, "control_lock");
+}
+
+static void gfs2_control_func(struct work_struct *work)
+{
+ struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char lvb_bits[GDLM_LVB_SIZE];
+ uint32_t block_gen, start_gen, lvb_gen, flags;
+ int recover_set = 0;
+ int write_lvb = 0;
+ int recover_size;
+ int i, error;
+
+ spin_lock(&ls->ls_recover_spin);
+ /*
+ * No MOUNT_DONE means we're still mounting; control_mount()
+ * will set this flag, after which this thread will take over
+ * all further clearing of BLOCK_LOCKS.
+ *
+ * FIRST_MOUNT means this node is doing first mounter recovery,
+ * for which recovery control is handled by
+ * control_mount()/control_first_done(), not this thread.
+ */
+ if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+ test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+ block_gen = ls->ls_recover_block;
+ start_gen = ls->ls_recover_start;
+ spin_unlock(&ls->ls_recover_spin);
+
+ /*
+ * Equal block_gen and start_gen implies we are between
+ * recover_prep and recover_done callbacks, which means
+ * dlm recovery is in progress and dlm locking is blocked.
+ * There's no point trying to do any work until recover_done.
+ */
+
+ if (block_gen == start_gen)
+ return;
+
+ /*
+ * Propagate recover_submit[] and recover_result[] to lvb:
+ * dlm_recoverd adds to recover_submit[] jids needing recovery
+ * gfs2_recover adds to recover_result[] journal recovery results
+ *
+ * set lvb bit for jids in recover_submit[] if the lvb has not
+ * yet been updated for the generation of the failure
+ *
+ * clear lvb bit for jids in recover_result[] if the result of
+ * the journal recovery is SUCCESS
+ */
+
+ error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+ if (error) {
+ fs_err(sdp, "control lock EX error %d\n", error);
+ return;
+ }
+
+ control_lvb_read(ls, &lvb_gen, lvb_bits);
+
+ spin_lock(&ls->ls_recover_spin);
+ if (block_gen != ls->ls_recover_block ||
+ start_gen != ls->ls_recover_start) {
+ fs_info(sdp, "recover generation %u block1 %u %u\n",
+ start_gen, block_gen, ls->ls_recover_block);
+ spin_unlock(&ls->ls_recover_spin);
+ control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+ return;
+ }
+
+ recover_size = ls->ls_recover_size;
+
+ if (lvb_gen <= start_gen) {
+ /*
+ * Clear lvb bits for jids we've successfully recovered.
+ * Because all nodes attempt to recover failed journals,
+ * a journal can be recovered multiple times successfully
+ * in succession. Only the first will really do recovery,
+ * the others find it clean, but still report a successful
+ * recovery. So, another node may have already recovered
+ * the jid and cleared the lvb bit for it.
+ */
+ for (i = 0; i < recover_size; i++) {
+ if (ls->ls_recover_result[i] != LM_RD_SUCCESS)
+ continue;
+
+ ls->ls_recover_result[i] = 0;
+
+ if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET))
+ continue;
+
+ __clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
+ write_lvb = 1;
+ }
+ }
+
+ if (lvb_gen == start_gen) {
+ /*
+ * Failed slots before start_gen are already set in lvb.
+ */
+ for (i = 0; i < recover_size; i++) {
+ if (!ls->ls_recover_submit[i])
+ continue;
+ if (ls->ls_recover_submit[i] < lvb_gen)
+ ls->ls_recover_submit[i] = 0;
+ }
+ } else if (lvb_gen < start_gen) {
+ /*
+ * Failed slots before start_gen are not yet set in lvb.
+ */
+ for (i = 0; i < recover_size; i++) {
+ if (!ls->ls_recover_submit[i])
+ continue;
+ if (ls->ls_recover_submit[i] < start_gen) {
+ ls->ls_recover_submit[i] = 0;
+ __set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
+ }
+ }
+ /* even if there are no bits to set, we need to write the
+ latest generation to the lvb */
+ write_lvb = 1;
+ } else {
+ /*
+ * we should be getting a recover_done() for lvb_gen soon
+ */
+ }
+ spin_unlock(&ls->ls_recover_spin);
+
+ if (write_lvb) {
+ control_lvb_write(ls, start_gen, lvb_bits);
+ flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK;
+ } else {
+ flags = DLM_LKF_CONVERT;
+ }
+
+ error = control_lock(sdp, DLM_LOCK_NL, flags);
+ if (error) {
+ fs_err(sdp, "control lock NL error %d\n", error);
+ return;
+ }
+
+ /*
+ * Everyone will see jid bits set in the lvb, run gfs2_recover_set(),
+ * and clear a jid bit in the lvb if the recovery is a success.
+ * Eventually all journals will be recovered, all jid bits will
+ * be cleared in the lvb, and everyone will clear BLOCK_LOCKS.
+ */
+
+ for (i = 0; i < recover_size; i++) {
+ if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) {
+ fs_info(sdp, "recover generation %u jid %d\n",
+ start_gen, i);
+ gfs2_recover_set(sdp, i);
+ recover_set++;
+ }
+ }
+ if (recover_set)
+ return;
+
+ /*
+ * No more jid bits set in lvb, all recovery is done, unblock locks
+ * (unless a new recover_prep callback has occured blocking locks
+ * again while working above)
+ */
+
+ spin_lock(&ls->ls_recover_spin);
+ if (ls->ls_recover_block == block_gen &&
+ ls->ls_recover_start == start_gen) {
+ clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ fs_info(sdp, "recover generation %u done\n", start_gen);
+ gfs2_glock_thaw(sdp);
+ } else {
+ fs_info(sdp, "recover generation %u block2 %u %u\n",
+ start_gen, block_gen, ls->ls_recover_block);
+ spin_unlock(&ls->ls_recover_spin);
+ }
+}
+
+static int control_mount(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char lvb_bits[GDLM_LVB_SIZE];
+ uint32_t start_gen, block_gen, mount_gen, lvb_gen;
+ int mounted_mode;
+ int retries = 0;
+ int error;
+
+ memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb));
+ memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb));
+ memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE);
+ ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb;
+ init_completion(&ls->ls_sync_wait);
+
+ set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+
+ error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK);
+ if (error) {
+ fs_err(sdp, "control_mount control_lock NL error %d\n", error);
+ return error;
+ }
+
+ error = mounted_lock(sdp, DLM_LOCK_NL, 0);
+ if (error) {
+ fs_err(sdp, "control_mount mounted_lock NL error %d\n", error);
+ control_unlock(sdp);
+ return error;
+ }
+ mounted_mode = DLM_LOCK_NL;
+
+restart:
+ if (retries++ && signal_pending(current)) {
+ error = -EINTR;
+ goto fail;
+ }
+
+ /*
+ * We always start with both locks in NL. control_lock is
+ * demoted to NL below so we don't need to do it here.
+ */
+
+ if (mounted_mode != DLM_LOCK_NL) {
+ error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+ if (error)
+ goto fail;
+ mounted_mode = DLM_LOCK_NL;
+ }
+
+ /*
+ * Other nodes need to do some work in dlm recovery and gfs2_control
+ * before the recover_done and control_lock will be ready for us below.
+ * A delay here is not required but often avoids having to retry.
+ */
+
+ msleep_interruptible(500);
+
+ /*
+ * Acquire control_lock in EX and mounted_lock in either EX or PR.
+ * control_lock lvb keeps track of any pending journal recoveries.
+ * mounted_lock indicates if any other nodes have the fs mounted.
+ */
+
+ error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK);
+ if (error == -EAGAIN) {
+ goto restart;
+ } else if (error) {
+ fs_err(sdp, "control_mount control_lock EX error %d\n", error);
+ goto fail;
+ }
+
+ error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
+ if (!error) {
+ mounted_mode = DLM_LOCK_EX;
+ goto locks_done;
+ } else if (error != -EAGAIN) {
+ fs_err(sdp, "control_mount mounted_lock EX error %d\n", error);
+ goto fail;
+ }
+
+ error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
+ if (!error) {
+ mounted_mode = DLM_LOCK_PR;
+ goto locks_done;
+ } else {
+ /* not even -EAGAIN should happen here */
+ fs_err(sdp, "control_mount mounted_lock PR error %d\n", error);
+ goto fail;
+ }
+
+locks_done:
+ /*
+ * If we got both locks above in EX, then we're the first mounter.
+ * If not, then we need to wait for the control_lock lvb to be
+ * updated by other mounted nodes to reflect our mount generation.
+ *
+ * In simple first mounter cases, first mounter will see zero lvb_gen,
+ * but in cases where all existing nodes leave/fail before mounting
+ * nodes finish control_mount, then all nodes will be mounting and
+ * lvb_gen will be non-zero.
+ */
+
+ control_lvb_read(ls, &lvb_gen, lvb_bits);
+
+ if (lvb_gen == 0xFFFFFFFF) {
+ /* special value to force mount attempts to fail */
+ fs_err(sdp, "control_mount control_lock disabled\n");
+ error = -EINVAL;
+ goto fail;
+ }
+
+ if (mounted_mode == DLM_LOCK_EX) {
+ /* first mounter, keep both EX while doing first recovery */
+ spin_lock(&ls->ls_recover_spin);
+ clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+ set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
+ set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ fs_info(sdp, "first mounter control generation %u\n", lvb_gen);
+ return 0;
+ }
+
+ error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+ if (error)
+ goto fail;
+
+ /*
+ * We are not first mounter, now we need to wait for the control_lock
+ * lvb generation to be >= the generation from our first recover_done
+ * and all lvb bits to be clear (no pending journal recoveries.)
+ */
+
+ if (!all_jid_bits_clear(lvb_bits)) {
+ /* journals need recovery, wait until all are clear */
+ fs_info(sdp, "control_mount wait for journal recovery\n");
+ goto restart;
+ }
+
+ spin_lock(&ls->ls_recover_spin);
+ block_gen = ls->ls_recover_block;
+ start_gen = ls->ls_recover_start;
+ mount_gen = ls->ls_recover_mount;
+
+ if (lvb_gen < mount_gen) {
+ /* wait for mounted nodes to update control_lock lvb to our
+ generation, which might include new recovery bits set */
+ fs_info(sdp, "control_mount wait1 block %u start %u mount %u "
+ "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+ lvb_gen, ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ goto restart;
+ }
+
+ if (lvb_gen != start_gen) {
+ /* wait for mounted nodes to update control_lock lvb to the
+ latest recovery generation */
+ fs_info(sdp, "control_mount wait2 block %u start %u mount %u "
+ "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+ lvb_gen, ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ goto restart;
+ }
+
+ if (block_gen == start_gen) {
+ /* dlm recovery in progress, wait for it to finish */
+ fs_info(sdp, "control_mount wait3 block %u start %u mount %u "
+ "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+ lvb_gen, ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ goto restart;
}
- error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm,
- DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
- (ls->ls_nodir ? DLM_LSFL_NODIR : 0),
- GDLM_LVB_SIZE);
+ clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+ set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
+ memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
+ memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
+ spin_unlock(&ls->ls_recover_spin);
+ return 0;
+
+fail:
+ mounted_unlock(sdp);
+ control_unlock(sdp);
+ return error;
+}
+
+static int dlm_recovery_wait(void *word)
+{
+ schedule();
+ return 0;
+}
+
+static int control_first_done(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char lvb_bits[GDLM_LVB_SIZE];
+ uint32_t start_gen, block_gen;
+ int error;
+
+restart:
+ spin_lock(&ls->ls_recover_spin);
+ start_gen = ls->ls_recover_start;
+ block_gen = ls->ls_recover_block;
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) ||
+ !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+ !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+ /* sanity check, should not happen */
+ fs_err(sdp, "control_first_done start %u block %u flags %lx\n",
+ start_gen, block_gen, ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ control_unlock(sdp);
+ return -1;
+ }
+
+ if (start_gen == block_gen) {
+ /*
+ * Wait for the end of a dlm recovery cycle to switch from
+ * first mounter recovery. We can ignore any recover_slot
+ * callbacks between the recover_prep and next recover_done
+ * because we are still the first mounter and any failed nodes
+ * have not fully mounted, so they don't need recovery.
+ */
+ spin_unlock(&ls->ls_recover_spin);
+ fs_info(sdp, "control_first_done wait gen %u\n", start_gen);
+
+ wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY,
+ dlm_recovery_wait, TASK_UNINTERRUPTIBLE);
+ goto restart;
+ }
+
+ clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+ set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags);
+ memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
+ memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
+ spin_unlock(&ls->ls_recover_spin);
+
+ memset(lvb_bits, 0, sizeof(lvb_bits));
+ control_lvb_write(ls, start_gen, lvb_bits);
+
+ error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT);
+ if (error)
+ fs_err(sdp, "control_first_done mounted PR error %d\n", error);
+
+ error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
if (error)
- printk(KERN_ERR "dlm_new_lockspace error %d", error);
+ fs_err(sdp, "control_first_done control NL error %d\n", error);
return error;
}
+/*
+ * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC)
+ * to accomodate the largest slot number. (NB dlm slot numbers start at 1,
+ * gfs2 jids start at 0, so jid = slot - 1)
+ */
+
+#define RECOVER_SIZE_INC 16
+
+static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,
+ int num_slots)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ uint32_t *submit = NULL;
+ uint32_t *result = NULL;
+ uint32_t old_size, new_size;
+ int i, max_jid;
+
+ max_jid = 0;
+ for (i = 0; i < num_slots; i++) {
+ if (max_jid < slots[i].slot - 1)
+ max_jid = slots[i].slot - 1;
+ }
+
+ old_size = ls->ls_recover_size;
+
+ if (old_size >= max_jid + 1)
+ return 0;
+
+ new_size = old_size + RECOVER_SIZE_INC;
+
+ submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
+ result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
+ if (!submit || !result) {
+ kfree(submit);
+ kfree(result);
+ return -ENOMEM;
+ }
+
+ spin_lock(&ls->ls_recover_spin);
+ memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t));
+ memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t));
+ kfree(ls->ls_recover_submit);
+ kfree(ls->ls_recover_result);
+ ls->ls_recover_submit = submit;
+ ls->ls_recover_result = result;
+ ls->ls_recover_size = new_size;
+ spin_unlock(&ls->ls_recover_spin);
+ return 0;
+}
+
+static void free_recover_size(struct lm_lockstruct *ls)
+{
+ kfree(ls->ls_recover_submit);
+ kfree(ls->ls_recover_result);
+ ls->ls_recover_submit = NULL;
+ ls->ls_recover_result = NULL;
+ ls->ls_recover_size = 0;
+}
+
+/* dlm calls before it does lock recovery */
+
+static void gdlm_recover_prep(void *arg)
+{
+ struct gfs2_sbd *sdp = arg;
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+ spin_lock(&ls->ls_recover_spin);
+ ls->ls_recover_block = ls->ls_recover_start;
+ set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
+
+ if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+ test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+ set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+}
+
+/* dlm calls after recover_prep has been completed on all lockspace members;
+ identifies slot/jid of failed member */
+
+static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct gfs2_sbd *sdp = arg;
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ int jid = slot->slot - 1;
+
+ spin_lock(&ls->ls_recover_spin);
+ if (ls->ls_recover_size < jid + 1) {
+ fs_err(sdp, "recover_slot jid %d gen %u short size %d",
+ jid, ls->ls_recover_block, ls->ls_recover_size);
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+
+ if (ls->ls_recover_submit[jid]) {
+ fs_info(sdp, "recover_slot jid %d gen %u prev %u",
+ jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
+ }
+ ls->ls_recover_submit[jid] = ls->ls_recover_block;
+ spin_unlock(&ls->ls_recover_spin);
+}
+
+/* dlm calls after recover_slot and after it completes lock recovery */
+
+static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
+ int our_slot, uint32_t generation)
+{
+ struct gfs2_sbd *sdp = arg;
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+ /* ensure the ls jid arrays are large enough */
+ set_recover_size(sdp, slots, num_slots);
+
+ spin_lock(&ls->ls_recover_spin);
+ ls->ls_recover_start = generation;
+
+ if (!ls->ls_recover_mount) {
+ ls->ls_recover_mount = generation;
+ ls->ls_jid = our_slot - 1;
+ }
+
+ if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
+ queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
+
+ clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY);
+ spin_unlock(&ls->ls_recover_spin);
+}
+
+/* gfs2_recover thread has a journal recovery result */
+
+static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
+ unsigned int result)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+ if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+ return;
+
+ /* don't care about the recovery of own journal during mount */
+ if (jid == ls->ls_jid)
+ return;
+
+ spin_lock(&ls->ls_recover_spin);
+ if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+ if (ls->ls_recover_size < jid + 1) {
+ fs_err(sdp, "recovery_result jid %d short size %d",
+ jid, ls->ls_recover_size);
+ spin_unlock(&ls->ls_recover_spin);
+ return;
+ }
+
+ fs_info(sdp, "recover jid %d result %s\n", jid,
+ result == LM_RD_GAVEUP ? "busy" : "success");
+
+ ls->ls_recover_result[jid] = result;
+
+ /* GAVEUP means another node is recovering the journal; delay our
+ next attempt to recover it, to give the other node a chance to
+ finish before trying again */
+
+ if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
+ queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work,
+ result == LM_RD_GAVEUP ? HZ : 0);
+ spin_unlock(&ls->ls_recover_spin);
+}
+
+const struct dlm_lockspace_ops gdlm_lockspace_ops = {
+ .recover_prep = gdlm_recover_prep,
+ .recover_slot = gdlm_recover_slot,
+ .recover_done = gdlm_recover_done,
+};
+
+static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ char cluster[GFS2_LOCKNAME_LEN];
+ const char *fsname;
+ uint32_t flags;
+ int error, ops_result;
+
+ /*
+ * initialize everything
+ */
+
+ INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func);
+ spin_lock_init(&ls->ls_recover_spin);
+ ls->ls_recover_flags = 0;
+ ls->ls_recover_mount = 0;
+ ls->ls_recover_start = 0;
+ ls->ls_recover_block = 0;
+ ls->ls_recover_size = 0;
+ ls->ls_recover_submit = NULL;
+ ls->ls_recover_result = NULL;
+
+ error = set_recover_size(sdp, NULL, 0);
+ if (error)
+ goto fail;
+
+ /*
+ * prepare dlm_new_lockspace args
+ */
+
+ fsname = strchr(table, ':');
+ if (!fsname) {
+ fs_info(sdp, "no fsname found\n");
+ error = -EINVAL;
+ goto fail_free;
+ }
+ memset(cluster, 0, sizeof(cluster));
+ memcpy(cluster, table, strlen(table) - strlen(fsname));
+ fsname++;
+
+ flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
+ if (ls->ls_nodir)
+ flags |= DLM_LSFL_NODIR;
+
+ /*
+ * create/join lockspace
+ */
+
+ error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE,
+ &gdlm_lockspace_ops, sdp, &ops_result,
+ &ls->ls_dlm);
+ if (error) {
+ fs_err(sdp, "dlm_new_lockspace error %d\n", error);
+ goto fail_free;
+ }
+
+ if (ops_result < 0) {
+ /*
+ * dlm does not support ops callbacks,
+ * old dlm_controld/gfs_controld are used, try without ops.
+ */
+ fs_info(sdp, "dlm lockspace ops not used\n");
+ free_recover_size(ls);
+ set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags);
+ return 0;
+ }
+
+ if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) {
+ fs_err(sdp, "dlm lockspace ops disallow jid preset\n");
+ error = -EINVAL;
+ goto fail_release;
+ }
+
+ /*
+ * control_mount() uses control_lock to determine first mounter,
+ * and for later mounts, waits for any recoveries to be cleared.
+ */
+
+ error = control_mount(sdp);
+ if (error) {
+ fs_err(sdp, "mount control error %d\n", error);
+ goto fail_release;
+ }
+
+ ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+ clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
+ return 0;
+
+fail_release:
+ dlm_release_lockspace(ls->ls_dlm, 2);
+fail_free:
+ free_recover_size(ls);
+fail:
+ return error;
+}
+
+static void gdlm_first_done(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ int error;
+
+ if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+ return;
+
+ error = control_first_done(sdp);
+ if (error)
+ fs_err(sdp, "mount first_done error %d\n", error);
+}
+
static void gdlm_unmount(struct gfs2_sbd *sdp)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+ goto release;
+
+ /* wait for gfs2_control_wq to be done with this mount */
+
+ spin_lock(&ls->ls_recover_spin);
+ set_bit(DFL_UNMOUNT, &ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
+ flush_delayed_work_sync(&sdp->sd_control_work);
+
+ /* mounted_lock and control_lock will be purged in dlm recovery */
+release:
if (ls->ls_dlm) {
dlm_release_lockspace(ls->ls_dlm, 2);
ls->ls_dlm = NULL;
}
+
+ free_recover_size(ls);
}
static const match_table_t dlm_tokens = {
@@ -226,6 +1197,8 @@ static const match_table_t dlm_tokens = {
const struct lm_lockops gfs2_dlm_ops = {
.lm_proto_name = "lock_dlm",
.lm_mount = gdlm_mount,
+ .lm_first_done = gdlm_first_done,
+ .lm_recovery_result = gdlm_recovery_result,
.lm_unmount = gdlm_unmount,
.lm_put_lock = gdlm_put_lock,
.lm_lock = gdlm_lock,
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 598646434362..756fae9eaf8f 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -626,7 +626,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
else
- submit_bh(WRITE_FLUSH_FUA | REQ_META | REQ_PRIO, bh);
+ submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
@@ -951,8 +951,8 @@ int gfs2_logd(void *data)
wake_up(&sdp->sd_log_waitq);
t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
- if (freezing(current))
- refrigerator();
+
+ try_to_freeze();
do {
prepare_to_wait(&sdp->sd_logd_waitq, &wait,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 8a139ff1919f..a8d9bcd0e19c 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -28,6 +28,8 @@
#include "recovery.h"
#include "dir.h"
+struct workqueue_struct *gfs2_control_wq;
+
static struct shrinker qd_shrinker = {
.shrink = gfs2_shrink_qd_memory,
.seeks = DEFAULT_SEEKS,
@@ -40,7 +42,8 @@ static void gfs2_init_inode_once(void *foo)
inode_init_once(&ip->i_inode);
init_rwsem(&ip->i_rw_mutex);
INIT_LIST_HEAD(&ip->i_trunc_list);
- ip->i_alloc = NULL;
+ ip->i_qadata = NULL;
+ ip->i_res = NULL;
ip->i_hash_cache = NULL;
}
@@ -145,12 +148,19 @@ static int __init init_gfs2_fs(void)
if (!gfs_recovery_wq)
goto fail_wq;
+ gfs2_control_wq = alloc_workqueue("gfs2_control",
+ WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0);
+ if (!gfs2_control_wq)
+ goto fail_control;
+
gfs2_register_debugfs();
printk("GFS2 installed\n");
return 0;
+fail_control:
+ destroy_workqueue(gfs_recovery_wq);
fail_wq:
unregister_filesystem(&gfs2meta_fs_type);
fail_unregister:
@@ -194,6 +204,7 @@ static void __exit exit_gfs2_fs(void)
unregister_filesystem(&gfs2_fs_type);
unregister_filesystem(&gfs2meta_fs_type);
destroy_workqueue(gfs_recovery_wq);
+ destroy_workqueue(gfs2_control_wq);
rcu_barrier();
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index be29858900f6..181586e673f9 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -435,7 +435,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (buffer_uptodate(first_bh))
goto out;
if (!buffer_locked(first_bh))
- ll_rw_block(READ_SYNC | REQ_META | REQ_PRIO, 1, &first_bh);
+ ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh);
dblock++;
extlen--;
@@ -444,7 +444,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
bh = gfs2_getbuf(gl, dblock, CREATE);
if (!buffer_uptodate(bh) && !buffer_locked(bh))
- ll_rw_block(READA, 1, &bh);
+ ll_rw_block(READA | REQ_META, 1, &bh);
brelse(bh);
dblock++;
extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index cb23c2be731a..24f609c9ef91 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -224,7 +224,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
bio->bi_end_io = end_bio_io_page;
bio->bi_private = page;
- submit_bio(READ_SYNC | REQ_META | REQ_PRIO, bio);
+ submit_bio(READ_SYNC | REQ_META, bio);
wait_on_page_locked(page);
bio_put(bio);
if (!PageUptodate(page)) {
@@ -562,8 +562,12 @@ static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
{
char *message = "FIRSTMOUNT=Done";
char *envp[] = { message, NULL };
- struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- ls->ls_first_done = 1;
+
+ fs_info(sdp, "first mount done, others may mount\n");
+
+ if (sdp->sd_lockstruct.ls_ops->lm_first_done)
+ sdp->sd_lockstruct.ls_ops->lm_first_done(sdp);
+
kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
}
@@ -796,6 +800,11 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
fs_err(sdp, "can't get quota file inode: %d\n", error);
goto fail_rindex;
}
+
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ goto fail_qinode;
+
return 0;
fail_qinode:
@@ -944,7 +953,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
struct gfs2_args *args = &sdp->sd_args;
const char *proto = sdp->sd_proto_name;
const char *table = sdp->sd_table_name;
- const char *fsname;
char *o, *options;
int ret;
@@ -1004,21 +1012,12 @@ hostdata_error:
}
}
- if (sdp->sd_args.ar_spectator)
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
- else
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
- sdp->sd_lockstruct.ls_jid);
-
- fsname = strchr(table, ':');
- if (fsname)
- fsname++;
if (lm->lm_mount == NULL) {
fs_info(sdp, "Now mounting FS...\n");
complete_all(&sdp->sd_locking_init);
return 0;
}
- ret = lm->lm_mount(sdp, fsname);
+ ret = lm->lm_mount(sdp, table);
if (ret == 0)
fs_info(sdp, "Joined cluster. Now mounting FS...\n");
complete_all(&sdp->sd_locking_init);
@@ -1084,7 +1083,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
if (sdp->sd_args.ar_spectator) {
sb->s_flags |= MS_RDONLY;
- set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+ set_bit(SDF_RORECOVERY, &sdp->sd_flags);
}
if (sdp->sd_args.ar_posix_acl)
sb->s_flags |= MS_POSIXACL;
@@ -1124,6 +1123,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
if (error)
goto fail;
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
+
gfs2_create_debugfs_file(sdp);
error = gfs2_sys_fs_add(sdp);
@@ -1160,6 +1161,13 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
goto fail_sb;
}
+ if (sdp->sd_args.ar_spectator)
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
+ sdp->sd_table_name);
+ else
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
+ sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
+
error = init_inodes(sdp, DO);
if (error)
goto fail_sb;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 7e528dc14f85..a45b21b03915 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -494,11 +494,11 @@ static void qdsb_put(struct gfs2_quota_data *qd)
int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = ip->i_alloc;
- struct gfs2_quota_data **qd = al->al_qd;
+ struct gfs2_qadata *qa = ip->i_qadata;
+ struct gfs2_quota_data **qd = qa->qa_qd;
int error;
- if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
+ if (gfs2_assert_warn(sdp, !qa->qa_qd_num) ||
gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
return -EIO;
@@ -508,20 +508,20 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd);
if (error)
goto out;
- al->al_qd_num++;
+ qa->qa_qd_num++;
qd++;
error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd);
if (error)
goto out;
- al->al_qd_num++;
+ qa->qa_qd_num++;
qd++;
if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
error = qdsb_get(sdp, QUOTA_USER, uid, qd);
if (error)
goto out;
- al->al_qd_num++;
+ qa->qa_qd_num++;
qd++;
}
@@ -529,7 +529,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
error = qdsb_get(sdp, QUOTA_GROUP, gid, qd);
if (error)
goto out;
- al->al_qd_num++;
+ qa->qa_qd_num++;
qd++;
}
@@ -542,16 +542,16 @@ out:
void gfs2_quota_unhold(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = ip->i_alloc;
+ struct gfs2_qadata *qa = ip->i_qadata;
unsigned int x;
gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
- for (x = 0; x < al->al_qd_num; x++) {
- qdsb_put(al->al_qd[x]);
- al->al_qd[x] = NULL;
+ for (x = 0; x < qa->qa_qd_num; x++) {
+ qdsb_put(qa->qa_qd[x]);
+ qa->qa_qd[x] = NULL;
}
- al->al_qd_num = 0;
+ qa->qa_qd_num = 0;
}
static int sort_qd(const void *a, const void *b)
@@ -712,7 +712,7 @@ get_a_page:
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh)) {
- ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
+ ll_rw_block(READ | REQ_META, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
goto unlock_out;
@@ -762,7 +762,6 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
struct gfs2_quota_data *qd;
loff_t offset;
unsigned int nalloc = 0, blocks;
- struct gfs2_alloc *al = NULL;
int error;
gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
@@ -792,26 +791,19 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
nalloc++;
}
- al = gfs2_alloc_get(ip);
- if (!al) {
- error = -ENOMEM;
- goto out_gunlock;
- }
/*
* 1 blk for unstuffing inode if stuffed. We add this extra
* block to the reservation unconditionally. If the inode
* doesn't need unstuffing, the block will be released to the
* rgrp since it won't be allocated during the transaction
*/
- al->al_requested = 1;
/* +3 in the end for unstuffing block, inode size update block
* and another block in case quota straddles page boundary and
* two blocks need to be updated instead of 1 */
blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
- if (nalloc)
- al->al_requested += nalloc * (data_blocks + ind_blocks);
- error = gfs2_inplace_reserve(ip);
+ error = gfs2_inplace_reserve(ip, 1 +
+ (nalloc * (data_blocks + ind_blocks)));
if (error)
goto out_alloc;
@@ -840,8 +832,6 @@ out_end_trans:
out_ipres:
gfs2_inplace_release(ip);
out_alloc:
- gfs2_alloc_put(ip);
-out_gunlock:
gfs2_glock_dq_uninit(&i_gh);
out:
while (qx--)
@@ -925,7 +915,7 @@ fail:
int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = ip->i_alloc;
+ struct gfs2_qadata *qa = ip->i_qadata;
struct gfs2_quota_data *qd;
unsigned int x;
int error = 0;
@@ -938,15 +928,15 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
return 0;
- sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
+ sort(qa->qa_qd, qa->qa_qd_num, sizeof(struct gfs2_quota_data *),
sort_qd, NULL);
- for (x = 0; x < al->al_qd_num; x++) {
+ for (x = 0; x < qa->qa_qd_num; x++) {
int force = NO_FORCE;
- qd = al->al_qd[x];
+ qd = qa->qa_qd[x];
if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
force = FORCE;
- error = do_glock(qd, force, &al->al_qd_ghs[x]);
+ error = do_glock(qd, force, &qa->qa_qd_ghs[x]);
if (error)
break;
}
@@ -955,7 +945,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
set_bit(GIF_QD_LOCKED, &ip->i_flags);
else {
while (x--)
- gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
+ gfs2_glock_dq_uninit(&qa->qa_qd_ghs[x]);
gfs2_quota_unhold(ip);
}
@@ -1000,7 +990,7 @@ static int need_sync(struct gfs2_quota_data *qd)
void gfs2_quota_unlock(struct gfs2_inode *ip)
{
- struct gfs2_alloc *al = ip->i_alloc;
+ struct gfs2_qadata *qa = ip->i_qadata;
struct gfs2_quota_data *qda[4];
unsigned int count = 0;
unsigned int x;
@@ -1008,14 +998,14 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
goto out;
- for (x = 0; x < al->al_qd_num; x++) {
+ for (x = 0; x < qa->qa_qd_num; x++) {
struct gfs2_quota_data *qd;
int sync;
- qd = al->al_qd[x];
+ qd = qa->qa_qd[x];
sync = need_sync(qd);
- gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
+ gfs2_glock_dq_uninit(&qa->qa_qd_ghs[x]);
if (sync && qd_trylock(qd))
qda[count++] = qd;
@@ -1048,7 +1038,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = ip->i_alloc;
+ struct gfs2_qadata *qa = ip->i_qadata;
struct gfs2_quota_data *qd;
s64 value;
unsigned int x;
@@ -1060,8 +1050,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
return 0;
- for (x = 0; x < al->al_qd_num; x++) {
- qd = al->al_qd[x];
+ for (x = 0; x < qa->qa_qd_num; x++) {
+ qd = qa->qa_qd[x];
if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
(qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
@@ -1099,7 +1089,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
u32 uid, u32 gid)
{
- struct gfs2_alloc *al = ip->i_alloc;
+ struct gfs2_qadata *qa = ip->i_qadata;
struct gfs2_quota_data *qd;
unsigned int x;
@@ -1108,8 +1098,8 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
if (ip->i_diskflags & GFS2_DIF_SYSTEM)
return;
- for (x = 0; x < al->al_qd_num; x++) {
- qd = al->al_qd[x];
+ for (x = 0; x < qa->qa_qd_num; x++) {
+ qd = qa->qa_qd[x];
if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
(qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
@@ -1427,8 +1417,8 @@ int gfs2_quotad(void *data)
/* Check for & recover partially truncated inodes */
quotad_check_trunc_list(sdp);
- if (freezing(current))
- refrigerator();
+ try_to_freeze();
+
t = min(quotad_timeo, statfs_timeo);
prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
@@ -1529,7 +1519,6 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
unsigned int data_blocks, ind_blocks;
unsigned int blocks = 0;
int alloc_required;
- struct gfs2_alloc *al;
loff_t offset;
int error;
@@ -1594,15 +1583,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
if (gfs2_is_stuffed(ip))
alloc_required = 1;
if (alloc_required) {
- al = gfs2_alloc_get(ip);
- if (al == NULL)
- goto out_i;
gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
&data_blocks, &ind_blocks);
- blocks = al->al_requested = 1 + data_blocks + ind_blocks;
- error = gfs2_inplace_reserve(ip);
+ blocks = 1 + data_blocks + ind_blocks;
+ error = gfs2_inplace_reserve(ip, blocks);
if (error)
- goto out_alloc;
+ goto out_i;
blocks += gfs2_rg_blocks(ip);
}
@@ -1617,11 +1603,8 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
gfs2_trans_end(sdp);
out_release:
- if (alloc_required) {
+ if (alloc_required)
gfs2_inplace_release(ip);
-out_alloc:
- gfs2_alloc_put(ip);
- }
out_i:
gfs2_glock_dq_uninit(&i_gh);
out_q:
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f2a02edcac8f..963b2d75200c 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -436,12 +436,16 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
char env_status[20];
char *envp[] = { env_jid, env_status, NULL };
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
ls->ls_recover_jid_done = jid;
ls->ls_recover_jid_status = message;
sprintf(env_jid, "JID=%d", jid);
sprintf(env_status, "RECOVERY=%s",
message == LM_RD_SUCCESS ? "Done" : "Failed");
kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
+
+ if (sdp->sd_lockstruct.ls_ops->lm_recovery_result)
+ sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message);
}
void gfs2_recover_func(struct work_struct *work)
@@ -512,7 +516,9 @@ void gfs2_recover_func(struct work_struct *work)
if (error)
goto fail_gunlock_ji;
- if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
+ if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) {
+ ro = 1;
+ } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
ro = 1;
} else {
@@ -577,6 +583,7 @@ fail_gunlock_j:
fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
fail:
+ jd->jd_recover_error = error;
gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
done:
clear_bit(JDF_RECOVERY, &jd->jd_flags);
@@ -605,6 +612,6 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
TASK_UNINTERRUPTIBLE);
- return 0;
+ return wait ? jd->jd_recover_error : 0;
}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 96bd6d759f29..49ada95209d0 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -65,8 +65,8 @@ static const char valid_change[16] = {
};
static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
- unsigned char old_state, unsigned char new_state,
- unsigned int *n);
+ unsigned char old_state,
+ struct gfs2_bitmap **rbi);
/**
* gfs2_setbit - Set a bit in the bitmaps
@@ -683,16 +683,21 @@ int gfs2_rindex_update(struct gfs2_sbd *sdp)
struct gfs2_glock *gl = ip->i_gl;
struct gfs2_holder ri_gh;
int error = 0;
+ int unlock_required = 0;
/* Read new copy from disk if we don't have the latest */
if (!sdp->sd_rindex_uptodate) {
mutex_lock(&sdp->sd_rindex_mutex);
- error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
- if (error)
- return error;
+ if (!gfs2_glock_is_locked_by_me(gl)) {
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
+ if (error)
+ return error;
+ unlock_required = 1;
+ }
if (!sdp->sd_rindex_uptodate)
error = gfs2_ri_update(ip);
- gfs2_glock_dq_uninit(&ri_gh);
+ if (unlock_required)
+ gfs2_glock_dq_uninit(&ri_gh);
mutex_unlock(&sdp->sd_rindex_mutex);
}
@@ -860,22 +865,36 @@ fail:
}
/**
- * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
+ * gfs2_qadata_get - get the struct gfs2_qadata structure for an inode
* @ip: the incore GFS2 inode structure
*
- * Returns: the struct gfs2_alloc
+ * Returns: the struct gfs2_qadata
*/
-struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
+struct gfs2_qadata *gfs2_qadata_get(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
int error;
- BUG_ON(ip->i_alloc != NULL);
- ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS);
+ BUG_ON(ip->i_qadata != NULL);
+ ip->i_qadata = kzalloc(sizeof(struct gfs2_qadata), GFP_NOFS);
error = gfs2_rindex_update(sdp);
if (error)
fs_warn(sdp, "rindex update returns %d\n", error);
- return ip->i_alloc;
+ return ip->i_qadata;
+}
+
+/**
+ * gfs2_blkrsv_get - get the struct gfs2_blkreserv structure for an inode
+ * @ip: the incore GFS2 inode structure
+ *
+ * Returns: the struct gfs2_qadata
+ */
+
+static struct gfs2_blkreserv *gfs2_blkrsv_get(struct gfs2_inode *ip)
+{
+ BUG_ON(ip->i_res != NULL);
+ ip->i_res = kzalloc(sizeof(struct gfs2_blkreserv), GFP_NOFS);
+ return ip->i_res;
}
/**
@@ -890,15 +909,20 @@ struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
{
- const struct gfs2_alloc *al = ip->i_alloc;
+ const struct gfs2_blkreserv *rs = ip->i_res;
if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
return 0;
- if (rgd->rd_free_clone >= al->al_requested)
+ if (rgd->rd_free_clone >= rs->rs_requested)
return 1;
return 0;
}
+static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
+{
+ return (bi->bi_start * GFS2_NBBY) + blk;
+}
+
/**
* try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
* @rgd: The rgrp
@@ -912,20 +936,20 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
u32 goal = 0, block;
u64 no_addr;
struct gfs2_sbd *sdp = rgd->rd_sbd;
- unsigned int n;
struct gfs2_glock *gl;
struct gfs2_inode *ip;
int error;
int found = 0;
+ struct gfs2_bitmap *bi;
while (goal < rgd->rd_data) {
down_write(&sdp->sd_log_flush_lock);
- n = 1;
- block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
- GFS2_BLKST_UNLINKED, &n);
+ block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, &bi);
up_write(&sdp->sd_log_flush_lock);
if (block == BFITNOENT)
break;
+
+ block = gfs2_bi2rgd_blk(bi, block);
/* rgblk_search can return a block < goal, so we need to
keep it marching forward. */
no_addr = block + rgd->rd_data0;
@@ -977,8 +1001,8 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *rgd, *begin = NULL;
- struct gfs2_alloc *al = ip->i_alloc;
- int error, rg_locked;
+ struct gfs2_blkreserv *rs = ip->i_res;
+ int error, rg_locked, flags = LM_FLAG_TRY;
int loops = 0;
if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
@@ -997,7 +1021,7 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
error = 0;
} else {
error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
- LM_FLAG_TRY, &al->al_rgd_gh);
+ flags, &rs->rs_rgd_gh);
}
switch (error) {
case 0:
@@ -1008,12 +1032,14 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
if (rgd->rd_flags & GFS2_RDF_CHECK)
try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
if (!rg_locked)
- gfs2_glock_dq_uninit(&al->al_rgd_gh);
+ gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
/* fall through */
case GLR_TRYFAILED:
rgd = gfs2_rgrpd_get_next(rgd);
- if (rgd == begin)
+ if (rgd == begin) {
+ flags = 0;
loops++;
+ }
break;
default:
return error;
@@ -1023,6 +1049,13 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
return -ENOSPC;
}
+static void gfs2_blkrsv_put(struct gfs2_inode *ip)
+{
+ BUG_ON(ip->i_res == NULL);
+ kfree(ip->i_res);
+ ip->i_res = NULL;
+}
+
/**
* gfs2_inplace_reserve - Reserve space in the filesystem
* @ip: the inode to reserve space for
@@ -1030,16 +1063,23 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
* Returns: errno
*/
-int gfs2_inplace_reserve(struct gfs2_inode *ip)
+int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = ip->i_alloc;
+ struct gfs2_blkreserv *rs;
int error = 0;
u64 last_unlinked = NO_BLOCK;
int tries = 0;
- if (gfs2_assert_warn(sdp, al->al_requested))
- return -EINVAL;
+ rs = gfs2_blkrsv_get(ip);
+ if (!rs)
+ return -ENOMEM;
+
+ rs->rs_requested = requested;
+ if (gfs2_assert_warn(sdp, requested)) {
+ error = -EINVAL;
+ goto out;
+ }
do {
error = get_local_rgrp(ip, &last_unlinked);
@@ -1056,6 +1096,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip)
gfs2_log_flush(sdp, NULL);
} while (tries++ < 3);
+out:
+ if (error)
+ gfs2_blkrsv_put(ip);
return error;
}
@@ -1068,10 +1111,11 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip)
void gfs2_inplace_release(struct gfs2_inode *ip)
{
- struct gfs2_alloc *al = ip->i_alloc;
+ struct gfs2_blkreserv *rs = ip->i_res;
- if (al->al_rgd_gh.gh_gl)
- gfs2_glock_dq_uninit(&al->al_rgd_gh);
+ if (rs->rs_rgd_gh.gh_gl)
+ gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+ gfs2_blkrsv_put(ip);
}
/**
@@ -1108,39 +1152,35 @@ static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
}
/**
- * rgblk_search - find a block in @old_state, change allocation
- * state to @new_state
+ * rgblk_search - find a block in @state
* @rgd: the resource group descriptor
* @goal: the goal block within the RG (start here to search for avail block)
- * @old_state: GFS2_BLKST_XXX the before-allocation state to find
- * @new_state: GFS2_BLKST_XXX the after-allocation block state
- * @n: The extent length
+ * @state: GFS2_BLKST_XXX the before-allocation state to find
+ * @dinode: TRUE if the first block we allocate is for a dinode
+ * @rbi: address of the pointer to the bitmap containing the block found
*
- * Walk rgrp's bitmap to find bits that represent a block in @old_state.
- * Add the found bitmap buffer to the transaction.
- * Set the found bits to @new_state to change block's allocation state.
+ * Walk rgrp's bitmap to find bits that represent a block in @state.
*
* This function never fails, because we wouldn't call it unless we
* know (from reservation results, etc.) that a block is available.
*
- * Scope of @goal and returned block is just within rgrp, not the whole
- * filesystem.
+ * Scope of @goal is just within rgrp, not the whole filesystem.
+ * Scope of @returned block is just within bitmap, not the whole filesystem.
*
- * Returns: the block number allocated
+ * Returns: the block number found relative to the bitmap rbi
*/
static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
- unsigned char old_state, unsigned char new_state,
- unsigned int *n)
+ unsigned char state,
+ struct gfs2_bitmap **rbi)
{
struct gfs2_bitmap *bi = NULL;
const u32 length = rgd->rd_length;
u32 blk = BFITNOENT;
unsigned int buf, x;
- const unsigned int elen = *n;
const u8 *buffer = NULL;
- *n = 0;
+ *rbi = NULL;
/* Find bitmap block that contains bits for goal block */
for (buf = 0; buf < length; buf++) {
bi = rgd->rd_bits + buf;
@@ -1163,21 +1203,21 @@ do_search:
bi = rgd->rd_bits + buf;
if (test_bit(GBF_FULL, &bi->bi_flags) &&
- (old_state == GFS2_BLKST_FREE))
+ (state == GFS2_BLKST_FREE))
goto skip;
/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
bitmaps, so we must search the originals for that. */
buffer = bi->bi_bh->b_data + bi->bi_offset;
WARN_ON(!buffer_uptodate(bi->bi_bh));
- if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
+ if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
buffer = bi->bi_clone + bi->bi_offset;
- blk = gfs2_bitfit(buffer, bi->bi_len, goal, old_state);
+ blk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
if (blk != BFITNOENT)
break;
- if ((goal == 0) && (old_state == GFS2_BLKST_FREE))
+ if ((goal == 0) && (state == GFS2_BLKST_FREE))
set_bit(GBF_FULL, &bi->bi_flags);
/* Try next bitmap block (wrap back to rgrp header if at end) */
@@ -1187,16 +1227,37 @@ skip:
goal = 0;
}
- if (blk == BFITNOENT)
- return blk;
+ if (blk != BFITNOENT)
+ *rbi = bi;
- *n = 1;
- if (old_state == new_state)
- goto out;
+ return blk;
+}
+
+/**
+ * gfs2_alloc_extent - allocate an extent from a given bitmap
+ * @rgd: the resource group descriptor
+ * @bi: the bitmap within the rgrp
+ * @blk: the block within the bitmap
+ * @dinode: TRUE if the first block we allocate is for a dinode
+ * @n: The extent length
+ *
+ * Add the found bitmap buffer to the transaction.
+ * Set the found bits to @new_state to change block's allocation state.
+ * Returns: starting block number of the extent (fs scope)
+ */
+static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
+ u32 blk, bool dinode, unsigned int *n)
+{
+ const unsigned int elen = *n;
+ u32 goal;
+ const u8 *buffer = NULL;
+ *n = 0;
+ buffer = bi->bi_bh->b_data + bi->bi_offset;
gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
- bi, blk, new_state);
+ bi, blk, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
+ (*n)++;
goal = blk;
while (*n < elen) {
goal++;
@@ -1206,11 +1267,12 @@ skip:
GFS2_BLKST_FREE)
break;
gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
- bi, goal, new_state);
+ bi, goal, GFS2_BLKST_USED);
(*n)++;
}
-out:
- return (bi->bi_start * GFS2_NBBY) + blk;
+ blk = gfs2_bi2rgd_blk(bi, blk);
+ rgd->rd_last_alloc = blk + *n - 1;
+ return rgd->rd_data0 + blk;
}
/**
@@ -1298,121 +1360,93 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
}
/**
- * gfs2_alloc_block - Allocate one or more blocks
+ * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
* @ip: the inode to allocate the block for
* @bn: Used to return the starting block number
- * @n: requested number of blocks/extent length (value/result)
+ * @ndata: requested number of blocks/extent length (value/result)
+ * @dinode: 1 if we're allocating a dinode block, else 0
+ * @generation: the generation number of the inode
*
* Returns: 0 or error
*/
-int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
+int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
+ bool dinode, u64 *generation)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct buffer_head *dibh;
- struct gfs2_alloc *al = ip->i_alloc;
struct gfs2_rgrpd *rgd;
- u32 goal, blk;
- u64 block;
+ unsigned int ndata;
+ u32 goal, blk; /* block, within the rgrp scope */
+ u64 block; /* block, within the file system scope */
int error;
+ struct gfs2_bitmap *bi;
/* Only happens if there is a bug in gfs2, return something distinctive
* to ensure that it is noticed.
*/
- if (al == NULL)
+ if (ip->i_res == NULL)
return -ECANCELED;
rgd = ip->i_rgd;
- if (rgrp_contains_block(rgd, ip->i_goal))
+ if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
goal = ip->i_goal - rgd->rd_data0;
else
goal = rgd->rd_last_alloc;
- blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
+ blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
/* Since all blocks are reserved in advance, this shouldn't happen */
if (blk == BFITNOENT)
goto rgrp_error;
- rgd->rd_last_alloc = blk;
- block = rgd->rd_data0 + blk;
- ip->i_goal = block + *n - 1;
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error == 0) {
- struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
- brelse(dibh);
+ block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
+ ndata = *nblocks;
+ if (dinode)
+ ndata--;
+
+ if (!dinode) {
+ ip->i_goal = block + ndata - 1;
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error == 0) {
+ struct gfs2_dinode *di =
+ (struct gfs2_dinode *)dibh->b_data;
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ di->di_goal_meta = di->di_goal_data =
+ cpu_to_be64(ip->i_goal);
+ brelse(dibh);
+ }
}
- if (rgd->rd_free < *n)
+ if (rgd->rd_free < *nblocks)
goto rgrp_error;
- rgd->rd_free -= *n;
-
- gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
- gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
-
- al->al_alloced += *n;
-
- gfs2_statfs_change(sdp, 0, -(s64)*n, 0);
- gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
-
- rgd->rd_free_clone -= *n;
- trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED);
- *bn = block;
- return 0;
-
-rgrp_error:
- gfs2_rgrp_error(rgd);
- return -EIO;
-}
-
-/**
- * gfs2_alloc_di - Allocate a dinode
- * @dip: the directory that the inode is going in
- * @bn: the block number which is allocated
- * @generation: the generation number of the inode
- *
- * Returns: 0 on success or error
- */
-
-int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
- struct gfs2_alloc *al = dip->i_alloc;
- struct gfs2_rgrpd *rgd = dip->i_rgd;
- u32 blk;
- u64 block;
- unsigned int n = 1;
-
- blk = rgblk_search(rgd, rgd->rd_last_alloc,
- GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
-
- /* Since all blocks are reserved in advance, this shouldn't happen */
- if (blk == BFITNOENT)
- goto rgrp_error;
-
- rgd->rd_last_alloc = blk;
- block = rgd->rd_data0 + blk;
- if (rgd->rd_free == 0)
- goto rgrp_error;
-
- rgd->rd_free--;
- rgd->rd_dinodes++;
- *generation = rgd->rd_igeneration++;
- if (*generation == 0)
+ rgd->rd_free -= *nblocks;
+ if (dinode) {
+ rgd->rd_dinodes++;
*generation = rgd->rd_igeneration++;
+ if (*generation == 0)
+ *generation = rgd->rd_igeneration++;
+ }
+
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
- al->al_alloced++;
+ gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
+ if (dinode)
+ gfs2_trans_add_unrevoke(sdp, block, 1);
- gfs2_statfs_change(sdp, 0, -1, +1);
- gfs2_trans_add_unrevoke(sdp, block, 1);
+ /*
+ * This needs reviewing to see why we cannot do the quota change
+ * at this point in the dinode case.
+ */
+ if (ndata)
+ gfs2_quota_change(ip, ndata, ip->i_inode.i_uid,
+ ip->i_inode.i_gid);
- rgd->rd_free_clone--;
- trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
+ rgd->rd_free_clone -= *nblocks;
+ trace_gfs2_block_alloc(ip, block, *nblocks,
+ dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
*bn = block;
return 0;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index cf5c50180192..ceec9106cdf4 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -28,19 +28,19 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
-extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
-static inline void gfs2_alloc_put(struct gfs2_inode *ip)
+extern struct gfs2_qadata *gfs2_qadata_get(struct gfs2_inode *ip);
+static inline void gfs2_qadata_put(struct gfs2_inode *ip)
{
- BUG_ON(ip->i_alloc == NULL);
- kfree(ip->i_alloc);
- ip->i_alloc = NULL;
+ BUG_ON(ip->i_qadata == NULL);
+ kfree(ip->i_qadata);
+ ip->i_qadata = NULL;
}
-extern int gfs2_inplace_reserve(struct gfs2_inode *ip);
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
extern void gfs2_inplace_release(struct gfs2_inode *ip);
-extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
-extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
+extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
+ bool dinode, u64 *generation);
extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 71e420989f77..4553ce515f62 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1284,18 +1284,18 @@ static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
/**
* gfs2_show_options - Show mount options for /proc/mounts
* @s: seq_file structure
- * @mnt: vfsmount
+ * @root: root of this (sub)tree
*
* Returns: 0 on success or error code
*/
-static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+static int gfs2_show_options(struct seq_file *s, struct dentry *root)
{
- struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+ struct gfs2_sbd *sdp = root->d_sb->s_fs_info;
struct gfs2_args *args = &sdp->sd_args;
int val;
- if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+ if (is_ancestor(root, sdp->sd_master_dir))
seq_printf(s, ",meta");
if (args->ar_lockproto[0])
seq_printf(s, ",lockproto=%s", args->ar_lockproto);
@@ -1399,8 +1399,9 @@ static void gfs2_final_release_pages(struct gfs2_inode *ip)
static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al;
+ struct gfs2_qadata *qa;
struct gfs2_rgrpd *rgd;
+ struct gfs2_holder gh;
int error;
if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
@@ -1408,8 +1409,8 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
return -EIO;
}
- al = gfs2_alloc_get(ip);
- if (!al)
+ qa = gfs2_qadata_get(ip);
+ if (!qa)
return -ENOMEM;
error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
@@ -1423,8 +1424,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
goto out_qs;
}
- error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
- &al->al_rgd_gh);
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
if (error)
goto out_qs;
@@ -1440,11 +1440,11 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
gfs2_trans_end(sdp);
out_rg_gunlock:
- gfs2_glock_dq_uninit(&al->al_rgd_gh);
+ gfs2_glock_dq_uninit(&gh);
out_qs:
gfs2_quota_unhold(ip);
out:
- gfs2_alloc_put(ip);
+ gfs2_qadata_put(ip);
return error;
}
@@ -1582,7 +1582,6 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
static void gfs2_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(gfs2_inode_cachep, inode);
}
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 443cabcfcd23..d33172c291ba 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -298,7 +298,7 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
ssize_t ret;
int val = 0;
- if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))
val = 1;
ret = sprintf(buf, "%d\n", val);
return ret;
@@ -313,9 +313,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
val = simple_strtol(buf, NULL, 0);
if (val == 1)
- set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+ set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
else if (val == 0) {
- clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+ clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
smp_mb__after_clear_bit();
gfs2_glock_thaw(sdp);
} else {
@@ -350,8 +350,8 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
goto out;
if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
goto out;
- sdp->sd_lockstruct.ls_first = first;
- rv = 0;
+ sdp->sd_lockstruct.ls_first = first;
+ rv = 0;
out:
spin_unlock(&sdp->sd_jindex_spin);
return rv ? rv : len;
@@ -360,19 +360,14 @@ out:
static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- return sprintf(buf, "%d\n", ls->ls_first_done);
+ return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags));
}
-static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
{
- unsigned jid;
struct gfs2_jdesc *jd;
int rv;
- rv = sscanf(buf, "%u", &jid);
- if (rv != 1)
- return -EINVAL;
-
rv = -ESHUTDOWN;
spin_lock(&sdp->sd_jindex_spin);
if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
@@ -389,6 +384,20 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
}
out:
spin_unlock(&sdp->sd_jindex_spin);
+ return rv;
+}
+
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+ unsigned jid;
+ int rv;
+
+ rv = sscanf(buf, "%u", &jid);
+ if (rv != 1)
+ return -EINVAL;
+
+ rv = gfs2_recover_set(sdp, jid);
+
return rv ? rv : len;
}
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index e94560e836d7..79182d6ad6ac 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -19,5 +19,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
int gfs2_sys_init(void);
void gfs2_sys_uninit(void);
+int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid);
+
#endif /* __SYS_DOT_H__ */
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index f8f101ef600c..125d4572e1c0 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -30,9 +30,9 @@ struct gfs2_glock;
* block, or all of the blocks in the rg, whichever is smaller */
static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip)
{
- const struct gfs2_alloc *al = ip->i_alloc;
- if (al->al_requested < ip->i_rgd->rd_length)
- return al->al_requested + 1;
+ const struct gfs2_blkreserv *rs = ip->i_res;
+ if (rs->rs_requested < ip->i_rgd->rd_length)
+ return rs->rs_requested + 1;
return ip->i_rgd->rd_length;
}
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 71d7bf830c09..e9636591b5d5 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -321,11 +321,11 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
struct gfs2_ea_header *ea,
struct gfs2_ea_header *prev, int leave)
{
- struct gfs2_alloc *al;
+ struct gfs2_qadata *qa;
int error;
- al = gfs2_alloc_get(ip);
- if (!al)
+ qa = gfs2_qadata_get(ip);
+ if (!qa)
return -ENOMEM;
error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
@@ -336,7 +336,7 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
gfs2_quota_unhold(ip);
out_alloc:
- gfs2_alloc_put(ip);
+ gfs2_qadata_put(ip);
return error;
}
@@ -549,9 +549,10 @@ int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata)
goto out;
error = gfs2_ea_get_copy(ip, &el, data, len);
- if (error == 0)
- error = len;
- *ppdata = data;
+ if (error < 0)
+ kfree(data);
+ else
+ *ppdata = data;
out:
brelse(el.el_bh);
return error;
@@ -609,7 +610,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
u64 block;
int error;
- error = gfs2_alloc_block(ip, &block, &n);
+ error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
if (error)
return error;
gfs2_trans_add_unrevoke(sdp, block, 1);
@@ -671,7 +672,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
int mh_size = sizeof(struct gfs2_meta_header);
unsigned int n = 1;
- error = gfs2_alloc_block(ip, &block, &n);
+ error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
if (error)
return error;
gfs2_trans_add_unrevoke(sdp, block, 1);
@@ -708,21 +709,19 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
unsigned int blks,
ea_skeleton_call_t skeleton_call, void *private)
{
- struct gfs2_alloc *al;
+ struct gfs2_qadata *qa;
struct buffer_head *dibh;
int error;
- al = gfs2_alloc_get(ip);
- if (!al)
+ qa = gfs2_qadata_get(ip);
+ if (!qa)
return -ENOMEM;
error = gfs2_quota_lock_check(ip);
if (error)
goto out;
- al->al_requested = blks;
-
- error = gfs2_inplace_reserve(ip);
+ error = gfs2_inplace_reserve(ip, blks);
if (error)
goto out_gunlock_q;
@@ -751,7 +750,7 @@ out_ipres:
out_gunlock_q:
gfs2_quota_unlock(ip);
out:
- gfs2_alloc_put(ip);
+ gfs2_qadata_put(ip);
return error;
}
@@ -991,7 +990,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
} else {
u64 blk;
unsigned int n = 1;
- error = gfs2_alloc_block(ip, &blk, &n);
+ error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL);
if (error)
return error;
gfs2_trans_add_unrevoke(sdp, blk, 1);
@@ -1435,9 +1434,9 @@ out:
static int ea_dealloc_block(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = ip->i_alloc;
struct gfs2_rgrpd *rgd;
struct buffer_head *dibh;
+ struct gfs2_holder gh;
int error;
rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
@@ -1446,8 +1445,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
return -EIO;
}
- error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
- &al->al_rgd_gh);
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
if (error)
return error;
@@ -1471,7 +1469,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
gfs2_trans_end(sdp);
out_gunlock:
- gfs2_glock_dq_uninit(&al->al_rgd_gh);
+ gfs2_glock_dq_uninit(&gh);
return error;
}
@@ -1484,11 +1482,11 @@ out_gunlock:
int gfs2_ea_dealloc(struct gfs2_inode *ip)
{
- struct gfs2_alloc *al;
+ struct gfs2_qadata *qa;
int error;
- al = gfs2_alloc_get(ip);
- if (!al)
+ qa = gfs2_qadata_get(ip);
+ if (!qa)
return -ENOMEM;
error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
@@ -1510,7 +1508,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
out_quota:
gfs2_quota_unhold(ip);
out_alloc:
- gfs2_alloc_put(ip);
+ gfs2_qadata_put(ip);
return error;
}
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index bce4eef91a06..62fc14ea4b73 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -186,7 +186,7 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
* a directory and return a corresponding inode, given the inode for
* the directory and the name (and its length) of the new file.
*/
-static int hfs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
struct inode *inode;
@@ -216,7 +216,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, int mode,
* in a directory, given the inode for the parent directory and the
* name (and its length) of the new directory.
*/
-static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
int res;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index ad97c2d58287..1bf967c6bfdc 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -184,7 +184,7 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern const struct address_space_operations hfs_aops;
extern const struct address_space_operations hfs_btree_aops;
-extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
+extern struct inode *hfs_new_inode(struct inode *, struct qstr *, umode_t);
extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
extern int hfs_write_inode(struct inode *, struct writeback_control *);
extern int hfs_inode_setattr(struct dentry *, struct iattr *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a1a9fdcd2a00..737dbeb64320 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -169,7 +169,7 @@ const struct address_space_operations hfs_aops = {
/*
* hfs_new_inode
*/
-struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
+struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct inode *inode = new_inode(sb);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 1b55f704fb22..8137fb3e6780 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -133,9 +133,9 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data)
return 0;
}
-static int hfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
+static int hfs_show_options(struct seq_file *seq, struct dentry *root)
{
- struct hfs_sb_info *sbi = HFS_SB(mnt->mnt_sb);
+ struct hfs_sb_info *sbi = HFS_SB(root->d_sb);
if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
@@ -170,7 +170,6 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
static void hfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
}
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4536cd3f15ae..88e155f895c6 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -424,7 +424,7 @@ out:
}
static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
- int mode, dev_t rdev)
+ umode_t mode, dev_t rdev)
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
struct inode *inode;
@@ -453,13 +453,13 @@ out:
return res;
}
-static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
+static int hfsplus_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
return hfsplus_mknod(dir, dentry, mode, 0);
}
-static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
}
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index d7674d051f52..21a5b7fc6db4 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -402,7 +402,7 @@ void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *);
int hfsplus_cat_write_inode(struct inode *);
-struct inode *hfsplus_new_inode(struct super_block *, int);
+struct inode *hfsplus_new_inode(struct super_block *, umode_t);
void hfsplus_delete_inode(struct inode *);
int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
@@ -419,7 +419,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
int hfsplus_parse_options_remount(char *input, int *force);
void hfsplus_fill_defaults(struct hfsplus_sb_info *);
-int hfsplus_show_options(struct seq_file *, struct vfsmount *);
+int hfsplus_show_options(struct seq_file *, struct dentry *);
/* super.c */
struct inode *hfsplus_iget(struct super_block *, unsigned long);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 40e1413be4cf..6643b242bdd7 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -378,7 +378,7 @@ static const struct file_operations hfsplus_file_operations = {
.unlocked_ioctl = hfsplus_ioctl,
};
-struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
+struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
struct inode *inode = new_inode(sb);
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index fbaa6690c8e0..f66c7655b3f7 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -43,7 +43,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
unsigned int flags;
int err = 0;
- err = mnt_want_write(file->f_path.mnt);
+ err = mnt_want_write_file(file);
if (err)
goto out;
@@ -94,7 +94,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
out_unlock_inode:
mutex_unlock(&inode->i_mutex);
out_drop_write:
- mnt_drop_write(file->f_path.mnt);
+ mnt_drop_write_file(file);
out:
return err;
}
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bb62a5882147..06fa5618600c 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -206,9 +206,9 @@ done:
return 1;
}
-int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
+int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
{
- struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);
if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index d24a9b666a23..427682ca9e48 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -499,9 +499,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
if (!sbi->hidden_dir) {
mutex_lock(&sbi->vh_mutex);
sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
- hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
- sbi->hidden_dir);
+ if (!sbi->hidden_dir) {
+ mutex_unlock(&sbi->vh_mutex);
+ err = -ENOMEM;
+ goto out_put_root;
+ }
+ err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root,
+ &str, sbi->hidden_dir);
mutex_unlock(&sbi->vh_mutex);
+ if (err)
+ goto out_put_hidden_dir;
hfsplus_mark_inode_dirty(sbi->hidden_dir,
HFSPLUS_I_CAT_DIRTY);
@@ -558,7 +565,6 @@ static void hfsplus_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
}
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index bf15a43016b9..3cbfa93cd782 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -39,7 +39,7 @@
struct hostfs_iattr {
unsigned int ia_valid;
- mode_t ia_mode;
+ unsigned short ia_mode;
uid_t ia_uid;
gid_t ia_gid;
loff_t ia_size;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2f72da5ae686..e130bd46d671 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -250,7 +250,6 @@ static void hostfs_evict_inode(struct inode *inode)
static void hostfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kfree(HOSTFS_I(inode));
}
@@ -259,9 +258,9 @@ static void hostfs_destroy_inode(struct inode *inode)
call_rcu(&inode->i_rcu, hostfs_i_callback);
}
-static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
{
- const char *root_path = vfs->mnt_sb->s_fs_info;
+ const char *root_path = root->d_sb->s_fs_info;
size_t offset = strlen(root_ino) + 1;
if (strlen(root_path) > offset)
@@ -552,7 +551,7 @@ static int read_name(struct inode *ino, char *name)
return 0;
}
-int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
+int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
struct inode *inode;
@@ -677,7 +676,7 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
return err;
}
-int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode)
+int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
{
char *file;
int err;
@@ -701,7 +700,7 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
return err;
}
-int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
char *name;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index ea91fcb0ef9b..30dd7b10b507 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -8,7 +8,7 @@
#include <linux/sched.h>
#include "hpfs_fn.h"
-static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
const unsigned char *name = dentry->d_name.name;
unsigned len = dentry->d_name.len;
@@ -115,7 +115,7 @@ bail:
return err;
}
-static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd)
{
const unsigned char *name = dentry->d_name.name;
unsigned len = dentry->d_name.len;
@@ -201,7 +201,7 @@ bail:
return err;
}
-static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
{
const unsigned char *name = dentry->d_name.name;
unsigned len = dentry->d_name.len;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 98580a3b5005..3690467c944e 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -181,7 +181,6 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb)
static void hpfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
}
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index f590b1160c6c..d92f4ce80925 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -622,7 +622,6 @@ void hppfs_evict_inode(struct inode *ino)
static void hppfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kfree(HPPFS_I(inode));
}
@@ -726,7 +725,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
sb->s_fs_info = proc_mnt;
err = -ENOMEM;
- root_inode = get_inode(sb, dget(proc_mnt->mnt_sb->s_root));
+ root_inode = get_inode(sb, dget(proc_mnt->mnt_root));
if (!root_inode)
goto out_mntput;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0be5a78598d0..1e85a7ac0217 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -447,8 +447,8 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
return 0;
}
-static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
- gid_t gid, int mode, dev_t dev)
+static struct inode *hugetlbfs_get_root(struct super_block *sb,
+ struct hugetlbfs_config *config)
{
struct inode *inode;
@@ -456,9 +456,31 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
if (inode) {
struct hugetlbfs_inode_info *info;
inode->i_ino = get_next_ino();
- inode->i_mode = mode;
- inode->i_uid = uid;
- inode->i_gid = gid;
+ inode->i_mode = S_IFDIR | config->mode;
+ inode->i_uid = config->uid;
+ inode->i_gid = config->gid;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ info = HUGETLBFS_I(inode);
+ mpol_shared_policy_init(&info->policy, NULL);
+ inode->i_op = &hugetlbfs_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ /* directory inodes start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
+ }
+ return inode;
+}
+
+static struct inode *hugetlbfs_get_inode(struct super_block *sb,
+ struct inode *dir,
+ umode_t mode, dev_t dev)
+{
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (inode) {
+ struct hugetlbfs_inode_info *info;
+ inode->i_ino = get_next_ino();
+ inode_init_owner(inode, dir, mode);
inode->i_mapping->a_ops = &hugetlbfs_aops;
inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -500,20 +522,12 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
* File creation. Allocate an inode, and we're done..
*/
static int hugetlbfs_mknod(struct inode *dir,
- struct dentry *dentry, int mode, dev_t dev)
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
int error = -ENOSPC;
- gid_t gid;
-
- if (dir->i_mode & S_ISGID) {
- gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else {
- gid = current_fsgid();
- }
- inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(), gid, mode, dev);
+
+ inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
if (inode) {
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
d_instantiate(dentry, inode);
@@ -523,7 +537,7 @@ static int hugetlbfs_mknod(struct inode *dir,
return error;
}
-static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
if (!retval)
@@ -531,7 +545,7 @@ static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
return retval;
}
-static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd)
{
return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
}
@@ -541,15 +555,8 @@ static int hugetlbfs_symlink(struct inode *dir,
{
struct inode *inode;
int error = -ENOSPC;
- gid_t gid;
-
- if (dir->i_mode & S_ISGID)
- gid = dir->i_gid;
- else
- gid = current_fsgid();
- inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(),
- gid, S_IFLNK|S_IRWXUGO, 0);
+ inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
if (inode) {
int l = strlen(symname)+1;
error = page_symlink(inode, symname, l);
@@ -576,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page)
}
static int hugetlbfs_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
int rc;
@@ -666,7 +674,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
static void hugetlbfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}
@@ -858,8 +865,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
sb->s_time_gran = 1;
- inode = hugetlbfs_get_inode(sb, config.uid, config.gid,
- S_IFDIR | config.mode, 0);
+ inode = hugetlbfs_get_root(sb, &config);
if (!inode)
goto out_free;
@@ -957,8 +963,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
path.mnt = mntget(hugetlbfs_vfsmount);
error = -ENOSPC;
- inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(),
- current_fsgid(), S_IFREG | S_IRWXUGO, 0);
+ inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
if (!inode)
goto out_dentry;
diff --git a/fs/inode.c b/fs/inode.c
index ee4e66b998f4..83ab215baab1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -26,6 +26,7 @@
#include <linux/ima.h>
#include <linux/cred.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
+#include <linux/ratelimit.h>
#include "internal.h"
/*
@@ -191,6 +192,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
}
inode->i_private = NULL;
inode->i_mapping = mapping;
+ INIT_LIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
#ifdef CONFIG_FS_POSIX_ACL
inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
#endif
@@ -241,6 +243,11 @@ void __destroy_inode(struct inode *inode)
BUG_ON(inode_has_buffers(inode));
security_inode_free(inode);
fsnotify_inode_delete(inode);
+ if (!inode->i_nlink) {
+ WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
+ atomic_long_dec(&inode->i_sb->s_remove_count);
+ }
+
#ifdef CONFIG_FS_POSIX_ACL
if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
posix_acl_release(inode->i_acl);
@@ -254,7 +261,6 @@ EXPORT_SYMBOL(__destroy_inode);
static void i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(inode_cachep, inode);
}
@@ -268,6 +274,82 @@ static void destroy_inode(struct inode *inode)
call_rcu(&inode->i_rcu, i_callback);
}
+/**
+ * drop_nlink - directly drop an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink. In cases
+ * where we are attempting to track writes to the
+ * filesystem, a decrement to zero means an imminent
+ * write when the file is truncated and actually unlinked
+ * on the filesystem.
+ */
+void drop_nlink(struct inode *inode)
+{
+ WARN_ON(inode->i_nlink == 0);
+ inode->__i_nlink--;
+ if (!inode->i_nlink)
+ atomic_long_inc(&inode->i_sb->s_remove_count);
+}
+EXPORT_SYMBOL(drop_nlink);
+
+/**
+ * clear_nlink - directly zero an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink. See
+ * drop_nlink() for why we care about i_nlink hitting zero.
+ */
+void clear_nlink(struct inode *inode)
+{
+ if (inode->i_nlink) {
+ inode->__i_nlink = 0;
+ atomic_long_inc(&inode->i_sb->s_remove_count);
+ }
+}
+EXPORT_SYMBOL(clear_nlink);
+
+/**
+ * set_nlink - directly set an inode's link count
+ * @inode: inode
+ * @nlink: new nlink (should be non-zero)
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.
+ */
+void set_nlink(struct inode *inode, unsigned int nlink)
+{
+ if (!nlink) {
+ clear_nlink(inode);
+ } else {
+ /* Yes, some filesystems do change nlink from zero to one */
+ if (inode->i_nlink == 0)
+ atomic_long_dec(&inode->i_sb->s_remove_count);
+
+ inode->__i_nlink = nlink;
+ }
+}
+EXPORT_SYMBOL(set_nlink);
+
+/**
+ * inc_nlink - directly increment an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink. Currently,
+ * it is only here for parity with dec_nlink().
+ */
+void inc_nlink(struct inode *inode)
+{
+ if (WARN_ON(inode->i_nlink == 0))
+ atomic_long_dec(&inode->i_sb->s_remove_count);
+
+ inode->__i_nlink++;
+}
+EXPORT_SYMBOL(inc_nlink);
+
void address_space_init_once(struct address_space *mapping)
{
memset(mapping, 0, sizeof(*mapping));
@@ -290,7 +372,6 @@ void inode_init_once(struct inode *inode)
{
memset(inode, 0, sizeof(*inode));
INIT_HLIST_NODE(&inode->i_hash);
- INIT_LIST_HEAD(&inode->i_dentry);
INIT_LIST_HEAD(&inode->i_devices);
INIT_LIST_HEAD(&inode->i_wb_list);
INIT_LIST_HEAD(&inode->i_lru);
@@ -692,6 +773,8 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
else
__count_vm_events(PGINODESTEAL, reap);
spin_unlock(&sb->s_inode_lru_lock);
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += reap;
dispose_list(&freeable);
}
@@ -855,8 +938,7 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode)
struct file_system_type *type = inode->i_sb->s_type;
/* Set new key only if filesystem hasn't already changed it */
- if (!lockdep_match_class(&inode->i_mutex,
- &type->i_mutex_key)) {
+ if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) {
/*
* ensure nobody is actually holding i_mutex
*/
@@ -883,6 +965,7 @@ void unlock_new_inode(struct inode *inode)
spin_lock(&inode->i_lock);
WARN_ON(!(inode->i_state & I_NEW));
inode->i_state &= ~I_NEW;
+ smp_mb();
wake_up_bit(&inode->i_state, __I_NEW);
spin_unlock(&inode->i_lock);
}
@@ -1508,7 +1591,7 @@ void file_update_time(struct file *file)
if (sync_it & S_MTIME)
inode->i_mtime = now;
mark_inode_dirty_sync(inode);
- mnt_drop_write(file->f_path.mnt);
+ mnt_drop_write_file(file);
}
EXPORT_SYMBOL(file_update_time);
@@ -1568,7 +1651,7 @@ __setup("ihash_entries=", set_ihash_entries);
*/
void __init inode_init_early(void)
{
- int loop;
+ unsigned int loop;
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
@@ -1586,13 +1669,13 @@ void __init inode_init_early(void)
&i_hash_mask,
0);
- for (loop = 0; loop < (1 << i_hash_shift); loop++)
+ for (loop = 0; loop < (1U << i_hash_shift); loop++)
INIT_HLIST_HEAD(&inode_hashtable[loop]);
}
void __init inode_init(void)
{
- int loop;
+ unsigned int loop;
/* inode slab cache */
inode_cachep = kmem_cache_create("inode_cache",
@@ -1616,7 +1699,7 @@ void __init inode_init(void)
&i_hash_mask,
0);
- for (loop = 0; loop < (1 << i_hash_shift); loop++)
+ for (loop = 0; loop < (1U << i_hash_shift); loop++)
INIT_HLIST_HEAD(&inode_hashtable[loop]);
}
@@ -1647,7 +1730,7 @@ EXPORT_SYMBOL(init_special_inode);
* @mode: mode of the new inode
*/
void inode_init_owner(struct inode *inode, const struct inode *dir,
- mode_t mode)
+ umode_t mode)
{
inode->i_uid = current_fsuid();
if (dir && dir->i_mode & S_ISGID) {
diff --git a/fs/internal.h b/fs/internal.h
index fe327c20af83..9962c59ba280 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -15,19 +15,14 @@ struct super_block;
struct file_system_type;
struct linux_binprm;
struct path;
+struct mount;
/*
* block_dev.c
*/
#ifdef CONFIG_BLOCK
-extern struct super_block *blockdev_superblock;
extern void __init bdev_cache_init(void);
-static inline int sb_is_blkdev_sb(struct super_block *sb)
-{
- return sb == blockdev_superblock;
-}
-
extern int __sync_blockdev(struct block_device *bdev, int wait);
#else
@@ -35,11 +30,6 @@ static inline void bdev_cache_init(void)
{
}
-static inline int sb_is_blkdev_sb(struct super_block *sb)
-{
- return 0;
-}
-
static inline int __sync_blockdev(struct block_device *bdev, int wait)
{
return 0;
@@ -52,28 +42,17 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
extern void __init chrdev_init(void);
/*
- * exec.c
- */
-extern int check_unsafe_exec(struct linux_binprm *);
-
-/*
* namespace.c
*/
extern int copy_mount_options(const void __user *, unsigned long *);
extern int copy_mount_string(const void __user *, char **);
-extern unsigned int mnt_get_count(struct vfsmount *mnt);
-extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
extern struct vfsmount *lookup_mnt(struct path *);
-extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
- struct vfsmount *);
-extern void release_mounts(struct list_head *);
-extern void umount_tree(struct vfsmount *, int, struct list_head *);
-extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
extern int finish_automount(struct vfsmount *, struct path *);
extern void mnt_make_longterm(struct vfsmount *);
extern void mnt_make_shortterm(struct vfsmount *);
+extern int sb_prepare_remount_readonly(struct super_block *);
extern void __init mnt_init(void);
@@ -98,10 +77,9 @@ extern struct file *get_empty_filp(void);
*/
extern int do_remount_sb(struct super_block *, int, void *, int);
extern bool grab_super_passive(struct super_block *sb);
-extern void __put_super(struct super_block *sb);
-extern void put_super(struct super_block *sb);
extern struct dentry *mount_fs(struct file_system_type *,
int, const char *, void *);
+extern struct super_block *user_get_super(dev_t);
/*
* open.c
@@ -111,7 +89,7 @@ extern struct file *nameidata_to_filp(struct nameidata *);
extern void release_open_intent(struct nameidata *);
struct open_flags {
int open_flag;
- int mode;
+ umode_t mode;
int acc_mode;
int intent;
};
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 1d9b9fcb2db4..066836e81848 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -42,7 +42,7 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd,
error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
if (error == -ENOIOCTLCMD)
- error = -EINVAL;
+ error = -ENOTTY;
out:
return error;
}
diff --git a/fs/ioprio.c b/fs/ioprio.c
index f79dab83e17b..0f1b9515213b 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -48,28 +48,12 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
if (err)
return err;
- task_lock(task);
- do {
- ioc = task->io_context;
- /* see wmb() in current_io_context() */
- smp_read_barrier_depends();
- if (ioc)
- break;
-
- ioc = alloc_io_context(GFP_ATOMIC, -1);
- if (!ioc) {
- err = -ENOMEM;
- break;
- }
- task->io_context = ioc;
- } while (1);
-
- if (!err) {
- ioc->ioprio = ioprio;
- ioc->ioprio_changed = 1;
+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+ if (ioc) {
+ ioc_ioprio_changed(ioc, ioprio);
+ put_io_context(ioc);
}
- task_unlock(task);
return err;
}
EXPORT_SYMBOL_GPL(set_task_ioprio);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index f950059525fc..bd62c76fb5df 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -85,7 +85,6 @@ static struct inode *isofs_alloc_inode(struct super_block *sb)
static void isofs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
}
@@ -170,8 +169,8 @@ struct iso9660_options{
unsigned char map;
unsigned char check;
unsigned int blocksize;
- mode_t fmode;
- mode_t dmode;
+ umode_t fmode;
+ umode_t dmode;
gid_t gid;
uid_t uid;
char *iocharset;
@@ -949,8 +948,11 @@ root_found:
/* get the root dentry */
s->s_root = d_alloc_root(inode);
- if (!(s->s_root))
- goto out_no_root;
+ if (!(s->s_root)) {
+ iput(inode);
+ error = -ENOMEM;
+ goto out_no_inode;
+ }
kfree(opt.iocharset);
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 7d33de84f52a..0e73f63d9274 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -50,14 +50,14 @@ struct isofs_sb_info {
unsigned int s_uid_set:1;
unsigned int s_gid_set:1;
- mode_t s_fmode;
- mode_t s_dmode;
+ umode_t s_fmode;
+ umode_t s_dmode;
gid_t s_gid;
uid_t s_uid;
struct nls_table *s_nls_iocharset; /* Native language support table */
};
-#define ISOFS_INVALID_MODE ((mode_t) -1)
+#define ISOFS_INVALID_MODE ((umode_t) -1)
static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb)
{
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index f94fc48ff3a0..05f0754f2b46 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -453,8 +453,6 @@ out:
*
* Return <0 on error, 0 on success, 1 if there was nothing to clean up.
*
- * Called with the journal lock held.
- *
* This is the only part of the journaling code which really needs to be
* aware of transaction aborts. Checkpointing involves writing to the
* main filesystem area rather than to the journal, so it can proceed
@@ -472,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal)
if (is_journal_aborted(journal))
return 1;
- /* OK, work out the oldest transaction remaining in the log, and
+ /*
+ * OK, work out the oldest transaction remaining in the log, and
* the log block it starts at.
*
* If the log is now empty, we need to work out which is the
* next transaction ID we will write, and where it will
- * start. */
-
+ * start.
+ */
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
transaction = journal->j_checkpoint_transactions;
@@ -504,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal)
spin_unlock(&journal->j_state_lock);
return 1;
}
+ spin_unlock(&journal->j_state_lock);
+ /*
+ * We need to make sure that any blocks that were recently written out
+ * --- perhaps by log_do_checkpoint() --- are flushed out before we
+ * drop the transactions from the journal. It's unlikely this will be
+ * necessary, especially with an appropriately sized journal, but we
+ * need this to guarantee correctness. Fortunately
+ * cleanup_journal_tail() doesn't get called all that often.
+ */
+ if (journal->j_flags & JFS_BARRIER)
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+
+ spin_lock(&journal->j_state_lock);
+ if (!tid_gt(first_tid, journal->j_tail_sequence)) {
+ spin_unlock(&journal->j_state_lock);
+ /* Someone else cleaned up journal so return 0 */
+ return 0;
+ }
/* OK, update the superblock to recover the freed space.
* Physical blocks come first: have we wrapped beyond the end of
* the log? */
@@ -537,7 +554,7 @@ int cleanup_journal_tail(journal_t *journal)
* them.
*
* Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
+ * Returns number of buffers reaped (for debug)
*/
static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 8799207df058..f2b9a571f4cf 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -392,6 +392,12 @@ void journal_commit_transaction(journal_t *journal)
jbd_debug (3, "JBD: commit phase 1\n");
/*
+ * Clear revoked flag to reflect there is no revoked buffers
+ * in the next transaction which is going to be started.
+ */
+ journal_clear_buffer_revoked_flags(journal);
+
+ /*
* Switch to a new revoke table.
*/
journal_switch_revoke_table(journal);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index fea8dd661d2b..59c09f9541b5 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -166,7 +166,7 @@ loop:
*/
jbd_debug(1, "Now suspending kjournald\n");
spin_unlock(&journal->j_state_lock);
- refrigerator();
+ try_to_freeze();
spin_lock(&journal->j_state_lock);
} else {
/*
@@ -721,7 +721,6 @@ static journal_t * journal_init_common (void)
init_waitqueue_head(&journal->j_wait_checkpoint);
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
- mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
spin_lock_init(&journal->j_list_lock);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 5b43e96788e6..008bf062fd26 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,6 +20,7 @@
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
+#include <linux/blkdev.h>
#endif
/*
@@ -263,6 +264,9 @@ int journal_recover(journal_t *journal)
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
+ /* Flush disk caches to get replayed data on the permanent storage */
+ if (journal->j_flags & JFS_BARRIER)
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
return err;
}
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 305a90763154..25c713e7071c 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -47,6 +47,10 @@
* overwriting the new data. We don't even need to clear the revoke
* bit here.
*
+ * We cache revoke status of a buffer in the current transaction in b_states
+ * bits. As the name says, revokevalid flag indicates that the cached revoke
+ * status of a buffer is valid and we can rely on the cached status.
+ *
* Revoke information on buffers is a tri-state value:
*
* RevokeValid clear: no cached revoke status, need to look it up
@@ -479,6 +483,36 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
return did_revoke;
}
+/*
+ * journal_clear_revoked_flags clears revoked flag of buffers in
+ * revoke table to reflect there is no revoked buffer in the next
+ * transaction which is going to be started.
+ */
+void journal_clear_buffer_revoked_flags(journal_t *journal)
+{
+ struct jbd_revoke_table_s *revoke = journal->j_revoke;
+ int i = 0;
+
+ for (i = 0; i < revoke->hash_size; i++) {
+ struct list_head *hash_list;
+ struct list_head *list_entry;
+ hash_list = &revoke->hash_table[i];
+
+ list_for_each(list_entry, hash_list) {
+ struct jbd_revoke_record_s *record;
+ struct buffer_head *bh;
+ record = (struct jbd_revoke_record_s *)list_entry;
+ bh = __find_get_block(journal->j_fs_dev,
+ record->blocknr,
+ journal->j_blocksize);
+ if (bh) {
+ clear_buffer_revoked(bh);
+ __brelse(bh);
+ }
+ }
+ }
+}
+
/* journal_switch_revoke table select j_revoke for next transaction
* we do not want to suspend any processing until all revokes are
* written -bzzz
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 7e59c6e66f9b..7fce94b04bc3 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -426,17 +426,34 @@ int journal_restart(handle_t *handle, int nblocks)
* void journal_lock_updates () - establish a transaction barrier.
* @journal: Journal to establish a barrier on.
*
- * This locks out any further updates from being started, and blocks
- * until all existing updates have completed, returning only once the
- * journal is in a quiescent state with no updates running.
- *
- * The journal lock should not be held on entry.
+ * This locks out any further updates from being started, and blocks until all
+ * existing updates have completed, returning only once the journal is in a
+ * quiescent state with no updates running.
+ *
+ * We do not use simple mutex for synchronization as there are syscalls which
+ * want to return with filesystem locked and that trips up lockdep. Also
+ * hibernate needs to lock filesystem but locked mutex then blocks hibernation.
+ * Since locking filesystem is rare operation, we use simple counter and
+ * waitqueue for locking.
*/
void journal_lock_updates(journal_t *journal)
{
DEFINE_WAIT(wait);
+wait:
+ /* Wait for previous locked operation to finish */
+ wait_event(journal->j_wait_transaction_locked,
+ journal->j_barrier_count == 0);
+
spin_lock(&journal->j_state_lock);
+ /*
+ * Check reliably under the lock whether we are the ones winning the race
+ * and locking the journal
+ */
+ if (journal->j_barrier_count > 0) {
+ spin_unlock(&journal->j_state_lock);
+ goto wait;
+ }
++journal->j_barrier_count;
/* Wait until there are no running updates */
@@ -460,14 +477,6 @@ void journal_lock_updates(journal_t *journal)
spin_lock(&journal->j_state_lock);
}
spin_unlock(&journal->j_state_lock);
-
- /*
- * We have now established a barrier against other normal updates, but
- * we also need to barrier against other journal_lock_updates() calls
- * to make sure that we serialise special journal-locked operations
- * too.
- */
- mutex_lock(&journal->j_barrier);
}
/**
@@ -475,14 +484,11 @@ void journal_lock_updates(journal_t *journal)
* @journal: Journal to release the barrier on.
*
* Release a transaction barrier obtained with journal_lock_updates().
- *
- * Should be called without the journal lock held.
*/
void journal_unlock_updates (journal_t *journal)
{
J_ASSERT(journal->j_barrier_count != 0);
- mutex_unlock(&journal->j_barrier);
spin_lock(&journal->j_state_lock);
--journal->j_barrier_count;
spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 16a698bd906d..d49d202903fb 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -565,7 +565,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
*
* Called with the journal locked.
* Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
+ * Returns number of buffers reaped (for debug)
*/
static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 68d704db787f..5069b8475150 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -430,6 +430,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(3, "JBD2: commit phase 1\n");
/*
+ * Clear revoked flag to reflect there is no revoked buffers
+ * in the next transaction which is going to be started.
+ */
+ jbd2_clear_buffer_revoked_flags(journal);
+
+ /*
* Switch to a new revoke table.
*/
jbd2_journal_switch_revoke_table(journal);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0fa0123151d3..c0a5f9f1b127 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -173,7 +173,7 @@ loop:
*/
jbd_debug(1, "Now suspending kjournald2\n");
write_unlock(&journal->j_state_lock);
- refrigerator();
+ try_to_freeze();
write_lock(&journal->j_state_lock);
} else {
/*
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 69fd93588118..30b2867d6cc9 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -47,6 +47,10 @@
* overwriting the new data. We don't even need to clear the revoke
* bit here.
*
+ * We cache revoke status of a buffer in the current transaction in b_states
+ * bits. As the name says, revokevalid flag indicates that the cached revoke
+ * status of a buffer is valid and we can rely on the cached status.
+ *
* Revoke information on buffers is a tri-state value:
*
* RevokeValid clear: no cached revoke status, need to look it up
@@ -478,6 +482,36 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
return did_revoke;
}
+/*
+ * journal_clear_revoked_flag clears revoked flag of buffers in
+ * revoke table to reflect there is no revoked buffers in the next
+ * transaction which is going to be started.
+ */
+void jbd2_clear_buffer_revoked_flags(journal_t *journal)
+{
+ struct jbd2_revoke_table_s *revoke = journal->j_revoke;
+ int i = 0;
+
+ for (i = 0; i < revoke->hash_size; i++) {
+ struct list_head *hash_list;
+ struct list_head *list_entry;
+ hash_list = &revoke->hash_table[i];
+
+ list_for_each(list_entry, hash_list) {
+ struct jbd2_revoke_record_s *record;
+ struct buffer_head *bh;
+ record = (struct jbd2_revoke_record_s *)list_entry;
+ bh = __find_get_block(journal->j_fs_dev,
+ record->blocknr,
+ journal->j_blocksize);
+ if (bh) {
+ clear_buffer_revoked(bh);
+ __brelse(bh);
+ }
+ }
+ }
+}
+
/* journal_switch_revoke table select j_revoke for next transaction
* we do not want to suspend any processing until all revokes are
* written -bzzz
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0e41a4c080e..35ae096bed5d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -517,12 +517,13 @@ void jbd2_journal_lock_updates(journal_t *journal)
break;
spin_lock(&transaction->t_handle_lock);
+ prepare_to_wait(&journal->j_wait_updates, &wait,
+ TASK_UNINTERRUPTIBLE);
if (!atomic_read(&transaction->t_updates)) {
spin_unlock(&transaction->t_handle_lock);
+ finish_wait(&journal->j_wait_updates, &wait);
break;
}
- prepare_to_wait(&journal->j_wait_updates, &wait,
- TASK_UNINTERRUPTIBLE);
spin_unlock(&transaction->t_handle_lock);
write_unlock(&journal->j_state_lock);
schedule();
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index be6169bd8acd..973ac5822bd7 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -22,16 +22,16 @@
static int jffs2_readdir (struct file *, void *, filldir_t);
-static int jffs2_create (struct inode *,struct dentry *,int,
+static int jffs2_create (struct inode *,struct dentry *,umode_t,
struct nameidata *);
static struct dentry *jffs2_lookup (struct inode *,struct dentry *,
struct nameidata *);
static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
static int jffs2_unlink (struct inode *,struct dentry *);
static int jffs2_symlink (struct inode *,struct dentry *,const char *);
-static int jffs2_mkdir (struct inode *,struct dentry *,int);
+static int jffs2_mkdir (struct inode *,struct dentry *,umode_t);
static int jffs2_rmdir (struct inode *,struct dentry *);
-static int jffs2_mknod (struct inode *,struct dentry *,int,dev_t);
+static int jffs2_mknod (struct inode *,struct dentry *,umode_t,dev_t);
static int jffs2_rename (struct inode *, struct dentry *,
struct inode *, struct dentry *);
@@ -169,8 +169,8 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
/***********************************************************************/
-static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
- struct nameidata *nd)
+static int jffs2_create(struct inode *dir_i, struct dentry *dentry,
+ umode_t mode, struct nameidata *nd)
{
struct jffs2_raw_inode *ri;
struct jffs2_inode_info *f, *dir_f;
@@ -450,7 +450,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
}
-static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
+static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode)
{
struct jffs2_inode_info *f, *dir_f;
struct jffs2_sb_info *c;
@@ -618,7 +618,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
return ret;
}
-static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, dev_t rdev)
+static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct jffs2_inode_info *f, *dir_f;
struct jffs2_sb_info *c;
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index e513f1913c15..eafb8d37a6fb 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -74,7 +74,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
((struct erase_priv_struct *)instr->priv)->jeb = jeb;
((struct erase_priv_struct *)instr->priv)->c = c;
- ret = c->mtd->erase(c->mtd, instr);
+ ret = mtd_erase(c->mtd, instr);
if (!ret)
return;
@@ -335,13 +335,12 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
void *ebuf;
uint32_t ofs;
size_t retlen;
- int ret = -EIO;
-
- if (c->mtd->point) {
- unsigned long *wordebuf;
+ int ret;
+ unsigned long *wordebuf;
- ret = c->mtd->point(c->mtd, jeb->offset, c->sector_size,
- &retlen, &ebuf, NULL);
+ ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen,
+ &ebuf, NULL);
+ if (ret != -EOPNOTSUPP) {
if (ret) {
D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
goto do_flash_read;
@@ -349,7 +348,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
if (retlen < c->sector_size) {
/* Don't muck about if it won't let us point to the whole erase sector */
D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", retlen));
- c->mtd->unpoint(c->mtd, jeb->offset, retlen);
+ mtd_unpoint(c->mtd, jeb->offset, retlen);
goto do_flash_read;
}
wordebuf = ebuf-sizeof(*wordebuf);
@@ -358,7 +357,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
if (*++wordebuf != ~0)
break;
} while(--retlen);
- c->mtd->unpoint(c->mtd, jeb->offset, c->sector_size);
+ mtd_unpoint(c->mtd, jeb->offset, c->sector_size);
if (retlen) {
printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08tx\n",
*wordebuf, jeb->offset + c->sector_size-retlen*sizeof(*wordebuf));
@@ -381,7 +380,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
*bad_offset = ofs;
- ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf);
+ ret = mtd_read(c->mtd, ofs, readlen, &retlen, ebuf);
if (ret) {
printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret);
ret = -EIO;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 4b8afe39a87f..2e0123867cb1 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -466,7 +466,6 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
if (insert_inode_locked(inode) < 0) {
make_bad_inode(inode);
- unlock_new_inode(inode);
iput(inode);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index ee57bac1ba6d..3093ac4fb24c 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -62,17 +62,15 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
#ifndef __ECOS
/* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
* adding and jffs2_flash_read_end() interface. */
- if (c->mtd->point) {
- err = c->mtd->point(c->mtd, ofs, len, &retlen,
- (void **)&buffer, NULL);
- if (!err && retlen < len) {
- JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
- c->mtd->unpoint(c->mtd, ofs, retlen);
- } else if (err)
+ err = mtd_point(c->mtd, ofs, len, &retlen, (void **)&buffer, NULL);
+ if (!err && retlen < len) {
+ JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
+ mtd_unpoint(c->mtd, ofs, retlen);
+ } else if (err) {
+ if (err != -EOPNOTSUPP)
JFFS2_WARNING("MTD point failed: error code %d.\n", err);
- else
- pointed = 1; /* succefully pointed to device */
- }
+ } else
+ pointed = 1; /* succefully pointed to device */
#endif
if (!pointed) {
@@ -101,7 +99,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
kfree(buffer);
#ifndef __ECOS
else
- c->mtd->unpoint(c->mtd, ofs, len);
+ mtd_unpoint(c->mtd, ofs, len);
#endif
if (crc != tn->data_crc) {
@@ -137,7 +135,7 @@ free_out:
kfree(buffer);
#ifndef __ECOS
else
- c->mtd->unpoint(c->mtd, ofs, len);
+ mtd_unpoint(c->mtd, ofs, len);
#endif
return err;
}
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 28107ca136e4..f99464833bb2 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -97,15 +97,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
size_t pointlen, try_size;
if (c->mtd->point) {
- ret = c->mtd->point(c->mtd, 0, c->mtd->size, &pointlen,
- (void **)&flashbuf, NULL);
+ ret = mtd_point(c->mtd, 0, c->mtd->size, &pointlen,
+ (void **)&flashbuf, NULL);
if (!ret && pointlen < c->mtd->size) {
/* Don't muck about if it won't let us point to the whole flash */
D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen));
- c->mtd->unpoint(c->mtd, 0, pointlen);
+ mtd_unpoint(c->mtd, 0, pointlen);
flashbuf = NULL;
}
- if (ret)
+ if (ret && ret != -EOPNOTSUPP)
D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
}
#endif
@@ -273,7 +273,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
kfree(flashbuf);
#ifndef __ECOS
else
- c->mtd->unpoint(c->mtd, 0, c->mtd->size);
+ mtd_unpoint(c->mtd, 0, c->mtd->size);
#endif
kfree(s);
return ret;
@@ -455,7 +455,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
if (jffs2_cleanmarker_oob(c)) {
int ret;
- if (c->mtd->block_isbad(c->mtd, jeb->offset))
+ if (mtd_block_isbad(c->mtd, jeb->offset))
return BLK_STATE_BADBLOCK;
ret = jffs2_check_nand_cleanmarker(c, jeb);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index e7e974454115..f2d96b5e64f6 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -45,7 +45,6 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
static void jffs2_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
}
@@ -97,9 +96,9 @@ static const char *jffs2_compr_name(unsigned int compr)
}
}
-static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+static int jffs2_show_options(struct seq_file *s, struct dentry *root)
{
- struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb);
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(root->d_sb);
struct jffs2_mount_opts *opts = &c->mount_opts;
if (opts->override_compr)
@@ -336,9 +335,7 @@ static void jffs2_put_super (struct super_block *sb)
jffs2_flash_cleanup(c);
kfree(c->inocache_list);
jffs2_clear_xattr_subsystem(c);
- if (c->mtd->sync)
- c->mtd->sync(c->mtd);
-
+ mtd_sync(c->mtd);
D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
}
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index b09e51d2f81f..30e8f47e8a23 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -228,7 +228,7 @@ static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf,
size_t retlen;
char *eccstr;
- ret = c->mtd->read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify);
+ ret = mtd_read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify);
if (ret && ret != -EUCLEAN && ret != -EBADMSG) {
printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x failed: %d\n", c->wbuf_ofs, ret);
return ret;
@@ -337,7 +337,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
}
/* Do the read... */
- ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
+ ret = mtd_read(c->mtd, start, c->wbuf_ofs - start, &retlen,
+ buf);
/* ECC recovered ? */
if ((ret == -EUCLEAN || ret == -EBADMSG) &&
@@ -413,13 +414,12 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
if (breakme++ == 20) {
printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs);
breakme = 0;
- c->mtd->write(c->mtd, ofs, towrite, &retlen,
- brokenbuf);
+ mtd_write(c->mtd, ofs, towrite, &retlen, brokenbuf);
ret = -EIO;
} else
#endif
- ret = c->mtd->write(c->mtd, ofs, towrite, &retlen,
- rewrite_buf);
+ ret = mtd_write(c->mtd, ofs, towrite, &retlen,
+ rewrite_buf);
if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) {
/* Argh. We tried. Really we did. */
@@ -619,13 +619,14 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
if (breakme++ == 20) {
printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs);
breakme = 0;
- c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
- brokenbuf);
+ mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
+ brokenbuf);
ret = -EIO;
} else
#endif
- ret = c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf);
+ ret = mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize,
+ &retlen, c->wbuf);
if (ret) {
printk(KERN_WARNING "jffs2_flush_wbuf(): Write failed with %d\n", ret);
@@ -861,8 +862,8 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs,
v += wbuf_retlen;
if (vlen >= c->wbuf_pagesize) {
- ret = c->mtd->write(c->mtd, outvec_to, PAGE_DIV(vlen),
- &wbuf_retlen, v);
+ ret = mtd_write(c->mtd, outvec_to, PAGE_DIV(vlen),
+ &wbuf_retlen, v);
if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen))
goto outfile;
@@ -948,11 +949,11 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
int ret;
if (!jffs2_is_writebuffered(c))
- return c->mtd->read(c->mtd, ofs, len, retlen, buf);
+ return mtd_read(c->mtd, ofs, len, retlen, buf);
/* Read flash */
down_read(&c->wbuf_sem);
- ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
+ ret = mtd_read(c->mtd, ofs, len, retlen, buf);
if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) {
if (ret == -EBADMSG)
@@ -1031,7 +1032,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
ops.datbuf = NULL;
- ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
+ ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
if (ret || ops.oobretlen != ops.ooblen) {
printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
" bytes, read %zd bytes, error %d\n",
@@ -1074,7 +1075,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
ops.datbuf = NULL;
- ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
+ ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
if (ret || ops.oobretlen != ops.ooblen) {
printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
" bytes, read %zd bytes, error %d\n",
@@ -1100,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
ops.datbuf = NULL;
- ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops);
+ ret = mtd_write_oob(c->mtd, jeb->offset, &ops);
if (ret || ops.oobretlen != ops.ooblen) {
printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd"
" bytes, read %zd bytes, error %d\n",
@@ -1129,11 +1130,8 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
if( ++jeb->bad_count < MAX_ERASE_FAILURES)
return 0;
- if (!c->mtd->block_markbad)
- return 1; // What else can we do?
-
printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset);
- ret = c->mtd->block_markbad(c->mtd, bad_offset);
+ ret = mtd_block_markbad(c->mtd, bad_offset);
if (ret) {
D1(printk(KERN_WARNING "jffs2_write_nand_badblock(): Write failed for block at %08x: error %d\n", jeb->offset, ret));
diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c
index b9276b11bac6..a1bda9dab3f8 100644
--- a/fs/jffs2/writev.c
+++ b/fs/jffs2/writev.c
@@ -13,30 +13,6 @@
#include <linux/mtd/mtd.h>
#include "nodelist.h"
-/* This ought to be in core MTD code. All registered MTD devices
- without writev should have this put in place. Bug the MTD
- maintainer */
-static inline int mtd_fake_writev(struct mtd_info *mtd, const struct kvec *vecs,
- unsigned long count, loff_t to, size_t *retlen)
-{
- unsigned long i;
- size_t totlen = 0, thislen;
- int ret = 0;
-
- for (i=0; i<count; i++) {
- if (!vecs[i].iov_len)
- continue;
- ret = mtd->write(mtd, to, vecs[i].iov_len, &thislen, vecs[i].iov_base);
- totlen += thislen;
- if (ret || thislen != vecs[i].iov_len)
- break;
- to += vecs[i].iov_len;
- }
- if (retlen)
- *retlen = totlen;
- return ret;
-}
-
int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
unsigned long count, loff_t to, size_t *retlen)
{
@@ -50,18 +26,14 @@ int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
}
}
- if (c->mtd->writev)
- return c->mtd->writev(c->mtd, vecs, count, to, retlen);
- else {
- return mtd_fake_writev(c->mtd, vecs, count, to, retlen);
- }
+ return mtd_writev(c->mtd, vecs, count, to, retlen);
}
int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
size_t *retlen, const u_char *buf)
{
int ret;
- ret = c->mtd->write(c->mtd, ofs, len, retlen, buf);
+ ret = mtd_write(c->mtd, ofs, len, retlen, buf);
if (jffs2_sum_active()) {
struct kvec vecs[1];
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 6f98a1866776..f19d1e04a374 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -68,7 +68,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
unsigned int oldflags;
int err;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
return err;
@@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
setflags_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return err;
}
default:
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index cc5f811ed383..2eb952c41a69 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2349,7 +2349,7 @@ int jfsIOWait(void *arg)
if (freezing(current)) {
spin_unlock_irq(&log_redrive_lock);
- refrigerator();
+ try_to_freeze();
} else {
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irq(&log_redrive_lock);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index af9606057dde..bb8b661bcc50 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2800,7 +2800,7 @@ int jfs_lazycommit(void *arg)
if (freezing(current)) {
LAZY_UNLOCK(flags);
- refrigerator();
+ try_to_freeze();
} else {
DECLARE_WAITQUEUE(wq, current);
@@ -2994,7 +2994,7 @@ int jfs_sync(void *arg)
if (freezing(current)) {
TXN_UNLOCK();
- refrigerator();
+ try_to_freeze();
} else {
set_current_state(TASK_INTERRUPTIBLE);
TXN_UNLOCK();
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a112ad96e474..5f7c160ea64f 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -72,7 +72,7 @@ static inline void free_ea_wmap(struct inode *inode)
* RETURN: Errors from subroutines
*
*/
-static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
+static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
int rc = 0;
@@ -205,7 +205,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
* note:
* EACCESS: user needs search+write permission on the parent directory
*/
-static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
+static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
{
int rc = 0;
tid_t tid; /* transaction id */
@@ -1353,7 +1353,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* FUNCTION: Create a special file (device)
*/
static int jfs_mknod(struct inode *dir, struct dentry *dentry,
- int mode, dev_t rdev)
+ umode_t mode, dev_t rdev)
{
struct jfs_inode_info *jfs_ip;
struct btstack btstack;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index a44eff076c17..682bca642f38 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -119,7 +119,6 @@ static void jfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
struct jfs_inode_info *ji = JFS_IP(inode);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(jfs_inode_cachep, ji);
}
@@ -609,9 +608,9 @@ static int jfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
-static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int jfs_show_options(struct seq_file *seq, struct dentry *root)
{
- struct jfs_sb_info *sbi = JFS_SBI(vfs->mnt_sb);
+ struct jfs_sb_info *sbi = JFS_SBI(root->d_sb);
if (sbi->uid != -1)
seq_printf(seq, ",uid=%d", sbi->uid);
diff --git a/fs/libfs.c b/fs/libfs.c
index f6d411eef1e7..5b2dbb3ba4fc 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -12,7 +12,7 @@
#include <linux/mutex.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
-#include <linux/buffer_head.h>
+#include <linux/buffer_head.h> /* sync_mapping_buffers */
#include <asm/uaccess.h>
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 23d7451b2938..65ba36b80a9e 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(nsm_lock);
* Local NSM state
*/
u32 __read_mostly nsm_local_state;
-int __read_mostly nsm_use_hostnames;
+bool __read_mostly nsm_use_hostnames;
static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
{
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 1ca0679c80bf..2240d384d787 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -403,7 +403,7 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file)
{
struct super_block *sb = datap;
- return sb == file->f_file->f_path.mnt->mnt_sb;
+ return sb == file->f_file->f_path.dentry->d_sb;
}
/**
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 339e17e9133d..9c501449450d 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -13,13 +13,14 @@
#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
-static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len,
+ void *buf)
{
struct mtd_info *mtd = logfs_super(sb)->s_mtd;
size_t retlen;
int ret;
- ret = mtd->read(mtd, ofs, len, &retlen, buf);
+ ret = mtd_read(mtd, ofs, len, &retlen, buf);
BUG_ON(ret == -EINVAL);
if (ret)
return ret;
@@ -31,7 +32,8 @@ static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
return 0;
}
-static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len,
+ void *buf)
{
struct logfs_super *super = logfs_super(sb);
struct mtd_info *mtd = super->s_mtd;
@@ -47,7 +49,7 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
BUG_ON(len > PAGE_CACHE_SIZE);
page_start = ofs & PAGE_CACHE_MASK;
page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
- ret = mtd->write(mtd, ofs, len, &retlen, buf);
+ ret = mtd_write(mtd, ofs, len, &retlen, buf);
if (ret || (retlen != len))
return -EIO;
@@ -60,14 +62,15 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
* asynchronous properties. So just to prevent the first implementor of such
* a thing from breaking logfs in 2350, we do the usual pointless dance to
* declare a completion variable and wait for completion before returning
- * from mtd_erase(). What an exercise in futility!
+ * from logfs_mtd_erase(). What an exercise in futility!
*/
static void logfs_erase_callback(struct erase_info *ei)
{
complete((struct completion *)ei->priv);
}
-static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
+static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs,
+ size_t len)
{
struct logfs_super *super = logfs_super(sb);
struct address_space *mapping = super->s_mapping_inode->i_mapping;
@@ -84,7 +87,7 @@ static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
return 0;
}
-static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
+static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
int ensure_write)
{
struct mtd_info *mtd = logfs_super(sb)->s_mtd;
@@ -102,30 +105,29 @@ static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
ei.len = len;
ei.callback = logfs_erase_callback;
ei.priv = (long)&complete;
- ret = mtd->erase(mtd, &ei);
+ ret = mtd_erase(mtd, &ei);
if (ret)
return -EIO;
wait_for_completion(&complete);
if (ei.state != MTD_ERASE_DONE)
return -EIO;
- return mtd_erase_mapping(sb, ofs, len);
+ return logfs_mtd_erase_mapping(sb, ofs, len);
}
-static void mtd_sync(struct super_block *sb)
+static void logfs_mtd_sync(struct super_block *sb)
{
struct mtd_info *mtd = logfs_super(sb)->s_mtd;
- if (mtd->sync)
- mtd->sync(mtd);
+ mtd_sync(mtd);
}
-static int mtd_readpage(void *_sb, struct page *page)
+static int logfs_mtd_readpage(void *_sb, struct page *page)
{
struct super_block *sb = _sb;
int err;
- err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+ err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
page_address(page));
if (err == -EUCLEAN || err == -EBADMSG) {
/* -EBADMSG happens regularly on power failures */
@@ -143,18 +145,15 @@ static int mtd_readpage(void *_sb, struct page *page)
return err;
}
-static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
+static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
{
struct logfs_super *super = logfs_super(sb);
struct address_space *mapping = super->s_mapping_inode->i_mapping;
- filler_t *filler = mtd_readpage;
+ filler_t *filler = logfs_mtd_readpage;
struct mtd_info *mtd = super->s_mtd;
- if (!mtd->block_isbad)
- return NULL;
-
*ofs = 0;
- while (mtd->block_isbad(mtd, *ofs)) {
+ while (mtd_block_isbad(mtd, *ofs)) {
*ofs += mtd->erasesize;
if (*ofs >= mtd->size)
return NULL;
@@ -163,18 +162,15 @@ static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
}
-static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
+static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
{
struct logfs_super *super = logfs_super(sb);
struct address_space *mapping = super->s_mapping_inode->i_mapping;
- filler_t *filler = mtd_readpage;
+ filler_t *filler = logfs_mtd_readpage;
struct mtd_info *mtd = super->s_mtd;
- if (!mtd->block_isbad)
- return NULL;
-
*ofs = mtd->size - mtd->erasesize;
- while (mtd->block_isbad(mtd, *ofs)) {
+ while (mtd_block_isbad(mtd, *ofs)) {
*ofs -= mtd->erasesize;
if (*ofs <= 0)
return NULL;
@@ -184,7 +180,7 @@ static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
}
-static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
size_t nr_pages)
{
struct logfs_super *super = logfs_super(sb);
@@ -196,8 +192,8 @@ static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
page = find_lock_page(mapping, index + i);
BUG_ON(!page);
- err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
- page_address(page));
+ err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+ page_address(page));
unlock_page(page);
page_cache_release(page);
if (err)
@@ -206,7 +202,7 @@ static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
return 0;
}
-static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
+static void logfs_mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
{
struct logfs_super *super = logfs_super(sb);
int head;
@@ -227,15 +223,15 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
len += head;
}
len = PAGE_ALIGN(len);
- __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+ __logfs_mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
}
-static void mtd_put_device(struct logfs_super *s)
+static void logfs_mtd_put_device(struct logfs_super *s)
{
put_mtd_device(s->s_mtd);
}
-static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
+static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs)
{
struct logfs_super *super = logfs_super(sb);
void *buf;
@@ -244,7 +240,7 @@ static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
buf = kmalloc(super->s_writesize, GFP_KERNEL);
if (!buf)
return -ENOMEM;
- err = mtd_read(sb, ofs, super->s_writesize, buf);
+ err = logfs_mtd_read(sb, ofs, super->s_writesize, buf);
if (err)
goto out;
if (memchr_inv(buf, 0xff, super->s_writesize))
@@ -255,14 +251,14 @@ out:
}
static const struct logfs_device_ops mtd_devops = {
- .find_first_sb = mtd_find_first_sb,
- .find_last_sb = mtd_find_last_sb,
- .readpage = mtd_readpage,
- .writeseg = mtd_writeseg,
- .erase = mtd_erase,
- .can_write_buf = mtd_can_write_buf,
- .sync = mtd_sync,
- .put_device = mtd_put_device,
+ .find_first_sb = logfs_mtd_find_first_sb,
+ .find_last_sb = logfs_mtd_find_last_sb,
+ .readpage = logfs_mtd_readpage,
+ .writeseg = logfs_mtd_writeseg,
+ .erase = logfs_mtd_erase,
+ .can_write_buf = logfs_mtd_can_write_buf,
+ .sync = logfs_mtd_sync,
+ .put_device = logfs_mtd_put_device,
};
int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b7d7f67cee5a..3de7a32cadbe 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -71,7 +71,7 @@ static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
static int write_inode(struct inode *inode)
{
- return __logfs_write_inode(inode, WF_LOCK);
+ return __logfs_write_inode(inode, NULL, WF_LOCK);
}
static s64 dir_seek_data(struct inode *inode, s64 pos)
@@ -482,7 +482,7 @@ out:
return ret;
}
-static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int logfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
@@ -501,7 +501,7 @@ static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
return __logfs_create(dir, dentry, inode, NULL, 0);
}
-static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int logfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
struct inode *inode;
@@ -517,7 +517,7 @@ static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
return __logfs_create(dir, dentry, inode, NULL, 0);
}
-static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+static int logfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
struct inode *inode;
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index b548c87a86f1..3886cded283c 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -230,7 +230,9 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return ret;
mutex_lock(&inode->i_mutex);
+ logfs_get_wblocks(sb, NULL, WF_LOCK);
logfs_write_anchor(sb);
+ logfs_put_wblocks(sb, NULL, WF_LOCK);
mutex_unlock(&inode->i_mutex);
return 0;
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index caa4419285dc..d4efb061bdc5 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -367,7 +367,7 @@ static struct gc_candidate *get_candidate(struct super_block *sb)
int i, max_dist;
struct gc_candidate *cand = NULL, *this;
- max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
+ max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
for (i = max_dist; i >= 0; i--) {
this = first_in_list(&super->s_low_list[i]);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 7e441ad5f792..a422f42238b2 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -144,7 +144,6 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
static void logfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
}
@@ -287,7 +286,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
return 0;
- ret = __logfs_write_inode(inode, flags);
+ ret = __logfs_write_inode(inode, NULL, flags);
LOGFS_BUG_ON(ret, inode->i_sb);
return ret;
}
@@ -324,7 +323,7 @@ static void logfs_set_ino_generation(struct super_block *sb,
mutex_unlock(&super->s_journal_mutex);
}
-struct inode *logfs_new_inode(struct inode *dir, int mode)
+struct inode *logfs_new_inode(struct inode *dir, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
@@ -364,7 +363,9 @@ static void logfs_init_once(void *_li)
static int logfs_sync_fs(struct super_block *sb, int wait)
{
+ logfs_get_wblocks(sb, NULL, WF_LOCK);
logfs_write_anchor(sb);
+ logfs_put_wblocks(sb, NULL, WF_LOCK);
return 0;
}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 9da29706f91c..1e1c369df22b 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -612,7 +612,6 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
if (len == 0)
return logfs_write_header(super, header, 0, type);
- BUG_ON(len > sb->s_blocksize);
compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
if (compr_len < 0 || type == JE_ANCHOR) {
memcpy(data, buf, len);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 398ecff6e548..5f0937609465 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -520,7 +520,7 @@ extern const struct super_operations logfs_super_operations;
struct inode *logfs_iget(struct super_block *sb, ino_t ino);
struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
void logfs_safe_iput(struct inode *inode, int cookie);
-struct inode *logfs_new_inode(struct inode *dir, int mode);
+struct inode *logfs_new_inode(struct inode *dir, umode_t mode);
struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
int logfs_init_inode_cache(void);
@@ -528,7 +528,7 @@ void logfs_destroy_inode_cache(void);
void logfs_set_blocks(struct inode *inode, u64 no);
/* these logically belong into inode.c but actually reside in readwrite.c */
int logfs_read_inode(struct inode *inode);
-int __logfs_write_inode(struct inode *inode, long flags);
+int __logfs_write_inode(struct inode *inode, struct page *, long flags);
void logfs_evict_inode(struct inode *inode);
/* journal.c */
@@ -577,6 +577,8 @@ void initialize_block_counters(struct page *page, struct logfs_block *block,
__be64 *array, int page_is_empty);
int logfs_exist_block(struct inode *inode, u64 bix);
int get_page_reserve(struct inode *inode, struct page *page);
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
extern struct logfs_block_ops indirect_block_ops;
/* segment.c */
@@ -594,6 +596,7 @@ int logfs_init_mapping(struct super_block *sb);
void logfs_sync_area(struct logfs_area *area);
void logfs_sync_segments(struct super_block *sb);
void freeseg(struct super_block *sb, u32 segno);
+void free_areas(struct super_block *sb);
/* area handling */
int logfs_init_areas(struct super_block *sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 2ac4217b7901..4153e65b0148 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -244,8 +244,7 @@ static void preunlock_page(struct super_block *sb, struct page *page, int lock)
* is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
* in addition to PG_locked.
*/
-static void logfs_get_wblocks(struct super_block *sb, struct page *page,
- int lock)
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock)
{
struct logfs_super *super = logfs_super(sb);
@@ -260,8 +259,7 @@ static void logfs_get_wblocks(struct super_block *sb, struct page *page,
}
}
-static void logfs_put_wblocks(struct super_block *sb, struct page *page,
- int lock)
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock)
{
struct logfs_super *super = logfs_super(sb);
@@ -424,7 +422,7 @@ static void inode_write_block(struct logfs_block *block)
if (inode->i_ino == LOGFS_INO_MASTER)
logfs_write_anchor(inode->i_sb);
else {
- ret = __logfs_write_inode(inode, 0);
+ ret = __logfs_write_inode(inode, NULL, 0);
/* see indirect_write_block comment */
BUG_ON(ret);
}
@@ -560,8 +558,13 @@ static void inode_free_block(struct super_block *sb, struct logfs_block *block)
static void indirect_free_block(struct super_block *sb,
struct logfs_block *block)
{
- ClearPagePrivate(block->page);
- block->page->private = 0;
+ struct page *page = block->page;
+
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ set_page_private(page, 0);
+ }
__free_block(sb, block);
}
@@ -650,8 +653,11 @@ static void alloc_data_block(struct inode *inode, struct page *page)
logfs_unpack_index(page->index, &bix, &level);
block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
block->page = page;
+
SetPagePrivate(page);
- page->private = (unsigned long)block;
+ page_cache_get(page);
+ set_page_private(page, (unsigned long) block);
+
block->ops = &indirect_block_ops;
}
@@ -1570,11 +1576,15 @@ int logfs_write_buf(struct inode *inode, struct page *page, long flags)
static int __logfs_delete(struct inode *inode, struct page *page)
{
long flags = WF_DELETE;
+ int err;
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
if (page->index < I0_BLOCKS)
return logfs_write_direct(inode, page, flags);
+ err = grow_inode(inode, page->index, 0);
+ if (err)
+ return err;
return logfs_write_rec(inode, page, page->index, 0, flags);
}
@@ -1623,7 +1633,7 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
if (inode->i_ino == LOGFS_INO_MASTER)
logfs_write_anchor(inode->i_sb);
else {
- err = __logfs_write_inode(inode, flags);
+ err = __logfs_write_inode(inode, page, flags);
}
}
}
@@ -1873,7 +1883,7 @@ int logfs_truncate(struct inode *inode, u64 target)
logfs_get_wblocks(sb, NULL, 1);
err = __logfs_truncate(inode, size);
if (!err)
- err = __logfs_write_inode(inode, 0);
+ err = __logfs_write_inode(inode, NULL, 0);
logfs_put_wblocks(sb, NULL, 1);
}
@@ -1901,8 +1911,11 @@ static void move_page_to_inode(struct inode *inode, struct page *page)
li->li_block = block;
block->page = NULL;
- page->private = 0;
- ClearPagePrivate(page);
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ set_page_private(page, 0);
+ }
}
static void move_inode_to_page(struct page *page, struct inode *inode)
@@ -1918,8 +1931,12 @@ static void move_inode_to_page(struct page *page, struct inode *inode)
BUG_ON(PagePrivate(page));
block->ops = &indirect_block_ops;
block->page = page;
- page->private = (unsigned long)block;
- SetPagePrivate(page);
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, (unsigned long) block);
+ }
block->inode = NULL;
li->li_block = NULL;
@@ -2106,14 +2123,14 @@ void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
ec_level);
}
-int __logfs_write_inode(struct inode *inode, long flags)
+int __logfs_write_inode(struct inode *inode, struct page *page, long flags)
{
struct super_block *sb = inode->i_sb;
int ret;
- logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
+ logfs_get_wblocks(sb, page, flags & WF_LOCK);
ret = do_write_inode(inode);
- logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
+ logfs_put_wblocks(sb, page, flags & WF_LOCK);
return ret;
}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 9d5187353255..ab798ed1cc88 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -86,7 +86,11 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
BUG_ON(!page); /* FIXME: reserve a pool */
SetPageUptodate(page);
memcpy(page_address(page) + offset, buf, copylen);
- SetPagePrivate(page);
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ }
page_cache_release(page);
buf += copylen;
@@ -110,7 +114,10 @@ static void pad_partial_page(struct logfs_area *area)
page = get_mapping_page(sb, index, 0);
BUG_ON(!page); /* FIXME: reserve a pool */
memset(page_address(page) + offset, 0xff, len);
- SetPagePrivate(page);
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ }
page_cache_release(page);
}
}
@@ -130,7 +137,10 @@ static void pad_full_pages(struct logfs_area *area)
BUG_ON(!page); /* FIXME: reserve a pool */
SetPageUptodate(page);
memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
- SetPagePrivate(page);
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ }
page_cache_release(page);
index++;
no_indizes--;
@@ -485,8 +495,12 @@ static void move_btree_to_page(struct inode *inode, struct page *page,
mempool_free(item, super->s_alias_pool);
}
block->page = page;
- SetPagePrivate(page);
- page->private = (unsigned long)block;
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, (unsigned long) block);
+ }
block->ops = &indirect_block_ops;
initialize_block_counters(page, block, data, 0);
}
@@ -536,8 +550,12 @@ void move_page_to_btree(struct page *page)
list_add(&item->list, &block->item_list);
}
block->page = NULL;
- ClearPagePrivate(page);
- page->private = 0;
+
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ set_page_private(page, 0);
+ }
block->ops = &btree_block_ops;
err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
block);
@@ -702,7 +720,10 @@ void freeseg(struct super_block *sb, u32 segno)
page = find_get_page(mapping, ofs >> PAGE_SHIFT);
if (!page)
continue;
- ClearPagePrivate(page);
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ }
page_cache_release(page);
}
}
@@ -841,6 +862,16 @@ static void free_area(struct logfs_area *area)
kfree(area);
}
+void free_areas(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int i;
+
+ for_each_area(i)
+ free_area(super->s_area[i]);
+ free_area(super->s_journal_area);
+}
+
static struct logfs_area *alloc_area(struct super_block *sb)
{
struct logfs_area *area;
@@ -923,10 +954,6 @@ err:
void logfs_cleanup_areas(struct super_block *sb)
{
struct logfs_super *super = logfs_super(sb);
- int i;
btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
- for_each_area(i)
- free_area(super->s_area[i]);
- free_area(super->s_journal_area);
}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index e795c234ea33..c9ee7f5d1caf 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -486,14 +486,15 @@ static void logfs_kill_sb(struct super_block *sb)
/* Alias entries slow down mount, so evict as many as possible */
sync_filesystem(sb);
logfs_write_anchor(sb);
+ free_areas(sb);
/*
* From this point on alias entries are simply dropped - and any
* writes to the object store are considered bugs.
*/
- super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
log_super("LogFS: Now in shutdown\n");
generic_shutdown_super(sb);
+ super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index ef175cb8cfd8..4bc50dac8e97 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -209,7 +209,7 @@ void minix_free_inode(struct inode * inode)
mark_buffer_dirty(bh);
}
-struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
+struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error)
{
struct super_block *sb = dir->i_sb;
struct minix_sb_info *sbi = minix_sb(sb);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 4d46a6a59070..fa8b612b8ce2 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -71,7 +71,6 @@ static struct inode *minix_alloc_inode(struct super_block *sb)
static void minix_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(minix_inode_cachep, minix_i(inode));
}
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 26bbd55e82ea..c889ef0aa571 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -46,7 +46,7 @@ struct minix_sb_info {
extern struct inode *minix_iget(struct super_block *, unsigned long);
extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
-extern struct inode * minix_new_inode(const struct inode *, int, int *);
+extern struct inode * minix_new_inode(const struct inode *, umode_t, int *);
extern void minix_free_inode(struct inode * inode);
extern unsigned long minix_count_free_inodes(struct super_block *sb);
extern int minix_new_block(struct inode * inode);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 6e6777f1b4b2..2f76e38c2065 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -36,7 +36,7 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st
return NULL;
}
-static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
+static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
{
int error;
struct inode *inode;
@@ -54,7 +54,7 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_
return error;
}
-static int minix_create(struct inode * dir, struct dentry *dentry, int mode,
+static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
return minix_mknod(dir, dentry, mode, 0);
@@ -103,7 +103,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
return add_nondir(dentry, inode);
}
-static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
+static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)
{
struct inode * inode;
int err = -EMLINK;
diff --git a/fs/mount.h b/fs/mount.h
new file mode 100644
index 000000000000..4ef36d93e5a2
--- /dev/null
+++ b/fs/mount.h
@@ -0,0 +1,76 @@
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/poll.h>
+
+struct mnt_namespace {
+ atomic_t count;
+ struct mount * root;
+ struct list_head list;
+ wait_queue_head_t poll;
+ int event;
+};
+
+struct mnt_pcp {
+ int mnt_count;
+ int mnt_writers;
+};
+
+struct mount {
+ struct list_head mnt_hash;
+ struct mount *mnt_parent;
+ struct dentry *mnt_mountpoint;
+ struct vfsmount mnt;
+#ifdef CONFIG_SMP
+ struct mnt_pcp __percpu *mnt_pcp;
+ atomic_t mnt_longterm; /* how many of the refs are longterm */
+#else
+ int mnt_count;
+ int mnt_writers;
+#endif
+ struct list_head mnt_mounts; /* list of children, anchored here */
+ struct list_head mnt_child; /* and going through their mnt_child */
+ struct list_head mnt_instance; /* mount instance on sb->s_mounts */
+ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
+ struct list_head mnt_list;
+ struct list_head mnt_expire; /* link in fs-specific expiry list */
+ struct list_head mnt_share; /* circular list of shared mounts */
+ struct list_head mnt_slave_list;/* list of slave mounts */
+ struct list_head mnt_slave; /* slave list entry */
+ struct mount *mnt_master; /* slave is on master->mnt_slave_list */
+ struct mnt_namespace *mnt_ns; /* containing namespace */
+#ifdef CONFIG_FSNOTIFY
+ struct hlist_head mnt_fsnotify_marks;
+ __u32 mnt_fsnotify_mask;
+#endif
+ int mnt_id; /* mount identifier */
+ int mnt_group_id; /* peer group identifier */
+ int mnt_expiry_mark; /* true if marked for expiry */
+ int mnt_pinned;
+ int mnt_ghosts;
+};
+
+static inline struct mount *real_mount(struct vfsmount *mnt)
+{
+ return container_of(mnt, struct mount, mnt);
+}
+
+static inline int mnt_has_parent(struct mount *mnt)
+{
+ return mnt != mnt->mnt_parent;
+}
+
+extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
+
+static inline void get_mnt_ns(struct mnt_namespace *ns)
+{
+ atomic_inc(&ns->count);
+}
+
+struct proc_mounts {
+ struct seq_file m; /* must be the first element */
+ struct mnt_namespace *ns;
+ struct path root;
+ int (*show)(struct seq_file *, struct vfsmount *);
+};
+
+extern const struct seq_operations mounts_op;
diff --git a/fs/mpage.c b/fs/mpage.c
index fdfae9fa98cd..643e9f55ef29 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -371,9 +371,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
- struct blk_plug plug;
-
- blk_start_plug(&plug);
map_bh.b_state = 0;
map_bh.b_size = 0;
@@ -395,7 +392,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
BUG_ON(!list_empty(pages));
if (bio)
mpage_bio_submit(READ, bio);
- blk_finish_plug(&plug);
return 0;
}
EXPORT_SYMBOL(mpage_readpages);
diff --git a/fs/namei.c b/fs/namei.c
index 5008f01787f5..46ea9cc16647 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -36,6 +36,7 @@
#include <asm/uaccess.h>
#include "internal.h"
+#include "mount.h"
/* [Feb-1997 T. Schoebel-Theuer]
* Fundamental changes in the pathname lookup mechanisms (namei)
@@ -139,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
static char *getname_flags(const char __user *filename, int flags, int *empty)
{
- char *tmp, *result;
+ char *result = __getname();
+ int retval;
- result = ERR_PTR(-ENOMEM);
- tmp = __getname();
- if (tmp) {
- int retval = do_getname(filename, tmp);
+ if (!result)
+ return ERR_PTR(-ENOMEM);
- result = tmp;
- if (retval < 0) {
- if (retval == -ENOENT && empty)
- *empty = 1;
- if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
- __putname(tmp);
- result = ERR_PTR(retval);
- }
+ retval = do_getname(filename, result);
+ if (retval < 0) {
+ if (retval == -ENOENT && empty)
+ *empty = 1;
+ if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+ __putname(result);
+ return ERR_PTR(retval);
}
}
audit_getname(result);
@@ -676,36 +675,38 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
static int follow_up_rcu(struct path *path)
{
- struct vfsmount *parent;
+ struct mount *mnt = real_mount(path->mnt);
+ struct mount *parent;
struct dentry *mountpoint;
- parent = path->mnt->mnt_parent;
- if (parent == path->mnt)
+ parent = mnt->mnt_parent;
+ if (&parent->mnt == path->mnt)
return 0;
- mountpoint = path->mnt->mnt_mountpoint;
+ mountpoint = mnt->mnt_mountpoint;
path->dentry = mountpoint;
- path->mnt = parent;
+ path->mnt = &parent->mnt;
return 1;
}
int follow_up(struct path *path)
{
- struct vfsmount *parent;
+ struct mount *mnt = real_mount(path->mnt);
+ struct mount *parent;
struct dentry *mountpoint;
br_read_lock(vfsmount_lock);
- parent = path->mnt->mnt_parent;
- if (parent == path->mnt) {
+ parent = mnt->mnt_parent;
+ if (&parent->mnt == path->mnt) {
br_read_unlock(vfsmount_lock);
return 0;
}
- mntget(parent);
- mountpoint = dget(path->mnt->mnt_mountpoint);
+ mntget(&parent->mnt);
+ mountpoint = dget(mnt->mnt_mountpoint);
br_read_unlock(vfsmount_lock);
dput(path->dentry);
path->dentry = mountpoint;
mntput(path->mnt);
- path->mnt = parent;
+ path->mnt = &parent->mnt;
return 1;
}
@@ -884,7 +885,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
struct inode **inode)
{
for (;;) {
- struct vfsmount *mounted;
+ struct mount *mounted;
/*
* Don't forget we might have a non-mountpoint managed dentry
* that wants to block transit.
@@ -898,8 +899,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
mounted = __lookup_mnt(path->mnt, path->dentry, 1);
if (!mounted)
break;
- path->mnt = mounted;
- path->dentry = mounted->mnt_root;
+ path->mnt = &mounted->mnt;
+ path->dentry = mounted->mnt.mnt_root;
nd->flags |= LOOKUP_JUMPED;
nd->seq = read_seqcount_begin(&path->dentry->d_seq);
/*
@@ -915,12 +916,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
static void follow_mount_rcu(struct nameidata *nd)
{
while (d_mountpoint(nd->path.dentry)) {
- struct vfsmount *mounted;
+ struct mount *mounted;
mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
if (!mounted)
break;
- nd->path.mnt = mounted;
- nd->path.dentry = mounted->mnt_root;
+ nd->path.mnt = &mounted->mnt;
+ nd->path.dentry = mounted->mnt.mnt_root;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
}
}
@@ -1094,8 +1095,10 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr
struct dentry *old;
/* Don't create child dentry for a dead directory. */
- if (unlikely(IS_DEADDIR(inode)))
+ if (unlikely(IS_DEADDIR(inode))) {
+ dput(dentry);
return ERR_PTR(-ENOENT);
+ }
old = inode->i_op->lookup(inode, dentry, nd);
if (unlikely(old)) {
@@ -1371,6 +1374,34 @@ static inline int can_lookup(struct inode *inode)
return 1;
}
+unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+{
+ unsigned long hash = init_name_hash();
+ while (len--)
+ hash = partial_name_hash(*name++, hash);
+ return end_name_hash(hash);
+}
+EXPORT_SYMBOL(full_name_hash);
+
+/*
+ * We know there's a real path component here of at least
+ * one character.
+ */
+static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+{
+ unsigned long hash = init_name_hash();
+ unsigned long len = 0, c;
+
+ c = (unsigned char)*name;
+ do {
+ len++;
+ hash = partial_name_hash(c, hash);
+ c = (unsigned char)name[len];
+ } while (c && c != '/');
+ *hashp = end_name_hash(hash);
+ return len;
+}
+
/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
@@ -1391,31 +1422,22 @@ static int link_path_walk(const char *name, struct nameidata *nd)
/* At this point we know we have a real path component. */
for(;;) {
- unsigned long hash;
struct qstr this;
- unsigned int c;
+ long len;
int type;
err = may_lookup(nd);
if (err)
break;
+ len = hash_name(name, &this.hash);
this.name = name;
- c = *(const unsigned char *)name;
-
- hash = init_name_hash();
- do {
- name++;
- hash = partial_name_hash(c, hash);
- c = *(const unsigned char *)name;
- } while (c && (c != '/'));
- this.len = name - (const char *) this.name;
- this.hash = end_name_hash(hash);
+ this.len = len;
type = LAST_NORM;
- if (this.name[0] == '.') switch (this.len) {
+ if (name[0] == '.') switch (len) {
case 2:
- if (this.name[1] == '.') {
+ if (name[1] == '.') {
type = LAST_DOTDOT;
nd->flags |= LOOKUP_JUMPED;
}
@@ -1434,12 +1456,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)
}
}
- /* remove trailing slashes? */
- if (!c)
+ if (!name[len])
goto last_component;
- while (*++name == '/');
- if (!*name)
+ /*
+ * If it wasn't NUL, we know it was '/'. Skip that
+ * slash, and continue until no more slashes.
+ */
+ do {
+ len++;
+ } while (unlikely(name[len] == '/'));
+ if (!name[len])
goto last_component;
+ name += len;
err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
if (err < 0)
@@ -1772,24 +1800,21 @@ static struct dentry *lookup_hash(struct nameidata *nd)
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
struct qstr this;
- unsigned long hash;
unsigned int c;
WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
this.name = name;
this.len = len;
+ this.hash = full_name_hash(name, len);
if (!len)
return ERR_PTR(-EACCES);
- hash = init_name_hash();
while (len--) {
c = *(const unsigned char *)name++;
if (c == '/' || c == '\0')
return ERR_PTR(-EACCES);
- hash = partial_name_hash(c, hash);
}
- this.hash = end_name_hash(hash);
/*
* See if the low-level filesystem might want
* to use its own hash..
@@ -1976,7 +2001,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
}
}
-int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
+int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
int error = may_create(dir, dentry);
@@ -2137,7 +2162,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
/* sayonara */
error = complete_walk(nd);
if (error)
- return ERR_PTR(-ECHILD);
+ return ERR_PTR(error);
error = -ENOTDIR;
if (nd->flags & LOOKUP_DIRECTORY) {
@@ -2177,7 +2202,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
/* Negative dentry, just create the file */
if (!dentry->d_inode) {
- int mode = op->mode;
+ umode_t mode = op->mode;
if (!IS_POSIXACL(dir->d_inode))
mode &= ~current_umask();
/*
@@ -2236,7 +2261,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
error = complete_walk(nd);
if (error)
- goto exit;
+ return ERR_PTR(error);
error = -EISDIR;
if (S_ISDIR(nd->inode->i_mode))
goto exit;
@@ -2444,7 +2469,7 @@ struct dentry *user_path_create(int dfd, const char __user *pathname, struct pat
}
EXPORT_SYMBOL(user_path_create);
-int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
int error = may_create(dir, dentry);
@@ -2472,7 +2497,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
return error;
}
-static int may_mknod(mode_t mode)
+static int may_mknod(umode_t mode)
{
switch (mode & S_IFMT) {
case S_IFREG:
@@ -2489,7 +2514,7 @@ static int may_mknod(mode_t mode)
}
}
-SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
+SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
unsigned, dev)
{
struct dentry *dentry;
@@ -2536,12 +2561,12 @@ out_dput:
return error;
}
-SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
+SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
return sys_mknodat(AT_FDCWD, filename, mode, dev);
}
-int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int error = may_create(dir, dentry);
@@ -2562,7 +2587,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
return error;
}
-SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
+SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
struct dentry *dentry;
struct path path;
@@ -2590,7 +2615,7 @@ out_dput:
return error;
}
-SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
+SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
{
return sys_mkdirat(AT_FDCWD, pathname, mode);
}
diff --git a/fs/namespace.c b/fs/namespace.c
index cfc6d4448aa5..e6081996c9a2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -9,30 +9,17 @@
*/
#include <linux/syscalls.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/percpu.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/acct.h>
+#include <linux/export.h>
#include <linux/capability.h>
-#include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/sysfs.h>
-#include <linux/seq_file.h>
#include <linux/mnt_namespace.h>
#include <linux/namei.h>
-#include <linux/nsproxy.h>
#include <linux/security.h>
-#include <linux/mount.h>
-#include <linux/ramfs.h>
-#include <linux/log2.h>
#include <linux/idr.h>
-#include <linux/fs_struct.h>
-#include <linux/fsnotify.h>
-#include <asm/uaccess.h>
-#include <asm/unistd.h>
+#include <linux/acct.h> /* acct_auto_close_mnt */
+#include <linux/ramfs.h> /* init_rootfs */
+#include <linux/fs_struct.h> /* get_fs_root et.al. */
+#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
+#include <linux/uaccess.h>
#include "pnode.h"
#include "internal.h"
@@ -78,7 +65,7 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
* allocation is serialized by namespace_sem, but we need the spinlock to
* serialize with freeing.
*/
-static int mnt_alloc_id(struct vfsmount *mnt)
+static int mnt_alloc_id(struct mount *mnt)
{
int res;
@@ -95,7 +82,7 @@ retry:
return res;
}
-static void mnt_free_id(struct vfsmount *mnt)
+static void mnt_free_id(struct mount *mnt)
{
int id = mnt->mnt_id;
spin_lock(&mnt_id_lock);
@@ -110,7 +97,7 @@ static void mnt_free_id(struct vfsmount *mnt)
*
* mnt_group_ida is protected by namespace_sem
*/
-static int mnt_alloc_group_id(struct vfsmount *mnt)
+static int mnt_alloc_group_id(struct mount *mnt)
{
int res;
@@ -129,7 +116,7 @@ static int mnt_alloc_group_id(struct vfsmount *mnt)
/*
* Release a peer group ID
*/
-void mnt_release_group_id(struct vfsmount *mnt)
+void mnt_release_group_id(struct mount *mnt)
{
int id = mnt->mnt_group_id;
ida_remove(&mnt_group_ida, id);
@@ -141,7 +128,7 @@ void mnt_release_group_id(struct vfsmount *mnt)
/*
* vfsmount lock must be held for read
*/
-static inline void mnt_add_count(struct vfsmount *mnt, int n)
+static inline void mnt_add_count(struct mount *mnt, int n)
{
#ifdef CONFIG_SMP
this_cpu_add(mnt->mnt_pcp->mnt_count, n);
@@ -152,35 +139,10 @@ static inline void mnt_add_count(struct vfsmount *mnt, int n)
#endif
}
-static inline void mnt_set_count(struct vfsmount *mnt, int n)
-{
-#ifdef CONFIG_SMP
- this_cpu_write(mnt->mnt_pcp->mnt_count, n);
-#else
- mnt->mnt_count = n;
-#endif
-}
-
-/*
- * vfsmount lock must be held for read
- */
-static inline void mnt_inc_count(struct vfsmount *mnt)
-{
- mnt_add_count(mnt, 1);
-}
-
-/*
- * vfsmount lock must be held for read
- */
-static inline void mnt_dec_count(struct vfsmount *mnt)
-{
- mnt_add_count(mnt, -1);
-}
-
/*
* vfsmount lock must be held for write
*/
-unsigned int mnt_get_count(struct vfsmount *mnt)
+unsigned int mnt_get_count(struct mount *mnt)
{
#ifdef CONFIG_SMP
unsigned int count = 0;
@@ -196,9 +158,9 @@ unsigned int mnt_get_count(struct vfsmount *mnt)
#endif
}
-static struct vfsmount *alloc_vfsmnt(const char *name)
+static struct mount *alloc_vfsmnt(const char *name)
{
- struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
+ struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
if (mnt) {
int err;
@@ -277,7 +239,7 @@ int __mnt_is_readonly(struct vfsmount *mnt)
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
-static inline void mnt_inc_writers(struct vfsmount *mnt)
+static inline void mnt_inc_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
this_cpu_inc(mnt->mnt_pcp->mnt_writers);
@@ -286,7 +248,7 @@ static inline void mnt_inc_writers(struct vfsmount *mnt)
#endif
}
-static inline void mnt_dec_writers(struct vfsmount *mnt)
+static inline void mnt_dec_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
this_cpu_dec(mnt->mnt_pcp->mnt_writers);
@@ -295,7 +257,7 @@ static inline void mnt_dec_writers(struct vfsmount *mnt)
#endif
}
-static unsigned int mnt_get_writers(struct vfsmount *mnt)
+static unsigned int mnt_get_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
unsigned int count = 0;
@@ -311,6 +273,15 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt)
#endif
}
+static int mnt_is_readonly(struct vfsmount *mnt)
+{
+ if (mnt->mnt_sb->s_readonly_remount)
+ return 1;
+ /* Order wrt setting s_flags/s_readonly_remount in do_remount() */
+ smp_rmb();
+ return __mnt_is_readonly(mnt);
+}
+
/*
* Most r/o checks on a fs are for operations that take
* discrete amounts of time, like a write() or unlink().
@@ -321,7 +292,7 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt)
*/
/**
* mnt_want_write - get write access to a mount
- * @mnt: the mount on which to take a write
+ * @m: the mount on which to take a write
*
* This tells the low-level filesystem that a write is
* about to be performed to it, and makes sure that
@@ -329,8 +300,9 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt)
* the write operation is finished, mnt_drop_write()
* must be called. This is effectively a refcount.
*/
-int mnt_want_write(struct vfsmount *mnt)
+int mnt_want_write(struct vfsmount *m)
{
+ struct mount *mnt = real_mount(m);
int ret = 0;
preempt_disable();
@@ -341,7 +313,7 @@ int mnt_want_write(struct vfsmount *mnt)
* incremented count after it has set MNT_WRITE_HOLD.
*/
smp_mb();
- while (mnt->mnt_flags & MNT_WRITE_HOLD)
+ while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
cpu_relax();
/*
* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
@@ -349,12 +321,10 @@ int mnt_want_write(struct vfsmount *mnt)
* MNT_WRITE_HOLD is cleared.
*/
smp_rmb();
- if (__mnt_is_readonly(mnt)) {
+ if (mnt_is_readonly(m)) {
mnt_dec_writers(mnt);
ret = -EROFS;
- goto out;
}
-out:
preempt_enable();
return ret;
}
@@ -378,7 +348,7 @@ int mnt_clone_write(struct vfsmount *mnt)
if (__mnt_is_readonly(mnt))
return -EROFS;
preempt_disable();
- mnt_inc_writers(mnt);
+ mnt_inc_writers(real_mount(mnt));
preempt_enable();
return 0;
}
@@ -412,17 +382,23 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file);
void mnt_drop_write(struct vfsmount *mnt)
{
preempt_disable();
- mnt_dec_writers(mnt);
+ mnt_dec_writers(real_mount(mnt));
preempt_enable();
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
-static int mnt_make_readonly(struct vfsmount *mnt)
+void mnt_drop_write_file(struct file *file)
+{
+ mnt_drop_write(file->f_path.mnt);
+}
+EXPORT_SYMBOL(mnt_drop_write_file);
+
+static int mnt_make_readonly(struct mount *mnt)
{
int ret = 0;
br_write_lock(vfsmount_lock);
- mnt->mnt_flags |= MNT_WRITE_HOLD;
+ mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
/*
* After storing MNT_WRITE_HOLD, we'll read the counters. This store
* should be visible before we do.
@@ -448,25 +424,61 @@ static int mnt_make_readonly(struct vfsmount *mnt)
if (mnt_get_writers(mnt) > 0)
ret = -EBUSY;
else
- mnt->mnt_flags |= MNT_READONLY;
+ mnt->mnt.mnt_flags |= MNT_READONLY;
/*
* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
* that become unheld will see MNT_READONLY.
*/
smp_wmb();
- mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+ mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
br_write_unlock(vfsmount_lock);
return ret;
}
-static void __mnt_unmake_readonly(struct vfsmount *mnt)
+static void __mnt_unmake_readonly(struct mount *mnt)
{
br_write_lock(vfsmount_lock);
- mnt->mnt_flags &= ~MNT_READONLY;
+ mnt->mnt.mnt_flags &= ~MNT_READONLY;
+ br_write_unlock(vfsmount_lock);
+}
+
+int sb_prepare_remount_readonly(struct super_block *sb)
+{
+ struct mount *mnt;
+ int err = 0;
+
+ /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
+ if (atomic_long_read(&sb->s_remove_count))
+ return -EBUSY;
+
+ br_write_lock(vfsmount_lock);
+ list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+ if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
+ mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
+ smp_mb();
+ if (mnt_get_writers(mnt) > 0) {
+ err = -EBUSY;
+ break;
+ }
+ }
+ }
+ if (!err && atomic_long_read(&sb->s_remove_count))
+ err = -EBUSY;
+
+ if (!err) {
+ sb->s_readonly_remount = 1;
+ smp_wmb();
+ }
+ list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+ if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
+ mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+ }
br_write_unlock(vfsmount_lock);
+
+ return err;
}
-static void free_vfsmnt(struct vfsmount *mnt)
+static void free_vfsmnt(struct mount *mnt)
{
kfree(mnt->mnt_devname);
mnt_free_id(mnt);
@@ -481,20 +493,20 @@ static void free_vfsmnt(struct vfsmount *mnt)
* @dir. If @dir is set return the first mount else return the last mount.
* vfsmount_lock must be held for read or write.
*/
-struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
+struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
int dir)
{
struct list_head *head = mount_hashtable + hash(mnt, dentry);
struct list_head *tmp = head;
- struct vfsmount *p, *found = NULL;
+ struct mount *p, *found = NULL;
for (;;) {
tmp = dir ? tmp->next : tmp->prev;
p = NULL;
if (tmp == head)
break;
- p = list_entry(tmp, struct vfsmount, mnt_hash);
- if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
+ p = list_entry(tmp, struct mount, mnt_hash);
+ if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) {
found = p;
break;
}
@@ -508,16 +520,21 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
*/
struct vfsmount *lookup_mnt(struct path *path)
{
- struct vfsmount *child_mnt;
+ struct mount *child_mnt;
br_read_lock(vfsmount_lock);
- if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
- mntget(child_mnt);
- br_read_unlock(vfsmount_lock);
- return child_mnt;
+ child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
+ if (child_mnt) {
+ mnt_add_count(child_mnt, 1);
+ br_read_unlock(vfsmount_lock);
+ return &child_mnt->mnt;
+ } else {
+ br_read_unlock(vfsmount_lock);
+ return NULL;
+ }
}
-static inline int check_mnt(struct vfsmount *mnt)
+static inline int check_mnt(struct mount *mnt)
{
return mnt->mnt_ns == current->nsproxy->mnt_ns;
}
@@ -548,12 +565,12 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
* Clear dentry's mounted state if it has no remaining mounts.
* vfsmount_lock must be held for write.
*/
-static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry)
+static void dentry_reset_mounted(struct dentry *dentry)
{
unsigned u;
for (u = 0; u < HASH_SIZE; u++) {
- struct vfsmount *p;
+ struct mount *p;
list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
if (p->mnt_mountpoint == dentry)
@@ -568,25 +585,26 @@ static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry)
/*
* vfsmount lock must be held for write
*/
-static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
+static void detach_mnt(struct mount *mnt, struct path *old_path)
{
old_path->dentry = mnt->mnt_mountpoint;
- old_path->mnt = mnt->mnt_parent;
+ old_path->mnt = &mnt->mnt_parent->mnt;
mnt->mnt_parent = mnt;
- mnt->mnt_mountpoint = mnt->mnt_root;
+ mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
list_del_init(&mnt->mnt_hash);
- dentry_reset_mounted(old_path->mnt, old_path->dentry);
+ dentry_reset_mounted(old_path->dentry);
}
/*
* vfsmount lock must be held for write
*/
-void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
- struct vfsmount *child_mnt)
+void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry,
+ struct mount *child_mnt)
{
- child_mnt->mnt_parent = mntget(mnt);
+ mnt_add_count(mnt, 1); /* essentially, that's mntget */
child_mnt->mnt_mountpoint = dget(dentry);
+ child_mnt->mnt_parent = mnt;
spin_lock(&dentry->d_lock);
dentry->d_flags |= DCACHE_MOUNTED;
spin_unlock(&dentry->d_lock);
@@ -595,15 +613,15 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
/*
* vfsmount lock must be held for write
*/
-static void attach_mnt(struct vfsmount *mnt, struct path *path)
+static void attach_mnt(struct mount *mnt, struct path *path)
{
- mnt_set_mountpoint(path->mnt, path->dentry, mnt);
+ mnt_set_mountpoint(real_mount(path->mnt), path->dentry, mnt);
list_add_tail(&mnt->mnt_hash, mount_hashtable +
hash(path->mnt, path->dentry));
- list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
+ list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts);
}
-static inline void __mnt_make_longterm(struct vfsmount *mnt)
+static inline void __mnt_make_longterm(struct mount *mnt)
{
#ifdef CONFIG_SMP
atomic_inc(&mnt->mnt_longterm);
@@ -611,7 +629,7 @@ static inline void __mnt_make_longterm(struct vfsmount *mnt)
}
/* needs vfsmount lock for write */
-static inline void __mnt_make_shortterm(struct vfsmount *mnt)
+static inline void __mnt_make_shortterm(struct mount *mnt)
{
#ifdef CONFIG_SMP
atomic_dec(&mnt->mnt_longterm);
@@ -621,10 +639,10 @@ static inline void __mnt_make_shortterm(struct vfsmount *mnt)
/*
* vfsmount lock must be held for write
*/
-static void commit_tree(struct vfsmount *mnt)
+static void commit_tree(struct mount *mnt)
{
- struct vfsmount *parent = mnt->mnt_parent;
- struct vfsmount *m;
+ struct mount *parent = mnt->mnt_parent;
+ struct mount *m;
LIST_HEAD(head);
struct mnt_namespace *n = parent->mnt_ns;
@@ -639,12 +657,12 @@ static void commit_tree(struct vfsmount *mnt)
list_splice(&head, n->list.prev);
list_add_tail(&mnt->mnt_hash, mount_hashtable +
- hash(parent, mnt->mnt_mountpoint));
+ hash(&parent->mnt, mnt->mnt_mountpoint));
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
touch_mnt_namespace(n);
}
-static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
+static struct mount *next_mnt(struct mount *p, struct mount *root)
{
struct list_head *next = p->mnt_mounts.next;
if (next == &p->mnt_mounts) {
@@ -657,14 +675,14 @@ static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
p = p->mnt_parent;
}
}
- return list_entry(next, struct vfsmount, mnt_child);
+ return list_entry(next, struct mount, mnt_child);
}
-static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
+static struct mount *skip_mnt_tree(struct mount *p)
{
struct list_head *prev = p->mnt_mounts.prev;
while (prev != &p->mnt_mounts) {
- p = list_entry(prev, struct vfsmount, mnt_child);
+ p = list_entry(prev, struct mount, mnt_child);
prev = p->mnt_mounts.prev;
}
return p;
@@ -673,7 +691,7 @@ static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
- struct vfsmount *mnt;
+ struct mount *mnt;
struct dentry *root;
if (!type)
@@ -684,7 +702,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
return ERR_PTR(-ENOMEM);
if (flags & MS_KERNMOUNT)
- mnt->mnt_flags = MNT_INTERNAL;
+ mnt->mnt.mnt_flags = MNT_INTERNAL;
root = mount_fs(type, flags, name, data);
if (IS_ERR(root)) {
@@ -692,19 +710,22 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
return ERR_CAST(root);
}
- mnt->mnt_root = root;
- mnt->mnt_sb = root->d_sb;
- mnt->mnt_mountpoint = mnt->mnt_root;
+ mnt->mnt.mnt_root = root;
+ mnt->mnt.mnt_sb = root->d_sb;
+ mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
- return mnt;
+ br_write_lock(vfsmount_lock);
+ list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
+ br_write_unlock(vfsmount_lock);
+ return &mnt->mnt;
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);
-static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
+static struct mount *clone_mnt(struct mount *old, struct dentry *root,
int flag)
{
- struct super_block *sb = old->mnt_sb;
- struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
+ struct super_block *sb = old->mnt.mnt_sb;
+ struct mount *mnt = alloc_vfsmnt(old->mnt_devname);
if (mnt) {
if (flag & (CL_SLAVE | CL_PRIVATE))
@@ -718,12 +739,15 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
goto out_free;
}
- mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
+ mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
atomic_inc(&sb->s_active);
- mnt->mnt_sb = sb;
- mnt->mnt_root = dget(root);
- mnt->mnt_mountpoint = mnt->mnt_root;
+ mnt->mnt.mnt_sb = sb;
+ mnt->mnt.mnt_root = dget(root);
+ mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
+ br_write_lock(vfsmount_lock);
+ list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
+ br_write_unlock(vfsmount_lock);
if (flag & CL_SLAVE) {
list_add(&mnt->mnt_slave, &old->mnt_slave_list);
@@ -753,9 +777,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
return NULL;
}
-static inline void mntfree(struct vfsmount *mnt)
+static inline void mntfree(struct mount *mnt)
{
- struct super_block *sb = mnt->mnt_sb;
+ struct vfsmount *m = &mnt->mnt;
+ struct super_block *sb = m->mnt_sb;
/*
* This probably indicates that somebody messed
@@ -768,32 +793,32 @@ static inline void mntfree(struct vfsmount *mnt)
* so mnt_get_writers() below is safe.
*/
WARN_ON(mnt_get_writers(mnt));
- fsnotify_vfsmount_delete(mnt);
- dput(mnt->mnt_root);
+ fsnotify_vfsmount_delete(m);
+ dput(m->mnt_root);
free_vfsmnt(mnt);
deactivate_super(sb);
}
-static void mntput_no_expire(struct vfsmount *mnt)
+static void mntput_no_expire(struct mount *mnt)
{
put_again:
#ifdef CONFIG_SMP
br_read_lock(vfsmount_lock);
if (likely(atomic_read(&mnt->mnt_longterm))) {
- mnt_dec_count(mnt);
+ mnt_add_count(mnt, -1);
br_read_unlock(vfsmount_lock);
return;
}
br_read_unlock(vfsmount_lock);
br_write_lock(vfsmount_lock);
- mnt_dec_count(mnt);
+ mnt_add_count(mnt, -1);
if (mnt_get_count(mnt)) {
br_write_unlock(vfsmount_lock);
return;
}
#else
- mnt_dec_count(mnt);
+ mnt_add_count(mnt, -1);
if (likely(mnt_get_count(mnt)))
return;
br_write_lock(vfsmount_lock);
@@ -802,9 +827,10 @@ put_again:
mnt_add_count(mnt, mnt->mnt_pinned + 1);
mnt->mnt_pinned = 0;
br_write_unlock(vfsmount_lock);
- acct_auto_close_mnt(mnt);
+ acct_auto_close_mnt(&mnt->mnt);
goto put_again;
}
+ list_del(&mnt->mnt_instance);
br_write_unlock(vfsmount_lock);
mntfree(mnt);
}
@@ -812,10 +838,11 @@ put_again:
void mntput(struct vfsmount *mnt)
{
if (mnt) {
+ struct mount *m = real_mount(mnt);
/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
- if (unlikely(mnt->mnt_expiry_mark))
- mnt->mnt_expiry_mark = 0;
- mntput_no_expire(mnt);
+ if (unlikely(m->mnt_expiry_mark))
+ m->mnt_expiry_mark = 0;
+ mntput_no_expire(m);
}
}
EXPORT_SYMBOL(mntput);
@@ -823,7 +850,7 @@ EXPORT_SYMBOL(mntput);
struct vfsmount *mntget(struct vfsmount *mnt)
{
if (mnt)
- mnt_inc_count(mnt);
+ mnt_add_count(real_mount(mnt), 1);
return mnt;
}
EXPORT_SYMBOL(mntget);
@@ -831,16 +858,17 @@ EXPORT_SYMBOL(mntget);
void mnt_pin(struct vfsmount *mnt)
{
br_write_lock(vfsmount_lock);
- mnt->mnt_pinned++;
+ real_mount(mnt)->mnt_pinned++;
br_write_unlock(vfsmount_lock);
}
EXPORT_SYMBOL(mnt_pin);
-void mnt_unpin(struct vfsmount *mnt)
+void mnt_unpin(struct vfsmount *m)
{
+ struct mount *mnt = real_mount(m);
br_write_lock(vfsmount_lock);
if (mnt->mnt_pinned) {
- mnt_inc_count(mnt);
+ mnt_add_count(mnt, 1);
mnt->mnt_pinned--;
}
br_write_unlock(vfsmount_lock);
@@ -858,12 +886,12 @@ static inline void mangle(struct seq_file *m, const char *s)
*
* See also save_mount_options().
*/
-int generic_show_options(struct seq_file *m, struct vfsmount *mnt)
+int generic_show_options(struct seq_file *m, struct dentry *root)
{
const char *options;
rcu_read_lock();
- options = rcu_dereference(mnt->mnt_sb->s_options);
+ options = rcu_dereference(root->d_sb->s_options);
if (options != NULL && options[0]) {
seq_putc(m, ',');
@@ -907,10 +935,10 @@ void replace_mount_options(struct super_block *sb, char *options)
EXPORT_SYMBOL(replace_mount_options);
#ifdef CONFIG_PROC_FS
-/* iterator */
+/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
- struct proc_mounts *p = m->private;
+ struct proc_mounts *p = container_of(m, struct proc_mounts, m);
down_read(&namespace_sem);
return seq_list_start(&p->ns->list, *pos);
@@ -918,7 +946,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct proc_mounts *p = m->private;
+ struct proc_mounts *p = container_of(m, struct proc_mounts, m);
return seq_list_next(v, &p->ns->list, pos);
}
@@ -928,219 +956,18 @@ static void m_stop(struct seq_file *m, void *v)
up_read(&namespace_sem);
}
-int mnt_had_events(struct proc_mounts *p)
+static int m_show(struct seq_file *m, void *v)
{
- struct mnt_namespace *ns = p->ns;
- int res = 0;
-
- br_read_lock(vfsmount_lock);
- if (p->m.poll_event != ns->event) {
- p->m.poll_event = ns->event;
- res = 1;
- }
- br_read_unlock(vfsmount_lock);
-
- return res;
-}
-
-struct proc_fs_info {
- int flag;
- const char *str;
-};
-
-static int show_sb_opts(struct seq_file *m, struct super_block *sb)
-{
- static const struct proc_fs_info fs_info[] = {
- { MS_SYNCHRONOUS, ",sync" },
- { MS_DIRSYNC, ",dirsync" },
- { MS_MANDLOCK, ",mand" },
- { 0, NULL }
- };
- const struct proc_fs_info *fs_infop;
-
- for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
- if (sb->s_flags & fs_infop->flag)
- seq_puts(m, fs_infop->str);
- }
-
- return security_sb_show_options(m, sb);
-}
-
-static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
-{
- static const struct proc_fs_info mnt_info[] = {
- { MNT_NOSUID, ",nosuid" },
- { MNT_NODEV, ",nodev" },
- { MNT_NOEXEC, ",noexec" },
- { MNT_NOATIME, ",noatime" },
- { MNT_NODIRATIME, ",nodiratime" },
- { MNT_RELATIME, ",relatime" },
- { 0, NULL }
- };
- const struct proc_fs_info *fs_infop;
-
- for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
- if (mnt->mnt_flags & fs_infop->flag)
- seq_puts(m, fs_infop->str);
- }
-}
-
-static void show_type(struct seq_file *m, struct super_block *sb)
-{
- mangle(m, sb->s_type->name);
- if (sb->s_subtype && sb->s_subtype[0]) {
- seq_putc(m, '.');
- mangle(m, sb->s_subtype);
- }
-}
-
-static int show_vfsmnt(struct seq_file *m, void *v)
-{
- struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
- int err = 0;
- struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
-
- if (mnt->mnt_sb->s_op->show_devname) {
- err = mnt->mnt_sb->s_op->show_devname(m, mnt);
- if (err)
- goto out;
- } else {
- mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
- }
- seq_putc(m, ' ');
- seq_path(m, &mnt_path, " \t\n\\");
- seq_putc(m, ' ');
- show_type(m, mnt->mnt_sb);
- seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
- err = show_sb_opts(m, mnt->mnt_sb);
- if (err)
- goto out;
- show_mnt_opts(m, mnt);
- if (mnt->mnt_sb->s_op->show_options)
- err = mnt->mnt_sb->s_op->show_options(m, mnt);
- seq_puts(m, " 0 0\n");
-out:
- return err;
+ struct proc_mounts *p = container_of(m, struct proc_mounts, m);
+ struct mount *r = list_entry(v, struct mount, mnt_list);
+ return p->show(m, &r->mnt);
}
const struct seq_operations mounts_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_vfsmnt
-};
-
-static int show_mountinfo(struct seq_file *m, void *v)
-{
- struct proc_mounts *p = m->private;
- struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
- struct super_block *sb = mnt->mnt_sb;
- struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
- struct path root = p->root;
- int err = 0;
-
- seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id,
- MAJOR(sb->s_dev), MINOR(sb->s_dev));
- if (sb->s_op->show_path)
- err = sb->s_op->show_path(m, mnt);
- else
- seq_dentry(m, mnt->mnt_root, " \t\n\\");
- if (err)
- goto out;
- seq_putc(m, ' ');
-
- /* mountpoints outside of chroot jail will give SEQ_SKIP on this */
- err = seq_path_root(m, &mnt_path, &root, " \t\n\\");
- if (err)
- goto out;
-
- seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
- show_mnt_opts(m, mnt);
-
- /* Tagged fields ("foo:X" or "bar") */
- if (IS_MNT_SHARED(mnt))
- seq_printf(m, " shared:%i", mnt->mnt_group_id);
- if (IS_MNT_SLAVE(mnt)) {
- int master = mnt->mnt_master->mnt_group_id;
- int dom = get_dominating_id(mnt, &p->root);
- seq_printf(m, " master:%i", master);
- if (dom && dom != master)
- seq_printf(m, " propagate_from:%i", dom);
- }
- if (IS_MNT_UNBINDABLE(mnt))
- seq_puts(m, " unbindable");
-
- /* Filesystem specific data */
- seq_puts(m, " - ");
- show_type(m, sb);
- seq_putc(m, ' ');
- if (sb->s_op->show_devname)
- err = sb->s_op->show_devname(m, mnt);
- else
- mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
- if (err)
- goto out;
- seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
- err = show_sb_opts(m, sb);
- if (err)
- goto out;
- if (sb->s_op->show_options)
- err = sb->s_op->show_options(m, mnt);
- seq_putc(m, '\n');
-out:
- return err;
-}
-
-const struct seq_operations mountinfo_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_mountinfo,
-};
-
-static int show_vfsstat(struct seq_file *m, void *v)
-{
- struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
- struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
- int err = 0;
-
- /* device */
- if (mnt->mnt_sb->s_op->show_devname) {
- seq_puts(m, "device ");
- err = mnt->mnt_sb->s_op->show_devname(m, mnt);
- } else {
- if (mnt->mnt_devname) {
- seq_puts(m, "device ");
- mangle(m, mnt->mnt_devname);
- } else
- seq_puts(m, "no device");
- }
-
- /* mount point */
- seq_puts(m, " mounted on ");
- seq_path(m, &mnt_path, " \t\n\\");
- seq_putc(m, ' ');
-
- /* file system type */
- seq_puts(m, "with fstype ");
- show_type(m, mnt->mnt_sb);
-
- /* optional statistics */
- if (mnt->mnt_sb->s_op->show_stats) {
- seq_putc(m, ' ');
- if (!err)
- err = mnt->mnt_sb->s_op->show_stats(m, mnt);
- }
-
- seq_putc(m, '\n');
- return err;
-}
-
-const struct seq_operations mountstats_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_vfsstat,
+ .show = m_show,
};
#endif /* CONFIG_PROC_FS */
@@ -1152,11 +979,13 @@ const struct seq_operations mountstats_op = {
* open files, pwds, chroots or sub mounts that are
* busy.
*/
-int may_umount_tree(struct vfsmount *mnt)
+int may_umount_tree(struct vfsmount *m)
{
+ struct mount *mnt = real_mount(m);
int actual_refs = 0;
int minimum_refs = 0;
- struct vfsmount *p;
+ struct mount *p;
+ BUG_ON(!m);
/* write lock needed for mnt_get_count */
br_write_lock(vfsmount_lock);
@@ -1192,7 +1021,7 @@ int may_umount(struct vfsmount *mnt)
int ret = 1;
down_read(&namespace_sem);
br_write_lock(vfsmount_lock);
- if (propagate_mount_busy(mnt, 2))
+ if (propagate_mount_busy(real_mount(mnt), 2))
ret = 0;
br_write_unlock(vfsmount_lock);
up_read(&namespace_sem);
@@ -1203,25 +1032,25 @@ EXPORT_SYMBOL(may_umount);
void release_mounts(struct list_head *head)
{
- struct vfsmount *mnt;
+ struct mount *mnt;
while (!list_empty(head)) {
- mnt = list_first_entry(head, struct vfsmount, mnt_hash);
+ mnt = list_first_entry(head, struct mount, mnt_hash);
list_del_init(&mnt->mnt_hash);
- if (mnt->mnt_parent != mnt) {
+ if (mnt_has_parent(mnt)) {
struct dentry *dentry;
- struct vfsmount *m;
+ struct mount *m;
br_write_lock(vfsmount_lock);
dentry = mnt->mnt_mountpoint;
m = mnt->mnt_parent;
- mnt->mnt_mountpoint = mnt->mnt_root;
+ mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
m->mnt_ghosts--;
br_write_unlock(vfsmount_lock);
dput(dentry);
- mntput(m);
+ mntput(&m->mnt);
}
- mntput(mnt);
+ mntput(&mnt->mnt);
}
}
@@ -1229,10 +1058,10 @@ void release_mounts(struct list_head *head)
* vfsmount lock must be held for write
* namespace_sem must be held for write
*/
-void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
+void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
{
LIST_HEAD(tmp_list);
- struct vfsmount *p;
+ struct mount *p;
for (p = mnt; p; p = next_mnt(p, mnt))
list_move(&p->mnt_hash, &tmp_list);
@@ -1247,24 +1076,24 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
p->mnt_ns = NULL;
__mnt_make_shortterm(p);
list_del_init(&p->mnt_child);
- if (p->mnt_parent != p) {
+ if (mnt_has_parent(p)) {
p->mnt_parent->mnt_ghosts++;
- dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint);
+ dentry_reset_mounted(p->mnt_mountpoint);
}
change_mnt_propagation(p, MS_PRIVATE);
}
list_splice(&tmp_list, kill);
}
-static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
+static void shrink_submounts(struct mount *mnt, struct list_head *umounts);
-static int do_umount(struct vfsmount *mnt, int flags)
+static int do_umount(struct mount *mnt, int flags)
{
- struct super_block *sb = mnt->mnt_sb;
+ struct super_block *sb = mnt->mnt.mnt_sb;
int retval;
LIST_HEAD(umount_list);
- retval = security_sb_umount(mnt, flags);
+ retval = security_sb_umount(&mnt->mnt, flags);
if (retval)
return retval;
@@ -1275,7 +1104,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
*/
if (flags & MNT_EXPIRE) {
- if (mnt == current->fs->root.mnt ||
+ if (&mnt->mnt == current->fs->root.mnt ||
flags & (MNT_FORCE | MNT_DETACH))
return -EINVAL;
@@ -1317,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
* /reboot - static binary that would close all descriptors and
* call reboot(9). Then init(8) could umount root and exec /reboot.
*/
- if (mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
+ if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
/*
* Special case for "unmounting" root ...
* we just try to remount it readonly.
@@ -1359,6 +1188,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
{
struct path path;
+ struct mount *mnt;
int retval;
int lookup_flags = 0;
@@ -1371,21 +1201,22 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
if (retval)
goto out;
+ mnt = real_mount(path.mnt);
retval = -EINVAL;
if (path.dentry != path.mnt->mnt_root)
goto dput_and_out;
- if (!check_mnt(path.mnt))
+ if (!check_mnt(mnt))
goto dput_and_out;
retval = -EPERM;
if (!capable(CAP_SYS_ADMIN))
goto dput_and_out;
- retval = do_umount(path.mnt, flags);
+ retval = do_umount(mnt, flags);
dput_and_out:
/* we mustn't call path_put() as that would clear mnt_expiry_mark */
dput(path.dentry);
- mntput_no_expire(path.mnt);
+ mntput_no_expire(mnt);
out:
return retval;
}
@@ -1420,10 +1251,10 @@ static int mount_is_safe(struct path *path)
#endif
}
-struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
+struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
int flag)
{
- struct vfsmount *res, *p, *q, *r, *s;
+ struct mount *res, *p, *q, *r;
struct path path;
if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
@@ -1436,6 +1267,7 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
p = mnt;
list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
+ struct mount *s;
if (!is_subdir(r->mnt_mountpoint, dentry))
continue;
@@ -1449,9 +1281,9 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
q = q->mnt_parent;
}
p = s;
- path.mnt = q;
+ path.mnt = &q->mnt;
path.dentry = p->mnt_mountpoint;
- q = clone_mnt(p, p->mnt_root, flag);
+ q = clone_mnt(p, p->mnt.mnt_root, flag);
if (!q)
goto Enomem;
br_write_lock(vfsmount_lock);
@@ -1474,11 +1306,12 @@ Enomem:
struct vfsmount *collect_mounts(struct path *path)
{
- struct vfsmount *tree;
+ struct mount *tree;
down_write(&namespace_sem);
- tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
+ tree = copy_tree(real_mount(path->mnt), path->dentry,
+ CL_COPY_ALL | CL_PRIVATE);
up_write(&namespace_sem);
- return tree;
+ return tree ? &tree->mnt : NULL;
}
void drop_collected_mounts(struct vfsmount *mnt)
@@ -1486,7 +1319,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
LIST_HEAD(umount_list);
down_write(&namespace_sem);
br_write_lock(vfsmount_lock);
- umount_tree(mnt, 0, &umount_list);
+ umount_tree(real_mount(mnt), 0, &umount_list);
br_write_unlock(vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umount_list);
@@ -1495,21 +1328,21 @@ void drop_collected_mounts(struct vfsmount *mnt)
int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
struct vfsmount *root)
{
- struct vfsmount *mnt;
+ struct mount *mnt;
int res = f(root, arg);
if (res)
return res;
- list_for_each_entry(mnt, &root->mnt_list, mnt_list) {
- res = f(mnt, arg);
+ list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
+ res = f(&mnt->mnt, arg);
if (res)
return res;
}
return 0;
}
-static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
+static void cleanup_group_ids(struct mount *mnt, struct mount *end)
{
- struct vfsmount *p;
+ struct mount *p;
for (p = mnt; p != end; p = next_mnt(p, mnt)) {
if (p->mnt_group_id && !IS_MNT_SHARED(p))
@@ -1517,9 +1350,9 @@ static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
}
}
-static int invent_group_ids(struct vfsmount *mnt, bool recurse)
+static int invent_group_ids(struct mount *mnt, bool recurse)
{
- struct vfsmount *p;
+ struct mount *p;
for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
@@ -1597,13 +1430,13 @@ static int invent_group_ids(struct vfsmount *mnt, bool recurse)
* Must be called without spinlocks held, since this function can sleep
* in allocations.
*/
-static int attach_recursive_mnt(struct vfsmount *source_mnt,
+static int attach_recursive_mnt(struct mount *source_mnt,
struct path *path, struct path *parent_path)
{
LIST_HEAD(tree_list);
- struct vfsmount *dest_mnt = path->mnt;
+ struct mount *dest_mnt = real_mount(path->mnt);
struct dentry *dest_dentry = path->dentry;
- struct vfsmount *child, *p;
+ struct mount *child, *p;
int err;
if (IS_MNT_SHARED(dest_mnt)) {
@@ -1624,7 +1457,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
if (parent_path) {
detach_mnt(source_mnt, parent_path);
attach_mnt(source_mnt, path);
- touch_mnt_namespace(parent_path->mnt->mnt_ns);
+ touch_mnt_namespace(source_mnt->mnt_ns);
} else {
mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
commit_tree(source_mnt);
@@ -1672,13 +1505,13 @@ static void unlock_mount(struct path *path)
mutex_unlock(&path->dentry->d_inode->i_mutex);
}
-static int graft_tree(struct vfsmount *mnt, struct path *path)
+static int graft_tree(struct mount *mnt, struct path *path)
{
- if (mnt->mnt_sb->s_flags & MS_NOUSER)
+ if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
return -EINVAL;
if (S_ISDIR(path->dentry->d_inode->i_mode) !=
- S_ISDIR(mnt->mnt_root->d_inode->i_mode))
+ S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
return -ENOTDIR;
if (d_unlinked(path->dentry))
@@ -1709,7 +1542,8 @@ static int flags_to_propagation_type(int flags)
*/
static int do_change_type(struct path *path, int flag)
{
- struct vfsmount *m, *mnt = path->mnt;
+ struct mount *m;
+ struct mount *mnt = real_mount(path->mnt);
int recurse = flag & MS_REC;
int type;
int err = 0;
@@ -1749,7 +1583,7 @@ static int do_loopback(struct path *path, char *old_name,
{
LIST_HEAD(umount_list);
struct path old_path;
- struct vfsmount *mnt = NULL;
+ struct mount *mnt = NULL, *old;
int err = mount_is_safe(path);
if (err)
return err;
@@ -1763,18 +1597,20 @@ static int do_loopback(struct path *path, char *old_name,
if (err)
goto out;
+ old = real_mount(old_path.mnt);
+
err = -EINVAL;
- if (IS_MNT_UNBINDABLE(old_path.mnt))
+ if (IS_MNT_UNBINDABLE(old))
goto out2;
- if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
+ if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old))
goto out2;
err = -ENOMEM;
if (recurse)
- mnt = copy_tree(old_path.mnt, old_path.dentry, 0);
+ mnt = copy_tree(old, old_path.dentry, 0);
else
- mnt = clone_mnt(old_path.mnt, old_path.dentry, 0);
+ mnt = clone_mnt(old, old_path.dentry, 0);
if (!mnt)
goto out2;
@@ -1804,9 +1640,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
return 0;
if (readonly_request)
- error = mnt_make_readonly(mnt);
+ error = mnt_make_readonly(real_mount(mnt));
else
- __mnt_unmake_readonly(mnt);
+ __mnt_unmake_readonly(real_mount(mnt));
return error;
}
@@ -1820,11 +1656,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
{
int err;
struct super_block *sb = path->mnt->mnt_sb;
+ struct mount *mnt = real_mount(path->mnt);
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!check_mnt(path->mnt))
+ if (!check_mnt(mnt))
return -EINVAL;
if (path->dentry != path->mnt->mnt_root)
@@ -1841,22 +1678,22 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
err = do_remount_sb(sb, flags, data, 0);
if (!err) {
br_write_lock(vfsmount_lock);
- mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
- path->mnt->mnt_flags = mnt_flags;
+ mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
+ mnt->mnt.mnt_flags = mnt_flags;
br_write_unlock(vfsmount_lock);
}
up_write(&sb->s_umount);
if (!err) {
br_write_lock(vfsmount_lock);
- touch_mnt_namespace(path->mnt->mnt_ns);
+ touch_mnt_namespace(mnt->mnt_ns);
br_write_unlock(vfsmount_lock);
}
return err;
}
-static inline int tree_contains_unbindable(struct vfsmount *mnt)
+static inline int tree_contains_unbindable(struct mount *mnt)
{
- struct vfsmount *p;
+ struct mount *p;
for (p = mnt; p; p = next_mnt(p, mnt)) {
if (IS_MNT_UNBINDABLE(p))
return 1;
@@ -1867,7 +1704,8 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt)
static int do_move_mount(struct path *path, char *old_name)
{
struct path old_path, parent_path;
- struct vfsmount *p;
+ struct mount *p;
+ struct mount *old;
int err = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1881,8 +1719,11 @@ static int do_move_mount(struct path *path, char *old_name)
if (err < 0)
goto out;
+ old = real_mount(old_path.mnt);
+ p = real_mount(path->mnt);
+
err = -EINVAL;
- if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
+ if (!check_mnt(p) || !check_mnt(old))
goto out1;
if (d_unlinked(path->dentry))
@@ -1892,7 +1733,7 @@ static int do_move_mount(struct path *path, char *old_name)
if (old_path.dentry != old_path.mnt->mnt_root)
goto out1;
- if (old_path.mnt == old_path.mnt->mnt_parent)
+ if (!mnt_has_parent(old))
goto out1;
if (S_ISDIR(path->dentry->d_inode->i_mode) !=
@@ -1901,28 +1742,26 @@ static int do_move_mount(struct path *path, char *old_name)
/*
* Don't move a mount residing in a shared parent.
*/
- if (old_path.mnt->mnt_parent &&
- IS_MNT_SHARED(old_path.mnt->mnt_parent))
+ if (IS_MNT_SHARED(old->mnt_parent))
goto out1;
/*
* Don't move a mount tree containing unbindable mounts to a destination
* mount which is shared.
*/
- if (IS_MNT_SHARED(path->mnt) &&
- tree_contains_unbindable(old_path.mnt))
+ if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
goto out1;
err = -ELOOP;
- for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent)
- if (p == old_path.mnt)
+ for (; mnt_has_parent(p); p = p->mnt_parent)
+ if (p == old)
goto out1;
- err = attach_recursive_mnt(old_path.mnt, path, &parent_path);
+ err = attach_recursive_mnt(old, path, &parent_path);
if (err)
goto out1;
/* if the mount is moved, it should no longer be expire
* automatically */
- list_del_init(&old_path.mnt->mnt_expire);
+ list_del_init(&old->mnt_expire);
out1:
unlock_mount(path);
out:
@@ -1955,7 +1794,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
return ERR_PTR(err);
}
-struct vfsmount *
+static struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
struct file_system_type *type = get_fs_type(fstype);
@@ -1969,12 +1808,11 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data)
put_filesystem(type);
return mnt;
}
-EXPORT_SYMBOL_GPL(do_kern_mount);
/*
* add a mount into a namespace's mount tree
*/
-static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
+static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
{
int err;
@@ -1985,20 +1823,20 @@ static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flag
return err;
err = -EINVAL;
- if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
+ if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(real_mount(path->mnt)))
goto unlock;
/* Refuse the same filesystem on the same mount point */
err = -EBUSY;
- if (path->mnt->mnt_sb == newmnt->mnt_sb &&
+ if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
path->mnt->mnt_root == path->dentry)
goto unlock;
err = -EINVAL;
- if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
+ if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
goto unlock;
- newmnt->mnt_flags = mnt_flags;
+ newmnt->mnt.mnt_flags = mnt_flags;
err = graft_tree(newmnt, path);
unlock:
@@ -2027,7 +1865,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
if (IS_ERR(mnt))
return PTR_ERR(mnt);
- err = do_add_mount(mnt, path, mnt_flags);
+ err = do_add_mount(real_mount(mnt), path, mnt_flags);
if (err)
mntput(mnt);
return err;
@@ -2035,11 +1873,12 @@ static int do_new_mount(struct path *path, char *type, int flags,
int finish_automount(struct vfsmount *m, struct path *path)
{
+ struct mount *mnt = real_mount(m);
int err;
/* The new mount record should have at least 2 refs to prevent it being
* expired before we get a chance to add it
*/
- BUG_ON(mnt_get_count(m) < 2);
+ BUG_ON(mnt_get_count(mnt) < 2);
if (m->mnt_sb == path->mnt->mnt_sb &&
m->mnt_root == path->dentry) {
@@ -2047,15 +1886,15 @@ int finish_automount(struct vfsmount *m, struct path *path)
goto fail;
}
- err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+ err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
if (!err)
return 0;
fail:
/* remove m from any expiration list it may be on */
- if (!list_empty(&m->mnt_expire)) {
+ if (!list_empty(&mnt->mnt_expire)) {
down_write(&namespace_sem);
br_write_lock(vfsmount_lock);
- list_del_init(&m->mnt_expire);
+ list_del_init(&mnt->mnt_expire);
br_write_unlock(vfsmount_lock);
up_write(&namespace_sem);
}
@@ -2074,7 +1913,7 @@ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
down_write(&namespace_sem);
br_write_lock(vfsmount_lock);
- list_add_tail(&mnt->mnt_expire, expiry_list);
+ list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
br_write_unlock(vfsmount_lock);
up_write(&namespace_sem);
@@ -2088,7 +1927,7 @@ EXPORT_SYMBOL(mnt_set_expiry);
*/
void mark_mounts_for_expiry(struct list_head *mounts)
{
- struct vfsmount *mnt, *next;
+ struct mount *mnt, *next;
LIST_HEAD(graveyard);
LIST_HEAD(umounts);
@@ -2111,7 +1950,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
list_move(&mnt->mnt_expire, &graveyard);
}
while (!list_empty(&graveyard)) {
- mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire);
+ mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
touch_mnt_namespace(mnt->mnt_ns);
umount_tree(mnt, 1, &umounts);
}
@@ -2129,9 +1968,9 @@ EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
* search the list of submounts for a given mountpoint, and move any
* shrinkable submounts to the 'graveyard' list.
*/
-static int select_submounts(struct vfsmount *parent, struct list_head *graveyard)
+static int select_submounts(struct mount *parent, struct list_head *graveyard)
{
- struct vfsmount *this_parent = parent;
+ struct mount *this_parent = parent;
struct list_head *next;
int found = 0;
@@ -2140,10 +1979,10 @@ repeat:
resume:
while (next != &this_parent->mnt_mounts) {
struct list_head *tmp = next;
- struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child);
+ struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
next = tmp->next;
- if (!(mnt->mnt_flags & MNT_SHRINKABLE))
+ if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
continue;
/*
* Descend a level if the d_mounts list is non-empty.
@@ -2175,15 +2014,15 @@ resume:
*
* vfsmount_lock must be held for write
*/
-static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
+static void shrink_submounts(struct mount *mnt, struct list_head *umounts)
{
LIST_HEAD(graveyard);
- struct vfsmount *m;
+ struct mount *m;
/* extract submounts of 'mountpoint' from the expiration list */
while (select_submounts(mnt, &graveyard)) {
while (!list_empty(&graveyard)) {
- m = list_first_entry(&graveyard, struct vfsmount,
+ m = list_first_entry(&graveyard, struct mount,
mnt_expire);
touch_mnt_namespace(m->mnt_ns);
umount_tree(m, 1, umounts);
@@ -2370,12 +2209,13 @@ static struct mnt_namespace *alloc_mnt_ns(void)
void mnt_make_longterm(struct vfsmount *mnt)
{
- __mnt_make_longterm(mnt);
+ __mnt_make_longterm(real_mount(mnt));
}
-void mnt_make_shortterm(struct vfsmount *mnt)
+void mnt_make_shortterm(struct vfsmount *m)
{
#ifdef CONFIG_SMP
+ struct mount *mnt = real_mount(m);
if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
return;
br_write_lock(vfsmount_lock);
@@ -2393,7 +2233,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
{
struct mnt_namespace *new_ns;
struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
- struct vfsmount *p, *q;
+ struct mount *p, *q;
+ struct mount *old = mnt_ns->root;
+ struct mount *new;
new_ns = alloc_mnt_ns();
if (IS_ERR(new_ns))
@@ -2401,15 +2243,15 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
down_write(&namespace_sem);
/* First pass: copy the tree topology */
- new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root,
- CL_COPY_ALL | CL_EXPIRE);
- if (!new_ns->root) {
+ new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE);
+ if (!new) {
up_write(&namespace_sem);
kfree(new_ns);
return ERR_PTR(-ENOMEM);
}
+ new_ns->root = new;
br_write_lock(vfsmount_lock);
- list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
+ list_add_tail(&new_ns->list, &new->mnt_list);
br_write_unlock(vfsmount_lock);
/*
@@ -2417,27 +2259,27 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
* as belonging to new namespace. We have already acquired a private
* fs_struct, so tsk->fs->lock is not needed.
*/
- p = mnt_ns->root;
- q = new_ns->root;
+ p = old;
+ q = new;
while (p) {
q->mnt_ns = new_ns;
__mnt_make_longterm(q);
if (fs) {
- if (p == fs->root.mnt) {
- fs->root.mnt = mntget(q);
+ if (&p->mnt == fs->root.mnt) {
+ fs->root.mnt = mntget(&q->mnt);
__mnt_make_longterm(q);
- mnt_make_shortterm(p);
- rootmnt = p;
+ mnt_make_shortterm(&p->mnt);
+ rootmnt = &p->mnt;
}
- if (p == fs->pwd.mnt) {
- fs->pwd.mnt = mntget(q);
+ if (&p->mnt == fs->pwd.mnt) {
+ fs->pwd.mnt = mntget(&q->mnt);
__mnt_make_longterm(q);
- mnt_make_shortterm(p);
- pwdmnt = p;
+ mnt_make_shortterm(&p->mnt);
+ pwdmnt = &p->mnt;
}
}
- p = next_mnt(p, mnt_ns->root);
- q = next_mnt(q, new_ns->root);
+ p = next_mnt(p, old);
+ q = next_mnt(q, new);
}
up_write(&namespace_sem);
@@ -2470,22 +2312,20 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
* create_mnt_ns - creates a private namespace and adds a root filesystem
* @mnt: pointer to the new root filesystem mountpoint
*/
-struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
+static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
{
- struct mnt_namespace *new_ns;
-
- new_ns = alloc_mnt_ns();
+ struct mnt_namespace *new_ns = alloc_mnt_ns();
if (!IS_ERR(new_ns)) {
+ struct mount *mnt = real_mount(m);
mnt->mnt_ns = new_ns;
__mnt_make_longterm(mnt);
new_ns->root = mnt;
- list_add(&new_ns->list, &new_ns->root->mnt_list);
+ list_add(&new_ns->list, &mnt->mnt_list);
} else {
- mntput(mnt);
+ mntput(m);
}
return new_ns;
}
-EXPORT_SYMBOL(create_mnt_ns);
struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
{
@@ -2559,6 +2399,31 @@ out_type:
}
/*
+ * Return true if path is reachable from root
+ *
+ * namespace_sem or vfsmount_lock is held
+ */
+bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
+ const struct path *root)
+{
+ while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
+ dentry = mnt->mnt_mountpoint;
+ mnt = mnt->mnt_parent;
+ }
+ return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
+}
+
+int path_is_under(struct path *path1, struct path *path2)
+{
+ int res;
+ br_read_lock(vfsmount_lock);
+ res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
+ br_read_unlock(vfsmount_lock);
+ return res;
+}
+EXPORT_SYMBOL(path_is_under);
+
+/*
* pivot_root Semantics:
* Moves the root file system of the current process to the directory put_old,
* makes new_root as the new root file system of the current process, and sets
@@ -2586,8 +2451,8 @@ out_type:
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
const char __user *, put_old)
{
- struct vfsmount *tmp;
struct path new, old, parent_path, root_parent, root;
+ struct mount *new_mnt, *root_mnt;
int error;
if (!capable(CAP_SYS_ADMIN))
@@ -2611,11 +2476,13 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
goto out3;
error = -EINVAL;
- if (IS_MNT_SHARED(old.mnt) ||
- IS_MNT_SHARED(new.mnt->mnt_parent) ||
- IS_MNT_SHARED(root.mnt->mnt_parent))
+ new_mnt = real_mount(new.mnt);
+ root_mnt = real_mount(root.mnt);
+ if (IS_MNT_SHARED(real_mount(old.mnt)) ||
+ IS_MNT_SHARED(new_mnt->mnt_parent) ||
+ IS_MNT_SHARED(root_mnt->mnt_parent))
goto out4;
- if (!check_mnt(root.mnt) || !check_mnt(new.mnt))
+ if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
goto out4;
error = -ENOENT;
if (d_unlinked(new.dentry))
@@ -2629,33 +2496,22 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
error = -EINVAL;
if (root.mnt->mnt_root != root.dentry)
goto out4; /* not a mountpoint */
- if (root.mnt->mnt_parent == root.mnt)
+ if (!mnt_has_parent(root_mnt))
goto out4; /* not attached */
if (new.mnt->mnt_root != new.dentry)
goto out4; /* not a mountpoint */
- if (new.mnt->mnt_parent == new.mnt)
+ if (!mnt_has_parent(new_mnt))
goto out4; /* not attached */
/* make sure we can reach put_old from new_root */
- tmp = old.mnt;
- if (tmp != new.mnt) {
- for (;;) {
- if (tmp->mnt_parent == tmp)
- goto out4; /* already mounted on put_old */
- if (tmp->mnt_parent == new.mnt)
- break;
- tmp = tmp->mnt_parent;
- }
- if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
- goto out4;
- } else if (!is_subdir(old.dentry, new.dentry))
+ if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
goto out4;
br_write_lock(vfsmount_lock);
- detach_mnt(new.mnt, &parent_path);
- detach_mnt(root.mnt, &root_parent);
+ detach_mnt(new_mnt, &parent_path);
+ detach_mnt(root_mnt, &root_parent);
/* mount old root on put_old */
- attach_mnt(root.mnt, &old);
+ attach_mnt(root_mnt, &old);
/* mount new_root on / */
- attach_mnt(new.mnt, &root_parent);
+ attach_mnt(new_mnt, &root_parent);
touch_mnt_namespace(current->nsproxy->mnt_ns);
br_write_unlock(vfsmount_lock);
chroot_fs_refs(&root, &new);
@@ -2693,8 +2549,8 @@ static void __init init_mount_tree(void)
init_task.nsproxy->mnt_ns = ns;
get_mnt_ns(ns);
- root.mnt = ns->root;
- root.dentry = ns->root->mnt_root;
+ root.mnt = mnt;
+ root.dentry = mnt->mnt_root;
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);
@@ -2707,7 +2563,7 @@ void __init mnt_init(void)
init_rwsem(&namespace_sem);
- mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
+ mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
@@ -2747,7 +2603,6 @@ void put_mnt_ns(struct mnt_namespace *ns)
release_mounts(&umount_list);
kfree(ns);
}
-EXPORT_SYMBOL(put_mnt_ns);
struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
{
@@ -2776,5 +2631,5 @@ EXPORT_SYMBOL(kern_unmount);
bool our_mnt(struct vfsmount *mnt)
{
- return check_mnt(mnt);
+ return check_mnt(real_mount(mnt));
}
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9c51f621e901..aeed93a6bde0 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -30,15 +30,15 @@ static void ncp_do_readdir(struct file *, void *, filldir_t,
static int ncp_readdir(struct file *, void *, filldir_t);
-static int ncp_create(struct inode *, struct dentry *, int, struct nameidata *);
+static int ncp_create(struct inode *, struct dentry *, umode_t, struct nameidata *);
static struct dentry *ncp_lookup(struct inode *, struct dentry *, struct nameidata *);
static int ncp_unlink(struct inode *, struct dentry *);
-static int ncp_mkdir(struct inode *, struct dentry *, int);
+static int ncp_mkdir(struct inode *, struct dentry *, umode_t);
static int ncp_rmdir(struct inode *, struct dentry *);
static int ncp_rename(struct inode *, struct dentry *,
struct inode *, struct dentry *);
static int ncp_mknod(struct inode * dir, struct dentry *dentry,
- int mode, dev_t rdev);
+ umode_t mode, dev_t rdev);
#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
extern int ncp_symlink(struct inode *, struct dentry *, const char *);
#else
@@ -919,7 +919,7 @@ out_close:
goto out;
}
-int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
+int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev, __le32 attributes)
{
struct ncp_server *server = NCP_SERVER(dir);
@@ -928,7 +928,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
int opmode;
__u8 __name[NCP_MAXPATHLEN + 1];
- PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n",
+ PPRINTK("ncp_create_new: creating %s/%s, mode=%hx\n",
dentry->d_parent->d_name.name, dentry->d_name.name, mode);
ncp_age_dentry(server, dentry);
@@ -979,13 +979,13 @@ out:
return error;
}
-static int ncp_create(struct inode *dir, struct dentry *dentry, int mode,
+static int ncp_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
return ncp_create_new(dir, dentry, mode, 0, 0);
}
-static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct ncp_entry_info finfo;
struct ncp_server *server = NCP_SERVER(dir);
@@ -1201,12 +1201,12 @@ out:
}
static int ncp_mknod(struct inode * dir, struct dentry *dentry,
- int mode, dev_t rdev)
+ umode_t mode, dev_t rdev)
{
if (!new_valid_dev(rdev))
return -EINVAL;
if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) {
- DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%o\n", mode);
+ DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%ho\n", mode);
return ncp_create_new(dir, dentry, mode, rdev, 0);
}
return -EPERM; /* Strange, but true */
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index cbd1a61c110a..3d1e34f8a68e 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -44,7 +44,7 @@
static void ncp_evict_inode(struct inode *);
static void ncp_put_super(struct super_block *);
static int ncp_statfs(struct dentry *, struct kstatfs *);
-static int ncp_show_options(struct seq_file *, struct vfsmount *);
+static int ncp_show_options(struct seq_file *, struct dentry *);
static struct kmem_cache * ncp_inode_cachep;
@@ -60,7 +60,6 @@ static struct inode *ncp_alloc_inode(struct super_block *sb)
static void ncp_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
}
@@ -323,9 +322,9 @@ static void ncp_stop_tasks(struct ncp_server *server) {
flush_work_sync(&server->timeout_tq);
}
-static int ncp_show_options(struct seq_file *seq, struct vfsmount *mnt)
+static int ncp_show_options(struct seq_file *seq, struct dentry *root)
{
- struct ncp_server *server = NCP_SBP(mnt->mnt_sb);
+ struct ncp_server *server = NCP_SBP(root->d_sb);
unsigned int tmp;
if (server->m.uid != 0)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 790e92a9ec63..6958adfaff08 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -901,7 +901,7 @@ long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ret = __ncp_ioctl(inode, cmd, arg);
outDropWrite:
if (need_drop_write)
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
out:
return ret;
}
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 09881e6aa5ad..32c06587351a 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -114,7 +114,7 @@ int ncp_dirhandle_alloc(struct ncp_server *, __u8 vol, __le32 dirent, __u8 *dirh
int ncp_dirhandle_free(struct ncp_server *, __u8 dirhandle);
int ncp_create_new(struct inode *dir, struct dentry *dentry,
- int mode, dev_t rdev, __le32 attributes);
+ umode_t mode, dev_t rdev, __le32 attributes);
static inline int ncp_is_nfs_extras(struct ncp_server* server, unsigned int volnum) {
#ifdef CONFIG_NCPFS_NFS_NS
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index 661f861d80c6..52439ddc8de0 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -108,7 +108,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) {
char *rawlink;
int length, err, i, outlen;
int kludge;
- int mode;
+ umode_t mode;
__le32 attr;
unsigned int hdr;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 281ae95932c9..48cfac31f64c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -90,9 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
*/
struct parallel_io {
struct kref refcnt;
- struct rpc_call_ops call_ops;
- void (*pnfs_callback) (void *data);
+ void (*pnfs_callback) (void *data, int num_se);
void *data;
+ int bse_count;
};
static inline struct parallel_io *alloc_parallel(void *data)
@@ -103,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
if (rv) {
rv->data = data;
kref_init(&rv->refcnt);
+ rv->bse_count = 0;
}
return rv;
}
@@ -117,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
dprintk("%s enter\n", __func__);
- p->pnfs_callback(p->data);
+ p->pnfs_callback(p->data, p->bse_count);
kfree(p);
}
@@ -146,14 +147,19 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
{
struct bio *bio;
+ npg = min(npg, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOIO, npg);
- if (!bio)
- return NULL;
+ if (!bio && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (npg /= 2))
+ bio = bio_alloc(GFP_NOIO, npg);
+ }
- bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
- bio->bi_bdev = be->be_mdev;
- bio->bi_end_io = end_io;
- bio->bi_private = par;
+ if (bio) {
+ bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+ bio->bi_bdev = be->be_mdev;
+ bio->bi_end_io = end_io;
+ bio->bi_private = par;
+ }
return bio;
}
@@ -212,22 +218,15 @@ static void bl_read_cleanup(struct work_struct *work)
}
static void
-bl_end_par_io_read(void *data)
+bl_end_par_io_read(void *data, int unused)
{
struct nfs_read_data *rdata = data;
+ rdata->task.tk_status = rdata->pnfs_error;
INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
schedule_work(&rdata->task.u.tk_work);
}
-/* We don't want normal .rpc_call_done callback used, so we replace it
- * with this stub.
- */
-static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
-{
- return;
-}
-
static enum pnfs_try_status
bl_read_pagelist(struct nfs_read_data *rdata)
{
@@ -247,8 +246,6 @@ bl_read_pagelist(struct nfs_read_data *rdata)
par = alloc_parallel(rdata);
if (!par)
goto use_mds;
- par->call_ops = *rdata->mds_ops;
- par->call_ops.rpc_call_done = bl_rpc_do_nothing;
par->pnfs_callback = bl_end_par_io_read;
/* At this point, we can no longer jump to use_mds */
@@ -322,6 +319,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
{
sector_t isect, end;
struct pnfs_block_extent *be;
+ struct pnfs_block_short_extent *se;
dprintk("%s(%llu, %u)\n", __func__, offset, count);
if (count == 0)
@@ -334,8 +332,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
be = bl_find_get_extent(bl, isect, NULL);
BUG_ON(!be); /* FIXME */
len = min(end, be->be_f_offset + be->be_length) - isect;
- if (be->be_state == PNFS_BLOCK_INVALID_DATA)
- bl_mark_for_commit(be, isect, len); /* What if fails? */
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ se = bl_pop_one_short_extent(be->be_inval);
+ BUG_ON(!se);
+ bl_mark_for_commit(be, isect, len, se);
+ }
isect += len;
bl_put_extent(be);
}
@@ -357,7 +358,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
end_page_writeback(page);
page_cache_release(page);
} while (bvec >= bio->bi_io_vec);
- if (!uptodate) {
+
+ if (unlikely(!uptodate)) {
if (!wdata->pnfs_error)
wdata->pnfs_error = -EIO;
pnfs_set_lo_fail(wdata->lseg);
@@ -366,7 +368,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
put_parallel(par);
}
-/* This is basically copied from mpage_end_io_read */
static void bl_end_io_write(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
@@ -392,7 +393,7 @@ static void bl_write_cleanup(struct work_struct *work)
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
wdata = container_of(task, struct nfs_write_data, task);
- if (!wdata->pnfs_error) {
+ if (likely(!wdata->pnfs_error)) {
/* Marks for LAYOUTCOMMIT */
mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
wdata->args.offset, wdata->args.count);
@@ -401,11 +402,16 @@ static void bl_write_cleanup(struct work_struct *work)
}
/* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void bl_end_par_io_write(void *data)
+static void bl_end_par_io_write(void *data, int num_se)
{
struct nfs_write_data *wdata = data;
- wdata->task.tk_status = 0;
+ if (unlikely(wdata->pnfs_error)) {
+ bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
+ num_se);
+ }
+
+ wdata->task.tk_status = wdata->pnfs_error;
wdata->verf.committed = NFS_FILE_SYNC;
INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
schedule_work(&wdata->task.u.tk_work);
@@ -484,6 +490,55 @@ cleanup:
return ret;
}
+/* Find or create a zeroing page marked being writeback.
+ * Return ERR_PTR on error, NULL to indicate skip this page and page itself
+ * to indicate write out.
+ */
+static struct page *
+bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
+ struct pnfs_block_extent *cow_read)
+{
+ struct page *page;
+ int locked = 0;
+ page = find_get_page(inode->i_mapping, index);
+ if (page)
+ goto check_page;
+
+ page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ if (unlikely(!page)) {
+ dprintk("%s oom\n", __func__);
+ return ERR_PTR(-ENOMEM);
+ }
+ locked = 1;
+
+check_page:
+ /* PageDirty: Other will write this out
+ * PageWriteback: Other is writing this out
+ * PageUptodate: It was read before
+ */
+ if (PageDirty(page) || PageWriteback(page)) {
+ print_page(page);
+ if (locked)
+ unlock_page(page);
+ page_cache_release(page);
+ return NULL;
+ }
+
+ if (!locked) {
+ lock_page(page);
+ locked = 1;
+ goto check_page;
+ }
+ if (!PageUptodate(page)) {
+ /* New page, readin or zero it */
+ init_page_for_write(page, cow_read);
+ }
+ set_page_writeback(page);
+ unlock_page(page);
+
+ return page;
+}
+
static enum pnfs_try_status
bl_write_pagelist(struct nfs_write_data *wdata, int sync)
{
@@ -508,9 +563,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
*/
par = alloc_parallel(wdata);
if (!par)
- return PNFS_NOT_ATTEMPTED;
- par->call_ops = *wdata->mds_ops;
- par->call_ops.rpc_call_done = bl_rpc_do_nothing;
+ goto out_mds;
par->pnfs_callback = bl_end_par_io_write;
/* At this point, have to be more careful with error handling */
@@ -518,12 +571,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
if (!be || !is_writable(be, isect)) {
dprintk("%s no matching extents!\n", __func__);
- wdata->pnfs_error = -EINVAL;
- goto out;
+ goto out_mds;
}
/* First page inside INVALID extent */
if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (likely(!bl_push_one_short_extent(be->be_inval)))
+ par->bse_count++;
+ else
+ goto out_mds;
temp = offset >> PAGE_CACHE_SHIFT;
npg_zero = do_div(temp, npg_per_block);
isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
@@ -543,36 +599,16 @@ fill_invalid_ext:
dprintk("%s zero %dth page: index %lu isect %llu\n",
__func__, npg_zero, index,
(unsigned long long)isect);
- page =
- find_or_create_page(wdata->inode->i_mapping, index,
- GFP_NOFS);
- if (!page) {
- dprintk("%s oom\n", __func__);
- wdata->pnfs_error = -ENOMEM;
+ page = bl_find_get_zeroing_page(wdata->inode, index,
+ cow_read);
+ if (unlikely(IS_ERR(page))) {
+ wdata->pnfs_error = PTR_ERR(page);
goto out;
- }
-
- /* PageDirty: Other will write this out
- * PageWriteback: Other is writing this out
- * PageUptodate: It was read before
- * sector_initialized: already written out
- */
- if (PageDirty(page) || PageWriteback(page)) {
- print_page(page);
- unlock_page(page);
- page_cache_release(page);
+ } else if (page == NULL)
goto next_page;
- }
- if (!PageUptodate(page)) {
- /* New page, readin or zero it */
- init_page_for_write(page, cow_read);
- }
- set_page_writeback(page);
- unlock_page(page);
ret = bl_mark_sectors_init(be->be_inval, isect,
- PAGE_CACHE_SECTORS,
- NULL);
+ PAGE_CACHE_SECTORS);
if (unlikely(ret)) {
dprintk("%s bl_mark_sectors_init fail %d\n",
__func__, ret);
@@ -581,6 +617,19 @@ fill_invalid_ext:
wdata->pnfs_error = ret;
goto out;
}
+ if (likely(!bl_push_one_short_extent(be->be_inval)))
+ par->bse_count++;
+ else {
+ end_page_writeback(page);
+ page_cache_release(page);
+ wdata->pnfs_error = -ENOMEM;
+ goto out;
+ }
+ /* FIXME: This should be done in bi_end_io */
+ mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+ page->index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE);
+
bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
isect, page, be,
bl_end_io_write_zero, par);
@@ -589,10 +638,6 @@ fill_invalid_ext:
bio = NULL;
goto out;
}
- /* FIXME: This should be done in bi_end_io */
- mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
- page->index << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE);
next_page:
isect += PAGE_CACHE_SECTORS;
extent_length -= PAGE_CACHE_SECTORS;
@@ -616,13 +661,21 @@ next_page:
wdata->pnfs_error = -EINVAL;
goto out;
}
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (likely(!bl_push_one_short_extent(
+ be->be_inval)))
+ par->bse_count++;
+ else {
+ wdata->pnfs_error = -ENOMEM;
+ goto out;
+ }
+ }
extent_length = be->be_length -
(isect - be->be_f_offset);
}
if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
ret = bl_mark_sectors_init(be->be_inval, isect,
- PAGE_CACHE_SECTORS,
- NULL);
+ PAGE_CACHE_SECTORS);
if (unlikely(ret)) {
dprintk("%s bl_mark_sectors_init fail %d\n",
__func__, ret);
@@ -664,6 +717,10 @@ out:
bl_submit_bio(WRITE, bio);
put_parallel(par);
return PNFS_ATTEMPTED;
+out_mds:
+ bl_put_extent(be);
+ kfree(par);
+ return PNFS_NOT_ATTEMPTED;
}
/* FIXME - range ignored */
@@ -690,11 +747,17 @@ static void
release_inval_marks(struct pnfs_inval_markings *marks)
{
struct pnfs_inval_tracking *pos, *temp;
+ struct pnfs_block_short_extent *se, *stemp;
list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
list_del(&pos->it_link);
kfree(pos);
}
+
+ list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
+ list_del(&se->bse_node);
+ kfree(se);
+ }
return;
}
@@ -779,16 +842,13 @@ bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
static void free_blk_mountid(struct block_mount_id *mid)
{
if (mid) {
- struct pnfs_block_dev *dev;
- spin_lock(&mid->bm_lock);
- while (!list_empty(&mid->bm_devlist)) {
- dev = list_first_entry(&mid->bm_devlist,
- struct pnfs_block_dev,
- bm_node);
+ struct pnfs_block_dev *dev, *tmp;
+
+ /* No need to take bm_lock as we are last user freeing bm_devlist */
+ list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
list_del(&dev->bm_node);
bl_free_block_dev(dev);
}
- spin_unlock(&mid->bm_lock);
kfree(mid);
}
}
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 42acf7ef5992..e31a2df28e70 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -70,6 +70,7 @@ struct pnfs_inval_markings {
spinlock_t im_lock;
struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
sector_t im_block_size; /* Server blocksize in sectors */
+ struct list_head im_extents; /* Short extents for INVAL->RW conversion */
};
struct pnfs_inval_tracking {
@@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
{
spin_lock_init(&marks->im_lock);
INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
+ INIT_LIST_HEAD(&marks->im_extents);
marks->im_block_size = blocksize;
marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
blocksize);
@@ -186,8 +188,7 @@ struct pnfs_block_extent *
bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
struct pnfs_block_extent **cow_read);
int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length,
- sector_t **pages);
+ sector_t offset, sector_t length);
void bl_put_extent(struct pnfs_block_extent *be);
struct pnfs_block_extent *bl_alloc_extent(void);
int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
@@ -200,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
int bl_add_merge_extent(struct pnfs_block_layout *bl,
struct pnfs_block_extent *new);
int bl_mark_for_commit(struct pnfs_block_extent *be,
- sector_t offset, sector_t length);
+ sector_t offset, sector_t length,
+ struct pnfs_block_short_extent *new);
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 19fa7b0b8c00..1abac09f7cd5 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -110,13 +110,7 @@ static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
return 0;
} else {
struct pnfs_inval_tracking *new;
- if (storage)
- new = storage;
- else {
- new = kmalloc(sizeof(*new), GFP_NOFS);
- if (!new)
- return -ENOMEM;
- }
+ new = storage;
new->it_sector = s;
new->it_tags = (1 << tag);
list_add(&new->it_link, &pos->it_link);
@@ -139,11 +133,13 @@ static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
}
/* Ensure that future operations on given range of tree will not malloc */
-static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
+static int _preload_range(struct pnfs_inval_markings *marks,
+ u64 offset, u64 length)
{
u64 start, end, s;
int count, i, used = 0, status = -ENOMEM;
struct pnfs_inval_tracking **storage;
+ struct my_tree *tree = &marks->im_tree;
dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
start = normalize(offset, tree->mtt_step_size);
@@ -161,12 +157,11 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
goto out_cleanup;
}
- /* Now need lock - HOW??? */
-
+ spin_lock_bh(&marks->im_lock);
for (s = start; s < end; s += tree->mtt_step_size)
used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
+ spin_unlock_bh(&marks->im_lock);
- /* Unlock - HOW??? */
status = 0;
out_cleanup:
@@ -179,41 +174,14 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
return status;
}
-static void set_needs_init(sector_t *array, sector_t offset)
-{
- sector_t *p = array;
-
- dprintk("%s enter\n", __func__);
- if (!p)
- return;
- while (*p < offset)
- p++;
- if (*p == offset)
- return;
- else if (*p == ~0) {
- *p++ = offset;
- *p = ~0;
- return;
- } else {
- sector_t *save = p;
- dprintk("%s Adding %llu\n", __func__, (u64)offset);
- while (*p != ~0)
- p++;
- p++;
- memmove(save + 1, save, (char *)p - (char *)save);
- *save = offset;
- return;
- }
-}
-
/* We are relying on page lock to serialize this */
int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
{
int rv;
- spin_lock(&marks->im_lock);
+ spin_lock_bh(&marks->im_lock);
rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
- spin_unlock(&marks->im_lock);
+ spin_unlock_bh(&marks->im_lock);
return rv;
}
@@ -253,78 +221,39 @@ static int is_range_written(struct pnfs_inval_markings *marks,
{
int rv;
- spin_lock(&marks->im_lock);
+ spin_lock_bh(&marks->im_lock);
rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
- spin_unlock(&marks->im_lock);
+ spin_unlock_bh(&marks->im_lock);
return rv;
}
/* Marks sectors in [offest, offset_length) as having been initialized.
* All lengths are step-aligned, where step is min(pagesize, blocksize).
- * Notes where partial block is initialized, and helps prepare it for
- * complete initialization later.
+ * Currently assumes offset is page-aligned
*/
-/* Currently assumes offset is page-aligned */
int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length,
- sector_t **pages)
+ sector_t offset, sector_t length)
{
- sector_t s, start, end;
- sector_t *array = NULL; /* Pages to mark */
+ sector_t start, end;
dprintk("%s(offset=%llu,len=%llu) enter\n",
__func__, (u64)offset, (u64)length);
- s = max((sector_t) 3,
- 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
- dprintk("%s set max=%llu\n", __func__, (u64)s);
- if (pages) {
- array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
- if (!array)
- goto outerr;
- array[0] = ~0;
- }
start = normalize(offset, marks->im_block_size);
end = normalize_up(offset + length, marks->im_block_size);
- if (_preload_range(&marks->im_tree, start, end - start))
+ if (_preload_range(marks, start, end - start))
goto outerr;
- spin_lock(&marks->im_lock);
-
- for (s = normalize_up(start, PAGE_CACHE_SECTORS);
- s < offset; s += PAGE_CACHE_SECTORS) {
- dprintk("%s pre-area pages\n", __func__);
- /* Portion of used block is not initialized */
- if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
- set_needs_init(array, s);
- }
+ spin_lock_bh(&marks->im_lock);
if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
goto out_unlock;
- for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
- s < end; s += PAGE_CACHE_SECTORS) {
- dprintk("%s post-area pages\n", __func__);
- if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
- set_needs_init(array, s);
- }
-
- spin_unlock(&marks->im_lock);
+ spin_unlock_bh(&marks->im_lock);
- if (pages) {
- if (array[0] == ~0) {
- kfree(array);
- *pages = NULL;
- } else
- *pages = array;
- }
return 0;
- out_unlock:
- spin_unlock(&marks->im_lock);
- outerr:
- if (pages) {
- kfree(array);
- *pages = NULL;
- }
+out_unlock:
+ spin_unlock_bh(&marks->im_lock);
+outerr:
return -ENOMEM;
}
@@ -338,9 +267,9 @@ static int mark_written_sectors(struct pnfs_inval_markings *marks,
dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
(u64)offset, (u64)length);
- spin_lock(&marks->im_lock);
+ spin_lock_bh(&marks->im_lock);
status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
- spin_unlock(&marks->im_lock);
+ spin_unlock_bh(&marks->im_lock);
return status;
}
@@ -440,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,
/* Note the range described by offset, length is guaranteed to be contained
* within be.
+ * new will be freed, either by this function or add_to_commitlist if they
+ * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
*/
int bl_mark_for_commit(struct pnfs_block_extent *be,
- sector_t offset, sector_t length)
+ sector_t offset, sector_t length,
+ struct pnfs_block_short_extent *new)
{
sector_t new_end, end = offset + length;
- struct pnfs_block_short_extent *new;
struct pnfs_block_layout *bl = container_of(be->be_inval,
struct pnfs_block_layout,
bl_inval);
- new = kmalloc(sizeof(*new), GFP_NOFS);
- if (!new)
- return -ENOMEM;
-
mark_written_sectors(be->be_inval, offset, length);
/* We want to add the range to commit list, but it must be
* block-normalized, and verified that the normalized range has
@@ -483,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
new->bse_mdev = be->be_mdev;
spin_lock(&bl->bl_ext_lock);
- /* new will be freed, either by add_to_commitlist if it decides not
- * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
- */
add_to_commitlist(bl, new);
spin_unlock(&bl->bl_ext_lock);
return 0;
@@ -933,3 +857,53 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
}
}
}
+
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
+{
+ struct pnfs_block_short_extent *new;
+
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (unlikely(!new))
+ return -ENOMEM;
+
+ spin_lock_bh(&marks->im_lock);
+ list_add(&new->bse_node, &marks->im_extents);
+ spin_unlock_bh(&marks->im_lock);
+
+ return 0;
+}
+
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
+{
+ struct pnfs_block_short_extent *rv = NULL;
+
+ spin_lock_bh(&marks->im_lock);
+ if (!list_empty(&marks->im_extents)) {
+ rv = list_entry((&marks->im_extents)->next,
+ struct pnfs_block_short_extent, bse_node);
+ list_del_init(&rv->bse_node);
+ }
+ spin_unlock_bh(&marks->im_lock);
+
+ return rv;
+}
+
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
+{
+ struct pnfs_block_short_extent *se = NULL, *tmp;
+
+ if (num_to_free <= 0)
+ return;
+
+ spin_lock(&marks->im_lock);
+ list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
+ list_del(&se->bse_node);
+ kfree(se);
+ if (--num_to_free == 0)
+ break;
+ }
+ spin_unlock(&marks->im_lock);
+
+ BUG_ON(num_to_free > 0);
+}
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 07df5f1d85e5..c89d3b9e483c 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -162,7 +162,7 @@ struct cb_layoutrecallargs {
};
};
-extern unsigned nfs4_callback_layoutrecall(
+extern __be32 nfs4_callback_layoutrecall(
struct cb_layoutrecallargs *args,
void *dummy, struct cb_process_state *cps);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 43926add945b..54cea8ad5a76 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -339,7 +339,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
dprintk("%s enter. slotid %d seqid %d\n",
__func__, args->csa_slotid, args->csa_sequenceid);
- if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS)
+ if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
return htonl(NFS4ERR_BADSLOT);
slot = tbl->slots + args->csa_slotid;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 726e59a9e50f..d50b2742f23b 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -305,6 +305,10 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
n = ntohl(*p++);
if (n <= 0)
goto out;
+ if (n > ULONG_MAX / sizeof(*args->devs)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
if (!args->devs) {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 873bf00d51a2..31778f74357d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -84,7 +84,7 @@ retry:
/*
* Turn off NFSv4 uid/gid mapping when using AUTH_SYS
*/
-static int nfs4_disable_idmapping = 0;
+static bool nfs4_disable_idmapping = true;
/*
* RPC cruft for NFS
@@ -185,7 +185,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
clp->cl_minorversion = cl_init->minorversion;
clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
#endif
- cred = rpc_lookup_machine_cred();
+ cred = rpc_lookup_machine_cred("*");
if (!IS_ERR(cred))
clp->cl_machine_cred = cred;
nfs_fscache_get_client_cookie(clp);
@@ -250,6 +250,11 @@ static void pnfs_init_server(struct nfs_server *server)
rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
}
+static void nfs4_destroy_server(struct nfs_server *server)
+{
+ nfs4_purge_state_owners(server);
+}
+
#else
static void nfs4_shutdown_client(struct nfs_client *clp)
{
@@ -1065,6 +1070,7 @@ static struct nfs_server *nfs_alloc_server(void)
INIT_LIST_HEAD(&server->master_link);
INIT_LIST_HEAD(&server->delegations);
INIT_LIST_HEAD(&server->layouts);
+ INIT_LIST_HEAD(&server->state_owners_lru);
atomic_set(&server->active, 0);
@@ -1538,6 +1544,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
nfs_server_insert_lists(server);
server->mount_time = jiffies;
+ server->destroy = nfs4_destroy_server;
out:
nfs_free_fattr(fattr);
return error;
@@ -1719,6 +1726,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
/* Copy data from the source */
server->nfs_client = source->nfs_client;
+ server->destroy = source->destroy;
atomic_inc(&server->nfs_client->cl_count);
nfs_server_copy_userdata(server, source);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ac2899098147..fd9a872fada0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -47,13 +47,13 @@ static int nfs_opendir(struct inode *, struct file *);
static int nfs_closedir(struct inode *, struct file *);
static int nfs_readdir(struct file *, void *, filldir_t);
static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
-static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *);
-static int nfs_mkdir(struct inode *, struct dentry *, int);
+static int nfs_create(struct inode *, struct dentry *, umode_t, struct nameidata *);
+static int nfs_mkdir(struct inode *, struct dentry *, umode_t);
static int nfs_rmdir(struct inode *, struct dentry *);
static int nfs_unlink(struct inode *, struct dentry *);
static int nfs_symlink(struct inode *, struct dentry *, const char *);
static int nfs_link(struct dentry *, struct inode *, struct dentry *);
-static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
+static int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
static int nfs_rename(struct inode *, struct dentry *,
struct inode *, struct dentry *);
static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
@@ -112,7 +112,7 @@ const struct inode_operations nfs3_dir_inode_operations = {
#ifdef CONFIG_NFS_V4
static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
-static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd);
const struct inode_operations nfs4_dir_inode_operations = {
.create = nfs_open_create,
.lookup = nfs_atomic_lookup,
@@ -1368,18 +1368,7 @@ static fmode_t flags_to_mode(int flags)
static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags)
{
- struct nfs_open_context *ctx;
- struct rpc_cred *cred;
- fmode_t fmode = flags_to_mode(open_flags);
-
- cred = rpc_lookup_cred();
- if (IS_ERR(cred))
- return ERR_CAST(cred);
- ctx = alloc_nfs_open_context(dentry, cred, fmode);
- put_rpccred(cred);
- if (ctx == NULL)
- return ERR_PTR(-ENOMEM);
- return ctx;
+ return alloc_nfs_open_context(dentry, flags_to_mode(open_flags));
}
static int do_open(struct inode *inode, struct file *filp)
@@ -1584,8 +1573,8 @@ no_open:
return nfs_lookup_revalidate(dentry, nd);
}
-static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
- struct nameidata *nd)
+static int nfs_open_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, struct nameidata *nd)
{
struct nfs_open_context *ctx = NULL;
struct iattr attr;
@@ -1675,8 +1664,8 @@ out_error:
* that the operation succeeded on the server, but an error in the
* reply path made it appear to have failed.
*/
-static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
- struct nameidata *nd)
+static int nfs_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, struct nameidata *nd)
{
struct iattr attr;
int error;
@@ -1704,7 +1693,7 @@ out_err:
* See comments for nfs_proc_create regarding failed operations.
*/
static int
-nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct iattr attr;
int status;
@@ -1730,7 +1719,7 @@ out_err:
/*
* See comments for nfs_proc_create regarding failed operations.
*/
-static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct iattr attr;
int error;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 606ef0f20aed..c43a452f7da2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -272,13 +272,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
datasync);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
mutex_lock(&inode->i_mutex);
nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
status = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (status >= 0 && ret < 0)
+ status = ret;
have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
if (have_error)
ret = xchg(&ctx->error, 0);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 47d1c6ff2d8e..2c05f1991e1e 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -38,6 +38,89 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
+
+/**
+ * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
+ * @fattr: fully initialised struct nfs_fattr
+ * @owner_name: owner name string cache
+ * @group_name: group name string cache
+ */
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+ struct nfs4_string *owner_name,
+ struct nfs4_string *group_name)
+{
+ fattr->owner_name = owner_name;
+ fattr->group_name = group_name;
+}
+
+static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr)
+{
+ fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME;
+ kfree(fattr->owner_name->data);
+}
+
+static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
+{
+ fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME;
+ kfree(fattr->group_name->data);
+}
+
+static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ struct nfs4_string *owner = fattr->owner_name;
+ __u32 uid;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
+ return false;
+ if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) {
+ fattr->uid = uid;
+ fattr->valid |= NFS_ATTR_FATTR_OWNER;
+ }
+ return true;
+}
+
+static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ struct nfs4_string *group = fattr->group_name;
+ __u32 gid;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
+ return false;
+ if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) {
+ fattr->gid = gid;
+ fattr->valid |= NFS_ATTR_FATTR_GROUP;
+ }
+ return true;
+}
+
+/**
+ * nfs_fattr_free_names - free up the NFSv4 owner and group strings
+ * @fattr: a fully initialised nfs_fattr structure
+ */
+void nfs_fattr_free_names(struct nfs_fattr *fattr)
+{
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)
+ nfs_fattr_free_owner_name(fattr);
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)
+ nfs_fattr_free_group_name(fattr);
+}
+
+/**
+ * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free
+ * @server: pointer to the filesystem nfs_server structure
+ * @fattr: a fully initialised nfs_fattr structure
+ *
+ * This helper maps the cached NFSv4 owner/group strings in fattr into
+ * their numeric uid/gid equivalents, and then frees the cached strings.
+ */
+void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ if (nfs_fattr_map_owner_name(server, fattr))
+ nfs_fattr_free_owner_name(fattr);
+ if (nfs_fattr_map_group_name(server, fattr))
+ nfs_fattr_free_group_name(fattr);
+}
static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 50a15fa8cf98..f649fba8c384 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -38,6 +38,7 @@
#include <linux/nfs_xdr.h>
#include <linux/slab.h>
#include <linux/compat.h>
+#include <linux/freezer.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -56,7 +57,7 @@
#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1
/* Default is to see 64-bit inode numbers */
-static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
static void nfs_invalidate_inode(struct inode *);
static int nfs_update_inode(struct inode *, struct nfs_fattr *);
@@ -77,7 +78,7 @@ int nfs_wait_bit_killable(void *word)
{
if (fatal_signal_pending(current))
return -ERESTARTSYS;
- schedule();
+ freezable_schedule();
return 0;
}
@@ -629,23 +630,28 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
nfs_revalidate_inode(server, inode);
}
-struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred, fmode_t f_mode)
+struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode)
{
struct nfs_open_context *ctx;
+ struct rpc_cred *cred = rpc_lookup_cred();
+ if (IS_ERR(cred))
+ return ERR_CAST(cred);
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
- if (ctx != NULL) {
- nfs_sb_active(dentry->d_sb);
- ctx->dentry = dget(dentry);
- ctx->cred = get_rpccred(cred);
- ctx->state = NULL;
- ctx->mode = f_mode;
- ctx->flags = 0;
- ctx->error = 0;
- nfs_init_lock_context(&ctx->lock_context);
- ctx->lock_context.open_context = ctx;
- INIT_LIST_HEAD(&ctx->list);
+ if (!ctx) {
+ put_rpccred(cred);
+ return ERR_PTR(-ENOMEM);
}
+ nfs_sb_active(dentry->d_sb);
+ ctx->dentry = dget(dentry);
+ ctx->cred = cred;
+ ctx->state = NULL;
+ ctx->mode = f_mode;
+ ctx->flags = 0;
+ ctx->error = 0;
+ nfs_init_lock_context(&ctx->lock_context);
+ ctx->lock_context.open_context = ctx;
+ INIT_LIST_HEAD(&ctx->list);
return ctx;
}
@@ -738,15 +744,10 @@ static void nfs_file_clear_open_context(struct file *filp)
int nfs_open(struct inode *inode, struct file *filp)
{
struct nfs_open_context *ctx;
- struct rpc_cred *cred;
- cred = rpc_lookup_cred();
- if (IS_ERR(cred))
- return PTR_ERR(cred);
- ctx = alloc_nfs_open_context(filp->f_path.dentry, cred, filp->f_mode);
- put_rpccred(cred);
- if (ctx == NULL)
- return -ENOMEM;
+ ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
nfs_file_set_open_context(filp, ctx);
put_nfs_open_context(ctx);
nfs_fscache_set_inode_cookie(inode, filp);
@@ -1019,6 +1020,8 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
fattr->valid = 0;
fattr->time_start = jiffies;
fattr->gencount = nfs_inc_attr_generation_counter();
+ fattr->owner_name = NULL;
+ fattr->group_name = NULL;
}
struct nfs_fattr *nfs_alloc_fattr(void)
@@ -1464,7 +1467,6 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
static void nfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3f4d95751d52..8102db9b926c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -307,6 +307,8 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata);
/* write.c */
extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
struct list_head *head);
+extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode, int ioflags);
extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
extern void nfs_writedata_release(struct nfs_write_data *wdata);
extern void nfs_commit_free(struct nfs_write_data *p);
@@ -330,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data);
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
- struct page *, struct page *);
+ struct page *, struct page *, enum migrate_mode);
#else
#define nfs_migrate_page NULL
#endif
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d4bc9ed91748..91943953a370 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -17,6 +17,7 @@
#include <linux/nfs_page.h>
#include <linux/lockd/bind.h>
#include <linux/nfs_mount.h>
+#include <linux/freezer.h>
#include "iostat.h"
#include "internal.h"
@@ -32,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
res = rpc_call_sync(clnt, msg, flags);
if (res != -EJUKEBOX && res != -EKEYEXPIRED)
break;
- schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+ freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
res = -ERESTARTSYS;
} while (!fatal_signal_pending(current));
return res;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 693ae22f8731..4d7d0aedc101 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -94,6 +94,8 @@ struct nfs_unique_id {
struct nfs4_state_owner {
struct nfs_unique_id so_owner_id;
struct nfs_server *so_server;
+ struct list_head so_lru;
+ unsigned long so_expires;
struct rb_node so_server_node;
struct rpc_cred *so_cred; /* Associated cred */
@@ -319,6 +321,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
extern void nfs4_put_state_owner(struct nfs4_state_owner *);
+extern void nfs4_purge_state_owners(struct nfs_server *);
extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
extern void nfs4_put_open_state(struct nfs4_state *);
extern void nfs4_close_state(struct nfs4_state *, fmode_t);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index a62d36b9a99e..71ec08617e23 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -49,13 +49,14 @@ filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
loff_t offset)
{
u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
- u64 tmp;
+ u64 stripe_no;
+ u32 rem;
offset -= flseg->pattern_offset;
- tmp = offset;
- do_div(tmp, stripe_width);
+ stripe_no = div_u64(offset, stripe_width);
+ div_u64_rem(offset, flseg->stripe_unit, &rem);
- return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+ return stripe_no * flseg->stripe_unit + rem;
}
/* This function is used by the layout driver to calculate the
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index ed388aae9689..8ae91908f5aa 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -382,7 +382,7 @@ decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
{
struct nfs4_pnfs_ds_addr *da = NULL;
char *buf, *portstr;
- u32 port;
+ __be16 port;
int nlen, rlen;
int tmp[2];
__be32 *p;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d9f4d78c3413..ec9f6ef6c5dd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -52,9 +52,11 @@
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/module.h>
+#include <linux/nfs_idmap.h>
#include <linux/sunrpc/bc_xprt.h>
#include <linux/xattr.h>
#include <linux/utsname.h>
+#include <linux/freezer.h>
#include "nfs4_fs.h"
#include "delegation.h"
@@ -243,7 +245,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
*timeout = NFS4_POLL_RETRY_MIN;
if (*timeout > NFS4_POLL_RETRY_MAX)
*timeout = NFS4_POLL_RETRY_MAX;
- schedule_timeout_killable(*timeout);
+ freezable_schedule_timeout_killable(*timeout);
if (fatal_signal_pending(current))
res = -ERESTARTSYS;
*timeout <<= 1;
@@ -363,9 +365,8 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
* Must be called while holding tbl->slot_tbl_lock
*/
static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
+nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
{
- int free_slotid = free_slot - tbl->slots;
int slotid = free_slotid;
BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
@@ -430,7 +431,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
}
spin_lock(&tbl->slot_tbl_lock);
- nfs4_free_slot(tbl, res->sr_slot);
+ nfs4_free_slot(tbl, res->sr_slot - tbl->slots);
nfs4_check_drain_fc_complete(res->sr_session);
spin_unlock(&tbl->slot_tbl_lock);
res->sr_slot = NULL;
@@ -553,13 +554,10 @@ int nfs41_setup_sequence(struct nfs4_session *session,
spin_lock(&tbl->slot_tbl_lock);
if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
!rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
- /*
- * The state manager will wait until the slot table is empty.
- * Schedule the reset thread
- */
+ /* The state manager will wait until the slot table is empty */
rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
spin_unlock(&tbl->slot_tbl_lock);
- dprintk("%s Schedule Session Reset\n", __func__);
+ dprintk("%s session is draining\n", __func__);
return -EAGAIN;
}
@@ -764,6 +762,8 @@ struct nfs4_opendata {
struct nfs_openres o_res;
struct nfs_open_confirmargs c_arg;
struct nfs_open_confirmres c_res;
+ struct nfs4_string owner_name;
+ struct nfs4_string group_name;
struct nfs_fattr f_attr;
struct nfs_fattr dir_attr;
struct dentry *dir;
@@ -787,6 +787,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
p->o_res.server = p->o_arg.server;
nfs_fattr_init(&p->f_attr);
nfs_fattr_init(&p->dir_attr);
+ nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
}
static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
@@ -818,6 +819,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p->o_arg.name = &dentry->d_name;
p->o_arg.server = server;
p->o_arg.bitmask = server->attr_bitmask;
+ p->o_arg.dir_bitmask = server->cache_consistency_bitmask;
p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
if (flags & O_CREAT) {
u32 *s;
@@ -854,6 +856,7 @@ static void nfs4_opendata_free(struct kref *kref)
dput(p->dir);
dput(p->dentry);
nfs_sb_deactive(sb);
+ nfs_fattr_free_names(&p->f_attr);
kfree(p);
}
@@ -1578,6 +1581,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
if (status != 0 || !data->rpc_done)
return status;
+ nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr);
+
nfs_refresh_inode(dir, o_res->dir_attr);
if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
@@ -1610,6 +1615,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
return status;
}
+ nfs_fattr_map_and_free_names(server, &data->f_attr);
+
if (o_arg->open_flags & O_CREAT) {
update_changeattr(dir, &o_res->cinfo);
nfs_post_op_update_inode(dir, o_res->dir_attr);
@@ -3430,19 +3437,6 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
*/
#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT)
-static void buf_to_pages(const void *buf, size_t buflen,
- struct page **pages, unsigned int *pgbase)
-{
- const void *p = buf;
-
- *pgbase = offset_in_page(buf);
- p -= *pgbase;
- while (p < buf + buflen) {
- *(pages++) = virt_to_page(p);
- p += PAGE_CACHE_SIZE;
- }
-}
-
static int buf_to_pages_noslab(const void *buf, size_t buflen,
struct page **pages, unsigned int *pgbase)
{
@@ -3539,9 +3533,19 @@ out:
nfs4_set_cached_acl(inode, acl);
}
+/*
+ * The getxattr API returns the required buffer length when called with a
+ * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating
+ * the required buf. On a NULL buf, we send a page of data to the server
+ * guessing that the ACL request can be serviced by a page. If so, we cache
+ * up to the page of ACL data, and the 2nd call to getxattr is serviced by
+ * the cache. If not so, we throw away the page, and cache the required
+ * length. The next getxattr call will then produce another round trip to
+ * the server, this time with the input buf of the required size.
+ */
static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
{
- struct page *pages[NFS4ACL_MAXPAGES];
+ struct page *pages[NFS4ACL_MAXPAGES] = {NULL, };
struct nfs_getaclargs args = {
.fh = NFS_FH(inode),
.acl_pages = pages,
@@ -3556,41 +3560,60 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
.rpc_argp = &args,
.rpc_resp = &res,
};
- struct page *localpage = NULL;
- int ret;
+ int ret = -ENOMEM, npages, i, acl_len = 0;
- if (buflen < PAGE_SIZE) {
- /* As long as we're doing a round trip to the server anyway,
- * let's be prepared for a page of acl data. */
- localpage = alloc_page(GFP_KERNEL);
- resp_buf = page_address(localpage);
- if (localpage == NULL)
- return -ENOMEM;
- args.acl_pages[0] = localpage;
- args.acl_pgbase = 0;
- args.acl_len = PAGE_SIZE;
- } else {
- resp_buf = buf;
- buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
+ npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ /* As long as we're doing a round trip to the server anyway,
+ * let's be prepared for a page of acl data. */
+ if (npages == 0)
+ npages = 1;
+
+ for (i = 0; i < npages; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i])
+ goto out_free;
+ }
+ if (npages > 1) {
+ /* for decoding across pages */
+ res.acl_scratch = alloc_page(GFP_KERNEL);
+ if (!res.acl_scratch)
+ goto out_free;
}
- ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
+ args.acl_len = npages * PAGE_SIZE;
+ args.acl_pgbase = 0;
+ /* Let decode_getfacl know not to fail if the ACL data is larger than
+ * the page we send as a guess */
+ if (buf == NULL)
+ res.acl_flags |= NFS4_ACL_LEN_REQUEST;
+ resp_buf = page_address(pages[0]);
+
+ dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n",
+ __func__, buf, buflen, npages, args.acl_len);
+ ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
+ &msg, &args.seq_args, &res.seq_res, 0);
if (ret)
goto out_free;
- if (res.acl_len > args.acl_len)
- nfs4_write_cached_acl(inode, NULL, res.acl_len);
+
+ acl_len = res.acl_len - res.acl_data_offset;
+ if (acl_len > args.acl_len)
+ nfs4_write_cached_acl(inode, NULL, acl_len);
else
- nfs4_write_cached_acl(inode, resp_buf, res.acl_len);
+ nfs4_write_cached_acl(inode, resp_buf + res.acl_data_offset,
+ acl_len);
if (buf) {
ret = -ERANGE;
- if (res.acl_len > buflen)
+ if (acl_len > buflen)
goto out_free;
- if (localpage)
- memcpy(buf, resp_buf, res.acl_len);
+ _copy_from_pages(buf, pages, res.acl_data_offset,
+ res.acl_len);
}
- ret = res.acl_len;
+ ret = acl_len;
out_free:
- if (localpage)
- __free_page(localpage);
+ for (i = 0; i < npages; i++)
+ if (pages[i])
+ __free_page(pages[i]);
+ if (res.acl_scratch)
+ __free_page(res.acl_scratch);
return ret;
}
@@ -3621,6 +3644,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
nfs_zap_acl_cache(inode);
ret = nfs4_read_cached_acl(inode, buf, buflen);
if (ret != -ENOENT)
+ /* -ENOENT is returned if there is no ACL or if there is an ACL
+ * but no cached acl data, just the acl length */
return ret;
return nfs4_get_acl_uncached(inode, buf, buflen);
}
@@ -3958,7 +3983,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
static unsigned long
nfs4_set_lock_task_retry(unsigned long timeout)
{
- schedule_timeout_killable(timeout);
+ freezable_schedule_timeout_killable(timeout);
timeout <<= 1;
if (timeout > NFS4_LOCK_MAXTIMEOUT)
return NFS4_LOCK_MAXTIMEOUT;
@@ -4858,8 +4883,10 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
clp->cl_rpcclient->cl_auth->au_flavor);
res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL);
- if (unlikely(!res.server_scope))
- return -ENOMEM;
+ if (unlikely(!res.server_scope)) {
+ status = -ENOMEM;
+ goto out;
+ }
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
if (!status)
@@ -4876,12 +4903,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
clp->server_scope = NULL;
}
- if (!clp->server_scope)
+ if (!clp->server_scope) {
clp->server_scope = res.server_scope;
- else
- kfree(res.server_scope);
+ goto out;
+ }
}
-
+ kfree(res.server_scope);
+out:
dprintk("<-- %s status= %d\n", __func__, status);
return status;
}
@@ -4983,37 +5011,53 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
return status;
}
+static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags)
+{
+ return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags);
+}
+
+static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *new,
+ u32 max_slots,
+ u32 ivalue)
+{
+ struct nfs4_slot *old = NULL;
+ u32 i;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ if (new) {
+ old = tbl->slots;
+ tbl->slots = new;
+ tbl->max_slots = max_slots;
+ }
+ tbl->highest_used_slotid = -1; /* no slot is currently used */
+ for (i = 0; i < tbl->max_slots; i++)
+ tbl->slots[i].seq_nr = ivalue;
+ spin_unlock(&tbl->slot_tbl_lock);
+ kfree(old);
+}
+
/*
- * Reset a slot table
+ * (re)Initialise a slot table
*/
-static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
- int ivalue)
+static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
+ u32 ivalue)
{
struct nfs4_slot *new = NULL;
- int i;
- int ret = 0;
+ int ret = -ENOMEM;
dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
max_reqs, tbl->max_slots);
/* Does the newly negotiated max_reqs match the existing slot table? */
if (max_reqs != tbl->max_slots) {
- ret = -ENOMEM;
- new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
- GFP_NOFS);
+ new = nfs4_alloc_slots(max_reqs, GFP_NOFS);
if (!new)
goto out;
- ret = 0;
- kfree(tbl->slots);
- }
- spin_lock(&tbl->slot_tbl_lock);
- if (new) {
- tbl->slots = new;
- tbl->max_slots = max_reqs;
}
- for (i = 0; i < tbl->max_slots; ++i)
- tbl->slots[i].seq_nr = ivalue;
- spin_unlock(&tbl->slot_tbl_lock);
+ ret = 0;
+
+ nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue);
dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
tbl, tbl->slots, tbl->max_slots);
out:
@@ -5021,23 +5065,6 @@ out:
return ret;
}
-/*
- * Reset the forechannel and backchannel slot tables
- */
-static int nfs4_reset_slot_tables(struct nfs4_session *session)
-{
- int status;
-
- status = nfs4_reset_slot_table(&session->fc_slot_table,
- session->fc_attrs.max_reqs, 1);
- if (status)
- return status;
-
- status = nfs4_reset_slot_table(&session->bc_slot_table,
- session->bc_attrs.max_reqs, 0);
- return status;
-}
-
/* Destroy the slot table */
static void nfs4_destroy_slot_tables(struct nfs4_session *session)
{
@@ -5053,59 +5080,26 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session)
}
/*
- * Initialize slot table
+ * Initialize or reset the forechannel and backchannel tables
*/
-static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
- int max_slots, int ivalue)
-{
- struct nfs4_slot *slot;
- int ret = -ENOMEM;
-
- BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE);
-
- dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
-
- slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
- if (!slot)
- goto out;
- ret = 0;
-
- spin_lock(&tbl->slot_tbl_lock);
- tbl->max_slots = max_slots;
- tbl->slots = slot;
- tbl->highest_used_slotid = -1; /* no slot is currently used */
- spin_unlock(&tbl->slot_tbl_lock);
- dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
- tbl, tbl->slots, tbl->max_slots);
-out:
- dprintk("<-- %s: return %d\n", __func__, ret);
- return ret;
-}
-
-/*
- * Initialize the forechannel and backchannel tables
- */
-static int nfs4_init_slot_tables(struct nfs4_session *session)
+static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
{
struct nfs4_slot_table *tbl;
- int status = 0;
-
- tbl = &session->fc_slot_table;
- if (tbl->slots == NULL) {
- status = nfs4_init_slot_table(tbl,
- session->fc_attrs.max_reqs, 1);
- if (status)
- return status;
- }
-
- tbl = &session->bc_slot_table;
- if (tbl->slots == NULL) {
- status = nfs4_init_slot_table(tbl,
- session->bc_attrs.max_reqs, 0);
- if (status)
- nfs4_destroy_slot_tables(session);
- }
+ int status;
+ dprintk("--> %s\n", __func__);
+ /* Fore channel */
+ tbl = &ses->fc_slot_table;
+ status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
+ if (status) /* -ENOMEM */
+ return status;
+ /* Back channel */
+ tbl = &ses->bc_slot_table;
+ status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
+ if (status && tbl->slots == NULL)
+ /* Fore and back channel share a connection so get
+ * both slot tables or neither */
+ nfs4_destroy_slot_tables(ses);
return status;
}
@@ -5293,13 +5287,9 @@ int nfs4_proc_create_session(struct nfs_client *clp)
if (status)
goto out;
- /* Init and reset the fore channel */
- status = nfs4_init_slot_tables(session);
- dprintk("slot table initialization returned %d\n", status);
- if (status)
- goto out;
- status = nfs4_reset_slot_tables(session);
- dprintk("slot table reset returned %d\n", status);
+ /* Init or reset the session slot tables */
+ status = nfs4_setup_session_slot_tables(session);
+ dprintk("slot table setup returned %d\n", status);
if (status)
goto out;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6a7107ae6b72..45392032e7bd 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -49,6 +49,7 @@
#include <linux/ratelimit.h>
#include <linux/workqueue.h>
#include <linux/bitops.h>
+#include <linux/jiffies.h>
#include "nfs4_fs.h"
#include "callback.h"
@@ -377,31 +378,24 @@ nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
{
struct rb_node **p = &server->state_owners.rb_node,
*parent = NULL;
- struct nfs4_state_owner *sp, *res = NULL;
+ struct nfs4_state_owner *sp;
while (*p != NULL) {
parent = *p;
sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
- if (server < sp->so_server) {
- p = &parent->rb_left;
- continue;
- }
- if (server > sp->so_server) {
- p = &parent->rb_right;
- continue;
- }
if (cred < sp->so_cred)
p = &parent->rb_left;
else if (cred > sp->so_cred)
p = &parent->rb_right;
else {
+ if (!list_empty(&sp->so_lru))
+ list_del_init(&sp->so_lru);
atomic_inc(&sp->so_count);
- res = sp;
- break;
+ return sp;
}
}
- return res;
+ return NULL;
}
static struct nfs4_state_owner *
@@ -421,6 +415,8 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
else if (new->so_cred > sp->so_cred)
p = &parent->rb_right;
else {
+ if (!list_empty(&sp->so_lru))
+ list_del_init(&sp->so_lru);
atomic_inc(&sp->so_count);
return sp;
}
@@ -462,6 +458,7 @@ nfs4_alloc_state_owner(void)
spin_lock_init(&sp->so_sequence.lock);
INIT_LIST_HEAD(&sp->so_sequence.list);
atomic_set(&sp->so_count, 1);
+ INIT_LIST_HEAD(&sp->so_lru);
return sp;
}
@@ -479,6 +476,38 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
}
}
+static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
+{
+ rpc_destroy_wait_queue(&sp->so_sequence.wait);
+ put_rpccred(sp->so_cred);
+ kfree(sp);
+}
+
+static void nfs4_gc_state_owners(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp, *tmp;
+ unsigned long time_min, time_max;
+ LIST_HEAD(doomed);
+
+ spin_lock(&clp->cl_lock);
+ time_max = jiffies;
+ time_min = (long)time_max - (long)clp->cl_lease_time;
+ list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+ /* NB: LRU is sorted so that oldest is at the head */
+ if (time_in_range(sp->so_expires, time_min, time_max))
+ break;
+ list_move(&sp->so_lru, &doomed);
+ nfs4_remove_state_owner_locked(sp);
+ }
+ spin_unlock(&clp->cl_lock);
+
+ list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+ list_del(&sp->so_lru);
+ nfs4_free_state_owner(sp);
+ }
+}
+
/**
* nfs4_get_state_owner - Look up a state owner given a credential
* @server: nfs_server to search
@@ -496,10 +525,10 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
sp = nfs4_find_state_owner_locked(server, cred);
spin_unlock(&clp->cl_lock);
if (sp != NULL)
- return sp;
+ goto out;
new = nfs4_alloc_state_owner();
if (new == NULL)
- return NULL;
+ goto out;
new->so_server = server;
new->so_cred = cred;
spin_lock(&clp->cl_lock);
@@ -511,26 +540,58 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
rpc_destroy_wait_queue(&new->so_sequence.wait);
kfree(new);
}
+out:
+ nfs4_gc_state_owners(server);
return sp;
}
/**
* nfs4_put_state_owner - Release a nfs4_state_owner
* @sp: state owner data to release
- *
*/
void nfs4_put_state_owner(struct nfs4_state_owner *sp)
{
- struct nfs_client *clp = sp->so_server->nfs_client;
- struct rpc_cred *cred = sp->so_cred;
+ struct nfs_server *server = sp->so_server;
+ struct nfs_client *clp = server->nfs_client;
if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
return;
- nfs4_remove_state_owner_locked(sp);
+
+ if (!RB_EMPTY_NODE(&sp->so_server_node)) {
+ sp->so_expires = jiffies;
+ list_add_tail(&sp->so_lru, &server->state_owners_lru);
+ spin_unlock(&clp->cl_lock);
+ } else {
+ nfs4_remove_state_owner_locked(sp);
+ spin_unlock(&clp->cl_lock);
+ nfs4_free_state_owner(sp);
+ }
+}
+
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @server: nfs_server with cached state owners to release
+ *
+ * Called at umount time. Remaining state owners will be on
+ * the LRU with ref count of zero.
+ */
+void nfs4_purge_state_owners(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp, *tmp;
+ LIST_HEAD(doomed);
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+ list_move(&sp->so_lru, &doomed);
+ nfs4_remove_state_owner_locked(sp);
+ }
spin_unlock(&clp->cl_lock);
- rpc_destroy_wait_queue(&sp->so_sequence.wait);
- put_rpccred(cred);
- kfree(sp);
+
+ list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+ list_del(&sp->so_lru);
+ nfs4_free_state_owner(sp);
+ }
}
static struct nfs4_state *
@@ -1071,6 +1132,8 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4
{
struct nfs_client *clp = server->nfs_client;
+ if (test_and_clear_bit(NFS_DELEGATED_STATE, &state->flags))
+ nfs_async_inode_return_delegation(state->inode, &state->stateid);
nfs4_state_mark_reclaim_nograce(clp, state);
nfs4_schedule_state_manager(clp);
}
@@ -1402,6 +1465,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
restart:
rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ nfs4_purge_state_owners(server);
spin_lock(&clp->cl_lock);
for (pos = rb_first(&server->state_owners);
pos != NULL;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e6161b213ed1..33bd8d0f745d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2298,7 +2298,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_getfh(xdr, &hdr);
encode_getfattr(xdr, args->bitmask, &hdr);
encode_restorefh(xdr, &hdr);
- encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_getfattr(xdr, args->dir_bitmask, &hdr);
encode_nops(&hdr);
}
@@ -2517,11 +2517,12 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
- replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
+ replen = hdr.replen + op_decode_hdr_maxsz + 1;
encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
args->acl_pages, args->acl_pgbase, args->acl_len);
+
encode_nops(&hdr);
}
@@ -3790,7 +3791,8 @@ out_overflow:
}
static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
- const struct nfs_server *server, uint32_t *uid, int may_sleep)
+ const struct nfs_server *server, uint32_t *uid,
+ struct nfs4_string *owner_name)
{
uint32_t len;
__be32 *p;
@@ -3807,8 +3809,12 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
goto out_overflow;
- if (!may_sleep) {
- /* do nothing */
+ if (owner_name != NULL) {
+ owner_name->data = kmemdup(p, len, GFP_NOWAIT);
+ if (owner_name->data != NULL) {
+ owner_name->len = len;
+ ret = NFS_ATTR_FATTR_OWNER_NAME;
+ }
} else if (len < XDR_MAX_NETOBJ) {
if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
ret = NFS_ATTR_FATTR_OWNER;
@@ -3828,7 +3834,8 @@ out_overflow:
}
static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
- const struct nfs_server *server, uint32_t *gid, int may_sleep)
+ const struct nfs_server *server, uint32_t *gid,
+ struct nfs4_string *group_name)
{
uint32_t len;
__be32 *p;
@@ -3845,8 +3852,12 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
goto out_overflow;
- if (!may_sleep) {
- /* do nothing */
+ if (group_name != NULL) {
+ group_name->data = kmemdup(p, len, GFP_NOWAIT);
+ if (group_name->data != NULL) {
+ group_name->len = len;
+ ret = NFS_ATTR_FATTR_GROUP_NAME;
+ }
} else if (len < XDR_MAX_NETOBJ) {
if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
ret = NFS_ATTR_FATTR_GROUP;
@@ -4283,7 +4294,7 @@ xdr_error:
static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
struct nfs_fattr *fattr, struct nfs_fh *fh,
- const struct nfs_server *server, int may_sleep)
+ const struct nfs_server *server)
{
int status;
umode_t fmode = 0;
@@ -4350,12 +4361,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
- status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
+ status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
- status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
+ status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
@@ -4396,7 +4407,7 @@ xdr_error:
}
static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
- struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
+ struct nfs_fh *fh, const struct nfs_server *server)
{
__be32 *savep;
uint32_t attrlen,
@@ -4415,7 +4426,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
if (status < 0)
goto xdr_error;
- status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
+ status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server);
if (status < 0)
goto xdr_error;
@@ -4426,9 +4437,9 @@ xdr_error:
}
static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
- const struct nfs_server *server, int may_sleep)
+ const struct nfs_server *server)
{
- return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
+ return decode_getfattr_generic(xdr, fattr, NULL, server);
}
/*
@@ -4957,17 +4968,18 @@ decode_restorefh(struct xdr_stream *xdr)
}
static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
- size_t *acl_len)
+ struct nfs_getaclres *res)
{
- __be32 *savep;
+ __be32 *savep, *bm_p;
uint32_t attrlen,
bitmap[3] = {0};
struct kvec *iov = req->rq_rcv_buf.head;
int status;
- *acl_len = 0;
+ res->acl_len = 0;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
goto out;
+ bm_p = xdr->p;
if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
goto out;
if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
@@ -4979,18 +4991,30 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
size_t hdrlen;
u32 recvd;
+ /* The bitmap (xdr len + bitmaps) and the attr xdr len words
+ * are stored with the acl data to handle the problem of
+ * variable length bitmaps.*/
+ xdr->p = bm_p;
+ res->acl_data_offset = be32_to_cpup(bm_p) + 2;
+ res->acl_data_offset <<= 2;
+
/* We ignore &savep and don't do consistency checks on
* the attr length. Let userspace figure it out.... */
hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
+ attrlen += res->acl_data_offset;
recvd = req->rq_rcv_buf.len - hdrlen;
if (attrlen > recvd) {
- dprintk("NFS: server cheating in getattr"
- " acl reply: attrlen %u > recvd %u\n",
+ if (res->acl_flags & NFS4_ACL_LEN_REQUEST) {
+ /* getxattr interface called with a NULL buf */
+ res->acl_len = attrlen;
+ goto out;
+ }
+ dprintk("NFS: acl reply: attrlen %u > recvd %u\n",
attrlen, recvd);
return -EINVAL;
}
xdr_read_pages(xdr, attrlen);
- *acl_len = attrlen;
+ res->acl_len = attrlen;
} else
status = -EOPNOTSUPP;
@@ -5696,8 +5720,7 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
status = decode_open_downgrade(xdr, res);
if (status != 0)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -5723,8 +5746,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_access(xdr, res);
if (status != 0)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -5753,8 +5775,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- status = decode_getfattr(xdr, res->fattr, res->server
- ,!RPC_IS_ASYNC(rqstp->rq_task));
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -5780,8 +5801,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
goto out;
status = decode_getfh(xdr, res->fh);
if (status == 0)
- status = decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -5807,8 +5827,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_remove(xdr, &res->cinfo);
if (status)
goto out;
- decode_getfattr(xdr, res->dir_attr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->dir_attr, res->server);
out:
return status;
}
@@ -5841,14 +5860,12 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
if (status)
goto out;
/* Current FH is target directory */
- if (decode_getfattr(xdr, res->new_fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+ if (decode_getfattr(xdr, res->new_fattr, res->server))
goto out;
status = decode_restorefh(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->old_fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->old_fattr, res->server);
out:
return status;
}
@@ -5884,14 +5901,12 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
* Note order: OP_LINK leaves the directory as the current
* filehandle.
*/
- if (decode_getfattr(xdr, res->dir_attr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+ if (decode_getfattr(xdr, res->dir_attr, res->server))
goto out;
status = decode_restorefh(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -5923,14 +5938,12 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- if (decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+ if (decode_getfattr(xdr, res->fattr, res->server))
goto out;
status = decode_restorefh(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->dir_fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->dir_fattr, res->server);
out:
return status;
}
@@ -5962,8 +5975,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
- status = decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6019,6 +6031,10 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
struct compound_hdr hdr;
int status;
+ if (res->acl_scratch != NULL) {
+ void *p = page_address(res->acl_scratch);
+ xdr_set_scratch_buffer(xdr, p, PAGE_SIZE);
+ }
status = decode_compound_hdr(xdr, &hdr);
if (status)
goto out;
@@ -6028,7 +6044,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
- status = decode_getacl(xdr, rqstp, &res->acl_len);
+ status = decode_getacl(xdr, rqstp, res);
out:
return status;
@@ -6061,8 +6077,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
* an ESTALE error. Shouldn't be a problem,
* though, since fattr->valid will remain unset.
*/
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6093,13 +6108,11 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
goto out;
if (decode_getfh(xdr, &res->fh) != 0)
goto out;
- if (decode_getfattr(xdr, res->f_attr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+ if (decode_getfattr(xdr, res->f_attr, res->server) != 0)
goto out;
if (decode_restorefh(xdr) != 0)
goto out;
- decode_getfattr(xdr, res->dir_attr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->dir_attr, res->server);
out:
return status;
}
@@ -6147,8 +6160,7 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
status = decode_open(xdr, res);
if (status)
goto out;
- decode_getfattr(xdr, res->f_attr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->f_attr, res->server);
out:
return status;
}
@@ -6175,8 +6187,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
status = decode_setattr(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6356,8 +6367,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
if (status)
goto out;
if (res->fattr)
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
if (!status)
status = res->count;
out:
@@ -6386,8 +6396,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
if (status)
goto out;
if (res->fattr)
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6546,8 +6555,7 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
status = decode_delegreturn(xdr);
if (status != 0)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6576,8 +6584,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
goto out;
xdr_enter_page(xdr, PAGE_SIZE);
status = decode_getfattr(xdr, &res->fs_locations->fattr,
- res->fs_locations->server,
- !RPC_IS_ASYNC(req->rq_task));
+ res->fs_locations->server);
out:
return status;
}
@@ -6826,8 +6833,7 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
status = decode_layoutcommit(xdr, rqstp, res);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6958,7 +6964,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
goto out_overflow;
if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
- entry->server, 1) < 0)
+ entry->server) < 0)
goto out_overflow;
if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c807ab93140e..55d01280a609 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -551,7 +551,8 @@ static const struct nfs_pageio_ops objio_pg_write_ops = {
static struct pnfs_layoutdriver_type objlayout_type = {
.id = LAYOUT_OSD2_OBJECTS,
.name = "LAYOUT_OSD2_OBJECTS",
- .flags = PNFS_LAYOUTRET_ON_SETATTR,
+ .flags = PNFS_LAYOUTRET_ON_SETATTR |
+ PNFS_LAYOUTRET_ON_ERROR,
.alloc_layout_hdr = objlayout_alloc_layout_hdr,
.free_layout_hdr = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 72074e3a04f9..b3c29039f5b8 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -254,6 +254,8 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
oir->status = rdata->task.tk_status = status;
if (status >= 0)
rdata->res.count = status;
+ else
+ rdata->pnfs_error = status;
objlayout_iodone(oir);
/* must not use oir after this point */
@@ -334,6 +336,8 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
if (status >= 0) {
wdata->res.count = status;
wdata->verf.committed = oir->committed;
+ } else {
+ wdata->pnfs_error = status;
}
objlayout_iodone(oir);
/* must not use oir after this point */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8e672a2b2d69..17149a490065 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1166,6 +1166,33 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
+static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head)
+{
+ struct nfs_pageio_descriptor pgio;
+ LIST_HEAD(failed);
+
+ /* Resend all requests through the MDS */
+ nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE);
+ while (!list_empty(head)) {
+ struct nfs_page *req = nfs_list_entry(head->next);
+
+ nfs_list_remove_request(req);
+ if (!nfs_pageio_add_request(&pgio, req))
+ nfs_list_add_request(req, &failed);
+ }
+ nfs_pageio_complete(&pgio);
+
+ if (!list_empty(&failed)) {
+ /* For some reason our attempt to resend pages. Mark the
+ * overall send request as having failed, and let
+ * nfs_writeback_release_full deal with the error.
+ */
+ list_move(&failed, head);
+ return -EIO;
+ }
+ return 0;
+}
+
/*
* Called by non rpc-based layout drivers
*/
@@ -1175,9 +1202,17 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
pnfs_set_layoutcommit(data);
data->mds_ops->rpc_call_done(&data->task, data);
} else {
- put_lseg(data->lseg);
- data->lseg = NULL;
dprintk("pnfs write error = %d\n", data->pnfs_error);
+ if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_ERROR) {
+ /* Don't lo_commit on error, Server will needs to
+ * preform a file recovery.
+ */
+ clear_bit(NFS_INO_LAYOUTCOMMIT,
+ &NFS_I(data->inode)->flags);
+ pnfs_return_layout(data->inode);
+ }
+ data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
}
data->mds_ops->rpc_release(data);
}
@@ -1267,6 +1302,9 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
put_lseg(data->lseg);
data->lseg = NULL;
dprintk("pnfs write error = %d\n", data->pnfs_error);
+ if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_ERROR)
+ pnfs_return_layout(data->inode);
nfs_pageio_init_read_mds(&pgio, data->inode);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1509530cb111..53d593a0a4f2 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -68,6 +68,7 @@ enum {
enum layoutdriver_policy_flags {
/* Should the pNFS client commit and return the layout upon a setattr */
PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
+ PNFS_LAYOUTRET_ON_ERROR = 1 << 1,
};
struct nfs4_deviceid_node;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index f48125da198a..0c672588fe5a 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -41,6 +41,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/lockd/bind.h>
+#include <linux/freezer.h>
#include "internal.h"
#define NFSDBG_FACILITY NFSDBG_PROC
@@ -59,7 +60,7 @@ nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
res = rpc_call_sync(clnt, msg, flags);
if (res != -EKEYEXPIRED)
break;
- schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+ freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
res = -ERESTARTSYS;
} while (!fatal_signal_pending(current));
return res;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 134777406ee3..3dfa4f112c0a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -41,7 +41,6 @@
#include <linux/lockd/bind.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
-#include <linux/mnt_namespace.h>
#include <linux/namei.h>
#include <linux/nfs_idmap.h>
#include <linux/vfs.h>
@@ -263,10 +262,10 @@ static match_table_t nfs_local_lock_tokens = {
static void nfs_umount_begin(struct super_block *);
static int nfs_statfs(struct dentry *, struct kstatfs *);
-static int nfs_show_options(struct seq_file *, struct vfsmount *);
-static int nfs_show_devname(struct seq_file *, struct vfsmount *);
-static int nfs_show_path(struct seq_file *, struct vfsmount *);
-static int nfs_show_stats(struct seq_file *, struct vfsmount *);
+static int nfs_show_options(struct seq_file *, struct dentry *);
+static int nfs_show_devname(struct seq_file *, struct dentry *);
+static int nfs_show_path(struct seq_file *, struct dentry *);
+static int nfs_show_stats(struct seq_file *, struct dentry *);
static struct dentry *nfs_fs_mount(struct file_system_type *,
int, const char *, void *);
static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
@@ -721,9 +720,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
/*
* Describe the mount options on this VFS mountpoint
*/
-static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int nfs_show_options(struct seq_file *m, struct dentry *root)
{
- struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+ struct nfs_server *nfss = NFS_SB(root->d_sb);
nfs_show_mount_options(m, nfss, 0);
@@ -761,14 +760,14 @@ static void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
#endif
#endif
-static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
+static int nfs_show_devname(struct seq_file *m, struct dentry *root)
{
char *page = (char *) __get_free_page(GFP_KERNEL);
char *devname, *dummy;
int err = 0;
if (!page)
return -ENOMEM;
- devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE);
+ devname = nfs_path(&dummy, root, page, PAGE_SIZE);
if (IS_ERR(devname))
err = PTR_ERR(devname);
else
@@ -777,7 +776,7 @@ static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
return err;
}
-static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt)
+static int nfs_show_path(struct seq_file *m, struct dentry *dentry)
{
seq_puts(m, "/");
return 0;
@@ -786,10 +785,10 @@ static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt)
/*
* Present statistical information for this VFS mountpoint
*/
-static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
+static int nfs_show_stats(struct seq_file *m, struct dentry *root)
{
int i, cpu;
- struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+ struct nfs_server *nfss = NFS_SB(root->d_sb);
struct rpc_auth *auth = nfss->client->cl_auth;
struct nfs_iostats totals = { };
@@ -799,10 +798,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
* Display all mount option settings
*/
seq_printf(m, "\n\topts:\t");
- seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
- seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
- seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
- seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
+ seq_puts(m, root->d_sb->s_flags & MS_RDONLY ? "ro" : "rw");
+ seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
+ seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : "");
+ seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
nfs_show_mount_options(m, nfss, 1);
seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
@@ -909,10 +908,24 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
data->auth_flavor_len = 1;
data->version = version;
data->minorversion = 0;
+ security_init_mnt_opts(&data->lsm_opts);
}
return data;
}
+static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data)
+{
+ if (data) {
+ kfree(data->client_address);
+ kfree(data->mount_server.hostname);
+ kfree(data->nfs_server.export_path);
+ kfree(data->nfs_server.hostname);
+ kfree(data->fscache_uniq);
+ security_free_mnt_opts(&data->lsm_opts);
+ kfree(data);
+ }
+}
+
/*
* Sanity-check a server address provided by the mount command.
*
@@ -2220,9 +2233,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
mntfh = nfs_alloc_fhandle();
if (data == NULL || mntfh == NULL)
- goto out_free_fh;
-
- security_init_mnt_opts(&data->lsm_opts);
+ goto out;
/* Validate the mount data */
error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
@@ -2234,8 +2245,6 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
#ifdef CONFIG_NFS_V4
if (data->version == 4) {
mntroot = nfs4_try_mount(flags, dev_name, data);
- kfree(data->client_address);
- kfree(data->nfs_server.export_path);
goto out;
}
#endif /* CONFIG_NFS_V4 */
@@ -2290,13 +2299,8 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
s->s_flags |= MS_ACTIVE;
out:
- kfree(data->nfs_server.hostname);
- kfree(data->mount_server.hostname);
- kfree(data->fscache_uniq);
- security_free_mnt_opts(&data->lsm_opts);
-out_free_fh:
+ nfs_free_parsed_mount_data(data);
nfs_free_fhandle(mntfh);
- kfree(data);
return mntroot;
out_err_nosb:
@@ -2623,9 +2627,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
mntfh = nfs_alloc_fhandle();
if (data == NULL || mntfh == NULL)
- goto out_free_fh;
-
- security_init_mnt_opts(&data->lsm_opts);
+ goto out;
/* Get a volume representation */
server = nfs4_create_server(data, mntfh);
@@ -2677,13 +2679,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
s->s_flags |= MS_ACTIVE;
- security_free_mnt_opts(&data->lsm_opts);
nfs_free_fhandle(mntfh);
return mntroot;
out:
- security_free_mnt_opts(&data->lsm_opts);
-out_free_fh:
nfs_free_fhandle(mntfh);
return ERR_PTR(error);
@@ -2788,11 +2787,15 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
const char *export_path)
{
struct dentry *dentry;
- int ret = nfs_referral_loop_protect();
+ int err;
+
+ if (IS_ERR(root_mnt))
+ return ERR_CAST(root_mnt);
- if (ret) {
+ err = nfs_referral_loop_protect();
+ if (err) {
mntput(root_mnt);
- return ERR_PTR(ret);
+ return ERR_PTR(err);
}
dentry = mount_subtree(root_mnt, export_path);
@@ -2816,9 +2819,7 @@ static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
data->nfs_server.hostname);
data->nfs_server.export_path = export_path;
- res = ERR_CAST(root_mnt);
- if (!IS_ERR(root_mnt))
- res = nfs_follow_remote_path(root_mnt, export_path);
+ res = nfs_follow_remote_path(root_mnt, export_path);
dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n",
IS_ERR(res) ? PTR_ERR(res) : 0,
@@ -2838,7 +2839,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
data = nfs_alloc_parsed_mount_data(4);
if (data == NULL)
- goto out_free_data;
+ goto out;
/* Validate the mount data */
error = nfs4_validate_mount_data(raw_data, data, dev_name);
@@ -2852,12 +2853,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
error = PTR_ERR(res);
out:
- kfree(data->client_address);
- kfree(data->nfs_server.export_path);
- kfree(data->nfs_server.hostname);
- kfree(data->fscache_uniq);
-out_free_data:
- kfree(data);
+ nfs_free_parsed_mount_data(data);
dprintk("<-- nfs4_mount() = %d%s\n", error,
error != 0 ? " [error]" : "");
return res;
@@ -3079,9 +3075,7 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
flags, data, data->hostname);
data->mnt_path = export_path;
- res = ERR_CAST(root_mnt);
- if (!IS_ERR(root_mnt))
- res = nfs_follow_remote_path(root_mnt, export_path);
+ res = nfs_follow_remote_path(root_mnt, export_path);
dprintk("<-- nfs4_referral_mount() = %ld%s\n",
IS_ERR(res) ? PTR_ERR(res) : 0,
IS_ERR(res) ? " [error]" : "");
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1dda78db6a73..834f0fe96f89 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1052,7 +1052,7 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
.pg_doio = nfs_generic_pg_writepages,
};
-static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
struct inode *inode, int ioflags)
{
nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
@@ -1166,13 +1166,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
static void nfs_writeback_release_full(void *calldata)
{
struct nfs_write_data *data = calldata;
- int ret, status = data->task.tk_status;
- struct nfs_pageio_descriptor pgio;
-
- if (data->pnfs_error) {
- nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
- pgio.pg_recoalesce = 1;
- }
+ int status = data->task.tk_status;
/* Update attributes as result of writeback. */
while (!list_empty(&data->pages)) {
@@ -1188,11 +1182,6 @@ static void nfs_writeback_release_full(void *calldata)
req->wb_bytes,
(long long)req_offset(req));
- if (data->pnfs_error) {
- dprintk(", pnfs error = %d\n", data->pnfs_error);
- goto next;
- }
-
if (status < 0) {
nfs_set_pageerror(page);
nfs_context_set_write_error(req->wb_context, status);
@@ -1212,19 +1201,7 @@ remove_request:
next:
nfs_clear_page_tag_locked(req);
nfs_end_page_writeback(page);
- if (data->pnfs_error) {
- lock_page(page);
- nfs_pageio_cond_complete(&pgio, page->index);
- ret = nfs_page_async_flush(&pgio, page, 0);
- if (ret) {
- nfs_set_pageerror(page);
- dprintk("rewrite to MDS error = %d\n", ret);
- }
- unlock_page(page);
- }
}
- if (data->pnfs_error)
- nfs_pageio_complete(&pgio);
nfs_writedata_release(calldata);
}
@@ -1711,7 +1688,7 @@ out_error:
#ifdef CONFIG_MIGRATION
int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page)
+ struct page *page, enum migrate_mode mode)
{
/*
* If PagePrivate is set, then the page is currently associated with
@@ -1726,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
nfs_fscache_release_page(page, GFP_KERNEL);
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
#endif
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 10e6366608f2..8df1ea4a6ff9 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -80,3 +80,13 @@ config NFSD_V4
available from http://linux-nfs.org/.
If unsure, say N.
+
+config NFSD_FAULT_INJECTION
+ bool "NFS server manual fault injection"
+ depends on NFSD_V4 && DEBUG_KERNEL
+ help
+ This option enables support for manually injecting faults
+ into the NFS server. This is intended to be used for
+ testing error recovery on the NFS client.
+
+ If unsure, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9b118ee20193..af32ef06b4fe 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_NFSD) += nfsd.o
nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 62f3b9074e84..cf8a6bd062fa 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -87,7 +87,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
struct svc_expkey key;
struct svc_expkey *ek = NULL;
- if (mesg[mlen-1] != '\n')
+ if (mlen < 1 || mesg[mlen-1] != '\n')
return -EINVAL;
mesg[mlen-1] = 0;
@@ -1226,12 +1226,12 @@ nfsd_export_init(void)
int rv;
dprintk("nfsd: initializing export module.\n");
- rv = cache_register(&svc_export_cache);
+ rv = cache_register_net(&svc_export_cache, &init_net);
if (rv)
return rv;
- rv = cache_register(&svc_expkey_cache);
+ rv = cache_register_net(&svc_expkey_cache, &init_net);
if (rv)
- cache_unregister(&svc_export_cache);
+ cache_unregister_net(&svc_export_cache, &init_net);
return rv;
}
@@ -1255,8 +1255,8 @@ nfsd_export_shutdown(void)
dprintk("nfsd: shutting down export module.\n");
- cache_unregister(&svc_expkey_cache);
- cache_unregister(&svc_export_cache);
+ cache_unregister_net(&svc_expkey_cache, &init_net);
+ cache_unregister_net(&svc_export_cache, &init_net);
svcauth_unix_purge();
dprintk("nfsd: export shutdown complete.\n");
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
new file mode 100644
index 000000000000..ce7f0758d84c
--- /dev/null
+++ b/fs/nfsd/fault_inject.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
+ *
+ * Uses debugfs to create fault injection points for client testing
+ */
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+
+#include "state.h"
+#include "fault_inject.h"
+
+struct nfsd_fault_inject_op {
+ char *file;
+ void (*func)(u64);
+};
+
+static struct nfsd_fault_inject_op inject_ops[] = {
+ {
+ .file = "forget_clients",
+ .func = nfsd_forget_clients,
+ },
+ {
+ .file = "forget_locks",
+ .func = nfsd_forget_locks,
+ },
+ {
+ .file = "forget_openowners",
+ .func = nfsd_forget_openowners,
+ },
+ {
+ .file = "forget_delegations",
+ .func = nfsd_forget_delegations,
+ },
+ {
+ .file = "recall_delegations",
+ .func = nfsd_recall_delegations,
+ },
+};
+
+static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
+static struct dentry *debug_dir;
+
+static int nfsd_inject_set(void *op_ptr, u64 val)
+{
+ struct nfsd_fault_inject_op *op = op_ptr;
+
+ if (val == 0)
+ printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
+ else
+ printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
+
+ op->func(val);
+ return 0;
+}
+
+static int nfsd_inject_get(void *data, u64 *val)
+{
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n");
+
+void nfsd_fault_inject_cleanup(void)
+{
+ debugfs_remove_recursive(debug_dir);
+}
+
+int nfsd_fault_inject_init(void)
+{
+ unsigned int i;
+ struct nfsd_fault_inject_op *op;
+ mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+
+ debug_dir = debugfs_create_dir("nfsd", NULL);
+ if (!debug_dir)
+ goto fail;
+
+ for (i = 0; i < NUM_INJECT_OPS; i++) {
+ op = &inject_ops[i];
+ if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd))
+ goto fail;
+ }
+ return 0;
+
+fail:
+ nfsd_fault_inject_cleanup();
+ return -ENOMEM;
+}
diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h
new file mode 100644
index 000000000000..90bd0570956c
--- /dev/null
+++ b/fs/nfsd/fault_inject.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
+ *
+ * Function definitions for fault injection
+ */
+
+#ifndef LINUX_NFSD_FAULT_INJECT_H
+#define LINUX_NFSD_FAULT_INJECT_H
+
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+int nfsd_fault_inject_init(void);
+void nfsd_fault_inject_cleanup(void);
+void nfsd_forget_clients(u64);
+void nfsd_forget_locks(u64);
+void nfsd_forget_openowners(u64);
+void nfsd_forget_delegations(u64);
+void nfsd_recall_delegations(u64);
+#else /* CONFIG_NFSD_FAULT_INJECTION */
+static inline int nfsd_fault_inject_init(void) { return 0; }
+static inline void nfsd_fault_inject_cleanup(void) {}
+static inline void nfsd_forget_clients(u64 num) {}
+static inline void nfsd_forget_locks(u64 num) {}
+static inline void nfsd_forget_openowners(u64 num) {}
+static inline void nfsd_forget_delegations(u64 num) {}
+static inline void nfsd_recall_delegations(u64 num) {}
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
+
+#endif /* LINUX_NFSD_FAULT_INJECT_H */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7748d6a18d97..6f3ebb48b12f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -718,7 +718,7 @@ int set_callback_cred(void)
{
if (callback_cred)
return 0;
- callback_cred = rpc_lookup_machine_cred();
+ callback_cred = rpc_lookup_machine_cred("nfs");
if (!callback_cred)
return -ENOMEM;
return 0;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 55780a22fdbd..94096273cd6c 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
#include <linux/seq_file.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <net/net_namespace.h>
#include "idmap.h"
#include "nfsd.h"
@@ -466,20 +467,20 @@ nfsd_idmap_init(void)
{
int rv;
- rv = cache_register(&idtoname_cache);
+ rv = cache_register_net(&idtoname_cache, &init_net);
if (rv)
return rv;
- rv = cache_register(&nametoid_cache);
+ rv = cache_register_net(&nametoid_cache, &init_net);
if (rv)
- cache_unregister(&idtoname_cache);
+ cache_unregister_net(&idtoname_cache, &init_net);
return rv;
}
void
nfsd_idmap_shutdown(void)
{
- cache_unregister(&idtoname_cache);
- cache_unregister(&nametoid_cache);
+ cache_unregister_net(&idtoname_cache, &init_net);
+ cache_unregister_net(&nametoid_cache, &init_net);
}
static int
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index fa383361bc61..896da74ec563 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -266,10 +266,6 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
{
__be32 status;
- /* Only reclaims from previously confirmed clients are valid */
- if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
- return status;
-
/* We don't know the target directory, and therefore can not
* set the change info
*/
@@ -373,6 +369,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ status = nfs4_check_open_reclaim(&open->op_clientid);
+ if (status)
+ goto out;
case NFS4_OPEN_CLAIM_FH:
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
status = do_open_fhandle(rqstp, &cstate->current_fh,
@@ -838,7 +837,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
}
}
- status = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt);
+ status = fh_want_write(&cstate->current_fh);
if (status)
return status;
status = nfs_ok;
@@ -856,7 +855,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
0, (time_t)0);
out:
- mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt);
+ fh_drop_write(&cstate->current_fh);
return status;
}
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index ed083b9a731b..0b3e875d1abd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -117,8 +117,7 @@ out_no_tfm:
return status;
}
-int
-nfsd4_create_clid_dir(struct nfs4_client *clp)
+void nfsd4_create_clid_dir(struct nfs4_client *clp)
{
const struct cred *original_cred;
char *dname = clp->cl_recdir;
@@ -127,13 +126,14 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
- if (!rec_file || clp->cl_firststate)
- return 0;
-
+ if (clp->cl_firststate)
+ return;
clp->cl_firststate = 1;
+ if (!rec_file)
+ return;
status = nfs4_save_creds(&original_cred);
if (status < 0)
- return status;
+ return;
dir = rec_file->f_path.dentry;
/* lock the parent */
@@ -144,14 +144,21 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
status = PTR_ERR(dentry);
goto out_unlock;
}
- status = -EEXIST;
if (dentry->d_inode)
+ /*
+ * In the 4.1 case, where we're called from
+ * reclaim_complete(), records from the previous reboot
+ * may still be left, so this is OK.
+ *
+ * In the 4.0 case, we should never get here; but we may
+ * as well be forgiving and just succeed silently.
+ */
goto out_put;
- status = mnt_want_write(rec_file->f_path.mnt);
+ status = mnt_want_write_file(rec_file);
if (status)
goto out_put;
status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
- mnt_drop_write(rec_file->f_path.mnt);
+ mnt_drop_write_file(rec_file);
out_put:
dput(dentry);
out_unlock:
@@ -164,7 +171,6 @@ out_unlock:
" and is writeable", status,
user_recovery_dirname);
nfs4_reset_creds(original_cred);
- return status;
}
typedef int (recdir_func)(struct dentry *, struct dentry *);
@@ -268,7 +274,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
if (!rec_file || !clp->cl_firststate)
return;
- status = mnt_want_write(rec_file->f_path.mnt);
+ status = mnt_want_write_file(rec_file);
if (status)
goto out;
clp->cl_firststate = 0;
@@ -281,7 +287,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
nfs4_reset_creds(original_cred);
if (status == 0)
vfs_fsync(rec_file, 0);
- mnt_drop_write(rec_file->f_path.mnt);
+ mnt_drop_write_file(rec_file);
out:
if (status)
printk("NFSD: Failed to remove expired client state directory"
@@ -311,13 +317,13 @@ nfsd4_recdir_purge_old(void) {
if (!rec_file)
return;
- status = mnt_want_write(rec_file->f_path.mnt);
+ status = mnt_want_write_file(rec_file);
if (status)
goto out;
status = nfsd4_list_rec_dir(purge_old);
if (status == 0)
vfs_fsync(rec_file, 0);
- mnt_drop_write(rec_file->f_path.mnt);
+ mnt_drop_write_file(rec_file);
out:
if (status)
printk("nfsd4: failed to purge old clients from recovery"
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 47e94e33a975..e8c98f009670 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,12 +49,20 @@
time_t nfsd4_lease = 90; /* default lease time */
time_t nfsd4_grace = 90;
static time_t boot_time;
-static stateid_t zerostateid; /* bits all 0 */
-static stateid_t onestateid; /* bits all 1 */
+
+#define all_ones {{~0,~0},~0}
+static const stateid_t one_stateid = {
+ .si_generation = ~0,
+ .si_opaque = all_ones,
+};
+static const stateid_t zero_stateid = {
+ /* all fields zero */
+};
+
static u64 current_sessionid = 1;
-#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
-#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
+#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t)))
+#define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t)))
/* forward declarations */
static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
@@ -133,21 +141,21 @@ unsigned int max_delegations;
* Open owner state (share locks)
*/
-/* hash tables for open owners */
-#define OPEN_OWNER_HASH_BITS 8
-#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS)
-#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1)
+/* hash tables for lock and open owners */
+#define OWNER_HASH_BITS 8
+#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS)
+#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1)
-static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
+static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
{
unsigned int ret;
ret = opaque_hashval(ownername->data, ownername->len);
ret += clientid;
- return ret & OPEN_OWNER_HASH_MASK;
+ return ret & OWNER_HASH_MASK;
}
-static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
+static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
/* hash table for nfs4_file */
#define FILE_HASH_BITS 8
@@ -514,6 +522,7 @@ static void unhash_lockowner(struct nfs4_lockowner *lo)
list_del(&lo->lo_owner.so_strhash);
list_del(&lo->lo_perstateid);
+ list_del(&lo->lo_owner_ino_hash);
while (!list_empty(&lo->lo_owner.so_stateids)) {
stp = list_first_entry(&lo->lo_owner.so_stateids,
struct nfs4_ol_stateid, st_perstateowner);
@@ -658,7 +667,7 @@ static int nfsd4_sanitize_slot_size(u32 size)
/*
* XXX: If we run out of reserved DRC memory we could (up to a point)
* re-negotiate active sessions and reduce their slot usage to make
- * rooom for new connections. For now we just fail the create session.
+ * room for new connections. For now we just fail the create session.
*/
static int nfsd4_get_drc_mem(int slotsize, u32 num)
{
@@ -985,12 +994,11 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
if (clp == NULL)
return NULL;
- clp->cl_name.data = kmalloc(name.len, GFP_KERNEL);
+ clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
if (clp->cl_name.data == NULL) {
kfree(clp);
return NULL;
}
- memcpy(clp->cl_name.data, name.data, name.len);
clp->cl_name.len = name.len;
return clp;
}
@@ -1058,7 +1066,6 @@ expire_client(struct nfs4_client *clp)
spin_unlock(&recall_lock);
while (!list_empty(&reaplist)) {
dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
- list_del_init(&dp->dl_recall_lru);
unhash_delegation(dp);
}
while (!list_empty(&clp->cl_openowners)) {
@@ -2301,7 +2308,7 @@ nfsd4_free_slabs(void)
nfsd4_free_slab(&deleg_slab);
}
-static int
+int
nfsd4_init_slabs(void)
{
openowner_slab = kmem_cache_create("nfsd4_openowners",
@@ -2373,7 +2380,7 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
{
- list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]);
+ list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
list_add(&oo->oo_perclient, &clp->cl_openowners);
}
@@ -2436,7 +2443,9 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
struct nfs4_stateowner *so;
struct nfs4_openowner *oo;
- list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
+ list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
+ if (!so->so_is_open_owner)
+ continue;
if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
oo = openowner(so);
renew_client(oo->oo_owner.so_client);
@@ -2580,7 +2589,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
if (open->op_file == NULL)
return nfserr_jukebox;
- strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
+ strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
oo = find_openstateowner_str(strhashval, open);
open->op_openowner = oo;
if (!oo) {
@@ -3123,7 +3132,6 @@ nfs4_laundromat(void)
spin_unlock(&recall_lock);
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- list_del_init(&dp->dl_recall_lru);
unhash_delegation(dp);
}
test_val = nfsd4_lease;
@@ -3718,13 +3726,11 @@ out:
}
-/*
- * Lock owner state (byte-range locks)
- */
#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start))
-#define LOCK_HASH_BITS 8
-#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS)
-#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1)
+
+#define LOCKOWNER_INO_HASH_BITS 8
+#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
+#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
static inline u64
end_offset(u64 start, u64 len)
@@ -3746,16 +3752,14 @@ last_byte_offset(u64 start, u64 len)
return end > start ? end - 1: NFS4_MAX_UINT64;
}
-static inline unsigned int
-lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
- struct xdr_netobj *ownername)
+static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername)
{
return (file_hashval(inode) + cl_id
+ opaque_hashval(ownername->data, ownername->len))
- & LOCK_HASH_MASK;
+ & LOCKOWNER_INO_HASH_MASK;
}
-static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
+static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE];
/*
* TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
@@ -3809,23 +3813,39 @@ nevermind:
deny->ld_type = NFS4_WRITE_LT;
}
+static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner)
+{
+ struct nfs4_ol_stateid *lst;
+
+ if (!same_owner_str(&lo->lo_owner, owner, clid))
+ return false;
+ lst = list_first_entry(&lo->lo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
+ return lst->st_file->fi_inode == inode;
+}
+
static struct nfs4_lockowner *
find_lockowner_str(struct inode *inode, clientid_t *clid,
struct xdr_netobj *owner)
{
- unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
- struct nfs4_stateowner *op;
+ unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
+ struct nfs4_lockowner *lo;
- list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
- if (same_owner_str(op, owner, clid))
- return lockowner(op);
+ list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
+ if (same_lockowner_ino(lo, inode, clid, owner))
+ return lo;
}
return NULL;
}
static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
{
- list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]);
+ struct inode *inode = open_stp->st_file->fi_inode;
+ unsigned int inohash = lockowner_ino_hashval(inode,
+ clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
+
+ list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
+ list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]);
list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
}
@@ -3834,7 +3854,7 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
* Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has
* occurred.
*
- * strhashval = lock_ownerstr_hashval
+ * strhashval = ownerstr_hashval
*/
static struct nfs4_lockowner *
@@ -3892,6 +3912,37 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
__set_bit(access, &lock_stp->st_access_bmap);
}
+__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
+{
+ struct nfs4_file *fi = ost->st_file;
+ struct nfs4_openowner *oo = openowner(ost->st_stateowner);
+ struct nfs4_client *cl = oo->oo_owner.so_client;
+ struct nfs4_lockowner *lo;
+ unsigned int strhashval;
+
+ lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner);
+ if (lo) {
+ if (!cstate->minorversion)
+ return nfserr_bad_seqid;
+ /* XXX: a lockowner always has exactly one stateid: */
+ *lst = list_first_entry(&lo->lo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
+ return nfs_ok;
+ }
+ strhashval = ownerstr_hashval(cl->cl_clientid.cl_id,
+ &lock->v.new.owner);
+ lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
+ if (lo == NULL)
+ return nfserr_jukebox;
+ *lst = alloc_init_lock_stateid(lo, fi, ost);
+ if (*lst == NULL) {
+ release_lockowner(lo);
+ return nfserr_jukebox;
+ }
+ *new = true;
+ return nfs_ok;
+}
+
/*
* LOCK operation
*/
@@ -3907,7 +3958,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct file_lock file_lock;
struct file_lock conflock;
__be32 status = 0;
- unsigned int strhashval;
+ bool new_state = false;
int lkflg;
int err;
@@ -3933,10 +3984,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* lock stateid.
*/
struct nfs4_ol_stateid *open_stp = NULL;
-
+
+ if (nfsd4_has_session(cstate))
+ /* See rfc 5661 18.10.3: given clientid is ignored: */
+ memcpy(&lock->v.new.clientid,
+ &cstate->session->se_client->cl_clientid,
+ sizeof(clientid_t));
+
status = nfserr_stale_clientid;
- if (!nfsd4_has_session(cstate) &&
- STALE_CLIENTID(&lock->lk_new_clientid))
+ if (STALE_CLIENTID(&lock->lk_new_clientid))
goto out;
/* validate and update open stateid and open seqid */
@@ -3948,25 +4004,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
open_sop = openowner(open_stp->st_stateowner);
status = nfserr_bad_stateid;
- if (!nfsd4_has_session(cstate) &&
- !same_clid(&open_sop->oo_owner.so_client->cl_clientid,
+ if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
&lock->v.new.clientid))
goto out;
- /* create lockowner and lock stateid */
- fp = open_stp->st_file;
- strhashval = lock_ownerstr_hashval(fp->fi_inode,
- open_sop->oo_owner.so_client->cl_clientid.cl_id,
- &lock->v.new.owner);
- /* XXX: Do we need to check for duplicate stateowners on
- * the same file, or should they just be allowed (and
- * create new stateids)? */
- status = nfserr_jukebox;
- lock_sop = alloc_init_lock_stateowner(strhashval,
- open_sop->oo_owner.so_client, open_stp, lock);
- if (lock_sop == NULL)
- goto out;
- lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
- if (lock_stp == NULL)
+ status = lookup_or_create_lock_state(cstate, open_stp, lock,
+ &lock_stp, &new_state);
+ if (status)
goto out;
} else {
/* lock (lock owner + lock stateid) already exists */
@@ -3976,10 +4019,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
NFS4_LOCK_STID, &lock_stp);
if (status)
goto out;
- lock_sop = lockowner(lock_stp->st_stateowner);
- fp = lock_stp->st_file;
}
- /* lock_sop and lock_stp have been created or found */
+ lock_sop = lockowner(lock_stp->st_stateowner);
+ fp = lock_stp->st_file;
lkflg = setlkflg(lock->lk_type);
status = nfs4_check_openmode(lock_stp, lkflg);
@@ -4054,7 +4096,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
}
out:
- if (status && lock->lk_is_new && lock_sop)
+ if (status && new_state)
release_lockowner(lock_sop);
if (!cstate->replay_owner)
nfs4_unlock_state();
@@ -4251,7 +4293,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
struct nfs4_ol_stateid *stp;
struct xdr_netobj *owner = &rlockowner->rl_owner;
struct list_head matches;
- int i;
+ unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
__be32 status;
dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
@@ -4266,22 +4308,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
nfs4_lock_state();
status = nfserr_locks_held;
- /* XXX: we're doing a linear search through all the lockowners.
- * Yipes! For now we'll just hope clients aren't really using
- * release_lockowner much, but eventually we have to fix these
- * data structures. */
INIT_LIST_HEAD(&matches);
- for (i = 0; i < LOCK_HASH_SIZE; i++) {
- list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) {
- if (!same_owner_str(sop, owner, clid))
- continue;
- list_for_each_entry(stp, &sop->so_stateids,
- st_perstateowner) {
- lo = lockowner(sop);
- if (check_for_locks(stp->st_file, lo))
- goto out;
- list_add(&lo->lo_list, &matches);
- }
+
+ list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) {
+ if (sop->so_is_open_owner)
+ continue;
+ if (!same_owner_str(sop, owner, clid))
+ continue;
+ list_for_each_entry(stp, &sop->so_stateids,
+ st_perstateowner) {
+ lo = lockowner(sop);
+ if (check_for_locks(stp->st_file, lo))
+ goto out;
+ list_add(&lo->lo_list, &matches);
}
}
/* Clients probably won't expect us to return with some (but not all)
@@ -4394,16 +4433,127 @@ nfs4_check_open_reclaim(clientid_t *clid)
return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad;
}
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+
+void nfsd_forget_clients(u64 num)
+{
+ struct nfs4_client *clp, *next;
+ int count = 0;
+
+ nfs4_lock_state();
+ list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
+ nfsd4_remove_clid_dir(clp);
+ expire_client(clp);
+ if (++count == num)
+ break;
+ }
+ nfs4_unlock_state();
+
+ printk(KERN_INFO "NFSD: Forgot %d clients", count);
+}
+
+static void release_lockowner_sop(struct nfs4_stateowner *sop)
+{
+ release_lockowner(lockowner(sop));
+}
+
+static void release_openowner_sop(struct nfs4_stateowner *sop)
+{
+ release_openowner(openowner(sop));
+}
+
+static int nfsd_release_n_owners(u64 num, bool is_open_owner,
+ void (*release_sop)(struct nfs4_stateowner *))
+{
+ int i, count = 0;
+ struct nfs4_stateowner *sop, *next;
+
+ for (i = 0; i < OWNER_HASH_SIZE; i++) {
+ list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) {
+ if (sop->so_is_open_owner != is_open_owner)
+ continue;
+ release_sop(sop);
+ if (++count == num)
+ return count;
+ }
+ }
+ return count;
+}
+
+void nfsd_forget_locks(u64 num)
+{
+ int count;
+
+ nfs4_lock_state();
+ count = nfsd_release_n_owners(num, false, release_lockowner_sop);
+ nfs4_unlock_state();
+
+ printk(KERN_INFO "NFSD: Forgot %d locks", count);
+}
+
+void nfsd_forget_openowners(u64 num)
+{
+ int count;
+
+ nfs4_lock_state();
+ count = nfsd_release_n_owners(num, true, release_openowner_sop);
+ nfs4_unlock_state();
+
+ printk(KERN_INFO "NFSD: Forgot %d open owners", count);
+}
+
+int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegation *))
+{
+ int i, count = 0;
+ struct nfs4_file *fp, *fnext;
+ struct nfs4_delegation *dp, *dnext;
+
+ for (i = 0; i < FILE_HASH_SIZE; i++) {
+ list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) {
+ list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) {
+ deleg_func(dp);
+ if (++count == num)
+ return count;
+ }
+ }
+ }
+
+ return count;
+}
+
+void nfsd_forget_delegations(u64 num)
+{
+ unsigned int count;
+
+ nfs4_lock_state();
+ count = nfsd_process_n_delegations(num, unhash_delegation);
+ nfs4_unlock_state();
+
+ printk(KERN_INFO "NFSD: Forgot %d delegations", count);
+}
+
+void nfsd_recall_delegations(u64 num)
+{
+ unsigned int count;
+
+ nfs4_lock_state();
+ spin_lock(&recall_lock);
+ count = nfsd_process_n_delegations(num, nfsd_break_one_deleg);
+ spin_unlock(&recall_lock);
+ nfs4_unlock_state();
+
+ printk(KERN_INFO "NFSD: Recalled %d delegations", count);
+}
+
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
+
/* initialization to perform at module load time: */
-int
+void
nfs4_state_init(void)
{
- int i, status;
+ int i;
- status = nfsd4_init_slabs();
- if (status)
- return status;
for (i = 0; i < CLIENT_HASH_SIZE; i++) {
INIT_LIST_HEAD(&conf_id_hashtbl[i]);
INIT_LIST_HEAD(&conf_str_hashtbl[i]);
@@ -4416,18 +4566,15 @@ nfs4_state_init(void)
for (i = 0; i < FILE_HASH_SIZE; i++) {
INIT_LIST_HEAD(&file_hashtbl[i]);
}
- for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
- }
- for (i = 0; i < LOCK_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
+ for (i = 0; i < OWNER_HASH_SIZE; i++) {
+ INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
}
- memset(&onestateid, ~0, sizeof(stateid_t));
+ for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]);
INIT_LIST_HEAD(&close_lru);
INIT_LIST_HEAD(&client_lru);
INIT_LIST_HEAD(&del_recall_lru);
reclaim_str_hashtbl_size = 0;
- return 0;
}
static void
@@ -4526,7 +4673,6 @@ __nfs4_state_shutdown(void)
spin_unlock(&recall_lock);
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- list_del_init(&dp->dl_recall_lru);
unhash_delegation(dp);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b6fa792d6b85..0ec5a1b9700e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -215,10 +215,9 @@ defer_free(struct nfsd4_compoundargs *argp,
static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
{
if (p == argp->tmp) {
- p = kmalloc(nbytes, GFP_KERNEL);
+ p = kmemdup(argp->tmp, nbytes, GFP_KERNEL);
if (!p)
return NULL;
- memcpy(p, argp->tmp, nbytes);
} else {
BUG_ON(p != argp->tmpp);
argp->tmpp = NULL;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c45a2ea4a090..748eda93ce59 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -18,6 +18,7 @@
#include "idmap.h"
#include "nfsd.h"
#include "cache.h"
+#include "fault_inject.h"
/*
* We have a single directory with several nodes in it.
@@ -272,7 +273,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
* 2. Is that directory a mount point, or
* 3. Is that directory the root of an exported file system?
*/
- error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
+ error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb);
path_put(&path);
return error;
@@ -1128,9 +1129,13 @@ static int __init init_nfsd(void)
int retval;
printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
- retval = nfs4_state_init(); /* nfs4 locking state */
+ retval = nfsd4_init_slabs();
if (retval)
return retval;
+ nfs4_state_init();
+ retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+ if (retval)
+ goto out_free_slabs;
nfsd_stat_init(); /* Statistics */
retval = nfsd_reply_cache_init();
if (retval)
@@ -1161,6 +1166,8 @@ out_free_cache:
nfsd_reply_cache_shutdown();
out_free_stat:
nfsd_stat_shutdown();
+ nfsd_fault_inject_cleanup();
+out_free_slabs:
nfsd4_free_slabs();
return retval;
}
@@ -1175,6 +1182,7 @@ static void __exit exit_nfsd(void)
nfsd_lockd_shutdown();
nfsd_idmap_shutdown();
nfsd4_free_slabs();
+ nfsd_fault_inject_cleanup();
unregister_filesystem(&nfsd_fs_type);
}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 58134a23fdfb..1d1e8589b4ce 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -104,14 +104,16 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
*/
#ifdef CONFIG_NFSD_V4
extern unsigned int max_delegations;
-int nfs4_state_init(void);
+void nfs4_state_init(void);
+int nfsd4_init_slabs(void);
void nfsd4_free_slabs(void);
int nfs4_state_start(void);
void nfs4_state_shutdown(void);
void nfs4_reset_lease(time_t leasetime);
int nfs4_reset_recoverydir(char *recdir);
#else
-static inline int nfs4_state_init(void) { return 0; }
+static inline void nfs4_state_init(void) { }
+static inline int nfsd4_init_slabs(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
static inline int nfs4_state_start(void) { return 0; }
static inline void nfs4_state_shutdown(void) { }
@@ -338,15 +340,15 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
}
/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
-#define NFSD_WRITEONLY_ATTRS_WORD1 \
-(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#define NFSD_WRITEONLY_ATTRS_WORD1 \
+ (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
-#define NFSD_WRITEABLE_ATTRS_WORD0 \
-(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL )
-#define NFSD_WRITEABLE_ATTRS_WORD1 \
-(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
- | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#define NFSD_WRITEABLE_ATTRS_WORD0 \
+ (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
+#define NFSD_WRITEABLE_ATTRS_WORD1 \
+ (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
+ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
#define NFSD_WRITEABLE_ATTRS_WORD2 0
#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c763de5c1157..68454e75fce9 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -59,7 +59,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
* the write call).
*/
static inline __be32
-nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested)
+nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, umode_t requested)
{
mode &= S_IFMT;
@@ -293,7 +293,7 @@ out:
* include/linux/nfsd/nfsd.h.
*/
__be32
-fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
+fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
{
struct svc_export *exp;
struct dentry *dentry;
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index c16f8d8331b5..e5e6707ba687 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -102,7 +102,7 @@ extern char * SVCFH_fmt(struct svc_fh *fhp);
/*
* Function prototypes
*/
-__be32 fh_verify(struct svc_rqst *, struct svc_fh *, int, int);
+__be32 fh_verify(struct svc_rqst *, struct svc_fh *, umode_t, int);
__be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *);
__be32 fh_update(struct svc_fh *);
void fh_put(struct svc_fh *);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index a3cf38476a1b..ffb5df1db94f 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -366,6 +366,7 @@ struct nfs4_openowner {
struct nfs4_lockowner {
struct nfs4_stateowner lo_owner; /* must be first element */
+ struct list_head lo_owner_ino_hash; /* hash by owner,file */
struct list_head lo_perstateid; /* for lockowners only */
struct list_head lo_list; /* for temporary uses */
};
@@ -482,7 +483,7 @@ extern void nfsd4_shutdown_recdir(void);
extern int nfs4_client_to_reclaim(const char *name);
extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
extern void nfsd4_recdir_purge_old(void);
-extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
+extern void nfsd4_create_clid_dir(struct nfs4_client *clp);
extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
extern void release_session_client(struct nfsd4_session *);
extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7a2e442623c8..edf6d3ed8777 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -307,7 +307,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
struct dentry *dentry;
struct inode *inode;
int accmode = NFSD_MAY_SATTR;
- int ftype = 0;
+ umode_t ftype = 0;
__be32 err;
int host_err;
int size_change = 0;
@@ -594,8 +594,19 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
return error;
}
-#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction."
-#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type"
+/*
+ * NFS junction information is stored in an extended attribute.
+ */
+#define NFSD_JUNCTION_XATTR_NAME XATTR_TRUSTED_PREFIX "junction.nfs"
+
+/**
+ * nfsd4_is_junction - Test if an object could be an NFS junction
+ *
+ * @dentry: object to test
+ *
+ * Returns 1 if "dentry" appears to contain NFS junction information.
+ * Otherwise 0 is returned.
+ */
int nfsd4_is_junction(struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
@@ -606,7 +617,7 @@ int nfsd4_is_junction(struct dentry *dentry)
return 0;
if (!(inode->i_mode & S_ISVTX))
return 0;
- if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0)
+ if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0)
return 0;
return 1;
}
@@ -730,7 +741,7 @@ static int nfsd_open_break_lease(struct inode *inode, int access)
* N.B. After this call fhp needs an fh_put
*/
__be32
-nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
+nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
int access, struct file **filp)
{
struct dentry *dentry;
@@ -1300,7 +1311,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
}
- host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ host_err = fh_want_write(fhp);
if (host_err)
goto out_nfserr;
@@ -1325,7 +1336,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
break;
}
if (host_err < 0) {
- mnt_drop_write(fhp->fh_export->ex_path.mnt);
+ fh_drop_write(fhp);
goto out_nfserr;
}
@@ -1339,7 +1350,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err2 = nfserrno(commit_metadata(fhp));
if (err2)
err = err2;
- mnt_drop_write(fhp->fh_export->ex_path.mnt);
+ fh_drop_write(fhp);
/*
* Update the file handle to get the new inode info.
*/
@@ -1430,7 +1441,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
v_atime = verifier[1]&0x7fffffff;
}
- host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ host_err = fh_want_write(fhp);
if (host_err)
goto out_nfserr;
if (dchild->d_inode) {
@@ -1469,13 +1480,13 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
case NFS3_CREATE_GUARDED:
err = nfserr_exist;
}
- mnt_drop_write(fhp->fh_export->ex_path.mnt);
+ fh_drop_write(fhp);
goto out;
}
host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
if (host_err < 0) {
- mnt_drop_write(fhp->fh_export->ex_path.mnt);
+ fh_drop_write(fhp);
goto out_nfserr;
}
if (created)
@@ -1503,7 +1514,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (!err)
err = nfserrno(commit_metadata(fhp));
- mnt_drop_write(fhp->fh_export->ex_path.mnt);
+ fh_drop_write(fhp);
/*
* Update the filehandle to get the new inode info.
*/
@@ -1600,7 +1611,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (IS_ERR(dnew))
goto out_nfserr;
- host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ host_err = fh_want_write(fhp);
if (host_err)
goto out_nfserr;
@@ -1621,7 +1632,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfserrno(commit_metadata(fhp));
fh_unlock(fhp);
- mnt_drop_write(fhp->fh_export->ex_path.mnt);
+ fh_drop_write(fhp);
cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
dput(dnew);
@@ -1674,7 +1685,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
dold = tfhp->fh_dentry;
- host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
+ host_err = fh_want_write(tfhp);
if (host_err) {
err = nfserrno(host_err);
goto out_dput;
@@ -1699,7 +1710,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
err = nfserrno(host_err);
}
out_drop_write:
- mnt_drop_write(tfhp->fh_export->ex_path.mnt);
+ fh_drop_write(tfhp);
out_dput:
dput(dnew);
out_unlock:
@@ -1776,7 +1787,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
host_err = -EXDEV;
if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
goto out_dput_new;
- host_err = mnt_want_write(ffhp->fh_export->ex_path.mnt);
+ host_err = fh_want_write(ffhp);
if (host_err)
goto out_dput_new;
@@ -1795,7 +1806,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
host_err = commit_metadata(ffhp);
}
out_drop_write:
- mnt_drop_write(ffhp->fh_export->ex_path.mnt);
+ fh_drop_write(ffhp);
out_dput_new:
dput(ndentry);
out_dput_old:
@@ -1854,7 +1865,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (!type)
type = rdentry->d_inode->i_mode & S_IFMT;
- host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ host_err = fh_want_write(fhp);
if (host_err)
goto out_put;
@@ -1868,7 +1879,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (!host_err)
host_err = commit_metadata(fhp);
out_drop_write:
- mnt_drop_write(fhp->fh_export->ex_path.mnt);
+ fh_drop_write(fhp);
out_put:
dput(rdentry);
@@ -2270,7 +2281,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
} else
size = 0;
- error = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ error = fh_want_write(fhp);
if (error)
goto getout;
if (size)
@@ -2284,7 +2295,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
error = 0;
}
}
- mnt_drop_write(fhp->fh_export->ex_path.mnt);
+ fh_drop_write(fhp);
getout:
kfree(value);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 3f54ad03bb2b..1dcd238e11a0 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -66,7 +66,7 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
loff_t, unsigned long);
#endif /* CONFIG_NFSD_V3 */
-__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int,
+__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
int, struct file **);
void nfsd_close(struct file *);
__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
@@ -106,4 +106,14 @@ struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
#endif
+static inline int fh_want_write(struct svc_fh *fh)
+{
+ return mnt_want_write(fh->fh_export->ex_path.mnt);
+}
+
+static inline void fh_drop_write(struct svc_fh *fh)
+{
+ mnt_drop_write(fh->fh_export->ex_path.mnt);
+}
+
#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 3a1923943b14..ca35b3a46d17 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -251,7 +251,7 @@ nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
{
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b50ffb72e5b3..8f7b95ac1f7e 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -291,7 +291,7 @@ const struct address_space_operations nilfs_aops = {
.is_partially_uptodate = block_is_partially_uptodate,
};
-struct inode *nilfs_new_inode(struct inode *dir, int mode)
+struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct the_nilfs *nilfs = sb->s_fs_info;
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index ac258beeda3c..2a70fce70c65 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -27,7 +27,7 @@
#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
#include <linux/vmalloc.h>
#include <linux/compat.h> /* compat_ptr() */
-#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */
+#include <linux/mount.h> /* mnt_want_write_file(), mnt_drop_write_file() */
#include <linux/buffer_head.h>
#include <linux/nilfs2_fs.h>
#include "nilfs.h"
@@ -119,7 +119,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
if (get_user(flags, (int __user *)argp))
return -EFAULT;
- ret = mnt_want_write(filp->f_path.mnt);
+ ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -154,7 +154,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
ret = nilfs_transaction_commit(inode->i_sb);
out:
mutex_unlock(&inode->i_mutex);
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return ret;
}
@@ -174,7 +174,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write(filp->f_path.mnt);
+ ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -194,7 +194,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
up_read(&inode->i_sb->s_umount);
out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return ret;
}
@@ -210,7 +210,7 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write(filp->f_path.mnt);
+ ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -225,7 +225,7 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
else
nilfs_transaction_commit(inode->i_sb); /* never fails */
out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return ret;
}
@@ -591,7 +591,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write(filp->f_path.mnt);
+ ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -603,6 +603,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
nsegs = argv[4].v_nmembs;
if (argv[4].v_size != argsz[4])
goto out;
+ if (nsegs > UINT_MAX / sizeof(__u64))
+ goto out;
/*
* argv[4] points to segment numbers this ioctl cleans. We
@@ -675,7 +677,7 @@ out_free:
vfree(kbufs[n]);
kfree(kbufs[4]);
out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return ret;
}
@@ -710,7 +712,7 @@ static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
if (!capable(CAP_SYS_ADMIN))
goto out;
- ret = mnt_want_write(filp->f_path.mnt);
+ ret = mnt_want_write_file(filp);
if (ret)
goto out;
@@ -721,7 +723,7 @@ static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
ret = nilfs_resize_fs(inode->i_sb, newsize);
out_drop_write:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
out:
return ret;
}
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 768982de10e4..1cd3f624dffc 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -84,7 +84,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
struct inode *inode;
@@ -112,7 +112,7 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
}
static int
-nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct inode *inode;
struct nilfs_transaction_info ti;
@@ -213,7 +213,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
return err;
}
-static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct nilfs_transaction_info ti;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 3777d138f895..250add84da76 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -246,7 +246,7 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
/* inode.c */
void nilfs_inode_add_blocks(struct inode *inode, int n);
void nilfs_inode_sub_blocks(struct inode *inode, int n);
-extern struct inode *nilfs_new_inode(struct inode *, int);
+extern struct inode *nilfs_new_inode(struct inode *, umode_t);
extern void nilfs_free_inode(struct inode *);
extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern void nilfs_set_inode_flags(struct inode *);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index bb24ab6c282f..0e72ad6f22aa 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2470,7 +2470,7 @@ static int nilfs_segctor_thread(void *arg)
if (freezing(current)) {
spin_unlock(&sci->sc_state_lock);
- refrigerator();
+ try_to_freeze();
spin_lock(&sci->sc_state_lock);
} else {
DEFINE_WAIT(wait);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8351c44a7320..08e3d4f9df18 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -175,8 +175,6 @@ static void nilfs_i_callback(struct rcu_head *head)
struct inode *inode = container_of(head, struct inode, i_rcu);
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
- INIT_LIST_HEAD(&inode->i_dentry);
-
if (mdi) {
kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
kfree(mdi);
@@ -650,11 +648,11 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
- struct super_block *sb = vfs->mnt_sb;
+ struct super_block *sb = dentry->d_sb;
struct the_nilfs *nilfs = sb->s_fs_info;
- struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
+ struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
if (!nilfs_test_opt(nilfs, BARRIER))
seq_puts(seq, ",nobarrier");
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d32714094375..501b7f8b739f 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -409,6 +409,12 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
nilfs->ns_r_segments_percentage =
le32_to_cpu(sbp->s_r_segments_percentage);
+ if (nilfs->ns_r_segments_percentage < 1 ||
+ nilfs->ns_r_segments_percentage > 99) {
+ printk(KERN_ERR "NILFS: invalid reserved segments percentage.\n");
+ return -EINVAL;
+ }
+
nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments));
nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
return 0;
@@ -515,6 +521,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
brelse(sbh[1]);
sbh[1] = NULL;
sbp[1] = NULL;
+ valid[1] = 0;
swp = 0;
}
if (!valid[swp]) {
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 44a88a9fa2c8..fea6bd5831dc 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -52,7 +52,7 @@ static const struct utf8_table utf8_table[] =
#define SURROGATE_LOW 0x00000400
#define SURROGATE_BITS 0x000003ff
-int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
+int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu)
{
unsigned long l;
int c0, c, nc;
@@ -71,7 +71,7 @@ int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
*pu = (unicode_t) l;
return nc;
}
- if (len <= nc)
+ if (inlen <= nc)
return -1;
s++;
c = (*s ^ 0x80) & 0xFF;
@@ -83,7 +83,7 @@ int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
}
EXPORT_SYMBOL(utf8_to_utf32);
-int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
+int utf32_to_utf8(unicode_t u, u8 *s, int maxout)
{
unsigned long l;
int c, nc;
@@ -97,7 +97,7 @@ int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
return -1;
nc = 0;
- for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
+ for (t = utf8_table; t->cmask && maxout; t++, maxout--) {
nc++;
if (l <= t->lmask) {
c = t->shift;
@@ -114,34 +114,57 @@ int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
}
EXPORT_SYMBOL(utf32_to_utf8);
-int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
+static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian)
+{
+ switch (endian) {
+ default:
+ *s = (wchar_t) c;
+ break;
+ case UTF16_LITTLE_ENDIAN:
+ *s = __cpu_to_le16(c);
+ break;
+ case UTF16_BIG_ENDIAN:
+ *s = __cpu_to_be16(c);
+ break;
+ }
+}
+
+int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian,
+ wchar_t *pwcs, int maxout)
{
u16 *op;
int size;
unicode_t u;
op = pwcs;
- while (*s && len > 0) {
+ while (inlen > 0 && maxout > 0 && *s) {
if (*s & 0x80) {
- size = utf8_to_utf32(s, len, &u);
+ size = utf8_to_utf32(s, inlen, &u);
if (size < 0)
return -EINVAL;
+ s += size;
+ inlen -= size;
if (u >= PLANE_SIZE) {
+ if (maxout < 2)
+ break;
u -= PLANE_SIZE;
- *op++ = (wchar_t) (SURROGATE_PAIR |
- ((u >> 10) & SURROGATE_BITS));
- *op++ = (wchar_t) (SURROGATE_PAIR |
+ put_utf16(op++, SURROGATE_PAIR |
+ ((u >> 10) & SURROGATE_BITS),
+ endian);
+ put_utf16(op++, SURROGATE_PAIR |
SURROGATE_LOW |
- (u & SURROGATE_BITS));
+ (u & SURROGATE_BITS),
+ endian);
+ maxout -= 2;
} else {
- *op++ = (wchar_t) u;
+ put_utf16(op++, u, endian);
+ maxout--;
}
- s += size;
- len -= size;
} else {
- *op++ = *s++;
- len--;
+ put_utf16(op++, *s++, endian);
+ inlen--;
+ maxout--;
}
}
return op - pwcs;
@@ -160,27 +183,27 @@ static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
}
}
-int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
- u8 *s, int maxlen)
+int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian,
+ u8 *s, int maxout)
{
u8 *op;
int size;
unsigned long u, v;
op = s;
- while (len > 0 && maxlen > 0) {
+ while (inlen > 0 && maxout > 0) {
u = get_utf16(*pwcs, endian);
if (!u)
break;
pwcs++;
- len--;
+ inlen--;
if (u > 0x7f) {
if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
if (u & SURROGATE_LOW) {
/* Ignore character and move on */
continue;
}
- if (len <= 0)
+ if (inlen <= 0)
break;
v = get_utf16(*pwcs, endian);
if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
@@ -191,18 +214,18 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
+ (v & SURROGATE_BITS);
pwcs++;
- len--;
+ inlen--;
}
- size = utf32_to_utf8(u, op, maxlen);
+ size = utf32_to_utf8(u, op, maxout);
if (size == -1) {
/* Ignore character and move on */
} else {
op += size;
- maxlen -= size;
+ maxout -= size;
}
} else {
*op++ = (u8) u;
- maxlen--;
+ maxout--;
}
}
return op - s;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9fde1c00a296..3568c8a8b138 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -16,6 +16,8 @@
#include <asm/ioctls.h>
+#include "../../mount.h"
+
#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
#define FANOTIFY_DEFAULT_MAX_MARKS 8192
#define FANOTIFY_DEFAULT_MAX_LISTENERS 128
@@ -546,7 +548,7 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
fsnotify_put_mark(fsn_mark);
- if (removed & mnt->mnt_fsnotify_mask)
+ if (removed & real_mount(mnt)->mnt_fsnotify_mask)
fsnotify_recalc_vfsmount_mask(mnt);
return 0;
@@ -623,7 +625,7 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
- if (added & ~mnt->mnt_fsnotify_mask)
+ if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
fsnotify_recalc_vfsmount_mask(mnt);
err:
fsnotify_put_mark(fsn_mark);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 79b47cbb5cd8..ccb14d3fc0de 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -26,6 +26,7 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
+#include "../mount.h"
/*
* Clear all of the marks on an inode when it is being evicted from core
@@ -205,13 +206,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
struct fsnotify_group *inode_group, *vfsmount_group;
struct fsnotify_event *event = NULL;
- struct vfsmount *mnt;
+ struct mount *mnt;
int idx, ret = 0;
/* global tests shouldn't care about events on child only the specific event */
__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
if (data_is == FSNOTIFY_EVENT_PATH)
- mnt = ((struct path *)data)->mnt;
+ mnt = real_mount(((struct path *)data)->mnt);
else
mnt = NULL;
@@ -262,11 +263,11 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
/* we didn't use the vfsmount_mark */
vfsmount_group = NULL;
} else if (vfsmount_group > inode_group) {
- ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
+ ret = send_to_group(to_tell, &mnt->mnt, NULL, vfsmount_mark, mask, data,
data_is, cookie, file_name, &event);
inode_group = NULL;
} else {
- ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
+ ret = send_to_group(to_tell, &mnt->mnt, inode_mark, vfsmount_mark,
mask, data, data_is, cookie, file_name,
&event);
}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index e14587d55689..f104d565b682 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -135,9 +135,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
- /* 1 from caller and 1 for being on i_list/g_list */
- BUG_ON(atomic_read(&mark->refcnt) < 2);
-
spin_lock(&group->mark_lock);
if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
@@ -182,6 +179,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
iput(inode);
/*
+ * We don't necessarily have a ref on mark from caller so the above iput
+ * may have already destroyed it. Don't touch from now on.
+ */
+
+ /*
* it's possible that this group tried to destroy itself, but this
* this mark was simultaneously being freed by inode. If that's the
* case, we finish freeing the group here.
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 778fe6cae3b0..b7b4b0e8554f 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -28,15 +28,17 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
+#include "../mount.h"
void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
{
struct fsnotify_mark *mark, *lmark;
struct hlist_node *pos, *n;
+ struct mount *m = real_mount(mnt);
LIST_HEAD(free_list);
spin_lock(&mnt->mnt_root->d_lock);
- hlist_for_each_entry_safe(mark, pos, n, &mnt->mnt_fsnotify_marks, m.m_list) {
+ hlist_for_each_entry_safe(mark, pos, n, &m->mnt_fsnotify_marks, m.m_list) {
list_add(&mark->m.free_m_list, &free_list);
hlist_del_init_rcu(&mark->m.m_list);
fsnotify_get_mark(mark);
@@ -59,15 +61,16 @@ void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
*/
static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
{
+ struct mount *m = real_mount(mnt);
struct fsnotify_mark *mark;
struct hlist_node *pos;
__u32 new_mask = 0;
assert_spin_locked(&mnt->mnt_root->d_lock);
- hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list)
+ hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list)
new_mask |= mark->mask;
- mnt->mnt_fsnotify_mask = new_mask;
+ m->mnt_fsnotify_mask = new_mask;
}
/*
@@ -101,12 +104,13 @@ void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group,
struct vfsmount *mnt)
{
+ struct mount *m = real_mount(mnt);
struct fsnotify_mark *mark;
struct hlist_node *pos;
assert_spin_locked(&mnt->mnt_root->d_lock);
- hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) {
+ hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) {
if (mark->group == group) {
fsnotify_get_mark(mark);
return mark;
@@ -140,6 +144,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group, struct vfsmount *mnt,
int allow_dups)
{
+ struct mount *m = real_mount(mnt);
struct fsnotify_mark *lmark;
struct hlist_node *node, *last = NULL;
int ret = 0;
@@ -154,13 +159,13 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
mark->m.mnt = mnt;
/* is mark the first mark? */
- if (hlist_empty(&mnt->mnt_fsnotify_marks)) {
- hlist_add_head_rcu(&mark->m.m_list, &mnt->mnt_fsnotify_marks);
+ if (hlist_empty(&m->mnt_fsnotify_marks)) {
+ hlist_add_head_rcu(&mark->m.m_list, &m->mnt_fsnotify_marks);
goto out;
}
/* should mark be in the middle of the current list? */
- hlist_for_each_entry(lmark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
+ hlist_for_each_entry(lmark, node, &m->mnt_fsnotify_marks, m.m_list) {
last = node;
if ((lmark->group == group) && !allow_dups) {
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index f14fde2b03d6..e0281992ddc3 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1,7 +1,7 @@
/**
* attrib.c - NTFS attribute operations. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
* Copyright (c) 2002 Richard Russon
*
* This program/include file is free software; you can redistribute it and/or
@@ -345,10 +345,10 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
unsigned long flags;
bool is_retry = false;
+ BUG_ON(!ni);
ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
ni->mft_no, (unsigned long long)vcn,
write_locked ? "write" : "read");
- BUG_ON(!ni);
BUG_ON(!NInoNonResident(ni));
BUG_ON(vcn < 0);
if (!ni->runlist.rl) {
@@ -469,9 +469,9 @@ runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
int err = 0;
bool is_retry = false;
+ BUG_ON(!ni);
ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
- BUG_ON(!ni);
BUG_ON(!NInoNonResident(ni));
BUG_ON(vcn < 0);
if (!ni->runlist.rl) {
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 97e2dacbc867..2eaa66652944 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -335,7 +335,6 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
static void ntfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
}
@@ -2301,16 +2300,16 @@ void ntfs_evict_big_inode(struct inode *vi)
/**
* ntfs_show_options - show mount options in /proc/mounts
* @sf: seq_file in which to write our mount options
- * @mnt: vfs mount whose mount options to display
+ * @root: root of the mounted tree whose mount options to display
*
* Called by the VFS once for each mounted ntfs volume when someone reads
* /proc/mounts in order to display the NTFS specific mount options of each
- * mount. The mount options of the vfs mount @mnt are written to the seq file
+ * mount. The mount options of fs specified by @root are written to the seq file
* @sf and success is returned.
*/
-int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
+int ntfs_show_options(struct seq_file *sf, struct dentry *root)
{
- ntfs_volume *vol = NTFS_SB(mnt->mnt_sb);
+ ntfs_volume *vol = NTFS_SB(root->d_sb);
int i;
seq_printf(sf, ",uid=%i", vol->uid);
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index fe8e7e928889..db29695f845c 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -298,7 +298,7 @@ extern void ntfs_clear_extent_inode(ntfs_inode *ni);
extern int ntfs_read_inode_mount(struct inode *vi);
-extern int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt);
+extern int ntfs_show_options(struct seq_file *sf, struct dentry *root);
#ifdef NTFS_RW
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 382857f9c7db..3014a36a255b 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
/**
* mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
* Copyright (c) 2002 Richard Russon
*
* This program/include file is free software; you can redistribute it and/or
@@ -1367,7 +1367,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
ntfs_error(vol->sb, "Failed to merge runlists for mft "
"bitmap.");
if (ntfs_cluster_free_from_rl(vol, rl2)) {
- ntfs_error(vol->sb, "Failed to dealocate "
+ ntfs_error(vol->sb, "Failed to deallocate "
"allocated cluster.%s", es);
NVolSetErrors(vol);
}
@@ -1805,7 +1805,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
ntfs_error(vol->sb, "Failed to merge runlists for mft data "
"attribute.");
if (ntfs_cluster_free_from_rl(vol, rl2)) {
- ntfs_error(vol->sb, "Failed to dealocate clusters "
+ ntfs_error(vol->sb, "Failed to deallocate clusters "
"from the mft data attribute.%s", es);
NVolSetErrors(vol);
}
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index b52706da4645..f907611cca73 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
/*
* super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
* Copyright (c) 2001,2002 Richard Russon
*
* This program/include file is free software; you can redistribute it and/or
@@ -104,7 +104,7 @@ static bool parse_options(ntfs_volume *vol, char *opt)
int errors = 0, sloppy = 0;
uid_t uid = (uid_t)-1;
gid_t gid = (gid_t)-1;
- mode_t fmask = (mode_t)-1, dmask = (mode_t)-1;
+ umode_t fmask = (umode_t)-1, dmask = (umode_t)-1;
int mft_zone_multiplier = -1, on_errors = -1;
int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1;
struct nls_table *nls_map = NULL, *old_nls;
@@ -287,9 +287,9 @@ no_mount_options:
vol->uid = uid;
if (gid != (gid_t)-1)
vol->gid = gid;
- if (fmask != (mode_t)-1)
+ if (fmask != (umode_t)-1)
vol->fmask = fmask;
- if (dmask != (mode_t)-1)
+ if (dmask != (umode_t)-1)
vol->dmask = dmask;
if (show_sys_files != -1) {
if (show_sys_files)
@@ -1239,7 +1239,6 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
{
MFT_REF mref;
struct inode *vi;
- ntfs_inode *ni;
struct page *page;
u32 *kaddr, *kend;
ntfs_name *name = NULL;
@@ -1290,7 +1289,6 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
"is not the system volume.", i_size_read(vi));
goto iput_out;
}
- ni = NTFS_I(vi);
page = ntfs_map_page(vi->i_mapping, 0);
if (IS_ERR(page)) {
ntfs_error(vol->sb, "Failed to read from hiberfil.sys.");
@@ -3198,7 +3196,7 @@ MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparm
MODULE_VERSION(NTFS_VERSION);
MODULE_LICENSE("GPL");
#ifdef DEBUG
-module_param(debug_msgs, bool, 0);
+module_param(debug_msgs, bint, 0);
MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
#endif
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
index 406ab55dfb32..15e3ba8d521a 100644
--- a/fs/ntfs/volume.h
+++ b/fs/ntfs/volume.h
@@ -48,8 +48,8 @@ typedef struct {
unsigned long flags; /* Miscellaneous flags, see below. */
uid_t uid; /* uid that files will be mounted as. */
gid_t gid; /* gid that files will be mounted as. */
- mode_t fmask; /* The mask for file permissions. */
- mode_t dmask; /* The mask for directory
+ umode_t fmask; /* The mask for file permissions. */
+ umode_t dmask; /* The mask for directory
permissions. */
u8 mft_zone_multiplier; /* Initial mft zone multiplier. */
u8 on_errors; /* What to do on filesystem errors. */
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index dc45deb19e68..73ba81928bce 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -553,7 +553,7 @@ void o2net_debugfs_exit(void)
int o2net_debugfs_init(void)
{
- mode_t mode = S_IFREG|S_IRUSR;
+ umode_t mode = S_IFREG|S_IRUSR;
o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
if (o2net_dentry)
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b42076797049..abfac0d7ae9c 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -354,7 +354,6 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
static void dlmfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
}
@@ -401,16 +400,14 @@ static struct backing_dev_info dlmfs_backing_dev_info = {
static struct inode *dlmfs_get_root_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
- int mode = S_IFDIR | 0755;
+ umode_t mode = S_IFDIR | 0755;
struct dlmfs_inode_private *ip;
if (inode) {
ip = DLMFS_I(inode);
inode->i_ino = get_next_ino();
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
+ inode_init_owner(inode, NULL, mode);
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inc_nlink(inode);
@@ -424,7 +421,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
static struct inode *dlmfs_get_inode(struct inode *parent,
struct dentry *dentry,
- int mode)
+ umode_t mode)
{
struct super_block *sb = parent->i_sb;
struct inode * inode = new_inode(sb);
@@ -434,9 +431,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
return NULL;
inode->i_ino = get_next_ino();
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
+ inode_init_owner(inode, parent, mode);
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -473,13 +468,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
inc_nlink(inode);
break;
}
-
- if (parent->i_mode & S_ISGID) {
- inode->i_gid = parent->i_gid;
- if (S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
- }
-
return inode;
}
@@ -489,7 +477,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
/* SMP-safe */
static int dlmfs_mkdir(struct inode * dir,
struct dentry * dentry,
- int mode)
+ umode_t mode)
{
int status;
struct inode *inode = NULL;
@@ -537,7 +525,7 @@ bail:
static int dlmfs_create(struct inode *dir,
struct dentry *dentry,
- int mode,
+ umode_t mode,
struct nameidata *nd)
{
int status = 0;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6e396683c3d4..061591a3ab08 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2128,7 +2128,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* remove_suid() calls ->setattr without any hint that
* we may have already done our cluster locking. Since
* ocfs2_setattr() *must* take cluster locks to
- * proceeed, this will lead us to recursively lock the
+ * proceed, this will lead us to recursively lock the
* inode. There's also the dinode i_size state which
* can be lost via setattr during extending writes (we
* set inode->i_size at the end of a write. */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 726ff265b296..a6fda3c188aa 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -906,12 +906,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- status = mnt_want_write(filp->f_path.mnt);
+ status = mnt_want_write_file(filp);
if (status)
return status;
status = ocfs2_set_inode_attr(inode, flags,
OCFS2_FL_MODIFIABLE);
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return status;
case OCFS2_IOC_RESVSP:
case OCFS2_IOC_RESVSP64:
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 184c76b8c293..b1e3fce72ea4 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -1059,7 +1059,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
struct ocfs2_move_extents range;
struct ocfs2_move_extents_context *context = NULL;
- status = mnt_want_write(filp->f_path.mnt);
+ status = mnt_want_write_file(filp);
if (status)
return status;
@@ -1145,7 +1145,7 @@ out:
kfree(context);
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return status;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a8b2bfea574e..a9856e3eaaf0 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -185,7 +185,7 @@ bail:
return ret;
}
-static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
{
struct inode *inode;
@@ -207,7 +207,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
static int ocfs2_mknod(struct inode *dir,
struct dentry *dentry,
- int mode,
+ umode_t mode,
dev_t dev)
{
int status = 0;
@@ -602,7 +602,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
static int ocfs2_mkdir(struct inode *dir,
struct dentry *dentry,
- int mode)
+ umode_t mode)
{
int ret;
@@ -617,7 +617,7 @@ static int ocfs2_mkdir(struct inode *dir,
static int ocfs2_create(struct inode *dir,
struct dentry *dentry,
- int mode,
+ umode_t mode,
struct nameidata *nd)
{
int ret;
@@ -1053,7 +1053,7 @@ static int ocfs2_rename(struct inode *old_dir,
handle_t *handle = NULL;
struct buffer_head *old_dir_bh = NULL;
struct buffer_head *new_dir_bh = NULL;
- nlink_t old_dir_nlink = old_dir->i_nlink;
+ u32 old_dir_nlink = old_dir->i_nlink;
struct ocfs2_dinode *old_di;
struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index a5ebe421195f..286edf1e231f 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -827,8 +827,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
goto out;
}
- rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
- &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
+ rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
+ NULL, NULL, NULL, &fsdlm);
if (rc) {
ocfs2_live_connection_drop(control);
goto out;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4994f8b0e604..604e12c4e979 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -108,7 +108,7 @@ static int ocfs2_parse_options(struct super_block *sb, char *options,
int is_remount);
static int ocfs2_check_set_options(struct super_block *sb,
struct mount_options *options);
-static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
+static int ocfs2_show_options(struct seq_file *s, struct dentry *root);
static void ocfs2_put_super(struct super_block *sb);
static int ocfs2_mount_volume(struct super_block *sb);
static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -569,7 +569,6 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
static void ocfs2_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
}
@@ -1534,9 +1533,9 @@ bail:
return status;
}
-static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
{
- struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
+ struct ocfs2_super *osb = OCFS2_SB(root->d_sb);
unsigned long opts = osb->s_mount_opt;
unsigned int local_alloc_megs;
@@ -1568,8 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (osb->preferred_slot != OCFS2_INVALID_SLOT)
seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
- if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
- seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
+ seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
if (osb->osb_commit_interval)
seq_printf(s, ",commit=%u",
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index aa9e8777b09a..0ba9ea1e7961 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -623,7 +623,7 @@ int ocfs2_calc_security_init(struct inode *dir,
int ocfs2_calc_xattr_init(struct inode *dir,
struct buffer_head *dir_bh,
- int mode,
+ umode_t mode,
struct ocfs2_security_xattr_info *si,
int *want_clusters,
int *xattr_credits,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index d63cfb72316b..e5c7f15465b4 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *,
struct ocfs2_security_xattr_info *,
int *, int *, struct ocfs2_alloc_context **);
int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
- int, struct ocfs2_security_xattr_info *,
+ umode_t, struct ocfs2_security_xattr_info *,
int *, int *, int *);
/*
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 98e544274390..f00576ec320f 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -255,7 +255,7 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
return 0;
}
-static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode)
+static int omfs_add_node(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int err;
struct inode *inode = omfs_new_inode(dir, mode);
@@ -279,12 +279,12 @@ out_free_inode:
return err;
}
-static int omfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int omfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return omfs_add_node(dir, dentry, mode | S_IFDIR);
}
-static int omfs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int omfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
return omfs_add_node(dir, dentry, mode | S_IFREG);
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index e043c4cb9a97..6065bb0ba207 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -28,7 +28,7 @@ struct buffer_head *omfs_bread(struct super_block *sb, sector_t block)
return sb_bread(sb, clus_to_blk(sbi, block));
}
-struct inode *omfs_new_inode(struct inode *dir, int mode)
+struct inode *omfs_new_inode(struct inode *dir, umode_t mode)
{
struct inode *inode;
u64 new_block;
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index 7d414fef501a..8941f12c6b01 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -60,7 +60,7 @@ extern int omfs_shrink_inode(struct inode *inode);
/* inode.c */
extern struct buffer_head *omfs_bread(struct super_block *sb, sector_t block);
extern struct inode *omfs_iget(struct super_block *sb, ino_t inode);
-extern struct inode *omfs_new_inode(struct inode *dir, int mode);
+extern struct inode *omfs_new_inode(struct inode *dir, umode_t mode);
extern int omfs_reserve_block(struct super_block *sb, sector_t block);
extern int omfs_find_empty_block(struct super_block *sb, int mode, ino_t *ino);
extern int omfs_sync_inode(struct inode *inode);
diff --git a/fs/open.c b/fs/open.c
index 22c41b543f2d..77becc041149 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -456,7 +456,7 @@ static int chmod_common(struct path *path, umode_t mode)
if (error)
return error;
mutex_lock(&inode->i_mutex);
- error = security_path_chmod(path->dentry, path->mnt, mode);
+ error = security_path_chmod(path, mode);
if (error)
goto out_unlock;
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
@@ -468,7 +468,7 @@ out_unlock:
return error;
}
-SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
+SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
{
struct file * file;
int err = -EBADF;
@@ -482,7 +482,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
return err;
}
-SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
+SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode)
{
struct path path;
int error;
@@ -495,7 +495,7 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
return error;
}
-SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
+SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
{
return sys_fchmodat(AT_FDCWD, filename, mode);
}
@@ -608,7 +608,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
dentry = file->f_path.dentry;
audit_inode(NULL, dentry);
error = chown_common(&file->f_path, user, group);
- mnt_drop_write(file->f_path.mnt);
+ mnt_drop_write_file(file);
out_fput:
fput(file);
out:
@@ -877,7 +877,7 @@ void fd_install(unsigned int fd, struct file *file)
EXPORT_SYMBOL(fd_install);
-static inline int build_open_flags(int flags, int mode, struct open_flags *op)
+static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
{
int lookup_flags = 0;
int acc_mode;
@@ -948,7 +948,7 @@ static inline int build_open_flags(int flags, int mode, struct open_flags *op)
* have to. But in generally you should not do this, so please move
* along, nothing to see here..
*/
-struct file *filp_open(const char *filename, int flags, int mode)
+struct file *filp_open(const char *filename, int flags, umode_t mode)
{
struct open_flags op;
int lookup = build_open_flags(flags, mode, &op);
@@ -970,7 +970,7 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
}
EXPORT_SYMBOL(file_open_root);
-long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
+long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_flags op;
int lookup = build_open_flags(flags, mode, &op);
@@ -994,7 +994,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
return fd;
}
-SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
+SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
long ret;
@@ -1008,7 +1008,7 @@ SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
}
SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
- int, mode)
+ umode_t, mode)
{
long ret;
@@ -1027,7 +1027,7 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
* For backward compatibility? Maybe this should be moved
* into arch/i386 instead?
*/
-SYSCALL_DEFINE2(creat, const char __user *, pathname, int, mode)
+SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
{
return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
}
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index e4e0ff7962e2..a88c03bc749d 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -346,7 +346,6 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
static void openprom_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(op_inode_cachep, OP_I(inode));
}
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
deleted file mode 100644
index cb5f0a3f1b03..000000000000
--- a/fs/partitions/Kconfig
+++ /dev/null
@@ -1,251 +0,0 @@
-#
-# Partition configuration
-#
-config PARTITION_ADVANCED
- bool "Advanced partition selection"
- help
- Say Y here if you would like to use hard disks under Linux which
- were partitioned under an operating system running on a different
- architecture than your Linux system.
-
- Note that the answer to this question won't directly affect the
- kernel: saying N will just cause the configurator to skip all
- the questions about foreign partitioning schemes.
-
- If unsure, say N.
-
-config ACORN_PARTITION
- bool "Acorn partition support" if PARTITION_ADVANCED
- default y if ARCH_ACORN
- help
- Support hard disks partitioned under Acorn operating systems.
-
-config ACORN_PARTITION_CUMANA
- bool "Cumana partition support" if PARTITION_ADVANCED
- default y if ARCH_ACORN
- depends on ACORN_PARTITION
- help
- Say Y here if you would like to use hard disks under Linux which
- were partitioned using the Cumana interface on Acorn machines.
-
-config ACORN_PARTITION_EESOX
- bool "EESOX partition support" if PARTITION_ADVANCED
- default y if ARCH_ACORN
- depends on ACORN_PARTITION
-
-config ACORN_PARTITION_ICS
- bool "ICS partition support" if PARTITION_ADVANCED
- default y if ARCH_ACORN
- depends on ACORN_PARTITION
- help
- Say Y here if you would like to use hard disks under Linux which
- were partitioned using the ICS interface on Acorn machines.
-
-config ACORN_PARTITION_ADFS
- bool "Native filecore partition support" if PARTITION_ADVANCED
- default y if ARCH_ACORN
- depends on ACORN_PARTITION
- help
- The Acorn Disc Filing System is the standard file system of the
- RiscOS operating system which runs on Acorn's ARM-based Risc PC
- systems and the Acorn Archimedes range of machines. If you say
- `Y' here, Linux will support disk partitions created under ADFS.
-
-config ACORN_PARTITION_POWERTEC
- bool "PowerTec partition support" if PARTITION_ADVANCED
- default y if ARCH_ACORN
- depends on ACORN_PARTITION
- help
- Support reading partition tables created on Acorn machines using
- the PowerTec SCSI drive.
-
-config ACORN_PARTITION_RISCIX
- bool "RISCiX partition support" if PARTITION_ADVANCED
- default y if ARCH_ACORN
- depends on ACORN_PARTITION
- help
- Once upon a time, there was a native Unix port for the Acorn series
- of machines called RISCiX. If you say 'Y' here, Linux will be able
- to read disks partitioned under RISCiX.
-
-config OSF_PARTITION
- bool "Alpha OSF partition support" if PARTITION_ADVANCED
- default y if ALPHA
- help
- Say Y here if you would like to use hard disks under Linux which
- were partitioned on an Alpha machine.
-
-config AMIGA_PARTITION
- bool "Amiga partition table support" if PARTITION_ADVANCED
- default y if (AMIGA || AFFS_FS=y)
- help
- Say Y here if you would like to use hard disks under Linux which
- were partitioned under AmigaOS.
-
-config ATARI_PARTITION
- bool "Atari partition table support" if PARTITION_ADVANCED
- default y if ATARI
- help
- Say Y here if you would like to use hard disks under Linux which
- were partitioned under the Atari OS.
-
-config IBM_PARTITION
- bool "IBM disk label and partition support"
- depends on PARTITION_ADVANCED && S390
- help
- Say Y here if you would like to be able to read the hard disk
- partition table format used by IBM DASD disks operating under CMS.
- Otherwise, say N.
-
-config MAC_PARTITION
- bool "Macintosh partition map support" if PARTITION_ADVANCED
- default y if (MAC || PPC_PMAC)
- help
- Say Y here if you would like to use hard disks under Linux which
- were partitioned on a Macintosh.
-
-config MSDOS_PARTITION
- bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED
- default y
- help
- Say Y here.
-
-config BSD_DISKLABEL
- bool "BSD disklabel (FreeBSD partition tables) support"
- depends on PARTITION_ADVANCED && MSDOS_PARTITION
- help
- FreeBSD uses its own hard disk partition scheme on your PC. It
- requires only one entry in the primary partition table of your disk
- and manages it similarly to DOS extended partitions, putting in its
- first sector a new partition table in BSD disklabel format. Saying Y
- here allows you to read these disklabels and further mount FreeBSD
- partitions from within Linux if you have also said Y to "UFS
- file system support", above. If you don't know what all this is
- about, say N.
-
-config MINIX_SUBPARTITION
- bool "Minix subpartition support"
- depends on PARTITION_ADVANCED && MSDOS_PARTITION
- help
- Minix 2.0.0/2.0.2 subpartition table support for Linux.
- Say Y here if you want to mount and use Minix 2.0.0/2.0.2
- subpartitions.
-
-config SOLARIS_X86_PARTITION
- bool "Solaris (x86) partition table support"
- depends on PARTITION_ADVANCED && MSDOS_PARTITION
- help
- Like most systems, Solaris x86 uses its own hard disk partition
- table format, incompatible with all others. Saying Y here allows you
- to read these partition tables and further mount Solaris x86
- partitions from within Linux if you have also said Y to "UFS
- file system support", above.
-
-config UNIXWARE_DISKLABEL
- bool "Unixware slices support"
- depends on PARTITION_ADVANCED && MSDOS_PARTITION
- ---help---
- Like some systems, UnixWare uses its own slice table inside a
- partition (VTOC - Virtual Table of Contents). Its format is
- incompatible with all other OSes. Saying Y here allows you to read
- VTOC and further mount UnixWare partitions read-only from within
- Linux if you have also said Y to "UFS file system support" or
- "System V and Coherent file system support", above.
-
- This is mainly used to carry data from a UnixWare box to your
- Linux box via a removable medium like magneto-optical, ZIP or
- removable IDE drives. Note, however, that a good portable way to
- transport files and directories between unixes (and even other
- operating systems) is given by the tar program ("man tar" or
- preferably "info tar").
-
- If you don't know what all this is about, say N.
-
-config LDM_PARTITION
- bool "Windows Logical Disk Manager (Dynamic Disk) support"
- depends on PARTITION_ADVANCED
- ---help---
- Say Y here if you would like to use hard disks under Linux which
- were partitioned using Windows 2000's/XP's or Vista's Logical Disk
- Manager. They are also known as "Dynamic Disks".
-
- Note this driver only supports Dynamic Disks with a protective MBR
- label, i.e. DOS partition table. It does not support GPT labelled
- Dynamic Disks yet as can be created with Vista.
-
- Windows 2000 introduced the concept of Dynamic Disks to get around
- the limitations of the PC's partitioning scheme. The Logical Disk
- Manager allows the user to repartition a disk and create spanned,
- mirrored, striped or RAID volumes, all without the need for
- rebooting.
-
- Normal partitions are now called Basic Disks under Windows 2000, XP,
- and Vista.
-
- For a fuller description read <file:Documentation/ldm.txt>.
-
- If unsure, say N.
-
-config LDM_DEBUG
- bool "Windows LDM extra logging"
- depends on LDM_PARTITION
- help
- Say Y here if you would like LDM to log verbosely. This could be
- helpful if the driver doesn't work as expected and you'd like to
- report a bug.
-
- If unsure, say N.
-
-config SGI_PARTITION
- bool "SGI partition support" if PARTITION_ADVANCED
- default y if DEFAULT_SGI_PARTITION
- help
- Say Y here if you would like to be able to read the hard disk
- partition table format used by SGI machines.
-
-config ULTRIX_PARTITION
- bool "Ultrix partition table support" if PARTITION_ADVANCED
- default y if MACH_DECSTATION
- help
- Say Y here if you would like to be able to read the hard disk
- partition table format used by DEC (now Compaq) Ultrix machines.
- Otherwise, say N.
-
-config SUN_PARTITION
- bool "Sun partition tables support" if PARTITION_ADVANCED
- default y if (SPARC || SUN3 || SUN3X)
- ---help---
- Like most systems, SunOS uses its own hard disk partition table
- format, incompatible with all others. Saying Y here allows you to
- read these partition tables and further mount SunOS partitions from
- within Linux if you have also said Y to "UFS file system support",
- above. This is mainly used to carry data from a SPARC under SunOS to
- your Linux box via a removable medium like magneto-optical or ZIP
- drives; note however that a good portable way to transport files and
- directories between unixes (and even other operating systems) is
- given by the tar program ("man tar" or preferably "info tar"). If
- you don't know what all this is about, say N.
-
-config KARMA_PARTITION
- bool "Karma Partition support"
- depends on PARTITION_ADVANCED
- help
- Say Y here if you would like to mount the Rio Karma MP3 player, as it
- uses a proprietary partition table.
-
-config EFI_PARTITION
- bool "EFI GUID Partition support"
- depends on PARTITION_ADVANCED
- select CRC32
- help
- Say Y here if you would like to use hard disks under Linux which
- were partitioned using EFI GPT.
-
-config SYSV68_PARTITION
- bool "SYSV68 partition table support" if PARTITION_ADVANCED
- default y if VME
- help
- Say Y here if you would like to be able to read the hard disk
- partition table format used by Motorola Delta machines (using
- sysv68).
- Otherwise, say N.
diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile
deleted file mode 100644
index 03af8eac51da..000000000000
--- a/fs/partitions/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-
-obj-$(CONFIG_BLOCK) := check.o
-
-obj-$(CONFIG_ACORN_PARTITION) += acorn.o
-obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
-obj-$(CONFIG_ATARI_PARTITION) += atari.o
-obj-$(CONFIG_MAC_PARTITION) += mac.o
-obj-$(CONFIG_LDM_PARTITION) += ldm.o
-obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
-obj-$(CONFIG_OSF_PARTITION) += osf.o
-obj-$(CONFIG_SGI_PARTITION) += sgi.o
-obj-$(CONFIG_SUN_PARTITION) += sun.o
-obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o
-obj-$(CONFIG_IBM_PARTITION) += ibm.o
-obj-$(CONFIG_EFI_PARTITION) += efi.o
-obj-$(CONFIG_KARMA_PARTITION) += karma.o
-obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
deleted file mode 100644
index fbeb697374d5..000000000000
--- a/fs/partitions/acorn.c
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * linux/fs/partitions/acorn.c
- *
- * Copyright (c) 1996-2000 Russell King.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Scan ADFS partitions on hard disk drives. Unfortunately, there
- * isn't a standard for partitioning drives on Acorn machines, so
- * every single manufacturer of SCSI and IDE cards created their own
- * method.
- */
-#include <linux/buffer_head.h>
-#include <linux/adfs_fs.h>
-
-#include "check.h"
-#include "acorn.h"
-
-/*
- * Partition types. (Oh for reusability)
- */
-#define PARTITION_RISCIX_MFM 1
-#define PARTITION_RISCIX_SCSI 2
-#define PARTITION_LINUX 9
-
-#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
- defined(CONFIG_ACORN_PARTITION_ADFS)
-static struct adfs_discrecord *
-adfs_partition(struct parsed_partitions *state, char *name, char *data,
- unsigned long first_sector, int slot)
-{
- struct adfs_discrecord *dr;
- unsigned int nr_sects;
-
- if (adfs_checkbblk(data))
- return NULL;
-
- dr = (struct adfs_discrecord *)(data + 0x1c0);
-
- if (dr->disc_size == 0 && dr->disc_size_high == 0)
- return NULL;
-
- nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
- (le32_to_cpu(dr->disc_size) >> 9);
-
- if (name) {
- strlcat(state->pp_buf, " [", PAGE_SIZE);
- strlcat(state->pp_buf, name, PAGE_SIZE);
- strlcat(state->pp_buf, "]", PAGE_SIZE);
- }
- put_partition(state, slot, first_sector, nr_sects);
- return dr;
-}
-#endif
-
-#ifdef CONFIG_ACORN_PARTITION_RISCIX
-
-struct riscix_part {
- __le32 start;
- __le32 length;
- __le32 one;
- char name[16];
-};
-
-struct riscix_record {
- __le32 magic;
-#define RISCIX_MAGIC cpu_to_le32(0x4a657320)
- __le32 date;
- struct riscix_part part[8];
-};
-
-#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
- defined(CONFIG_ACORN_PARTITION_ADFS)
-static int riscix_partition(struct parsed_partitions *state,
- unsigned long first_sect, int slot,
- unsigned long nr_sects)
-{
- Sector sect;
- struct riscix_record *rr;
-
- rr = read_part_sector(state, first_sect, &sect);
- if (!rr)
- return -1;
-
- strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
-
-
- if (rr->magic == RISCIX_MAGIC) {
- unsigned long size = nr_sects > 2 ? 2 : nr_sects;
- int part;
-
- strlcat(state->pp_buf, " <", PAGE_SIZE);
-
- put_partition(state, slot++, first_sect, size);
- for (part = 0; part < 8; part++) {
- if (rr->part[part].one &&
- memcmp(rr->part[part].name, "All\0", 4)) {
- put_partition(state, slot++,
- le32_to_cpu(rr->part[part].start),
- le32_to_cpu(rr->part[part].length));
- strlcat(state->pp_buf, "(", PAGE_SIZE);
- strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
- strlcat(state->pp_buf, ")", PAGE_SIZE);
- }
- }
-
- strlcat(state->pp_buf, " >\n", PAGE_SIZE);
- } else {
- put_partition(state, slot++, first_sect, nr_sects);
- }
-
- put_dev_sector(sect);
- return slot;
-}
-#endif
-#endif
-
-#define LINUX_NATIVE_MAGIC 0xdeafa1de
-#define LINUX_SWAP_MAGIC 0xdeafab1e
-
-struct linux_part {
- __le32 magic;
- __le32 start_sect;
- __le32 nr_sects;
-};
-
-#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
- defined(CONFIG_ACORN_PARTITION_ADFS)
-static int linux_partition(struct parsed_partitions *state,
- unsigned long first_sect, int slot,
- unsigned long nr_sects)
-{
- Sector sect;
- struct linux_part *linuxp;
- unsigned long size = nr_sects > 2 ? 2 : nr_sects;
-
- strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
-
- put_partition(state, slot++, first_sect, size);
-
- linuxp = read_part_sector(state, first_sect, &sect);
- if (!linuxp)
- return -1;
-
- strlcat(state->pp_buf, " <", PAGE_SIZE);
- while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
- linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
- if (slot == state->limit)
- break;
- put_partition(state, slot++, first_sect +
- le32_to_cpu(linuxp->start_sect),
- le32_to_cpu(linuxp->nr_sects));
- linuxp ++;
- }
- strlcat(state->pp_buf, " >", PAGE_SIZE);
-
- put_dev_sector(sect);
- return slot;
-}
-#endif
-
-#ifdef CONFIG_ACORN_PARTITION_CUMANA
-int adfspart_check_CUMANA(struct parsed_partitions *state)
-{
- unsigned long first_sector = 0;
- unsigned int start_blk = 0;
- Sector sect;
- unsigned char *data;
- char *name = "CUMANA/ADFS";
- int first = 1;
- int slot = 1;
-
- /*
- * Try Cumana style partitions - sector 6 contains ADFS boot block
- * with pointer to next 'drive'.
- *
- * There are unknowns in this code - is the 'cylinder number' of the
- * next partition relative to the start of this one - I'm assuming
- * it is.
- *
- * Also, which ID did Cumana use?
- *
- * This is totally unfinished, and will require more work to get it
- * going. Hence it is totally untested.
- */
- do {
- struct adfs_discrecord *dr;
- unsigned int nr_sects;
-
- data = read_part_sector(state, start_blk * 2 + 6, &sect);
- if (!data)
- return -1;
-
- if (slot == state->limit)
- break;
-
- dr = adfs_partition(state, name, data, first_sector, slot++);
- if (!dr)
- break;
-
- name = NULL;
-
- nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) *
- (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) *
- dr->secspertrack;
-
- if (!nr_sects)
- break;
-
- first = 0;
- first_sector += nr_sects;
- start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9);
- nr_sects = 0; /* hmm - should be partition size */
-
- switch (data[0x1fc] & 15) {
- case 0: /* No partition / ADFS? */
- break;
-
-#ifdef CONFIG_ACORN_PARTITION_RISCIX
- case PARTITION_RISCIX_SCSI:
- /* RISCiX - we don't know how to find the next one. */
- slot = riscix_partition(state, first_sector, slot,
- nr_sects);
- break;
-#endif
-
- case PARTITION_LINUX:
- slot = linux_partition(state, first_sector, slot,
- nr_sects);
- break;
- }
- put_dev_sector(sect);
- if (slot == -1)
- return -1;
- } while (1);
- put_dev_sector(sect);
- return first ? 0 : 1;
-}
-#endif
-
-#ifdef CONFIG_ACORN_PARTITION_ADFS
-/*
- * Purpose: allocate ADFS partitions.
- *
- * Params : hd - pointer to gendisk structure to store partition info.
- * dev - device number to access.
- *
- * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok.
- *
- * Alloc : hda = whole drive
- * hda1 = ADFS partition on first drive.
- * hda2 = non-ADFS partition.
- */
-int adfspart_check_ADFS(struct parsed_partitions *state)
-{
- unsigned long start_sect, nr_sects, sectscyl, heads;
- Sector sect;
- unsigned char *data;
- struct adfs_discrecord *dr;
- unsigned char id;
- int slot = 1;
-
- data = read_part_sector(state, 6, &sect);
- if (!data)
- return -1;
-
- dr = adfs_partition(state, "ADFS", data, 0, slot++);
- if (!dr) {
- put_dev_sector(sect);
- return 0;
- }
-
- heads = dr->heads + ((dr->lowsector >> 6) & 1);
- sectscyl = dr->secspertrack * heads;
- start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl;
- id = data[0x1fc] & 15;
- put_dev_sector(sect);
-
- /*
- * Work out start of non-adfs partition.
- */
- nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
-
- if (start_sect) {
- switch (id) {
-#ifdef CONFIG_ACORN_PARTITION_RISCIX
- case PARTITION_RISCIX_SCSI:
- case PARTITION_RISCIX_MFM:
- slot = riscix_partition(state, start_sect, slot,
- nr_sects);
- break;
-#endif
-
- case PARTITION_LINUX:
- slot = linux_partition(state, start_sect, slot,
- nr_sects);
- break;
- }
- }
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- return 1;
-}
-#endif
-
-#ifdef CONFIG_ACORN_PARTITION_ICS
-
-struct ics_part {
- __le32 start;
- __le32 size;
-};
-
-static int adfspart_check_ICSLinux(struct parsed_partitions *state,
- unsigned long block)
-{
- Sector sect;
- unsigned char *data = read_part_sector(state, block, &sect);
- int result = 0;
-
- if (data) {
- if (memcmp(data, "LinuxPart", 9) == 0)
- result = 1;
- put_dev_sector(sect);
- }
-
- return result;
-}
-
-/*
- * Check for a valid ICS partition using the checksum.
- */
-static inline int valid_ics_sector(const unsigned char *data)
-{
- unsigned long sum;
- int i;
-
- for (i = 0, sum = 0x50617274; i < 508; i++)
- sum += data[i];
-
- sum -= le32_to_cpu(*(__le32 *)(&data[508]));
-
- return sum == 0;
-}
-
-/*
- * Purpose: allocate ICS partitions.
- * Params : hd - pointer to gendisk structure to store partition info.
- * dev - device number to access.
- * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
- * Alloc : hda = whole drive
- * hda1 = ADFS partition 0 on first drive.
- * hda2 = ADFS partition 1 on first drive.
- * ..etc..
- */
-int adfspart_check_ICS(struct parsed_partitions *state)
-{
- const unsigned char *data;
- const struct ics_part *p;
- int slot;
- Sector sect;
-
- /*
- * Try ICS style partitions - sector 0 contains partition info.
- */
- data = read_part_sector(state, 0, &sect);
- if (!data)
- return -1;
-
- if (!valid_ics_sector(data)) {
- put_dev_sector(sect);
- return 0;
- }
-
- strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
-
- for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
- u32 start = le32_to_cpu(p->start);
- s32 size = le32_to_cpu(p->size); /* yes, it's signed. */
-
- if (slot == state->limit)
- break;
-
- /*
- * Negative sizes tell the RISC OS ICS driver to ignore
- * this partition - in effect it says that this does not
- * contain an ADFS filesystem.
- */
- if (size < 0) {
- size = -size;
-
- /*
- * Our own extension - We use the first sector
- * of the partition to identify what type this
- * partition is. We must not make this visible
- * to the filesystem.
- */
- if (size > 1 && adfspart_check_ICSLinux(state, start)) {
- start += 1;
- size -= 1;
- }
- }
-
- if (size)
- put_partition(state, slot++, start, size);
- }
-
- put_dev_sector(sect);
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- return 1;
-}
-#endif
-
-#ifdef CONFIG_ACORN_PARTITION_POWERTEC
-struct ptec_part {
- __le32 unused1;
- __le32 unused2;
- __le32 start;
- __le32 size;
- __le32 unused5;
- char type[8];
-};
-
-static inline int valid_ptec_sector(const unsigned char *data)
-{
- unsigned char checksum = 0x2a;
- int i;
-
- /*
- * If it looks like a PC/BIOS partition, then it
- * probably isn't PowerTec.
- */
- if (data[510] == 0x55 && data[511] == 0xaa)
- return 0;
-
- for (i = 0; i < 511; i++)
- checksum += data[i];
-
- return checksum == data[511];
-}
-
-/*
- * Purpose: allocate ICS partitions.
- * Params : hd - pointer to gendisk structure to store partition info.
- * dev - device number to access.
- * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
- * Alloc : hda = whole drive
- * hda1 = ADFS partition 0 on first drive.
- * hda2 = ADFS partition 1 on first drive.
- * ..etc..
- */
-int adfspart_check_POWERTEC(struct parsed_partitions *state)
-{
- Sector sect;
- const unsigned char *data;
- const struct ptec_part *p;
- int slot = 1;
- int i;
-
- data = read_part_sector(state, 0, &sect);
- if (!data)
- return -1;
-
- if (!valid_ptec_sector(data)) {
- put_dev_sector(sect);
- return 0;
- }
-
- strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
-
- for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
- u32 start = le32_to_cpu(p->start);
- u32 size = le32_to_cpu(p->size);
-
- if (size)
- put_partition(state, slot++, start, size);
- }
-
- put_dev_sector(sect);
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- return 1;
-}
-#endif
-
-#ifdef CONFIG_ACORN_PARTITION_EESOX
-struct eesox_part {
- char magic[6];
- char name[10];
- __le32 start;
- __le32 unused6;
- __le32 unused7;
- __le32 unused8;
-};
-
-/*
- * Guess who created this format?
- */
-static const char eesox_name[] = {
- 'N', 'e', 'i', 'l', ' ',
- 'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' '
-};
-
-/*
- * EESOX SCSI partition format.
- *
- * This is a goddamned awful partition format. We don't seem to store
- * the size of the partition in this table, only the start addresses.
- *
- * There are two possibilities where the size comes from:
- * 1. The individual ADFS boot block entries that are placed on the disk.
- * 2. The start address of the next entry.
- */
-int adfspart_check_EESOX(struct parsed_partitions *state)
-{
- Sector sect;
- const unsigned char *data;
- unsigned char buffer[256];
- struct eesox_part *p;
- sector_t start = 0;
- int i, slot = 1;
-
- data = read_part_sector(state, 7, &sect);
- if (!data)
- return -1;
-
- /*
- * "Decrypt" the partition table. God knows why...
- */
- for (i = 0; i < 256; i++)
- buffer[i] = data[i] ^ eesox_name[i & 15];
-
- put_dev_sector(sect);
-
- for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) {
- sector_t next;
-
- if (memcmp(p->magic, "Eesox", 6))
- break;
-
- next = le32_to_cpu(p->start);
- if (i)
- put_partition(state, slot++, start, next - start);
- start = next;
- }
-
- if (i != 0) {
- sector_t size;
-
- size = get_capacity(state->bdev->bd_disk);
- put_partition(state, slot++, start, size - start);
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- }
-
- return i ? 1 : 0;
-}
-#endif
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
deleted file mode 100644
index ede828529692..000000000000
--- a/fs/partitions/acorn.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * linux/fs/partitions/acorn.h
- *
- * Copyright (C) 1996-2001 Russell King.
- *
- * I _hate_ this partitioning mess - why can't we have one defined
- * format, and everyone stick to it?
- */
-
-int adfspart_check_CUMANA(struct parsed_partitions *state);
-int adfspart_check_ADFS(struct parsed_partitions *state);
-int adfspart_check_ICS(struct parsed_partitions *state);
-int adfspart_check_POWERTEC(struct parsed_partitions *state);
-int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
deleted file mode 100644
index 70cbf44a1560..000000000000
--- a/fs/partitions/amiga.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * fs/partitions/amiga.c
- *
- * Code extracted from drivers/block/genhd.c
- *
- * Copyright (C) 1991-1998 Linus Torvalds
- * Re-organised Feb 1998 Russell King
- */
-
-#include <linux/types.h>
-#include <linux/affs_hardblocks.h>
-
-#include "check.h"
-#include "amiga.h"
-
-static __inline__ u32
-checksum_block(__be32 *m, int size)
-{
- u32 sum = 0;
-
- while (size--)
- sum += be32_to_cpu(*m++);
- return sum;
-}
-
-int amiga_partition(struct parsed_partitions *state)
-{
- Sector sect;
- unsigned char *data;
- struct RigidDiskBlock *rdb;
- struct PartitionBlock *pb;
- int start_sect, nr_sects, blk, part, res = 0;
- int blksize = 1; /* Multiplier for disk block size */
- int slot = 1;
- char b[BDEVNAME_SIZE];
-
- for (blk = 0; ; blk++, put_dev_sector(sect)) {
- if (blk == RDB_ALLOCATION_LIMIT)
- goto rdb_done;
- data = read_part_sector(state, blk, &sect);
- if (!data) {
- if (warn_no_part)
- printk("Dev %s: unable to read RDB block %d\n",
- bdevname(state->bdev, b), blk);
- res = -1;
- goto rdb_done;
- }
- if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK))
- continue;
-
- rdb = (struct RigidDiskBlock *)data;
- if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0)
- break;
- /* Try again with 0xdc..0xdf zeroed, Windows might have
- * trashed it.
- */
- *(__be32 *)(data+0xdc) = 0;
- if (checksum_block((__be32 *)data,
- be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
- printk("Warning: Trashed word at 0xd0 in block %d "
- "ignored in checksum calculation\n",blk);
- break;
- }
-
- printk("Dev %s: RDB in block %d has bad checksum\n",
- bdevname(state->bdev, b), blk);
- }
-
- /* blksize is blocks per 512 byte standard block */
- blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
-
- {
- char tmp[7 + 10 + 1 + 1];
-
- /* Be more informative */
- snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- }
- blk = be32_to_cpu(rdb->rdb_PartitionList);
- put_dev_sector(sect);
- for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
- blk *= blksize; /* Read in terms partition table understands */
- data = read_part_sector(state, blk, &sect);
- if (!data) {
- if (warn_no_part)
- printk("Dev %s: unable to read partition block %d\n",
- bdevname(state->bdev, b), blk);
- res = -1;
- goto rdb_done;
- }
- pb = (struct PartitionBlock *)data;
- blk = be32_to_cpu(pb->pb_Next);
- if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION))
- continue;
- if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 )
- continue;
-
- /* Tell Kernel about it */
-
- nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 -
- be32_to_cpu(pb->pb_Environment[9])) *
- be32_to_cpu(pb->pb_Environment[3]) *
- be32_to_cpu(pb->pb_Environment[5]) *
- blksize;
- if (!nr_sects)
- continue;
- start_sect = be32_to_cpu(pb->pb_Environment[9]) *
- be32_to_cpu(pb->pb_Environment[3]) *
- be32_to_cpu(pb->pb_Environment[5]) *
- blksize;
- put_partition(state,slot++,start_sect,nr_sects);
- {
- /* Be even more informative to aid mounting */
- char dostype[4];
- char tmp[42];
-
- __be32 *dt = (__be32 *)dostype;
- *dt = pb->pb_Environment[16];
- if (dostype[3] < ' ')
- snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
- dostype[0], dostype[1],
- dostype[2], dostype[3] + '@' );
- else
- snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
- dostype[0], dostype[1],
- dostype[2], dostype[3]);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
- be32_to_cpu(pb->pb_Environment[6]),
- be32_to_cpu(pb->pb_Environment[4]));
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- }
- res = 1;
- }
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
-
-rdb_done:
- return res;
-}
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
deleted file mode 100644
index d094585cadaa..000000000000
--- a/fs/partitions/amiga.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * fs/partitions/amiga.h
- */
-
-int amiga_partition(struct parsed_partitions *state);
-
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
deleted file mode 100644
index 9875b05e80a2..000000000000
--- a/fs/partitions/atari.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * fs/partitions/atari.c
- *
- * Code extracted from drivers/block/genhd.c
- *
- * Copyright (C) 1991-1998 Linus Torvalds
- * Re-organised Feb 1998 Russell King
- */
-
-#include <linux/ctype.h>
-#include "check.h"
-#include "atari.h"
-
-/* ++guenther: this should be settable by the user ("make config")?.
- */
-#define ICD_PARTS
-
-/* check if a partition entry looks valid -- Atari format is assumed if at
- least one of the primary entries is ok this way */
-#define VALID_PARTITION(pi,hdsiz) \
- (((pi)->flg & 1) && \
- isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \
- be32_to_cpu((pi)->st) <= (hdsiz) && \
- be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz))
-
-static inline int OK_id(char *s)
-{
- return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 ||
- memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 ||
- memcmp (s, "RAW", 3) == 0 ;
-}
-
-int atari_partition(struct parsed_partitions *state)
-{
- Sector sect;
- struct rootsector *rs;
- struct partition_info *pi;
- u32 extensect;
- u32 hd_size;
- int slot;
-#ifdef ICD_PARTS
- int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
-#endif
-
- rs = read_part_sector(state, 0, &sect);
- if (!rs)
- return -1;
-
- /* Verify this is an Atari rootsector: */
- hd_size = state->bdev->bd_inode->i_size >> 9;
- if (!VALID_PARTITION(&rs->part[0], hd_size) &&
- !VALID_PARTITION(&rs->part[1], hd_size) &&
- !VALID_PARTITION(&rs->part[2], hd_size) &&
- !VALID_PARTITION(&rs->part[3], hd_size)) {
- /*
- * if there's no valid primary partition, assume that no Atari
- * format partition table (there's no reliable magic or the like
- * :-()
- */
- put_dev_sector(sect);
- return 0;
- }
-
- pi = &rs->part[0];
- strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
- for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
- struct rootsector *xrs;
- Sector sect2;
- ulong partsect;
-
- if ( !(pi->flg & 1) )
- continue;
- /* active partition */
- if (memcmp (pi->id, "XGM", 3) != 0) {
- /* we don't care about other id's */
- put_partition (state, slot, be32_to_cpu(pi->st),
- be32_to_cpu(pi->siz));
- continue;
- }
- /* extension partition */
-#ifdef ICD_PARTS
- part_fmt = 1;
-#endif
- strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
- partsect = extensect = be32_to_cpu(pi->st);
- while (1) {
- xrs = read_part_sector(state, partsect, &sect2);
- if (!xrs) {
- printk (" block %ld read failed\n", partsect);
- put_dev_sector(sect);
- return -1;
- }
-
- /* ++roman: sanity check: bit 0 of flg field must be set */
- if (!(xrs->part[0].flg & 1)) {
- printk( "\nFirst sub-partition in extended partition is not valid!\n" );
- put_dev_sector(sect2);
- break;
- }
-
- put_partition(state, slot,
- partsect + be32_to_cpu(xrs->part[0].st),
- be32_to_cpu(xrs->part[0].siz));
-
- if (!(xrs->part[1].flg & 1)) {
- /* end of linked partition list */
- put_dev_sector(sect2);
- break;
- }
- if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) {
- printk("\nID of extended partition is not XGM!\n");
- put_dev_sector(sect2);
- break;
- }
-
- partsect = be32_to_cpu(xrs->part[1].st) + extensect;
- put_dev_sector(sect2);
- if (++slot == state->limit) {
- printk( "\nMaximum number of partitions reached!\n" );
- break;
- }
- }
- strlcat(state->pp_buf, " >", PAGE_SIZE);
- }
-#ifdef ICD_PARTS
- if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
- pi = &rs->icdpart[0];
- /* sanity check: no ICD format if first partition invalid */
- if (OK_id(pi->id)) {
- strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
- for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
- /* accept only GEM,BGM,RAW,LNX,SWP partitions */
- if (!((pi->flg & 1) && OK_id(pi->id)))
- continue;
- part_fmt = 2;
- put_partition (state, slot,
- be32_to_cpu(pi->st),
- be32_to_cpu(pi->siz));
- }
- strlcat(state->pp_buf, " >", PAGE_SIZE);
- }
- }
-#endif
- put_dev_sector(sect);
-
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
-
- return 1;
-}
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
deleted file mode 100644
index fe2d32a89f36..000000000000
--- a/fs/partitions/atari.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * fs/partitions/atari.h
- * Moved by Russell King from:
- *
- * linux/include/linux/atari_rootsec.h
- * definitions for Atari Rootsector layout
- * by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de)
- *
- * modified for ICD/Supra partitioning scheme restricted to at most 12
- * partitions
- * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de)
- */
-
-struct partition_info
-{
- u8 flg; /* bit 0: active; bit 7: bootable */
- char id[3]; /* "GEM", "BGM", "XGM", or other */
- __be32 st; /* start of partition */
- __be32 siz; /* length of partition */
-};
-
-struct rootsector
-{
- char unused[0x156]; /* room for boot code */
- struct partition_info icdpart[8]; /* info for ICD-partitions 5..12 */
- char unused2[0xc];
- u32 hd_siz; /* size of disk in blocks */
- struct partition_info part[4];
- u32 bsl_st; /* start of bad sector list */
- u32 bsl_cnt; /* length of bad sector list */
- u16 checksum; /* checksum for bootable disks */
-} __attribute__((__packed__));
-
-int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
deleted file mode 100644
index e3c63d1c5e13..000000000000
--- a/fs/partitions/check.c
+++ /dev/null
@@ -1,687 +0,0 @@
-/*
- * fs/partitions/check.c
- *
- * Code extracted from drivers/block/genhd.c
- * Copyright (C) 1991-1998 Linus Torvalds
- * Re-organised Feb 1998 Russell King
- *
- * We now have independent partition support from the
- * block drivers, which allows all the partition code to
- * be grouped in one location, and it to be mostly self
- * contained.
- *
- * Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl}
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/kmod.h>
-#include <linux/ctype.h>
-#include <linux/genhd.h>
-#include <linux/blktrace_api.h>
-
-#include "check.h"
-
-#include "acorn.h"
-#include "amiga.h"
-#include "atari.h"
-#include "ldm.h"
-#include "mac.h"
-#include "msdos.h"
-#include "osf.h"
-#include "sgi.h"
-#include "sun.h"
-#include "ibm.h"
-#include "ultrix.h"
-#include "efi.h"
-#include "karma.h"
-#include "sysv68.h"
-
-#ifdef CONFIG_BLK_DEV_MD
-extern void md_autodetect_dev(dev_t dev);
-#endif
-
-int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
-
-static int (*check_part[])(struct parsed_partitions *) = {
- /*
- * Probe partition formats with tables at disk address 0
- * that also have an ADFS boot block at 0xdc0.
- */
-#ifdef CONFIG_ACORN_PARTITION_ICS
- adfspart_check_ICS,
-#endif
-#ifdef CONFIG_ACORN_PARTITION_POWERTEC
- adfspart_check_POWERTEC,
-#endif
-#ifdef CONFIG_ACORN_PARTITION_EESOX
- adfspart_check_EESOX,
-#endif
-
- /*
- * Now move on to formats that only have partition info at
- * disk address 0xdc0. Since these may also have stale
- * PC/BIOS partition tables, they need to come before
- * the msdos entry.
- */
-#ifdef CONFIG_ACORN_PARTITION_CUMANA
- adfspart_check_CUMANA,
-#endif
-#ifdef CONFIG_ACORN_PARTITION_ADFS
- adfspart_check_ADFS,
-#endif
-
-#ifdef CONFIG_EFI_PARTITION
- efi_partition, /* this must come before msdos */
-#endif
-#ifdef CONFIG_SGI_PARTITION
- sgi_partition,
-#endif
-#ifdef CONFIG_LDM_PARTITION
- ldm_partition, /* this must come before msdos */
-#endif
-#ifdef CONFIG_MSDOS_PARTITION
- msdos_partition,
-#endif
-#ifdef CONFIG_OSF_PARTITION
- osf_partition,
-#endif
-#ifdef CONFIG_SUN_PARTITION
- sun_partition,
-#endif
-#ifdef CONFIG_AMIGA_PARTITION
- amiga_partition,
-#endif
-#ifdef CONFIG_ATARI_PARTITION
- atari_partition,
-#endif
-#ifdef CONFIG_MAC_PARTITION
- mac_partition,
-#endif
-#ifdef CONFIG_ULTRIX_PARTITION
- ultrix_partition,
-#endif
-#ifdef CONFIG_IBM_PARTITION
- ibm_partition,
-#endif
-#ifdef CONFIG_KARMA_PARTITION
- karma_partition,
-#endif
-#ifdef CONFIG_SYSV68_PARTITION
- sysv68_partition,
-#endif
- NULL
-};
-
-/*
- * disk_name() is used by partition check code and the genhd driver.
- * It formats the devicename of the indicated disk into
- * the supplied buffer (of size at least 32), and returns
- * a pointer to that same buffer (for convenience).
- */
-
-char *disk_name(struct gendisk *hd, int partno, char *buf)
-{
- if (!partno)
- snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
- else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
- snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
- else
- snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
-
- return buf;
-}
-
-const char *bdevname(struct block_device *bdev, char *buf)
-{
- return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
-}
-
-EXPORT_SYMBOL(bdevname);
-
-/*
- * There's very little reason to use this, you should really
- * have a struct block_device just about everywhere and use
- * bdevname() instead.
- */
-const char *__bdevname(dev_t dev, char *buffer)
-{
- scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)",
- MAJOR(dev), MINOR(dev));
- return buffer;
-}
-
-EXPORT_SYMBOL(__bdevname);
-
-static struct parsed_partitions *
-check_partition(struct gendisk *hd, struct block_device *bdev)
-{
- struct parsed_partitions *state;
- int i, res, err;
-
- state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
- if (!state)
- return NULL;
- state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
- if (!state->pp_buf) {
- kfree(state);
- return NULL;
- }
- state->pp_buf[0] = '\0';
-
- state->bdev = bdev;
- disk_name(hd, 0, state->name);
- snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
- if (isdigit(state->name[strlen(state->name)-1]))
- sprintf(state->name, "p");
-
- state->limit = disk_max_parts(hd);
- i = res = err = 0;
- while (!res && check_part[i]) {
- memset(&state->parts, 0, sizeof(state->parts));
- res = check_part[i++](state);
- if (res < 0) {
- /* We have hit an I/O error which we don't report now.
- * But record it, and let the others do their job.
- */
- err = res;
- res = 0;
- }
-
- }
- if (res > 0) {
- printk(KERN_INFO "%s", state->pp_buf);
-
- free_page((unsigned long)state->pp_buf);
- return state;
- }
- if (state->access_beyond_eod)
- err = -ENOSPC;
- if (err)
- /* The partition is unrecognized. So report I/O errors if there were any */
- res = err;
- if (!res)
- strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
- else if (warn_no_part)
- strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
-
- printk(KERN_INFO "%s", state->pp_buf);
-
- free_page((unsigned long)state->pp_buf);
- kfree(state);
- return ERR_PTR(res);
-}
-
-static ssize_t part_partition_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hd_struct *p = dev_to_part(dev);
-
- return sprintf(buf, "%d\n", p->partno);
-}
-
-static ssize_t part_start_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hd_struct *p = dev_to_part(dev);
-
- return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
-}
-
-ssize_t part_size_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hd_struct *p = dev_to_part(dev);
- return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
-}
-
-static ssize_t part_ro_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hd_struct *p = dev_to_part(dev);
- return sprintf(buf, "%d\n", p->policy ? 1 : 0);
-}
-
-static ssize_t part_alignment_offset_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hd_struct *p = dev_to_part(dev);
- return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
-}
-
-static ssize_t part_discard_alignment_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hd_struct *p = dev_to_part(dev);
- return sprintf(buf, "%u\n", p->discard_alignment);
-}
-
-ssize_t part_stat_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hd_struct *p = dev_to_part(dev);
- int cpu;
-
- cpu = part_stat_lock();
- part_round_stats(cpu, p);
- part_stat_unlock();
- return sprintf(buf,
- "%8lu %8lu %8llu %8u "
- "%8lu %8lu %8llu %8u "
- "%8u %8u %8u"
- "\n",
- part_stat_read(p, ios[READ]),
- part_stat_read(p, merges[READ]),
- (unsigned long long)part_stat_read(p, sectors[READ]),
- jiffies_to_msecs(part_stat_read(p, ticks[READ])),
- part_stat_read(p, ios[WRITE]),
- part_stat_read(p, merges[WRITE]),
- (unsigned long long)part_stat_read(p, sectors[WRITE]),
- jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
- part_in_flight(p),
- jiffies_to_msecs(part_stat_read(p, io_ticks)),
- jiffies_to_msecs(part_stat_read(p, time_in_queue)));
-}
-
-ssize_t part_inflight_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hd_struct *p = dev_to_part(dev);
-
- return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
- atomic_read(&p->in_flight[1]));
-}
-
-#ifdef CONFIG_FAIL_MAKE_REQUEST
-ssize_t part_fail_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct hd_struct *p = dev_to_part(dev);
-
- return sprintf(buf, "%d\n", p->make_it_fail);
-}
-
-ssize_t part_fail_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct hd_struct *p = dev_to_part(dev);
- int i;
-
- if (count > 0 && sscanf(buf, "%d", &i) > 0)
- p->make_it_fail = (i == 0) ? 0 : 1;
-
- return count;
-}
-#endif
-
-static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
-static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
-static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
-static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
-static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
-static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
- NULL);
-static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
-static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
-#ifdef CONFIG_FAIL_MAKE_REQUEST
-static struct device_attribute dev_attr_fail =
- __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
-#endif
-
-static struct attribute *part_attrs[] = {
- &dev_attr_partition.attr,
- &dev_attr_start.attr,
- &dev_attr_size.attr,
- &dev_attr_ro.attr,
- &dev_attr_alignment_offset.attr,
- &dev_attr_discard_alignment.attr,
- &dev_attr_stat.attr,
- &dev_attr_inflight.attr,
-#ifdef CONFIG_FAIL_MAKE_REQUEST
- &dev_attr_fail.attr,
-#endif
- NULL
-};
-
-static struct attribute_group part_attr_group = {
- .attrs = part_attrs,
-};
-
-static const struct attribute_group *part_attr_groups[] = {
- &part_attr_group,
-#ifdef CONFIG_BLK_DEV_IO_TRACE
- &blk_trace_attr_group,
-#endif
- NULL
-};
-
-static void part_release(struct device *dev)
-{
- struct hd_struct *p = dev_to_part(dev);
- free_part_stats(p);
- free_part_info(p);
- kfree(p);
-}
-
-struct device_type part_type = {
- .name = "partition",
- .groups = part_attr_groups,
- .release = part_release,
-};
-
-static void delete_partition_rcu_cb(struct rcu_head *head)
-{
- struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
-
- part->start_sect = 0;
- part->nr_sects = 0;
- part_stat_set_all(part, 0);
- put_device(part_to_dev(part));
-}
-
-void __delete_partition(struct hd_struct *part)
-{
- call_rcu(&part->rcu_head, delete_partition_rcu_cb);
-}
-
-void delete_partition(struct gendisk *disk, int partno)
-{
- struct disk_part_tbl *ptbl = disk->part_tbl;
- struct hd_struct *part;
-
- if (partno >= ptbl->len)
- return;
-
- part = ptbl->part[partno];
- if (!part)
- return;
-
- blk_free_devt(part_devt(part));
- rcu_assign_pointer(ptbl->part[partno], NULL);
- rcu_assign_pointer(ptbl->last_lookup, NULL);
- kobject_put(part->holder_dir);
- device_del(part_to_dev(part));
-
- hd_struct_put(part);
-}
-
-static ssize_t whole_disk_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- return 0;
-}
-static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
- whole_disk_show, NULL);
-
-struct hd_struct *add_partition(struct gendisk *disk, int partno,
- sector_t start, sector_t len, int flags,
- struct partition_meta_info *info)
-{
- struct hd_struct *p;
- dev_t devt = MKDEV(0, 0);
- struct device *ddev = disk_to_dev(disk);
- struct device *pdev;
- struct disk_part_tbl *ptbl;
- const char *dname;
- int err;
-
- err = disk_expand_part_tbl(disk, partno);
- if (err)
- return ERR_PTR(err);
- ptbl = disk->part_tbl;
-
- if (ptbl->part[partno])
- return ERR_PTR(-EBUSY);
-
- p = kzalloc(sizeof(*p), GFP_KERNEL);
- if (!p)
- return ERR_PTR(-EBUSY);
-
- if (!init_part_stats(p)) {
- err = -ENOMEM;
- goto out_free;
- }
- pdev = part_to_dev(p);
-
- p->start_sect = start;
- p->alignment_offset =
- queue_limit_alignment_offset(&disk->queue->limits, start);
- p->discard_alignment =
- queue_limit_discard_alignment(&disk->queue->limits, start);
- p->nr_sects = len;
- p->partno = partno;
- p->policy = get_disk_ro(disk);
-
- if (info) {
- struct partition_meta_info *pinfo = alloc_part_info(disk);
- if (!pinfo)
- goto out_free_stats;
- memcpy(pinfo, info, sizeof(*info));
- p->info = pinfo;
- }
-
- dname = dev_name(ddev);
- if (isdigit(dname[strlen(dname) - 1]))
- dev_set_name(pdev, "%sp%d", dname, partno);
- else
- dev_set_name(pdev, "%s%d", dname, partno);
-
- device_initialize(pdev);
- pdev->class = &block_class;
- pdev->type = &part_type;
- pdev->parent = ddev;
-
- err = blk_alloc_devt(p, &devt);
- if (err)
- goto out_free_info;
- pdev->devt = devt;
-
- /* delay uevent until 'holders' subdir is created */
- dev_set_uevent_suppress(pdev, 1);
- err = device_add(pdev);
- if (err)
- goto out_put;
-
- err = -ENOMEM;
- p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
- if (!p->holder_dir)
- goto out_del;
-
- dev_set_uevent_suppress(pdev, 0);
- if (flags & ADDPART_FLAG_WHOLEDISK) {
- err = device_create_file(pdev, &dev_attr_whole_disk);
- if (err)
- goto out_del;
- }
-
- /* everything is up and running, commence */
- rcu_assign_pointer(ptbl->part[partno], p);
-
- /* suppress uevent if the disk suppresses it */
- if (!dev_get_uevent_suppress(ddev))
- kobject_uevent(&pdev->kobj, KOBJ_ADD);
-
- hd_ref_init(p);
- return p;
-
-out_free_info:
- free_part_info(p);
-out_free_stats:
- free_part_stats(p);
-out_free:
- kfree(p);
- return ERR_PTR(err);
-out_del:
- kobject_put(p->holder_dir);
- device_del(pdev);
-out_put:
- put_device(pdev);
- blk_free_devt(devt);
- return ERR_PTR(err);
-}
-
-static bool disk_unlock_native_capacity(struct gendisk *disk)
-{
- const struct block_device_operations *bdops = disk->fops;
-
- if (bdops->unlock_native_capacity &&
- !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
- printk(KERN_CONT "enabling native capacity\n");
- bdops->unlock_native_capacity(disk);
- disk->flags |= GENHD_FL_NATIVE_CAPACITY;
- return true;
- } else {
- printk(KERN_CONT "truncated\n");
- return false;
- }
-}
-
-int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
-{
- struct parsed_partitions *state = NULL;
- struct disk_part_iter piter;
- struct hd_struct *part;
- int p, highest, res;
-rescan:
- if (state && !IS_ERR(state)) {
- kfree(state);
- state = NULL;
- }
-
- if (bdev->bd_part_count)
- return -EBUSY;
- res = invalidate_partition(disk, 0);
- if (res)
- return res;
-
- disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
- while ((part = disk_part_iter_next(&piter)))
- delete_partition(disk, part->partno);
- disk_part_iter_exit(&piter);
-
- if (disk->fops->revalidate_disk)
- disk->fops->revalidate_disk(disk);
- check_disk_size_change(disk, bdev);
- bdev->bd_invalidated = 0;
- if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
- return 0;
- if (IS_ERR(state)) {
- /*
- * I/O error reading the partition table. If any
- * partition code tried to read beyond EOD, retry
- * after unlocking native capacity.
- */
- if (PTR_ERR(state) == -ENOSPC) {
- printk(KERN_WARNING "%s: partition table beyond EOD, ",
- disk->disk_name);
- if (disk_unlock_native_capacity(disk))
- goto rescan;
- }
- return -EIO;
- }
- /*
- * If any partition code tried to read beyond EOD, try
- * unlocking native capacity even if partition table is
- * successfully read as we could be missing some partitions.
- */
- if (state->access_beyond_eod) {
- printk(KERN_WARNING
- "%s: partition table partially beyond EOD, ",
- disk->disk_name);
- if (disk_unlock_native_capacity(disk))
- goto rescan;
- }
-
- /* tell userspace that the media / partition table may have changed */
- kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
-
- /* Detect the highest partition number and preallocate
- * disk->part_tbl. This is an optimization and not strictly
- * necessary.
- */
- for (p = 1, highest = 0; p < state->limit; p++)
- if (state->parts[p].size)
- highest = p;
-
- disk_expand_part_tbl(disk, highest);
-
- /* add partitions */
- for (p = 1; p < state->limit; p++) {
- sector_t size, from;
- struct partition_meta_info *info = NULL;
-
- size = state->parts[p].size;
- if (!size)
- continue;
-
- from = state->parts[p].from;
- if (from >= get_capacity(disk)) {
- printk(KERN_WARNING
- "%s: p%d start %llu is beyond EOD, ",
- disk->disk_name, p, (unsigned long long) from);
- if (disk_unlock_native_capacity(disk))
- goto rescan;
- continue;
- }
-
- if (from + size > get_capacity(disk)) {
- printk(KERN_WARNING
- "%s: p%d size %llu extends beyond EOD, ",
- disk->disk_name, p, (unsigned long long) size);
-
- if (disk_unlock_native_capacity(disk)) {
- /* free state and restart */
- goto rescan;
- } else {
- /*
- * we can not ignore partitions of broken tables
- * created by for example camera firmware, but
- * we limit them to the end of the disk to avoid
- * creating invalid block devices
- */
- size = get_capacity(disk) - from;
- }
- }
-
- if (state->parts[p].has_info)
- info = &state->parts[p].info;
- part = add_partition(disk, p, from, size,
- state->parts[p].flags,
- &state->parts[p].info);
- if (IS_ERR(part)) {
- printk(KERN_ERR " %s: p%d could not be added: %ld\n",
- disk->disk_name, p, -PTR_ERR(part));
- continue;
- }
-#ifdef CONFIG_BLK_DEV_MD
- if (state->parts[p].flags & ADDPART_FLAG_RAID)
- md_autodetect_dev(part_to_dev(part)->devt);
-#endif
- }
- kfree(state);
- return 0;
-}
-
-unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
-{
- struct address_space *mapping = bdev->bd_inode->i_mapping;
- struct page *page;
-
- page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
- NULL);
- if (!IS_ERR(page)) {
- if (PageError(page))
- goto fail;
- p->v = page;
- return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9);
-fail:
- page_cache_release(page);
- }
- p->v = NULL;
- return NULL;
-}
-
-EXPORT_SYMBOL(read_dev_sector);
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
deleted file mode 100644
index d68bf4dc3bc2..000000000000
--- a/fs/partitions/check.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <linux/pagemap.h>
-#include <linux/blkdev.h>
-#include <linux/genhd.h>
-
-/*
- * add_gd_partition adds a partitions details to the devices partition
- * description.
- */
-struct parsed_partitions {
- struct block_device *bdev;
- char name[BDEVNAME_SIZE];
- struct {
- sector_t from;
- sector_t size;
- int flags;
- bool has_info;
- struct partition_meta_info info;
- } parts[DISK_MAX_PARTS];
- int next;
- int limit;
- bool access_beyond_eod;
- char *pp_buf;
-};
-
-static inline void *read_part_sector(struct parsed_partitions *state,
- sector_t n, Sector *p)
-{
- if (n >= get_capacity(state->bdev->bd_disk)) {
- state->access_beyond_eod = true;
- return NULL;
- }
- return read_dev_sector(state->bdev, n, p);
-}
-
-static inline void
-put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
-{
- if (n < p->limit) {
- char tmp[1 + BDEVNAME_SIZE + 10 + 1];
-
- p->parts[n].from = from;
- p->parts[n].size = size;
- snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
- strlcat(p->pp_buf, tmp, PAGE_SIZE);
- }
-}
-
-extern int warn_no_part;
-
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
deleted file mode 100644
index 6296b403c67a..000000000000
--- a/fs/partitions/efi.c
+++ /dev/null
@@ -1,675 +0,0 @@
-/************************************************************
- * EFI GUID Partition Table handling
- *
- * http://www.uefi.org/specs/
- * http://www.intel.com/technology/efi/
- *
- * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
- * Copyright 2000,2001,2002,2004 Dell Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *
- * TODO:
- *
- * Changelog:
- * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com>
- * - test for valid PMBR and valid PGPT before ever reading
- * AGPT, allow override with 'gpt' kernel command line option.
- * - check for first/last_usable_lba outside of size of disk
- *
- * Tue Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com>
- * - Ported to 2.5.7-pre1 and 2.5.7-dj2
- * - Applied patch to avoid fault in alternate header handling
- * - cleaned up find_valid_gpt
- * - On-disk structure and copy in memory is *always* LE now -
- * swab fields as needed
- * - remove print_gpt_header()
- * - only use first max_p partition entries, to keep the kernel minor number
- * and partition numbers tied.
- *
- * Mon Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com>
- * - Removed __PRIPTR_PREFIX - not being used
- *
- * Mon Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com>
- * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied
- *
- * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com>
- * - Added compare_gpts().
- * - moved le_efi_guid_to_cpus() back into this file. GPT is the only
- * thing that keeps EFI GUIDs on disk.
- * - Changed gpt structure names and members to be simpler and more Linux-like.
- *
- * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com>
- * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck
- *
- * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com>
- * - Changed function comments to DocBook style per Andreas Dilger suggestion.
- *
- * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com>
- * - Change read_lba() to use the page cache per Al Viro's work.
- * - print u64s properly on all architectures
- * - fixed debug_printk(), now Dprintk()
- *
- * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com>
- * - Style cleanups
- * - made most functions static
- * - Endianness addition
- * - remove test for second alternate header, as it's not per spec,
- * and is unnecessary. There's now a method to read/write the last
- * sector of an odd-sized disk from user space. No tools have ever
- * been released which used this code, so it's effectively dead.
- * - Per Asit Mallick of Intel, added a test for a valid PMBR.
- * - Added kernel command line option 'gpt' to override valid PMBR test.
- *
- * Wed Jun 6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com>
- * - added devfs volume UUID support (/dev/volumes/uuids) for
- * mounting file systems by the partition GUID.
- *
- * Tue Dec 5 2000 Matt Domsch <Matt_Domsch@dell.com>
- * - Moved crc32() to linux/lib, added efi_crc32().
- *
- * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com>
- * - Replaced Intel's CRC32 function with an equivalent
- * non-license-restricted version.
- *
- * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com>
- * - Fixed the last_lba() call to return the proper last block
- *
- * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com>
- * - Thanks to Andries Brouwer for his debugging assistance.
- * - Code works, detects all the partitions.
- *
- ************************************************************/
-#include <linux/crc32.h>
-#include <linux/ctype.h>
-#include <linux/math64.h>
-#include <linux/slab.h>
-#include "check.h"
-#include "efi.h"
-
-/* This allows a kernel command line option 'gpt' to override
- * the test for invalid PMBR. Not __initdata because reloading
- * the partition tables happens after init too.
- */
-static int force_gpt;
-static int __init
-force_gpt_fn(char *str)
-{
- force_gpt = 1;
- return 1;
-}
-__setup("gpt", force_gpt_fn);
-
-
-/**
- * efi_crc32() - EFI version of crc32 function
- * @buf: buffer to calculate crc32 of
- * @len - length of buf
- *
- * Description: Returns EFI-style CRC32 value for @buf
- *
- * This function uses the little endian Ethernet polynomial
- * but seeds the function with ~0, and xor's with ~0 at the end.
- * Note, the EFI Specification, v1.02, has a reference to
- * Dr. Dobbs Journal, May 1994 (actually it's in May 1992).
- */
-static inline u32
-efi_crc32(const void *buf, unsigned long len)
-{
- return (crc32(~0L, buf, len) ^ ~0L);
-}
-
-/**
- * last_lba(): return number of last logical block of device
- * @bdev: block device
- *
- * Description: Returns last LBA value on success, 0 on error.
- * This is stored (by sd and ide-geometry) in
- * the part[0] entry for this disk, and is the number of
- * physical sectors available on the disk.
- */
-static u64 last_lba(struct block_device *bdev)
-{
- if (!bdev || !bdev->bd_inode)
- return 0;
- return div_u64(bdev->bd_inode->i_size,
- bdev_logical_block_size(bdev)) - 1ULL;
-}
-
-static inline int
-pmbr_part_valid(struct partition *part)
-{
- if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT &&
- le32_to_cpu(part->start_sect) == 1UL)
- return 1;
- return 0;
-}
-
-/**
- * is_pmbr_valid(): test Protective MBR for validity
- * @mbr: pointer to a legacy mbr structure
- *
- * Description: Returns 1 if PMBR is valid, 0 otherwise.
- * Validity depends on two things:
- * 1) MSDOS signature is in the last two bytes of the MBR
- * 2) One partition of type 0xEE is found
- */
-static int
-is_pmbr_valid(legacy_mbr *mbr)
-{
- int i;
- if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
- return 0;
- for (i = 0; i < 4; i++)
- if (pmbr_part_valid(&mbr->partition_record[i]))
- return 1;
- return 0;
-}
-
-/**
- * read_lba(): Read bytes from disk, starting at given LBA
- * @state
- * @lba
- * @buffer
- * @size_t
- *
- * Description: Reads @count bytes from @state->bdev into @buffer.
- * Returns number of bytes read on success, 0 on error.
- */
-static size_t read_lba(struct parsed_partitions *state,
- u64 lba, u8 *buffer, size_t count)
-{
- size_t totalreadcount = 0;
- struct block_device *bdev = state->bdev;
- sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
-
- if (!buffer || lba > last_lba(bdev))
- return 0;
-
- while (count) {
- int copied = 512;
- Sector sect;
- unsigned char *data = read_part_sector(state, n++, &sect);
- if (!data)
- break;
- if (copied > count)
- copied = count;
- memcpy(buffer, data, copied);
- put_dev_sector(sect);
- buffer += copied;
- totalreadcount +=copied;
- count -= copied;
- }
- return totalreadcount;
-}
-
-/**
- * alloc_read_gpt_entries(): reads partition entries from disk
- * @state
- * @gpt - GPT header
- *
- * Description: Returns ptes on success, NULL on error.
- * Allocates space for PTEs based on information found in @gpt.
- * Notes: remember to free pte when you're done!
- */
-static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
- gpt_header *gpt)
-{
- size_t count;
- gpt_entry *pte;
-
- if (!gpt)
- return NULL;
-
- count = le32_to_cpu(gpt->num_partition_entries) *
- le32_to_cpu(gpt->sizeof_partition_entry);
- if (!count)
- return NULL;
- pte = kzalloc(count, GFP_KERNEL);
- if (!pte)
- return NULL;
-
- if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
- (u8 *) pte,
- count) < count) {
- kfree(pte);
- pte=NULL;
- return NULL;
- }
- return pte;
-}
-
-/**
- * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @state
- * @lba is the Logical Block Address of the partition table
- *
- * Description: returns GPT header on success, NULL on error. Allocates
- * and fills a GPT header starting at @ from @state->bdev.
- * Note: remember to free gpt when finished with it.
- */
-static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
- u64 lba)
-{
- gpt_header *gpt;
- unsigned ssz = bdev_logical_block_size(state->bdev);
-
- gpt = kzalloc(ssz, GFP_KERNEL);
- if (!gpt)
- return NULL;
-
- if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
- kfree(gpt);
- gpt=NULL;
- return NULL;
- }
-
- return gpt;
-}
-
-/**
- * is_gpt_valid() - tests one GPT header and PTEs for validity
- * @state
- * @lba is the logical block address of the GPT header to test
- * @gpt is a GPT header ptr, filled on return.
- * @ptes is a PTEs ptr, filled on return.
- *
- * Description: returns 1 if valid, 0 on error.
- * If valid, returns pointers to newly allocated GPT header and PTEs.
- */
-static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
- gpt_header **gpt, gpt_entry **ptes)
-{
- u32 crc, origcrc;
- u64 lastlba;
-
- if (!ptes)
- return 0;
- if (!(*gpt = alloc_read_gpt_header(state, lba)))
- return 0;
-
- /* Check the GUID Partition Table signature */
- if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) {
- pr_debug("GUID Partition Table Header signature is wrong:"
- "%lld != %lld\n",
- (unsigned long long)le64_to_cpu((*gpt)->signature),
- (unsigned long long)GPT_HEADER_SIGNATURE);
- goto fail;
- }
-
- /* Check the GUID Partition Table header size */
- if (le32_to_cpu((*gpt)->header_size) >
- bdev_logical_block_size(state->bdev)) {
- pr_debug("GUID Partition Table Header size is wrong: %u > %u\n",
- le32_to_cpu((*gpt)->header_size),
- bdev_logical_block_size(state->bdev));
- goto fail;
- }
-
- /* Check the GUID Partition Table CRC */
- origcrc = le32_to_cpu((*gpt)->header_crc32);
- (*gpt)->header_crc32 = 0;
- crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size));
-
- if (crc != origcrc) {
- pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n",
- crc, origcrc);
- goto fail;
- }
- (*gpt)->header_crc32 = cpu_to_le32(origcrc);
-
- /* Check that the my_lba entry points to the LBA that contains
- * the GUID Partition Table */
- if (le64_to_cpu((*gpt)->my_lba) != lba) {
- pr_debug("GPT my_lba incorrect: %lld != %lld\n",
- (unsigned long long)le64_to_cpu((*gpt)->my_lba),
- (unsigned long long)lba);
- goto fail;
- }
-
- /* Check the first_usable_lba and last_usable_lba are
- * within the disk.
- */
- lastlba = last_lba(state->bdev);
- if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
- pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
- (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
- (unsigned long long)lastlba);
- goto fail;
- }
- if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) {
- pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
- (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
- (unsigned long long)lastlba);
- goto fail;
- }
-
- /* Check that sizeof_partition_entry has the correct value */
- if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) {
- pr_debug("GUID Partitition Entry Size check failed.\n");
- goto fail;
- }
-
- if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
- goto fail;
-
- /* Check the GUID Partition Entry Array CRC */
- crc = efi_crc32((const unsigned char *) (*ptes),
- le32_to_cpu((*gpt)->num_partition_entries) *
- le32_to_cpu((*gpt)->sizeof_partition_entry));
-
- if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
- pr_debug("GUID Partitition Entry Array CRC check failed.\n");
- goto fail_ptes;
- }
-
- /* We're done, all's well */
- return 1;
-
- fail_ptes:
- kfree(*ptes);
- *ptes = NULL;
- fail:
- kfree(*gpt);
- *gpt = NULL;
- return 0;
-}
-
-/**
- * is_pte_valid() - tests one PTE for validity
- * @pte is the pte to check
- * @lastlba is last lba of the disk
- *
- * Description: returns 1 if valid, 0 on error.
- */
-static inline int
-is_pte_valid(const gpt_entry *pte, const u64 lastlba)
-{
- if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) ||
- le64_to_cpu(pte->starting_lba) > lastlba ||
- le64_to_cpu(pte->ending_lba) > lastlba)
- return 0;
- return 1;
-}
-
-/**
- * compare_gpts() - Search disk for valid GPT headers and PTEs
- * @pgpt is the primary GPT header
- * @agpt is the alternate GPT header
- * @lastlba is the last LBA number
- * Description: Returns nothing. Sanity checks pgpt and agpt fields
- * and prints warnings on discrepancies.
- *
- */
-static void
-compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
-{
- int error_found = 0;
- if (!pgpt || !agpt)
- return;
- if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) {
- printk(KERN_WARNING
- "GPT:Primary header LBA != Alt. header alternate_lba\n");
- printk(KERN_WARNING "GPT:%lld != %lld\n",
- (unsigned long long)le64_to_cpu(pgpt->my_lba),
- (unsigned long long)le64_to_cpu(agpt->alternate_lba));
- error_found++;
- }
- if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) {
- printk(KERN_WARNING
- "GPT:Primary header alternate_lba != Alt. header my_lba\n");
- printk(KERN_WARNING "GPT:%lld != %lld\n",
- (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
- (unsigned long long)le64_to_cpu(agpt->my_lba));
- error_found++;
- }
- if (le64_to_cpu(pgpt->first_usable_lba) !=
- le64_to_cpu(agpt->first_usable_lba)) {
- printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n");
- printk(KERN_WARNING "GPT:%lld != %lld\n",
- (unsigned long long)le64_to_cpu(pgpt->first_usable_lba),
- (unsigned long long)le64_to_cpu(agpt->first_usable_lba));
- error_found++;
- }
- if (le64_to_cpu(pgpt->last_usable_lba) !=
- le64_to_cpu(agpt->last_usable_lba)) {
- printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n");
- printk(KERN_WARNING "GPT:%lld != %lld\n",
- (unsigned long long)le64_to_cpu(pgpt->last_usable_lba),
- (unsigned long long)le64_to_cpu(agpt->last_usable_lba));
- error_found++;
- }
- if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) {
- printk(KERN_WARNING "GPT:disk_guids don't match.\n");
- error_found++;
- }
- if (le32_to_cpu(pgpt->num_partition_entries) !=
- le32_to_cpu(agpt->num_partition_entries)) {
- printk(KERN_WARNING "GPT:num_partition_entries don't match: "
- "0x%x != 0x%x\n",
- le32_to_cpu(pgpt->num_partition_entries),
- le32_to_cpu(agpt->num_partition_entries));
- error_found++;
- }
- if (le32_to_cpu(pgpt->sizeof_partition_entry) !=
- le32_to_cpu(agpt->sizeof_partition_entry)) {
- printk(KERN_WARNING
- "GPT:sizeof_partition_entry values don't match: "
- "0x%x != 0x%x\n",
- le32_to_cpu(pgpt->sizeof_partition_entry),
- le32_to_cpu(agpt->sizeof_partition_entry));
- error_found++;
- }
- if (le32_to_cpu(pgpt->partition_entry_array_crc32) !=
- le32_to_cpu(agpt->partition_entry_array_crc32)) {
- printk(KERN_WARNING
- "GPT:partition_entry_array_crc32 values don't match: "
- "0x%x != 0x%x\n",
- le32_to_cpu(pgpt->partition_entry_array_crc32),
- le32_to_cpu(agpt->partition_entry_array_crc32));
- error_found++;
- }
- if (le64_to_cpu(pgpt->alternate_lba) != lastlba) {
- printk(KERN_WARNING
- "GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
- printk(KERN_WARNING "GPT:%lld != %lld\n",
- (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
- (unsigned long long)lastlba);
- error_found++;
- }
-
- if (le64_to_cpu(agpt->my_lba) != lastlba) {
- printk(KERN_WARNING
- "GPT:Alternate GPT header not at the end of the disk.\n");
- printk(KERN_WARNING "GPT:%lld != %lld\n",
- (unsigned long long)le64_to_cpu(agpt->my_lba),
- (unsigned long long)lastlba);
- error_found++;
- }
-
- if (error_found)
- printk(KERN_WARNING
- "GPT: Use GNU Parted to correct GPT errors.\n");
- return;
-}
-
-/**
- * find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @state
- * @gpt is a GPT header ptr, filled on return.
- * @ptes is a PTEs ptr, filled on return.
- * Description: Returns 1 if valid, 0 on error.
- * If valid, returns pointers to newly allocated GPT header and PTEs.
- * Validity depends on PMBR being valid (or being overridden by the
- * 'gpt' kernel command line option) and finding either the Primary
- * GPT header and PTEs valid, or the Alternate GPT header and PTEs
- * valid. If the Primary GPT header is not valid, the Alternate GPT header
- * is not checked unless the 'gpt' kernel command line option is passed.
- * This protects against devices which misreport their size, and forces
- * the user to decide to use the Alternate GPT.
- */
-static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
- gpt_entry **ptes)
-{
- int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
- gpt_header *pgpt = NULL, *agpt = NULL;
- gpt_entry *pptes = NULL, *aptes = NULL;
- legacy_mbr *legacymbr;
- u64 lastlba;
-
- if (!ptes)
- return 0;
-
- lastlba = last_lba(state->bdev);
- if (!force_gpt) {
- /* This will be added to the EFI Spec. per Intel after v1.02. */
- legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
- if (legacymbr) {
- read_lba(state, 0, (u8 *) legacymbr,
- sizeof (*legacymbr));
- good_pmbr = is_pmbr_valid(legacymbr);
- kfree(legacymbr);
- }
- if (!good_pmbr)
- goto fail;
- }
-
- good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
- &pgpt, &pptes);
- if (good_pgpt)
- good_agpt = is_gpt_valid(state,
- le64_to_cpu(pgpt->alternate_lba),
- &agpt, &aptes);
- if (!good_agpt && force_gpt)
- good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
-
- /* The obviously unsuccessful case */
- if (!good_pgpt && !good_agpt)
- goto fail;
-
- compare_gpts(pgpt, agpt, lastlba);
-
- /* The good cases */
- if (good_pgpt) {
- *gpt = pgpt;
- *ptes = pptes;
- kfree(agpt);
- kfree(aptes);
- if (!good_agpt) {
- printk(KERN_WARNING
- "Alternate GPT is invalid, "
- "using primary GPT.\n");
- }
- return 1;
- }
- else if (good_agpt) {
- *gpt = agpt;
- *ptes = aptes;
- kfree(pgpt);
- kfree(pptes);
- printk(KERN_WARNING
- "Primary GPT is invalid, using alternate GPT.\n");
- return 1;
- }
-
- fail:
- kfree(pgpt);
- kfree(agpt);
- kfree(pptes);
- kfree(aptes);
- *gpt = NULL;
- *ptes = NULL;
- return 0;
-}
-
-/**
- * efi_partition(struct parsed_partitions *state)
- * @state
- *
- * Description: called from check.c, if the disk contains GPT
- * partitions, sets up partition entries in the kernel.
- *
- * If the first block on the disk is a legacy MBR,
- * it will get handled by msdos_partition().
- * If it's a Protective MBR, we'll handle it here.
- *
- * We do not create a Linux partition for GPT, but
- * only for the actual data partitions.
- * Returns:
- * -1 if unable to read the partition table
- * 0 if this isn't our partition table
- * 1 if successful
- *
- */
-int efi_partition(struct parsed_partitions *state)
-{
- gpt_header *gpt = NULL;
- gpt_entry *ptes = NULL;
- u32 i;
- unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
- u8 unparsed_guid[37];
-
- if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
- kfree(gpt);
- kfree(ptes);
- return 0;
- }
-
- pr_debug("GUID Partition Table is valid! Yea!\n");
-
- for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
- struct partition_meta_info *info;
- unsigned label_count = 0;
- unsigned label_max;
- u64 start = le64_to_cpu(ptes[i].starting_lba);
- u64 size = le64_to_cpu(ptes[i].ending_lba) -
- le64_to_cpu(ptes[i].starting_lba) + 1ULL;
-
- if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
- continue;
-
- put_partition(state, i+1, start * ssz, size * ssz);
-
- /* If this is a RAID volume, tell md */
- if (!efi_guidcmp(ptes[i].partition_type_guid,
- PARTITION_LINUX_RAID_GUID))
- state->parts[i + 1].flags = ADDPART_FLAG_RAID;
-
- info = &state->parts[i + 1].info;
- /* Instead of doing a manual swap to big endian, reuse the
- * common ASCII hex format as the interim.
- */
- efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
- part_pack_uuid(unparsed_guid, info->uuid);
-
- /* Naively convert UTF16-LE to 7 bits. */
- label_max = min(sizeof(info->volname) - 1,
- sizeof(ptes[i].partition_name));
- info->volname[label_max] = 0;
- while (label_count < label_max) {
- u8 c = ptes[i].partition_name[label_count] & 0xff;
- if (c && !isprint(c))
- c = '!';
- info->volname[label_count] = c;
- label_count++;
- }
- state->parts[i + 1].has_info = true;
- }
- kfree(ptes);
- kfree(gpt);
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- return 1;
-}
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
deleted file mode 100644
index b69ab729558f..000000000000
--- a/fs/partitions/efi.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/************************************************************
- * EFI GUID Partition Table
- * Per Intel EFI Specification v1.02
- * http://developer.intel.com/technology/efi/efi.htm
- *
- * By Matt Domsch <Matt_Domsch@dell.com> Fri Sep 22 22:15:56 CDT 2000
- * Copyright 2000,2001 Dell Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- ************************************************************/
-
-#ifndef FS_PART_EFI_H_INCLUDED
-#define FS_PART_EFI_H_INCLUDED
-
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/genhd.h>
-#include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/string.h>
-#include <linux/efi.h>
-
-#define MSDOS_MBR_SIGNATURE 0xaa55
-#define EFI_PMBR_OSTYPE_EFI 0xEF
-#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
-
-#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
-#define GPT_HEADER_REVISION_V1 0x00010000
-#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
-
-#define PARTITION_SYSTEM_GUID \
- EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \
- 0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B)
-#define LEGACY_MBR_PARTITION_GUID \
- EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \
- 0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F)
-#define PARTITION_MSFT_RESERVED_GUID \
- EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \
- 0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE)
-#define PARTITION_BASIC_DATA_GUID \
- EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \
- 0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7)
-#define PARTITION_LINUX_RAID_GUID \
- EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \
- 0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e)
-#define PARTITION_LINUX_SWAP_GUID \
- EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \
- 0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f)
-#define PARTITION_LINUX_LVM_GUID \
- EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \
- 0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28)
-
-typedef struct _gpt_header {
- __le64 signature;
- __le32 revision;
- __le32 header_size;
- __le32 header_crc32;
- __le32 reserved1;
- __le64 my_lba;
- __le64 alternate_lba;
- __le64 first_usable_lba;
- __le64 last_usable_lba;
- efi_guid_t disk_guid;
- __le64 partition_entry_lba;
- __le32 num_partition_entries;
- __le32 sizeof_partition_entry;
- __le32 partition_entry_array_crc32;
-
- /* The rest of the logical block is reserved by UEFI and must be zero.
- * EFI standard handles this by:
- *
- * uint8_t reserved2[ BlockSize - 92 ];
- */
-} __attribute__ ((packed)) gpt_header;
-
-typedef struct _gpt_entry_attributes {
- u64 required_to_function:1;
- u64 reserved:47;
- u64 type_guid_specific:16;
-} __attribute__ ((packed)) gpt_entry_attributes;
-
-typedef struct _gpt_entry {
- efi_guid_t partition_type_guid;
- efi_guid_t unique_partition_guid;
- __le64 starting_lba;
- __le64 ending_lba;
- gpt_entry_attributes attributes;
- efi_char16_t partition_name[72 / sizeof (efi_char16_t)];
-} __attribute__ ((packed)) gpt_entry;
-
-typedef struct _legacy_mbr {
- u8 boot_code[440];
- __le32 unique_mbr_signature;
- __le16 unknown;
- struct partition partition_record[4];
- __le16 signature;
-} __attribute__ ((packed)) legacy_mbr;
-
-/* Functions */
-extern int efi_partition(struct parsed_partitions *state);
-
-#endif
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only. This must remain at the end
- * of the file.
- * --------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 4
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -4
- * c-argdecl-indent: 4
- * c-label-offset: -4
- * c-continued-statement-offset: 4
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
deleted file mode 100644
index d513a07f44bb..000000000000
--- a/fs/partitions/ibm.c
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * File...........: linux/fs/partitions/ibm.c
- * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com>
- * Volker Sameske <sameske@de.ibm.com>
- * Bugreports.to..: <Linux390@de.ibm.com>
- * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
- */
-
-#include <linux/buffer_head.h>
-#include <linux/hdreg.h>
-#include <linux/slab.h>
-#include <asm/dasd.h>
-#include <asm/ebcdic.h>
-#include <asm/uaccess.h>
-#include <asm/vtoc.h>
-
-#include "check.h"
-#include "ibm.h"
-
-/*
- * compute the block number from a
- * cyl-cyl-head-head structure
- */
-static sector_t
-cchh2blk (struct vtoc_cchh *ptr, struct hd_geometry *geo) {
-
- sector_t cyl;
- __u16 head;
-
- /*decode cylinder and heads for large volumes */
- cyl = ptr->hh & 0xFFF0;
- cyl <<= 12;
- cyl |= ptr->cc;
- head = ptr->hh & 0x000F;
- return cyl * geo->heads * geo->sectors +
- head * geo->sectors;
-}
-
-/*
- * compute the block number from a
- * cyl-cyl-head-head-block structure
- */
-static sector_t
-cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
-
- sector_t cyl;
- __u16 head;
-
- /*decode cylinder and heads for large volumes */
- cyl = ptr->hh & 0xFFF0;
- cyl <<= 12;
- cyl |= ptr->cc;
- head = ptr->hh & 0x000F;
- return cyl * geo->heads * geo->sectors +
- head * geo->sectors +
- ptr->b;
-}
-
-/*
- */
-int ibm_partition(struct parsed_partitions *state)
-{
- struct block_device *bdev = state->bdev;
- int blocksize, res;
- loff_t i_size, offset, size, fmt_size;
- dasd_information2_t *info;
- struct hd_geometry *geo;
- char type[5] = {0,};
- char name[7] = {0,};
- union label_t {
- struct vtoc_volume_label_cdl vol;
- struct vtoc_volume_label_ldl lnx;
- struct vtoc_cms_label cms;
- } *label;
- unsigned char *data;
- Sector sect;
- sector_t labelsect;
- char tmp[64];
-
- res = 0;
- blocksize = bdev_logical_block_size(bdev);
- if (blocksize <= 0)
- goto out_exit;
- i_size = i_size_read(bdev->bd_inode);
- if (i_size == 0)
- goto out_exit;
-
- info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
- if (info == NULL)
- goto out_exit;
- geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL);
- if (geo == NULL)
- goto out_nogeo;
- label = kmalloc(sizeof(union label_t), GFP_KERNEL);
- if (label == NULL)
- goto out_nolab;
-
- if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0 ||
- ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
- goto out_freeall;
-
- /*
- * Special case for FBA disks: label sector does not depend on
- * blocksize.
- */
- if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
- (info->cu_type == 0x3880 && info->dev_type == 0x3370))
- labelsect = info->label_block;
- else
- labelsect = info->label_block * (blocksize >> 9);
-
- /*
- * Get volume label, extract name and type.
- */
- data = read_part_sector(state, labelsect, &sect);
- if (data == NULL)
- goto out_readerr;
-
- memcpy(label, data, sizeof(union label_t));
- put_dev_sector(sect);
-
- if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) {
- strncpy(type, label->vol.vollbl, 4);
- strncpy(name, label->vol.volid, 6);
- } else {
- strncpy(type, label->lnx.vollbl, 4);
- strncpy(name, label->lnx.volid, 6);
- }
- EBCASC(type, 4);
- EBCASC(name, 6);
-
- res = 1;
-
- /*
- * Three different formats: LDL, CDL and unformated disk
- *
- * identified by info->format
- *
- * unformated disks we do not have to care about
- */
- if (info->format == DASD_FORMAT_LDL) {
- if (strncmp(type, "CMS1", 4) == 0) {
- /*
- * VM style CMS1 labeled disk
- */
- blocksize = label->cms.block_size;
- if (label->cms.disk_offset != 0) {
- snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- /* disk is reserved minidisk */
- offset = label->cms.disk_offset;
- size = (label->cms.block_count - 1)
- * (blocksize >> 9);
- } else {
- snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- offset = (info->label_block + 1);
- size = label->cms.block_count
- * (blocksize >> 9);
- }
- put_partition(state, 1, offset*(blocksize >> 9),
- size-offset*(blocksize >> 9));
- } else {
- if (strncmp(type, "LNX1", 4) == 0) {
- snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- if (label->lnx.ldl_version == 0xf2) {
- fmt_size = label->lnx.formatted_blocks
- * (blocksize >> 9);
- } else if (!strcmp(info->type, "ECKD")) {
- /* formated w/o large volume support */
- fmt_size = geo->cylinders * geo->heads
- * geo->sectors * (blocksize >> 9);
- } else {
- /* old label and no usable disk geometry
- * (e.g. DIAG) */
- fmt_size = i_size >> 9;
- }
- size = i_size >> 9;
- if (fmt_size < size)
- size = fmt_size;
- offset = (info->label_block + 1);
- } else {
- /* unlabeled disk */
- strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
- size = i_size >> 9;
- offset = (info->label_block + 1);
- }
- put_partition(state, 1, offset*(blocksize >> 9),
- size-offset*(blocksize >> 9));
- }
- } else if (info->format == DASD_FORMAT_CDL) {
- /*
- * New style CDL formatted disk
- */
- sector_t blk;
- int counter;
-
- /*
- * check if VOL1 label is available
- * if not, something is wrong, skipping partition detection
- */
- if (strncmp(type, "VOL1", 4) == 0) {
- snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- /*
- * get block number and read then go through format1
- * labels
- */
- blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
- counter = 0;
- data = read_part_sector(state, blk * (blocksize/512),
- &sect);
- while (data != NULL) {
- struct vtoc_format1_label f1;
-
- memcpy(&f1, data,
- sizeof(struct vtoc_format1_label));
- put_dev_sector(sect);
-
- /* skip FMT4 / FMT5 / FMT7 labels */
- if (f1.DS1FMTID == _ascebc['4']
- || f1.DS1FMTID == _ascebc['5']
- || f1.DS1FMTID == _ascebc['7']
- || f1.DS1FMTID == _ascebc['9']) {
- blk++;
- data = read_part_sector(state,
- blk * (blocksize/512), &sect);
- continue;
- }
-
- /* only FMT1 and 8 labels valid at this point */
- if (f1.DS1FMTID != _ascebc['1'] &&
- f1.DS1FMTID != _ascebc['8'])
- break;
-
- /* OK, we got valid partition data */
- offset = cchh2blk(&f1.DS1EXT1.llimit, geo);
- size = cchh2blk(&f1.DS1EXT1.ulimit, geo) -
- offset + geo->sectors;
- if (counter >= state->limit)
- break;
- put_partition(state, counter + 1,
- offset * (blocksize >> 9),
- size * (blocksize >> 9));
- counter++;
- blk++;
- data = read_part_sector(state,
- blk * (blocksize/512), &sect);
- }
-
- if (!data)
- /* Are we not supposed to report this ? */
- goto out_readerr;
- } else
- printk(KERN_WARNING "Warning, expected Label VOL1 not "
- "found, treating as CDL formated Disk");
-
- }
-
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- goto out_freeall;
-
-
-out_readerr:
- res = -1;
-out_freeall:
- kfree(label);
-out_nolab:
- kfree(geo);
-out_nogeo:
- kfree(info);
-out_exit:
- return res;
-}
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
deleted file mode 100644
index 08fb0804a812..000000000000
--- a/fs/partitions/ibm.h
+++ /dev/null
@@ -1 +0,0 @@
-int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
deleted file mode 100644
index 0ea19312706b..000000000000
--- a/fs/partitions/karma.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * fs/partitions/karma.c
- * Rio Karma partition info.
- *
- * Copyright (C) 2006 Bob Copeland (me@bobcopeland.com)
- * based on osf.c
- */
-
-#include "check.h"
-#include "karma.h"
-
-int karma_partition(struct parsed_partitions *state)
-{
- int i;
- int slot = 1;
- Sector sect;
- unsigned char *data;
- struct disklabel {
- u8 d_reserved[270];
- struct d_partition {
- __le32 p_res;
- u8 p_fstype;
- u8 p_res2[3];
- __le32 p_offset;
- __le32 p_size;
- } d_partitions[2];
- u8 d_blank[208];
- __le16 d_magic;
- } __attribute__((packed)) *label;
- struct d_partition *p;
-
- data = read_part_sector(state, 0, &sect);
- if (!data)
- return -1;
-
- label = (struct disklabel *)data;
- if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) {
- put_dev_sector(sect);
- return 0;
- }
-
- p = label->d_partitions;
- for (i = 0 ; i < 2; i++, p++) {
- if (slot == state->limit)
- break;
-
- if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) {
- put_partition(state, slot, le32_to_cpu(p->p_offset),
- le32_to_cpu(p->p_size));
- }
- slot++;
- }
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- put_dev_sector(sect);
- return 1;
-}
-
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
deleted file mode 100644
index c764b2e9df21..000000000000
--- a/fs/partitions/karma.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*
- * fs/partitions/karma.h
- */
-
-#define KARMA_LABEL_MAGIC 0xAB56
-
-int karma_partition(struct parsed_partitions *state);
-
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
deleted file mode 100644
index bd8ae788f689..000000000000
--- a/fs/partitions/ldm.c
+++ /dev/null
@@ -1,1570 +0,0 @@
-/**
- * ldm - Support for Windows Logical Disk Manager (Dynamic Disks)
- *
- * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
- *
- * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
- *
- * This program is free software; you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation; either version 2 of the License, or (at your option) any later
- * version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program (in the main directory of the source in the file COPYING); if
- * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
- * Boston, MA 02111-1307 USA
- */
-
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/stringify.h>
-#include <linux/kernel.h>
-#include "ldm.h"
-#include "check.h"
-#include "msdos.h"
-
-/**
- * ldm_debug/info/error/crit - Output an error message
- * @f: A printf format string containing the message
- * @...: Variables to substitute into @f
- *
- * ldm_debug() writes a DEBUG level message to the syslog but only if the
- * driver was compiled with debug enabled. Otherwise, the call turns into a NOP.
- */
-#ifndef CONFIG_LDM_DEBUG
-#define ldm_debug(...) do {} while (0)
-#else
-#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a)
-#endif
-
-#define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __func__, f, ##a)
-#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a)
-#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a)
-
-static __printf(3, 4)
-void _ldm_printk(const char *level, const char *function, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start (args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- printk("%s%s(): %pV\n", level, function, &vaf);
-
- va_end(args);
-}
-
-/**
- * ldm_parse_hexbyte - Convert a ASCII hex number to a byte
- * @src: Pointer to at least 2 characters to convert.
- *
- * Convert a two character ASCII hex string to a number.
- *
- * Return: 0-255 Success, the byte was parsed correctly
- * -1 Error, an invalid character was supplied
- */
-static int ldm_parse_hexbyte (const u8 *src)
-{
- unsigned int x; /* For correct wrapping */
- int h;
-
- /* high part */
- x = h = hex_to_bin(src[0]);
- if (h < 0)
- return -1;
-
- /* low part */
- h = hex_to_bin(src[1]);
- if (h < 0)
- return -1;
-
- return (x << 4) + h;
-}
-
-/**
- * ldm_parse_guid - Convert GUID from ASCII to binary
- * @src: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
- * @dest: Memory block to hold binary GUID (16 bytes)
- *
- * N.B. The GUID need not be NULL terminated.
- *
- * Return: 'true' @dest contains binary GUID
- * 'false' @dest contents are undefined
- */
-static bool ldm_parse_guid (const u8 *src, u8 *dest)
-{
- static const int size[] = { 4, 2, 2, 2, 6 };
- int i, j, v;
-
- if (src[8] != '-' || src[13] != '-' ||
- src[18] != '-' || src[23] != '-')
- return false;
-
- for (j = 0; j < 5; j++, src++)
- for (i = 0; i < size[j]; i++, src+=2, *dest++ = v)
- if ((v = ldm_parse_hexbyte (src)) < 0)
- return false;
-
- return true;
-}
-
-/**
- * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure
- * @data: Raw database PRIVHEAD structure loaded from the device
- * @ph: In-memory privhead structure in which to return parsed information
- *
- * This parses the LDM database PRIVHEAD structure supplied in @data and
- * sets up the in-memory privhead structure @ph with the obtained information.
- *
- * Return: 'true' @ph contains the PRIVHEAD data
- * 'false' @ph contents are undefined
- */
-static bool ldm_parse_privhead(const u8 *data, struct privhead *ph)
-{
- bool is_vista = false;
-
- BUG_ON(!data || !ph);
- if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) {
- ldm_error("Cannot find PRIVHEAD structure. LDM database is"
- " corrupt. Aborting.");
- return false;
- }
- ph->ver_major = get_unaligned_be16(data + 0x000C);
- ph->ver_minor = get_unaligned_be16(data + 0x000E);
- ph->logical_disk_start = get_unaligned_be64(data + 0x011B);
- ph->logical_disk_size = get_unaligned_be64(data + 0x0123);
- ph->config_start = get_unaligned_be64(data + 0x012B);
- ph->config_size = get_unaligned_be64(data + 0x0133);
- /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */
- if (ph->ver_major == 2 && ph->ver_minor == 12)
- is_vista = true;
- if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) {
- ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d."
- " Aborting.", ph->ver_major, ph->ver_minor);
- return false;
- }
- ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major,
- ph->ver_minor, is_vista ? "Vista" : "2000/XP");
- if (ph->config_size != LDM_DB_SIZE) { /* 1 MiB in sectors. */
- /* Warn the user and continue, carefully. */
- ldm_info("Database is normally %u bytes, it claims to "
- "be %llu bytes.", LDM_DB_SIZE,
- (unsigned long long)ph->config_size);
- }
- if ((ph->logical_disk_size == 0) || (ph->logical_disk_start +
- ph->logical_disk_size > ph->config_start)) {
- ldm_error("PRIVHEAD disk size doesn't match real disk size");
- return false;
- }
- if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) {
- ldm_error("PRIVHEAD contains an invalid GUID.");
- return false;
- }
- ldm_debug("Parsed PRIVHEAD successfully.");
- return true;
-}
-
-/**
- * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure
- * @data: Raw database TOCBLOCK structure loaded from the device
- * @toc: In-memory toc structure in which to return parsed information
- *
- * This parses the LDM Database TOCBLOCK (table of contents) structure supplied
- * in @data and sets up the in-memory tocblock structure @toc with the obtained
- * information.
- *
- * N.B. The *_start and *_size values returned in @toc are not range-checked.
- *
- * Return: 'true' @toc contains the TOCBLOCK data
- * 'false' @toc contents are undefined
- */
-static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
-{
- BUG_ON (!data || !toc);
-
- if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) {
- ldm_crit ("Cannot find TOCBLOCK, database may be corrupt.");
- return false;
- }
- strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name));
- toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0;
- toc->bitmap1_start = get_unaligned_be64(data + 0x2E);
- toc->bitmap1_size = get_unaligned_be64(data + 0x36);
-
- if (strncmp (toc->bitmap1_name, TOC_BITMAP1,
- sizeof (toc->bitmap1_name)) != 0) {
- ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.",
- TOC_BITMAP1, toc->bitmap1_name);
- return false;
- }
- strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name));
- toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0;
- toc->bitmap2_start = get_unaligned_be64(data + 0x50);
- toc->bitmap2_size = get_unaligned_be64(data + 0x58);
- if (strncmp (toc->bitmap2_name, TOC_BITMAP2,
- sizeof (toc->bitmap2_name)) != 0) {
- ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.",
- TOC_BITMAP2, toc->bitmap2_name);
- return false;
- }
- ldm_debug ("Parsed TOCBLOCK successfully.");
- return true;
-}
-
-/**
- * ldm_parse_vmdb - Read the LDM Database VMDB structure
- * @data: Raw database VMDB structure loaded from the device
- * @vm: In-memory vmdb structure in which to return parsed information
- *
- * This parses the LDM Database VMDB structure supplied in @data and sets up
- * the in-memory vmdb structure @vm with the obtained information.
- *
- * N.B. The *_start, *_size and *_seq values will be range-checked later.
- *
- * Return: 'true' @vm contains VMDB info
- * 'false' @vm contents are undefined
- */
-static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
-{
- BUG_ON (!data || !vm);
-
- if (MAGIC_VMDB != get_unaligned_be32(data)) {
- ldm_crit ("Cannot find the VMDB, database may be corrupt.");
- return false;
- }
-
- vm->ver_major = get_unaligned_be16(data + 0x12);
- vm->ver_minor = get_unaligned_be16(data + 0x14);
- if ((vm->ver_major != 4) || (vm->ver_minor != 10)) {
- ldm_error ("Expected VMDB version %d.%d, got %d.%d. "
- "Aborting.", 4, 10, vm->ver_major, vm->ver_minor);
- return false;
- }
-
- vm->vblk_size = get_unaligned_be32(data + 0x08);
- if (vm->vblk_size == 0) {
- ldm_error ("Illegal VBLK size");
- return false;
- }
-
- vm->vblk_offset = get_unaligned_be32(data + 0x0C);
- vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
-
- ldm_debug ("Parsed VMDB successfully.");
- return true;
-}
-
-/**
- * ldm_compare_privheads - Compare two privhead objects
- * @ph1: First privhead
- * @ph2: Second privhead
- *
- * This compares the two privhead structures @ph1 and @ph2.
- *
- * Return: 'true' Identical
- * 'false' Different
- */
-static bool ldm_compare_privheads (const struct privhead *ph1,
- const struct privhead *ph2)
-{
- BUG_ON (!ph1 || !ph2);
-
- return ((ph1->ver_major == ph2->ver_major) &&
- (ph1->ver_minor == ph2->ver_minor) &&
- (ph1->logical_disk_start == ph2->logical_disk_start) &&
- (ph1->logical_disk_size == ph2->logical_disk_size) &&
- (ph1->config_start == ph2->config_start) &&
- (ph1->config_size == ph2->config_size) &&
- !memcmp (ph1->disk_id, ph2->disk_id, GUID_SIZE));
-}
-
-/**
- * ldm_compare_tocblocks - Compare two tocblock objects
- * @toc1: First toc
- * @toc2: Second toc
- *
- * This compares the two tocblock structures @toc1 and @toc2.
- *
- * Return: 'true' Identical
- * 'false' Different
- */
-static bool ldm_compare_tocblocks (const struct tocblock *toc1,
- const struct tocblock *toc2)
-{
- BUG_ON (!toc1 || !toc2);
-
- return ((toc1->bitmap1_start == toc2->bitmap1_start) &&
- (toc1->bitmap1_size == toc2->bitmap1_size) &&
- (toc1->bitmap2_start == toc2->bitmap2_start) &&
- (toc1->bitmap2_size == toc2->bitmap2_size) &&
- !strncmp (toc1->bitmap1_name, toc2->bitmap1_name,
- sizeof (toc1->bitmap1_name)) &&
- !strncmp (toc1->bitmap2_name, toc2->bitmap2_name,
- sizeof (toc1->bitmap2_name)));
-}
-
-/**
- * ldm_validate_privheads - Compare the primary privhead with its backups
- * @state: Partition check state including device holding the LDM Database
- * @ph1: Memory struct to fill with ph contents
- *
- * Read and compare all three privheads from disk.
- *
- * The privheads on disk show the size and location of the main disk area and
- * the configuration area (the database). The values are range-checked against
- * @hd, which contains the real size of the disk.
- *
- * Return: 'true' Success
- * 'false' Error
- */
-static bool ldm_validate_privheads(struct parsed_partitions *state,
- struct privhead *ph1)
-{
- static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
- struct privhead *ph[3] = { ph1 };
- Sector sect;
- u8 *data;
- bool result = false;
- long num_sects;
- int i;
-
- BUG_ON (!state || !ph1);
-
- ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
- ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
- if (!ph[1] || !ph[2]) {
- ldm_crit ("Out of memory.");
- goto out;
- }
-
- /* off[1 & 2] are relative to ph[0]->config_start */
- ph[0]->config_start = 0;
-
- /* Read and parse privheads */
- for (i = 0; i < 3; i++) {
- data = read_part_sector(state, ph[0]->config_start + off[i],
- &sect);
- if (!data) {
- ldm_crit ("Disk read failed.");
- goto out;
- }
- result = ldm_parse_privhead (data, ph[i]);
- put_dev_sector (sect);
- if (!result) {
- ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */
- if (i < 2)
- goto out; /* Already logged */
- else
- break; /* FIXME ignore for now, 3rd PH can fail on odd-sized disks */
- }
- }
-
- num_sects = state->bdev->bd_inode->i_size >> 9;
-
- if ((ph[0]->config_start > num_sects) ||
- ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
- ldm_crit ("Database extends beyond the end of the disk.");
- goto out;
- }
-
- if ((ph[0]->logical_disk_start > ph[0]->config_start) ||
- ((ph[0]->logical_disk_start + ph[0]->logical_disk_size)
- > ph[0]->config_start)) {
- ldm_crit ("Disk and database overlap.");
- goto out;
- }
-
- if (!ldm_compare_privheads (ph[0], ph[1])) {
- ldm_crit ("Primary and backup PRIVHEADs don't match.");
- goto out;
- }
- /* FIXME ignore this for now
- if (!ldm_compare_privheads (ph[0], ph[2])) {
- ldm_crit ("Primary and backup PRIVHEADs don't match.");
- goto out;
- }*/
- ldm_debug ("Validated PRIVHEADs successfully.");
- result = true;
-out:
- kfree (ph[1]);
- kfree (ph[2]);
- return result;
-}
-
-/**
- * ldm_validate_tocblocks - Validate the table of contents and its backups
- * @state: Partition check state including device holding the LDM Database
- * @base: Offset, into @state->bdev, of the database
- * @ldb: Cache of the database structures
- *
- * Find and compare the four tables of contents of the LDM Database stored on
- * @state->bdev and return the parsed information into @toc1.
- *
- * The offsets and sizes of the configs are range-checked against a privhead.
- *
- * Return: 'true' @toc1 contains validated TOCBLOCK info
- * 'false' @toc1 contents are undefined
- */
-static bool ldm_validate_tocblocks(struct parsed_partitions *state,
- unsigned long base, struct ldmdb *ldb)
-{
- static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
- struct tocblock *tb[4];
- struct privhead *ph;
- Sector sect;
- u8 *data;
- int i, nr_tbs;
- bool result = false;
-
- BUG_ON(!state || !ldb);
- ph = &ldb->ph;
- tb[0] = &ldb->toc;
- tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
- if (!tb[1]) {
- ldm_crit("Out of memory.");
- goto err;
- }
- tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1]));
- tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2]));
- /*
- * Try to read and parse all four TOCBLOCKs.
- *
- * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so
- * skip any that fail as long as we get at least one valid TOCBLOCK.
- */
- for (nr_tbs = i = 0; i < 4; i++) {
- data = read_part_sector(state, base + off[i], &sect);
- if (!data) {
- ldm_error("Disk read failed for TOCBLOCK %d.", i);
- continue;
- }
- if (ldm_parse_tocblock(data, tb[nr_tbs]))
- nr_tbs++;
- put_dev_sector(sect);
- }
- if (!nr_tbs) {
- ldm_crit("Failed to find a valid TOCBLOCK.");
- goto err;
- }
- /* Range check the TOCBLOCK against a privhead. */
- if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) ||
- ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) >
- ph->config_size)) {
- ldm_crit("The bitmaps are out of range. Giving up.");
- goto err;
- }
- /* Compare all loaded TOCBLOCKs. */
- for (i = 1; i < nr_tbs; i++) {
- if (!ldm_compare_tocblocks(tb[0], tb[i])) {
- ldm_crit("TOCBLOCKs 0 and %d do not match.", i);
- goto err;
- }
- }
- ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs);
- result = true;
-err:
- kfree(tb[1]);
- return result;
-}
-
-/**
- * ldm_validate_vmdb - Read the VMDB and validate it
- * @state: Partition check state including device holding the LDM Database
- * @base: Offset, into @bdev, of the database
- * @ldb: Cache of the database structures
- *
- * Find the vmdb of the LDM Database stored on @bdev and return the parsed
- * information in @ldb.
- *
- * Return: 'true' @ldb contains validated VBDB info
- * 'false' @ldb contents are undefined
- */
-static bool ldm_validate_vmdb(struct parsed_partitions *state,
- unsigned long base, struct ldmdb *ldb)
-{
- Sector sect;
- u8 *data;
- bool result = false;
- struct vmdb *vm;
- struct tocblock *toc;
-
- BUG_ON (!state || !ldb);
-
- vm = &ldb->vm;
- toc = &ldb->toc;
-
- data = read_part_sector(state, base + OFF_VMDB, &sect);
- if (!data) {
- ldm_crit ("Disk read failed.");
- return false;
- }
-
- if (!ldm_parse_vmdb (data, vm))
- goto out; /* Already logged */
-
- /* Are there uncommitted transactions? */
- if (get_unaligned_be16(data + 0x10) != 0x01) {
- ldm_crit ("Database is not in a consistent state. Aborting.");
- goto out;
- }
-
- if (vm->vblk_offset != 512)
- ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset);
-
- /*
- * The last_vblkd_seq can be before the end of the vmdb, just make sure
- * it is not out of bounds.
- */
- if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) {
- ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK. "
- "Database is corrupt. Aborting.");
- goto out;
- }
-
- result = true;
-out:
- put_dev_sector (sect);
- return result;
-}
-
-
-/**
- * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
- * @state: Partition check state including device holding the LDM Database
- *
- * This function provides a weak test to decide whether the device is a dynamic
- * disk or not. It looks for an MS-DOS-style partition table containing at
- * least one partition of type 0x42 (formerly SFS, now used by Windows for
- * dynamic disks).
- *
- * N.B. The only possible error can come from the read_part_sector and that is
- * only likely to happen if the underlying device is strange. If that IS
- * the case we should return zero to let someone else try.
- *
- * Return: 'true' @state->bdev is a dynamic disk
- * 'false' @state->bdev is not a dynamic disk, or an error occurred
- */
-static bool ldm_validate_partition_table(struct parsed_partitions *state)
-{
- Sector sect;
- u8 *data;
- struct partition *p;
- int i;
- bool result = false;
-
- BUG_ON(!state);
-
- data = read_part_sector(state, 0, &sect);
- if (!data) {
- ldm_info ("Disk read failed.");
- return false;
- }
-
- if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC))
- goto out;
-
- p = (struct partition*)(data + 0x01BE);
- for (i = 0; i < 4; i++, p++)
- if (SYS_IND (p) == LDM_PARTITION) {
- result = true;
- break;
- }
-
- if (result)
- ldm_debug ("Found W2K dynamic disk partition type.");
-
-out:
- put_dev_sector (sect);
- return result;
-}
-
-/**
- * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id
- * @ldb: Cache of the database structures
- *
- * The LDM Database contains a list of all partitions on all dynamic disks.
- * The primary PRIVHEAD, at the beginning of the physical disk, tells us
- * the GUID of this disk. This function searches for the GUID in a linked
- * list of vblk's.
- *
- * Return: Pointer, A matching vblk was found
- * NULL, No match, or an error
- */
-static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb)
-{
- struct list_head *item;
-
- BUG_ON (!ldb);
-
- list_for_each (item, &ldb->v_disk) {
- struct vblk *v = list_entry (item, struct vblk, list);
- if (!memcmp (v->vblk.disk.disk_id, ldb->ph.disk_id, GUID_SIZE))
- return v;
- }
-
- return NULL;
-}
-
-/**
- * ldm_create_data_partitions - Create data partitions for this device
- * @pp: List of the partitions parsed so far
- * @ldb: Cache of the database structures
- *
- * The database contains ALL the partitions for ALL disk groups, so we need to
- * filter out this specific disk. Using the disk's object id, we can find all
- * the partitions in the database that belong to this disk.
- *
- * Add each partition in our database, to the parsed_partitions structure.
- *
- * N.B. This function creates the partitions in the order it finds partition
- * objects in the linked list.
- *
- * Return: 'true' Partition created
- * 'false' Error, probably a range checking problem
- */
-static bool ldm_create_data_partitions (struct parsed_partitions *pp,
- const struct ldmdb *ldb)
-{
- struct list_head *item;
- struct vblk *vb;
- struct vblk *disk;
- struct vblk_part *part;
- int part_num = 1;
-
- BUG_ON (!pp || !ldb);
-
- disk = ldm_get_disk_objid (ldb);
- if (!disk) {
- ldm_crit ("Can't find the ID of this disk in the database.");
- return false;
- }
-
- strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
-
- /* Create the data partitions */
- list_for_each (item, &ldb->v_part) {
- vb = list_entry (item, struct vblk, list);
- part = &vb->vblk.part;
-
- if (part->disk_id != disk->obj_id)
- continue;
-
- put_partition (pp, part_num, ldb->ph.logical_disk_start +
- part->start, part->size);
- part_num++;
- }
-
- strlcat(pp->pp_buf, "\n", PAGE_SIZE);
- return true;
-}
-
-
-/**
- * ldm_relative - Calculate the next relative offset
- * @buffer: Block of data being worked on
- * @buflen: Size of the block of data
- * @base: Size of the previous fixed width fields
- * @offset: Cumulative size of the previous variable-width fields
- *
- * Because many of the VBLK fields are variable-width, it's necessary
- * to calculate each offset based on the previous one and the length
- * of the field it pointed to.
- *
- * Return: -1 Error, the calculated offset exceeded the size of the buffer
- * n OK, a range-checked offset into buffer
- */
-static int ldm_relative(const u8 *buffer, int buflen, int base, int offset)
-{
-
- base += offset;
- if (!buffer || offset < 0 || base > buflen) {
- if (!buffer)
- ldm_error("!buffer");
- if (offset < 0)
- ldm_error("offset (%d) < 0", offset);
- if (base > buflen)
- ldm_error("base (%d) > buflen (%d)", base, buflen);
- return -1;
- }
- if (base + buffer[base] >= buflen) {
- ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base,
- buffer[base], buflen);
- return -1;
- }
- return buffer[base] + offset + 1;
-}
-
-/**
- * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order
- * @block: Pointer to the variable-width number to convert
- *
- * Large numbers in the LDM Database are often stored in a packed format. Each
- * number is prefixed by a one byte width marker. All numbers in the database
- * are stored in big-endian byte order. This function reads one of these
- * numbers and returns the result
- *
- * N.B. This function DOES NOT perform any range checking, though the most
- * it will read is eight bytes.
- *
- * Return: n A number
- * 0 Zero, or an error occurred
- */
-static u64 ldm_get_vnum (const u8 *block)
-{
- u64 tmp = 0;
- u8 length;
-
- BUG_ON (!block);
-
- length = *block++;
-
- if (length && length <= 8)
- while (length--)
- tmp = (tmp << 8) | *block++;
- else
- ldm_error ("Illegal length %d.", length);
-
- return tmp;
-}
-
-/**
- * ldm_get_vstr - Read a length-prefixed string into a buffer
- * @block: Pointer to the length marker
- * @buffer: Location to copy string to
- * @buflen: Size of the output buffer
- *
- * Many of the strings in the LDM Database are not NULL terminated. Instead
- * they are prefixed by a one byte length marker. This function copies one of
- * these strings into a buffer.
- *
- * N.B. This function DOES NOT perform any range checking on the input.
- * If the buffer is too small, the output will be truncated.
- *
- * Return: 0, Error and @buffer contents are undefined
- * n, String length in characters (excluding NULL)
- * buflen-1, String was truncated.
- */
-static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen)
-{
- int length;
-
- BUG_ON (!block || !buffer);
-
- length = block[0];
- if (length >= buflen) {
- ldm_error ("Truncating string %d -> %d.", length, buflen);
- length = buflen - 1;
- }
- memcpy (buffer, block + 1, length);
- buffer[length] = 0;
- return length;
-}
-
-
-/**
- * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure
- * @buffer: Block of data being worked on
- * @buflen: Size of the block of data
- * @vb: In-memory vblk in which to return information
- *
- * Read a raw VBLK Component object (version 3) into a vblk structure.
- *
- * Return: 'true' @vb contains a Component VBLK
- * 'false' @vb contents are not defined
- */
-static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
-{
- int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len;
- struct vblk_comp *comp;
-
- BUG_ON (!buffer || !vb);
-
- r_objid = ldm_relative (buffer, buflen, 0x18, 0);
- r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
- r_vstate = ldm_relative (buffer, buflen, 0x18, r_name);
- r_child = ldm_relative (buffer, buflen, 0x1D, r_vstate);
- r_parent = ldm_relative (buffer, buflen, 0x2D, r_child);
-
- if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) {
- r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent);
- r_cols = ldm_relative (buffer, buflen, 0x2E, r_stripe);
- len = r_cols;
- } else {
- r_stripe = 0;
- r_cols = 0;
- len = r_parent;
- }
- if (len < 0)
- return false;
-
- len += VBLK_SIZE_CMP3;
- if (len != get_unaligned_be32(buffer + 0x14))
- return false;
-
- comp = &vb->vblk.comp;
- ldm_get_vstr (buffer + 0x18 + r_name, comp->state,
- sizeof (comp->state));
- comp->type = buffer[0x18 + r_vstate];
- comp->children = ldm_get_vnum (buffer + 0x1D + r_vstate);
- comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child);
- comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0;
-
- return true;
-}
-
-/**
- * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure
- * @buffer: Block of data being worked on
- * @buflen: Size of the block of data
- * @vb: In-memory vblk in which to return information
- *
- * Read a raw VBLK Disk Group object (version 3) into a vblk structure.
- *
- * Return: 'true' @vb contains a Disk Group VBLK
- * 'false' @vb contents are not defined
- */
-static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
-{
- int r_objid, r_name, r_diskid, r_id1, r_id2, len;
- struct vblk_dgrp *dgrp;
-
- BUG_ON (!buffer || !vb);
-
- r_objid = ldm_relative (buffer, buflen, 0x18, 0);
- r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
- r_diskid = ldm_relative (buffer, buflen, 0x18, r_name);
-
- if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) {
- r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid);
- r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1);
- len = r_id2;
- } else {
- r_id1 = 0;
- r_id2 = 0;
- len = r_diskid;
- }
- if (len < 0)
- return false;
-
- len += VBLK_SIZE_DGR3;
- if (len != get_unaligned_be32(buffer + 0x14))
- return false;
-
- dgrp = &vb->vblk.dgrp;
- ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id,
- sizeof (dgrp->disk_id));
- return true;
-}
-
-/**
- * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure
- * @buffer: Block of data being worked on
- * @buflen: Size of the block of data
- * @vb: In-memory vblk in which to return information
- *
- * Read a raw VBLK Disk Group object (version 4) into a vblk structure.
- *
- * Return: 'true' @vb contains a Disk Group VBLK
- * 'false' @vb contents are not defined
- */
-static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
-{
- char buf[64];
- int r_objid, r_name, r_id1, r_id2, len;
- struct vblk_dgrp *dgrp;
-
- BUG_ON (!buffer || !vb);
-
- r_objid = ldm_relative (buffer, buflen, 0x18, 0);
- r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
-
- if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) {
- r_id1 = ldm_relative (buffer, buflen, 0x44, r_name);
- r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1);
- len = r_id2;
- } else {
- r_id1 = 0;
- r_id2 = 0;
- len = r_name;
- }
- if (len < 0)
- return false;
-
- len += VBLK_SIZE_DGR4;
- if (len != get_unaligned_be32(buffer + 0x14))
- return false;
-
- dgrp = &vb->vblk.dgrp;
-
- ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf));
- return true;
-}
-
-/**
- * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure
- * @buffer: Block of data being worked on
- * @buflen: Size of the block of data
- * @vb: In-memory vblk in which to return information
- *
- * Read a raw VBLK Disk object (version 3) into a vblk structure.
- *
- * Return: 'true' @vb contains a Disk VBLK
- * 'false' @vb contents are not defined
- */
-static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
-{
- int r_objid, r_name, r_diskid, r_altname, len;
- struct vblk_disk *disk;
-
- BUG_ON (!buffer || !vb);
-
- r_objid = ldm_relative (buffer, buflen, 0x18, 0);
- r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
- r_diskid = ldm_relative (buffer, buflen, 0x18, r_name);
- r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid);
- len = r_altname;
- if (len < 0)
- return false;
-
- len += VBLK_SIZE_DSK3;
- if (len != get_unaligned_be32(buffer + 0x14))
- return false;
-
- disk = &vb->vblk.disk;
- ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name,
- sizeof (disk->alt_name));
- if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id))
- return false;
-
- return true;
-}
-
-/**
- * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure
- * @buffer: Block of data being worked on
- * @buflen: Size of the block of data
- * @vb: In-memory vblk in which to return information
- *
- * Read a raw VBLK Disk object (version 4) into a vblk structure.
- *
- * Return: 'true' @vb contains a Disk VBLK
- * 'false' @vb contents are not defined
- */
-static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
-{
- int r_objid, r_name, len;
- struct vblk_disk *disk;
-
- BUG_ON (!buffer || !vb);
-
- r_objid = ldm_relative (buffer, buflen, 0x18, 0);
- r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
- len = r_name;
- if (len < 0)
- return false;
-
- len += VBLK_SIZE_DSK4;
- if (len != get_unaligned_be32(buffer + 0x14))
- return false;
-
- disk = &vb->vblk.disk;
- memcpy (disk->disk_id, buffer + 0x18 + r_name, GUID_SIZE);
- return true;
-}
-
-/**
- * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure
- * @buffer: Block of data being worked on
- * @buflen: Size of the block of data
- * @vb: In-memory vblk in which to return information
- *
- * Read a raw VBLK Partition object (version 3) into a vblk structure.
- *
- * Return: 'true' @vb contains a Partition VBLK
- * 'false' @vb contents are not defined
- */
-static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
-{
- int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len;
- struct vblk_part *part;
-
- BUG_ON(!buffer || !vb);
- r_objid = ldm_relative(buffer, buflen, 0x18, 0);
- if (r_objid < 0) {
- ldm_error("r_objid %d < 0", r_objid);
- return false;
- }
- r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
- if (r_name < 0) {
- ldm_error("r_name %d < 0", r_name);
- return false;
- }
- r_size = ldm_relative(buffer, buflen, 0x34, r_name);
- if (r_size < 0) {
- ldm_error("r_size %d < 0", r_size);
- return false;
- }
- r_parent = ldm_relative(buffer, buflen, 0x34, r_size);
- if (r_parent < 0) {
- ldm_error("r_parent %d < 0", r_parent);
- return false;
- }
- r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent);
- if (r_diskid < 0) {
- ldm_error("r_diskid %d < 0", r_diskid);
- return false;
- }
- if (buffer[0x12] & VBLK_FLAG_PART_INDEX) {
- r_index = ldm_relative(buffer, buflen, 0x34, r_diskid);
- if (r_index < 0) {
- ldm_error("r_index %d < 0", r_index);
- return false;
- }
- len = r_index;
- } else {
- r_index = 0;
- len = r_diskid;
- }
- if (len < 0) {
- ldm_error("len %d < 0", len);
- return false;
- }
- len += VBLK_SIZE_PRT3;
- if (len > get_unaligned_be32(buffer + 0x14)) {
- ldm_error("len %d > BE32(buffer + 0x14) %d", len,
- get_unaligned_be32(buffer + 0x14));
- return false;
- }
- part = &vb->vblk.part;
- part->start = get_unaligned_be64(buffer + 0x24 + r_name);
- part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name);
- part->size = ldm_get_vnum(buffer + 0x34 + r_name);
- part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size);
- part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent);
- if (vb->flags & VBLK_FLAG_PART_INDEX)
- part->partnum = buffer[0x35 + r_diskid];
- else
- part->partnum = 0;
- return true;
-}
-
-/**
- * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure
- * @buffer: Block of data being worked on
- * @buflen: Size of the block of data
- * @vb: In-memory vblk in which to return information
- *
- * Read a raw VBLK Volume object (version 5) into a vblk structure.
- *
- * Return: 'true' @vb contains a Volume VBLK
- * 'false' @vb contents are not defined
- */
-static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb)
-{
- int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size;
- int r_id1, r_id2, r_size2, r_drive, len;
- struct vblk_volu *volu;
-
- BUG_ON(!buffer || !vb);
- r_objid = ldm_relative(buffer, buflen, 0x18, 0);
- if (r_objid < 0) {
- ldm_error("r_objid %d < 0", r_objid);
- return false;
- }
- r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
- if (r_name < 0) {
- ldm_error("r_name %d < 0", r_name);
- return false;
- }
- r_vtype = ldm_relative(buffer, buflen, 0x18, r_name);
- if (r_vtype < 0) {
- ldm_error("r_vtype %d < 0", r_vtype);
- return false;
- }
- r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype);
- if (r_disable_drive_letter < 0) {
- ldm_error("r_disable_drive_letter %d < 0",
- r_disable_drive_letter);
- return false;
- }
- r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter);
- if (r_child < 0) {
- ldm_error("r_child %d < 0", r_child);
- return false;
- }
- r_size = ldm_relative(buffer, buflen, 0x3D, r_child);
- if (r_size < 0) {
- ldm_error("r_size %d < 0", r_size);
- return false;
- }
- if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) {
- r_id1 = ldm_relative(buffer, buflen, 0x52, r_size);
- if (r_id1 < 0) {
- ldm_error("r_id1 %d < 0", r_id1);
- return false;
- }
- } else
- r_id1 = r_size;
- if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) {
- r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1);
- if (r_id2 < 0) {
- ldm_error("r_id2 %d < 0", r_id2);
- return false;
- }
- } else
- r_id2 = r_id1;
- if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) {
- r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2);
- if (r_size2 < 0) {
- ldm_error("r_size2 %d < 0", r_size2);
- return false;
- }
- } else
- r_size2 = r_id2;
- if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
- r_drive = ldm_relative(buffer, buflen, 0x52, r_size2);
- if (r_drive < 0) {
- ldm_error("r_drive %d < 0", r_drive);
- return false;
- }
- } else
- r_drive = r_size2;
- len = r_drive;
- if (len < 0) {
- ldm_error("len %d < 0", len);
- return false;
- }
- len += VBLK_SIZE_VOL5;
- if (len > get_unaligned_be32(buffer + 0x14)) {
- ldm_error("len %d > BE32(buffer + 0x14) %d", len,
- get_unaligned_be32(buffer + 0x14));
- return false;
- }
- volu = &vb->vblk.volu;
- ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type,
- sizeof(volu->volume_type));
- memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter,
- sizeof(volu->volume_state));
- volu->size = ldm_get_vnum(buffer + 0x3D + r_child);
- volu->partition_type = buffer[0x41 + r_size];
- memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid));
- if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
- ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint,
- sizeof(volu->drive_hint));
- }
- return true;
-}
-
-/**
- * ldm_parse_vblk - Read a raw VBLK object into a vblk structure
- * @buf: Block of data being worked on
- * @len: Size of the block of data
- * @vb: In-memory vblk in which to return information
- *
- * Read a raw VBLK object into a vblk structure. This function just reads the
- * information common to all VBLK types, then delegates the rest of the work to
- * helper functions: ldm_parse_*.
- *
- * Return: 'true' @vb contains a VBLK
- * 'false' @vb contents are not defined
- */
-static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb)
-{
- bool result = false;
- int r_objid;
-
- BUG_ON (!buf || !vb);
-
- r_objid = ldm_relative (buf, len, 0x18, 0);
- if (r_objid < 0) {
- ldm_error ("VBLK header is corrupt.");
- return false;
- }
-
- vb->flags = buf[0x12];
- vb->type = buf[0x13];
- vb->obj_id = ldm_get_vnum (buf + 0x18);
- ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name));
-
- switch (vb->type) {
- case VBLK_CMP3: result = ldm_parse_cmp3 (buf, len, vb); break;
- case VBLK_DSK3: result = ldm_parse_dsk3 (buf, len, vb); break;
- case VBLK_DSK4: result = ldm_parse_dsk4 (buf, len, vb); break;
- case VBLK_DGR3: result = ldm_parse_dgr3 (buf, len, vb); break;
- case VBLK_DGR4: result = ldm_parse_dgr4 (buf, len, vb); break;
- case VBLK_PRT3: result = ldm_parse_prt3 (buf, len, vb); break;
- case VBLK_VOL5: result = ldm_parse_vol5 (buf, len, vb); break;
- }
-
- if (result)
- ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.",
- (unsigned long long) vb->obj_id, vb->type);
- else
- ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).",
- (unsigned long long) vb->obj_id, vb->type);
-
- return result;
-}
-
-
-/**
- * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database
- * @data: Raw VBLK to add to the database
- * @len: Size of the raw VBLK
- * @ldb: Cache of the database structures
- *
- * The VBLKs are sorted into categories. Partitions are also sorted by offset.
- *
- * N.B. This function does not check the validity of the VBLKs.
- *
- * Return: 'true' The VBLK was added
- * 'false' An error occurred
- */
-static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb)
-{
- struct vblk *vb;
- struct list_head *item;
-
- BUG_ON (!data || !ldb);
-
- vb = kmalloc (sizeof (*vb), GFP_KERNEL);
- if (!vb) {
- ldm_crit ("Out of memory.");
- return false;
- }
-
- if (!ldm_parse_vblk (data, len, vb)) {
- kfree(vb);
- return false; /* Already logged */
- }
-
- /* Put vblk into the correct list. */
- switch (vb->type) {
- case VBLK_DGR3:
- case VBLK_DGR4:
- list_add (&vb->list, &ldb->v_dgrp);
- break;
- case VBLK_DSK3:
- case VBLK_DSK4:
- list_add (&vb->list, &ldb->v_disk);
- break;
- case VBLK_VOL5:
- list_add (&vb->list, &ldb->v_volu);
- break;
- case VBLK_CMP3:
- list_add (&vb->list, &ldb->v_comp);
- break;
- case VBLK_PRT3:
- /* Sort by the partition's start sector. */
- list_for_each (item, &ldb->v_part) {
- struct vblk *v = list_entry (item, struct vblk, list);
- if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) &&
- (v->vblk.part.start > vb->vblk.part.start)) {
- list_add_tail (&vb->list, &v->list);
- return true;
- }
- }
- list_add_tail (&vb->list, &ldb->v_part);
- break;
- }
- return true;
-}
-
-/**
- * ldm_frag_add - Add a VBLK fragment to a list
- * @data: Raw fragment to be added to the list
- * @size: Size of the raw fragment
- * @frags: Linked list of VBLK fragments
- *
- * Fragmented VBLKs may not be consecutive in the database, so they are placed
- * in a list so they can be pieced together later.
- *
- * Return: 'true' Success, the VBLK was added to the list
- * 'false' Error, a problem occurred
- */
-static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
-{
- struct frag *f;
- struct list_head *item;
- int rec, num, group;
-
- BUG_ON (!data || !frags);
-
- if (size < 2 * VBLK_SIZE_HEAD) {
- ldm_error("Value of size is to small.");
- return false;
- }
-
- group = get_unaligned_be32(data + 0x08);
- rec = get_unaligned_be16(data + 0x0C);
- num = get_unaligned_be16(data + 0x0E);
- if ((num < 1) || (num > 4)) {
- ldm_error ("A VBLK claims to have %d parts.", num);
- return false;
- }
- if (rec >= num) {
- ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num);
- return false;
- }
-
- list_for_each (item, frags) {
- f = list_entry (item, struct frag, list);
- if (f->group == group)
- goto found;
- }
-
- f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL);
- if (!f) {
- ldm_crit ("Out of memory.");
- return false;
- }
-
- f->group = group;
- f->num = num;
- f->rec = rec;
- f->map = 0xFF << num;
-
- list_add_tail (&f->list, frags);
-found:
- if (rec >= f->num) {
- ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num);
- return false;
- }
-
- if (f->map & (1 << rec)) {
- ldm_error ("Duplicate VBLK, part %d.", rec);
- f->map &= 0x7F; /* Mark the group as broken */
- return false;
- }
-
- f->map |= (1 << rec);
-
- data += VBLK_SIZE_HEAD;
- size -= VBLK_SIZE_HEAD;
-
- memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size);
-
- return true;
-}
-
-/**
- * ldm_frag_free - Free a linked list of VBLK fragments
- * @list: Linked list of fragments
- *
- * Free a linked list of VBLK fragments
- *
- * Return: none
- */
-static void ldm_frag_free (struct list_head *list)
-{
- struct list_head *item, *tmp;
-
- BUG_ON (!list);
-
- list_for_each_safe (item, tmp, list)
- kfree (list_entry (item, struct frag, list));
-}
-
-/**
- * ldm_frag_commit - Validate fragmented VBLKs and add them to the database
- * @frags: Linked list of VBLK fragments
- * @ldb: Cache of the database structures
- *
- * Now that all the fragmented VBLKs have been collected, they must be added to
- * the database for later use.
- *
- * Return: 'true' All the fragments we added successfully
- * 'false' One or more of the fragments we invalid
- */
-static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
-{
- struct frag *f;
- struct list_head *item;
-
- BUG_ON (!frags || !ldb);
-
- list_for_each (item, frags) {
- f = list_entry (item, struct frag, list);
-
- if (f->map != 0xFF) {
- ldm_error ("VBLK group %d is incomplete (0x%02x).",
- f->group, f->map);
- return false;
- }
-
- if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb))
- return false; /* Already logged */
- }
- return true;
-}
-
-/**
- * ldm_get_vblks - Read the on-disk database of VBLKs into memory
- * @state: Partition check state including device holding the LDM Database
- * @base: Offset, into @state->bdev, of the database
- * @ldb: Cache of the database structures
- *
- * To use the information from the VBLKs, they need to be read from the disk,
- * unpacked and validated. We cache them in @ldb according to their type.
- *
- * Return: 'true' All the VBLKs were read successfully
- * 'false' An error occurred
- */
-static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
- struct ldmdb *ldb)
-{
- int size, perbuf, skip, finish, s, v, recs;
- u8 *data = NULL;
- Sector sect;
- bool result = false;
- LIST_HEAD (frags);
-
- BUG_ON(!state || !ldb);
-
- size = ldb->vm.vblk_size;
- perbuf = 512 / size;
- skip = ldb->vm.vblk_offset >> 9; /* Bytes to sectors */
- finish = (size * ldb->vm.last_vblk_seq) >> 9;
-
- for (s = skip; s < finish; s++) { /* For each sector */
- data = read_part_sector(state, base + OFF_VMDB + s, &sect);
- if (!data) {
- ldm_crit ("Disk read failed.");
- goto out;
- }
-
- for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */
- if (MAGIC_VBLK != get_unaligned_be32(data)) {
- ldm_error ("Expected to find a VBLK.");
- goto out;
- }
-
- recs = get_unaligned_be16(data + 0x0E); /* Number of records */
- if (recs == 1) {
- if (!ldm_ldmdb_add (data, size, ldb))
- goto out; /* Already logged */
- } else if (recs > 1) {
- if (!ldm_frag_add (data, size, &frags))
- goto out; /* Already logged */
- }
- /* else Record is not in use, ignore it. */
- }
- put_dev_sector (sect);
- data = NULL;
- }
-
- result = ldm_frag_commit (&frags, ldb); /* Failures, already logged */
-out:
- if (data)
- put_dev_sector (sect);
- ldm_frag_free (&frags);
-
- return result;
-}
-
-/**
- * ldm_free_vblks - Free a linked list of vblk's
- * @lh: Head of a linked list of struct vblk
- *
- * Free a list of vblk's and free the memory used to maintain the list.
- *
- * Return: none
- */
-static void ldm_free_vblks (struct list_head *lh)
-{
- struct list_head *item, *tmp;
-
- BUG_ON (!lh);
-
- list_for_each_safe (item, tmp, lh)
- kfree (list_entry (item, struct vblk, list));
-}
-
-
-/**
- * ldm_partition - Find out whether a device is a dynamic disk and handle it
- * @state: Partition check state including device holding the LDM Database
- *
- * This determines whether the device @bdev is a dynamic disk and if so creates
- * the partitions necessary in the gendisk structure pointed to by @hd.
- *
- * We create a dummy device 1, which contains the LDM database, and then create
- * each partition described by the LDM database in sequence as devices 2+. For
- * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
- * and so on: the actual data containing partitions.
- *
- * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
- * 0 Success, @state->bdev is not a dynamic disk
- * -1 An error occurred before enough information had been read
- * Or @state->bdev is a dynamic disk, but it may be corrupted
- */
-int ldm_partition(struct parsed_partitions *state)
-{
- struct ldmdb *ldb;
- unsigned long base;
- int result = -1;
-
- BUG_ON(!state);
-
- /* Look for signs of a Dynamic Disk */
- if (!ldm_validate_partition_table(state))
- return 0;
-
- ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
- if (!ldb) {
- ldm_crit ("Out of memory.");
- goto out;
- }
-
- /* Parse and check privheads. */
- if (!ldm_validate_privheads(state, &ldb->ph))
- goto out; /* Already logged */
-
- /* All further references are relative to base (database start). */
- base = ldb->ph.config_start;
-
- /* Parse and check tocs and vmdb. */
- if (!ldm_validate_tocblocks(state, base, ldb) ||
- !ldm_validate_vmdb(state, base, ldb))
- goto out; /* Already logged */
-
- /* Initialize vblk lists in ldmdb struct */
- INIT_LIST_HEAD (&ldb->v_dgrp);
- INIT_LIST_HEAD (&ldb->v_disk);
- INIT_LIST_HEAD (&ldb->v_volu);
- INIT_LIST_HEAD (&ldb->v_comp);
- INIT_LIST_HEAD (&ldb->v_part);
-
- if (!ldm_get_vblks(state, base, ldb)) {
- ldm_crit ("Failed to read the VBLKs from the database.");
- goto cleanup;
- }
-
- /* Finally, create the data partition devices. */
- if (ldm_create_data_partitions(state, ldb)) {
- ldm_debug ("Parsed LDM database successfully.");
- result = 1;
- }
- /* else Already logged */
-
-cleanup:
- ldm_free_vblks (&ldb->v_dgrp);
- ldm_free_vblks (&ldb->v_disk);
- ldm_free_vblks (&ldb->v_volu);
- ldm_free_vblks (&ldb->v_comp);
- ldm_free_vblks (&ldb->v_part);
-out:
- kfree (ldb);
- return result;
-}
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
deleted file mode 100644
index 374242c0971a..000000000000
--- a/fs/partitions/ldm.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/**
- * ldm - Part of the Linux-NTFS project.
- *
- * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
- *
- * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program (in the main directory of the Linux-NTFS source
- * in the file COPYING); if not, write to the Free Software Foundation,
- * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _FS_PT_LDM_H_
-#define _FS_PT_LDM_H_
-
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/genhd.h>
-#include <linux/fs.h>
-#include <asm/unaligned.h>
-#include <asm/byteorder.h>
-
-struct parsed_partitions;
-
-/* Magic numbers in CPU format. */
-#define MAGIC_VMDB 0x564D4442 /* VMDB */
-#define MAGIC_VBLK 0x56424C4B /* VBLK */
-#define MAGIC_PRIVHEAD 0x5052495648454144ULL /* PRIVHEAD */
-#define MAGIC_TOCBLOCK 0x544F43424C4F434BULL /* TOCBLOCK */
-
-/* The defined vblk types. */
-#define VBLK_VOL5 0x51 /* Volume, version 5 */
-#define VBLK_CMP3 0x32 /* Component, version 3 */
-#define VBLK_PRT3 0x33 /* Partition, version 3 */
-#define VBLK_DSK3 0x34 /* Disk, version 3 */
-#define VBLK_DSK4 0x44 /* Disk, version 4 */
-#define VBLK_DGR3 0x35 /* Disk Group, version 3 */
-#define VBLK_DGR4 0x45 /* Disk Group, version 4 */
-
-/* vblk flags indicating extra information will be present */
-#define VBLK_FLAG_COMP_STRIPE 0x10
-#define VBLK_FLAG_PART_INDEX 0x08
-#define VBLK_FLAG_DGR3_IDS 0x08
-#define VBLK_FLAG_DGR4_IDS 0x08
-#define VBLK_FLAG_VOLU_ID1 0x08
-#define VBLK_FLAG_VOLU_ID2 0x20
-#define VBLK_FLAG_VOLU_SIZE 0x80
-#define VBLK_FLAG_VOLU_DRIVE 0x02
-
-/* size of a vblk's static parts */
-#define VBLK_SIZE_HEAD 16
-#define VBLK_SIZE_CMP3 22 /* Name and version */
-#define VBLK_SIZE_DGR3 12
-#define VBLK_SIZE_DGR4 44
-#define VBLK_SIZE_DSK3 12
-#define VBLK_SIZE_DSK4 45
-#define VBLK_SIZE_PRT3 28
-#define VBLK_SIZE_VOL5 58
-
-/* component types */
-#define COMP_STRIPE 0x01 /* Stripe-set */
-#define COMP_BASIC 0x02 /* Basic disk */
-#define COMP_RAID 0x03 /* Raid-set */
-
-/* Other constants. */
-#define LDM_DB_SIZE 2048 /* Size in sectors (= 1MiB). */
-
-#define OFF_PRIV1 6 /* Offset of the first privhead
- relative to the start of the
- device in sectors */
-
-/* Offsets to structures within the LDM Database in sectors. */
-#define OFF_PRIV2 1856 /* Backup private headers. */
-#define OFF_PRIV3 2047
-
-#define OFF_TOCB1 1 /* Tables of contents. */
-#define OFF_TOCB2 2
-#define OFF_TOCB3 2045
-#define OFF_TOCB4 2046
-
-#define OFF_VMDB 17 /* List of partitions. */
-
-#define LDM_PARTITION 0x42 /* Formerly SFS (Landis). */
-
-#define TOC_BITMAP1 "config" /* Names of the two defined */
-#define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */
-
-/* Borrowed from msdos.c */
-#define SYS_IND(p) (get_unaligned(&(p)->sys_ind))
-
-struct frag { /* VBLK Fragment handling */
- struct list_head list;
- u32 group;
- u8 num; /* Total number of records */
- u8 rec; /* This is record number n */
- u8 map; /* Which portions are in use */
- u8 data[0];
-};
-
-/* In memory LDM database structures. */
-
-#define GUID_SIZE 16
-
-struct privhead { /* Offsets and sizes are in sectors. */
- u16 ver_major;
- u16 ver_minor;
- u64 logical_disk_start;
- u64 logical_disk_size;
- u64 config_start;
- u64 config_size;
- u8 disk_id[GUID_SIZE];
-};
-
-struct tocblock { /* We have exactly two bitmaps. */
- u8 bitmap1_name[16];
- u64 bitmap1_start;
- u64 bitmap1_size;
- u8 bitmap2_name[16];
- u64 bitmap2_start;
- u64 bitmap2_size;
-};
-
-struct vmdb { /* VMDB: The database header */
- u16 ver_major;
- u16 ver_minor;
- u32 vblk_size;
- u32 vblk_offset;
- u32 last_vblk_seq;
-};
-
-struct vblk_comp { /* VBLK Component */
- u8 state[16];
- u64 parent_id;
- u8 type;
- u8 children;
- u16 chunksize;
-};
-
-struct vblk_dgrp { /* VBLK Disk Group */
- u8 disk_id[64];
-};
-
-struct vblk_disk { /* VBLK Disk */
- u8 disk_id[GUID_SIZE];
- u8 alt_name[128];
-};
-
-struct vblk_part { /* VBLK Partition */
- u64 start;
- u64 size; /* start, size and vol_off in sectors */
- u64 volume_offset;
- u64 parent_id;
- u64 disk_id;
- u8 partnum;
-};
-
-struct vblk_volu { /* VBLK Volume */
- u8 volume_type[16];
- u8 volume_state[16];
- u8 guid[16];
- u8 drive_hint[4];
- u64 size;
- u8 partition_type;
-};
-
-struct vblk_head { /* VBLK standard header */
- u32 group;
- u16 rec;
- u16 nrec;
-};
-
-struct vblk { /* Generalised VBLK */
- u8 name[64];
- u64 obj_id;
- u32 sequence;
- u8 flags;
- u8 type;
- union {
- struct vblk_comp comp;
- struct vblk_dgrp dgrp;
- struct vblk_disk disk;
- struct vblk_part part;
- struct vblk_volu volu;
- } vblk;
- struct list_head list;
-};
-
-struct ldmdb { /* Cache of the database */
- struct privhead ph;
- struct tocblock toc;
- struct vmdb vm;
- struct list_head v_dgrp;
- struct list_head v_disk;
- struct list_head v_volu;
- struct list_head v_comp;
- struct list_head v_part;
-};
-
-int ldm_partition(struct parsed_partitions *state);
-
-#endif /* _FS_PT_LDM_H_ */
-
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
deleted file mode 100644
index 11f688bd76c5..000000000000
--- a/fs/partitions/mac.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * fs/partitions/mac.c
- *
- * Code extracted from drivers/block/genhd.c
- * Copyright (C) 1991-1998 Linus Torvalds
- * Re-organised Feb 1998 Russell King
- */
-
-#include <linux/ctype.h>
-#include "check.h"
-#include "mac.h"
-
-#ifdef CONFIG_PPC_PMAC
-#include <asm/machdep.h>
-extern void note_bootable_part(dev_t dev, int part, int goodness);
-#endif
-
-/*
- * Code to understand MacOS partition tables.
- */
-
-static inline void mac_fix_string(char *stg, int len)
-{
- int i;
-
- for (i = len - 1; i >= 0 && stg[i] == ' '; i--)
- stg[i] = 0;
-}
-
-int mac_partition(struct parsed_partitions *state)
-{
- Sector sect;
- unsigned char *data;
- int slot, blocks_in_map;
- unsigned secsize;
-#ifdef CONFIG_PPC_PMAC
- int found_root = 0;
- int found_root_goodness = 0;
-#endif
- struct mac_partition *part;
- struct mac_driver_desc *md;
-
- /* Get 0th block and look at the first partition map entry. */
- md = read_part_sector(state, 0, &sect);
- if (!md)
- return -1;
- if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
- put_dev_sector(sect);
- return 0;
- }
- secsize = be16_to_cpu(md->block_size);
- put_dev_sector(sect);
- data = read_part_sector(state, secsize/512, &sect);
- if (!data)
- return -1;
- part = (struct mac_partition *) (data + secsize%512);
- if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
- put_dev_sector(sect);
- return 0; /* not a MacOS disk */
- }
- blocks_in_map = be32_to_cpu(part->map_count);
- if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
- put_dev_sector(sect);
- return 0;
- }
- strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
- for (slot = 1; slot <= blocks_in_map; ++slot) {
- int pos = slot * secsize;
- put_dev_sector(sect);
- data = read_part_sector(state, pos/512, &sect);
- if (!data)
- return -1;
- part = (struct mac_partition *) (data + pos%512);
- if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC)
- break;
- put_partition(state, slot,
- be32_to_cpu(part->start_block) * (secsize/512),
- be32_to_cpu(part->block_count) * (secsize/512));
-
- if (!strnicmp(part->type, "Linux_RAID", 10))
- state->parts[slot].flags = ADDPART_FLAG_RAID;
-#ifdef CONFIG_PPC_PMAC
- /*
- * If this is the first bootable partition, tell the
- * setup code, in case it wants to make this the root.
- */
- if (machine_is(powermac)) {
- int goodness = 0;
-
- mac_fix_string(part->processor, 16);
- mac_fix_string(part->name, 32);
- mac_fix_string(part->type, 32);
-
- if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE)
- && strcasecmp(part->processor, "powerpc") == 0)
- goodness++;
-
- if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0
- || (strnicmp(part->type, "Linux", 5) == 0
- && strcasecmp(part->type, "Linux_swap") != 0)) {
- int i, l;
-
- goodness++;
- l = strlen(part->name);
- if (strcmp(part->name, "/") == 0)
- goodness++;
- for (i = 0; i <= l - 4; ++i) {
- if (strnicmp(part->name + i, "root",
- 4) == 0) {
- goodness += 2;
- break;
- }
- }
- if (strnicmp(part->name, "swap", 4) == 0)
- goodness--;
- }
-
- if (goodness > found_root_goodness) {
- found_root = slot;
- found_root_goodness = goodness;
- }
- }
-#endif /* CONFIG_PPC_PMAC */
- }
-#ifdef CONFIG_PPC_PMAC
- if (found_root_goodness)
- note_bootable_part(state->bdev->bd_dev, found_root,
- found_root_goodness);
-#endif
-
- put_dev_sector(sect);
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- return 1;
-}
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
deleted file mode 100644
index 3c7d98436380..000000000000
--- a/fs/partitions/mac.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * fs/partitions/mac.h
- */
-
-#define MAC_PARTITION_MAGIC 0x504d
-
-/* type field value for A/UX or other Unix partitions */
-#define APPLE_AUX_TYPE "Apple_UNIX_SVR2"
-
-struct mac_partition {
- __be16 signature; /* expected to be MAC_PARTITION_MAGIC */
- __be16 res1;
- __be32 map_count; /* # blocks in partition map */
- __be32 start_block; /* absolute starting block # of partition */
- __be32 block_count; /* number of blocks in partition */
- char name[32]; /* partition name */
- char type[32]; /* string type description */
- __be32 data_start; /* rel block # of first data block */
- __be32 data_count; /* number of data blocks */
- __be32 status; /* partition status bits */
- __be32 boot_start;
- __be32 boot_size;
- __be32 boot_load;
- __be32 boot_load2;
- __be32 boot_entry;
- __be32 boot_entry2;
- __be32 boot_cksum;
- char processor[16]; /* identifies ISA of boot */
- /* there is more stuff after this that we don't need */
-};
-
-#define MAC_STATUS_BOOTABLE 8 /* partition is bootable */
-
-#define MAC_DRIVER_MAGIC 0x4552
-
-/* Driver descriptor structure, in block 0 */
-struct mac_driver_desc {
- __be16 signature; /* expected to be MAC_DRIVER_MAGIC */
- __be16 block_size;
- __be32 block_count;
- /* ... more stuff */
-};
-
-int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
deleted file mode 100644
index 5f79a6677c69..000000000000
--- a/fs/partitions/msdos.c
+++ /dev/null
@@ -1,552 +0,0 @@
-/*
- * fs/partitions/msdos.c
- *
- * Code extracted from drivers/block/genhd.c
- * Copyright (C) 1991-1998 Linus Torvalds
- *
- * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug
- * in the early extended-partition checks and added DM partitions
- *
- * Support for DiskManager v6.0x added by Mark Lord,
- * with information provided by OnTrack. This now works for linux fdisk
- * and LILO, as well as loadlin and bootln. Note that disks other than
- * /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1).
- *
- * More flexible handling of extended partitions - aeb, 950831
- *
- * Check partition table on IDE disks for common CHS translations
- *
- * Re-organised Feb 1998 Russell King
- */
-#include <linux/msdos_fs.h>
-
-#include "check.h"
-#include "msdos.h"
-#include "efi.h"
-
-/*
- * Many architectures don't like unaligned accesses, while
- * the nr_sects and start_sect partition table entries are
- * at a 2 (mod 4) address.
- */
-#include <asm/unaligned.h>
-
-#define SYS_IND(p) get_unaligned(&p->sys_ind)
-
-static inline sector_t nr_sects(struct partition *p)
-{
- return (sector_t)get_unaligned_le32(&p->nr_sects);
-}
-
-static inline sector_t start_sect(struct partition *p)
-{
- return (sector_t)get_unaligned_le32(&p->start_sect);
-}
-
-static inline int is_extended_partition(struct partition *p)
-{
- return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
- SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
- SYS_IND(p) == LINUX_EXTENDED_PARTITION);
-}
-
-#define MSDOS_LABEL_MAGIC1 0x55
-#define MSDOS_LABEL_MAGIC2 0xAA
-
-static inline int
-msdos_magic_present(unsigned char *p)
-{
- return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2);
-}
-
-/* Value is EBCDIC 'IBMA' */
-#define AIX_LABEL_MAGIC1 0xC9
-#define AIX_LABEL_MAGIC2 0xC2
-#define AIX_LABEL_MAGIC3 0xD4
-#define AIX_LABEL_MAGIC4 0xC1
-static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
-{
- struct partition *pt = (struct partition *) (p + 0x1be);
- Sector sect;
- unsigned char *d;
- int slot, ret = 0;
-
- if (!(p[0] == AIX_LABEL_MAGIC1 &&
- p[1] == AIX_LABEL_MAGIC2 &&
- p[2] == AIX_LABEL_MAGIC3 &&
- p[3] == AIX_LABEL_MAGIC4))
- return 0;
- /* Assume the partition table is valid if Linux partitions exists */
- for (slot = 1; slot <= 4; slot++, pt++) {
- if (pt->sys_ind == LINUX_SWAP_PARTITION ||
- pt->sys_ind == LINUX_RAID_PARTITION ||
- pt->sys_ind == LINUX_DATA_PARTITION ||
- pt->sys_ind == LINUX_LVM_PARTITION ||
- is_extended_partition(pt))
- return 0;
- }
- d = read_part_sector(state, 7, &sect);
- if (d) {
- if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
- ret = 1;
- put_dev_sector(sect);
- };
- return ret;
-}
-
-/*
- * Create devices for each logical partition in an extended partition.
- * The logical partitions form a linked list, with each entry being
- * a partition table with two entries. The first entry
- * is the real data partition (with a start relative to the partition
- * table start). The second is a pointer to the next logical partition
- * (with a start relative to the entire extended partition).
- * We do not create a Linux partition for the partition tables, but
- * only for the actual data partitions.
- */
-
-static void parse_extended(struct parsed_partitions *state,
- sector_t first_sector, sector_t first_size)
-{
- struct partition *p;
- Sector sect;
- unsigned char *data;
- sector_t this_sector, this_size;
- sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
- int loopct = 0; /* number of links followed
- without finding a data partition */
- int i;
-
- this_sector = first_sector;
- this_size = first_size;
-
- while (1) {
- if (++loopct > 100)
- return;
- if (state->next == state->limit)
- return;
- data = read_part_sector(state, this_sector, &sect);
- if (!data)
- return;
-
- if (!msdos_magic_present(data + 510))
- goto done;
-
- p = (struct partition *) (data + 0x1be);
-
- /*
- * Usually, the first entry is the real data partition,
- * the 2nd entry is the next extended partition, or empty,
- * and the 3rd and 4th entries are unused.
- * However, DRDOS sometimes has the extended partition as
- * the first entry (when the data partition is empty),
- * and OS/2 seems to use all four entries.
- */
-
- /*
- * First process the data partition(s)
- */
- for (i=0; i<4; i++, p++) {
- sector_t offs, size, next;
- if (!nr_sects(p) || is_extended_partition(p))
- continue;
-
- /* Check the 3rd and 4th entries -
- these sometimes contain random garbage */
- offs = start_sect(p)*sector_size;
- size = nr_sects(p)*sector_size;
- next = this_sector + offs;
- if (i >= 2) {
- if (offs + size > this_size)
- continue;
- if (next < first_sector)
- continue;
- if (next + size > first_sector + first_size)
- continue;
- }
-
- put_partition(state, state->next, next, size);
- if (SYS_IND(p) == LINUX_RAID_PARTITION)
- state->parts[state->next].flags = ADDPART_FLAG_RAID;
- loopct = 0;
- if (++state->next == state->limit)
- goto done;
- }
- /*
- * Next, process the (first) extended partition, if present.
- * (So far, there seems to be no reason to make
- * parse_extended() recursive and allow a tree
- * of extended partitions.)
- * It should be a link to the next logical partition.
- */
- p -= 4;
- for (i=0; i<4; i++, p++)
- if (nr_sects(p) && is_extended_partition(p))
- break;
- if (i == 4)
- goto done; /* nothing left to do */
-
- this_sector = first_sector + start_sect(p) * sector_size;
- this_size = nr_sects(p) * sector_size;
- put_dev_sector(sect);
- }
-done:
- put_dev_sector(sect);
-}
-
-/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
- indicates linux swap. Be careful before believing this is Solaris. */
-
-static void parse_solaris_x86(struct parsed_partitions *state,
- sector_t offset, sector_t size, int origin)
-{
-#ifdef CONFIG_SOLARIS_X86_PARTITION
- Sector sect;
- struct solaris_x86_vtoc *v;
- int i;
- short max_nparts;
-
- v = read_part_sector(state, offset + 1, &sect);
- if (!v)
- return;
- if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
- put_dev_sector(sect);
- return;
- }
- {
- char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
-
- snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- }
- if (le32_to_cpu(v->v_version) != 1) {
- char tmp[64];
-
- snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n",
- le32_to_cpu(v->v_version));
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- put_dev_sector(sect);
- return;
- }
- /* Ensure we can handle previous case of VTOC with 8 entries gracefully */
- max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
- for (i=0; i<max_nparts && state->next<state->limit; i++) {
- struct solaris_x86_slice *s = &v->v_slice[i];
- char tmp[3 + 10 + 1 + 1];
-
- if (s->s_size == 0)
- continue;
- snprintf(tmp, sizeof(tmp), " [s%d]", i);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- /* solaris partitions are relative to current MS-DOS
- * one; must add the offset of the current partition */
- put_partition(state, state->next++,
- le32_to_cpu(s->s_start)+offset,
- le32_to_cpu(s->s_size));
- }
- put_dev_sector(sect);
- strlcat(state->pp_buf, " >\n", PAGE_SIZE);
-#endif
-}
-
-#if defined(CONFIG_BSD_DISKLABEL)
-/*
- * Create devices for BSD partitions listed in a disklabel, under a
- * dos-like partition. See parse_extended() for more information.
- */
-static void parse_bsd(struct parsed_partitions *state,
- sector_t offset, sector_t size, int origin, char *flavour,
- int max_partitions)
-{
- Sector sect;
- struct bsd_disklabel *l;
- struct bsd_partition *p;
- char tmp[64];
-
- l = read_part_sector(state, offset + 1, &sect);
- if (!l)
- return;
- if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
- put_dev_sector(sect);
- return;
- }
-
- snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
-
- if (le16_to_cpu(l->d_npartitions) < max_partitions)
- max_partitions = le16_to_cpu(l->d_npartitions);
- for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
- sector_t bsd_start, bsd_size;
-
- if (state->next == state->limit)
- break;
- if (p->p_fstype == BSD_FS_UNUSED)
- continue;
- bsd_start = le32_to_cpu(p->p_offset);
- bsd_size = le32_to_cpu(p->p_size);
- if (offset == bsd_start && size == bsd_size)
- /* full parent partition, we have it already */
- continue;
- if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
- strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
- continue;
- }
- put_partition(state, state->next++, bsd_start, bsd_size);
- }
- put_dev_sector(sect);
- if (le16_to_cpu(l->d_npartitions) > max_partitions) {
- snprintf(tmp, sizeof(tmp), " (ignored %d more)",
- le16_to_cpu(l->d_npartitions) - max_partitions);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- }
- strlcat(state->pp_buf, " >\n", PAGE_SIZE);
-}
-#endif
-
-static void parse_freebsd(struct parsed_partitions *state,
- sector_t offset, sector_t size, int origin)
-{
-#ifdef CONFIG_BSD_DISKLABEL
- parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
-#endif
-}
-
-static void parse_netbsd(struct parsed_partitions *state,
- sector_t offset, sector_t size, int origin)
-{
-#ifdef CONFIG_BSD_DISKLABEL
- parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
-#endif
-}
-
-static void parse_openbsd(struct parsed_partitions *state,
- sector_t offset, sector_t size, int origin)
-{
-#ifdef CONFIG_BSD_DISKLABEL
- parse_bsd(state, offset, size, origin, "openbsd",
- OPENBSD_MAXPARTITIONS);
-#endif
-}
-
-/*
- * Create devices for Unixware partitions listed in a disklabel, under a
- * dos-like partition. See parse_extended() for more information.
- */
-static void parse_unixware(struct parsed_partitions *state,
- sector_t offset, sector_t size, int origin)
-{
-#ifdef CONFIG_UNIXWARE_DISKLABEL
- Sector sect;
- struct unixware_disklabel *l;
- struct unixware_slice *p;
-
- l = read_part_sector(state, offset + 29, &sect);
- if (!l)
- return;
- if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
- le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) {
- put_dev_sector(sect);
- return;
- }
- {
- char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
-
- snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- }
- p = &l->vtoc.v_slice[1];
- /* I omit the 0th slice as it is the same as whole disk. */
- while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
- if (state->next == state->limit)
- break;
-
- if (p->s_label != UNIXWARE_FS_UNUSED)
- put_partition(state, state->next++,
- le32_to_cpu(p->start_sect),
- le32_to_cpu(p->nr_sects));
- p++;
- }
- put_dev_sector(sect);
- strlcat(state->pp_buf, " >\n", PAGE_SIZE);
-#endif
-}
-
-/*
- * Minix 2.0.0/2.0.2 subpartition support.
- * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
- * Rajeev V. Pillai <rajeevvp@yahoo.com>
- */
-static void parse_minix(struct parsed_partitions *state,
- sector_t offset, sector_t size, int origin)
-{
-#ifdef CONFIG_MINIX_SUBPARTITION
- Sector sect;
- unsigned char *data;
- struct partition *p;
- int i;
-
- data = read_part_sector(state, offset, &sect);
- if (!data)
- return;
-
- p = (struct partition *)(data + 0x1be);
-
- /* The first sector of a Minix partition can have either
- * a secondary MBR describing its subpartitions, or
- * the normal boot sector. */
- if (msdos_magic_present (data + 510) &&
- SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
- char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
-
- snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
- if (state->next == state->limit)
- break;
- /* add each partition in use */
- if (SYS_IND(p) == MINIX_PARTITION)
- put_partition(state, state->next++,
- start_sect(p), nr_sects(p));
- }
- strlcat(state->pp_buf, " >\n", PAGE_SIZE);
- }
- put_dev_sector(sect);
-#endif /* CONFIG_MINIX_SUBPARTITION */
-}
-
-static struct {
- unsigned char id;
- void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
-} subtypes[] = {
- {FREEBSD_PARTITION, parse_freebsd},
- {NETBSD_PARTITION, parse_netbsd},
- {OPENBSD_PARTITION, parse_openbsd},
- {MINIX_PARTITION, parse_minix},
- {UNIXWARE_PARTITION, parse_unixware},
- {SOLARIS_X86_PARTITION, parse_solaris_x86},
- {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86},
- {0, NULL},
-};
-
-int msdos_partition(struct parsed_partitions *state)
-{
- sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
- Sector sect;
- unsigned char *data;
- struct partition *p;
- struct fat_boot_sector *fb;
- int slot;
-
- data = read_part_sector(state, 0, &sect);
- if (!data)
- return -1;
- if (!msdos_magic_present(data + 510)) {
- put_dev_sector(sect);
- return 0;
- }
-
- if (aix_magic_present(state, data)) {
- put_dev_sector(sect);
- strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
- return 0;
- }
-
- /*
- * Now that the 55aa signature is present, this is probably
- * either the boot sector of a FAT filesystem or a DOS-type
- * partition table. Reject this in case the boot indicator
- * is not 0 or 0x80.
- */
- p = (struct partition *) (data + 0x1be);
- for (slot = 1; slot <= 4; slot++, p++) {
- if (p->boot_ind != 0 && p->boot_ind != 0x80) {
- /*
- * Even without a valid boot inidicator value
- * its still possible this is valid FAT filesystem
- * without a partition table.
- */
- fb = (struct fat_boot_sector *) data;
- if (slot == 1 && fb->reserved && fb->fats
- && fat_valid_media(fb->media)) {
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- put_dev_sector(sect);
- return 1;
- } else {
- put_dev_sector(sect);
- return 0;
- }
- }
- }
-
-#ifdef CONFIG_EFI_PARTITION
- p = (struct partition *) (data + 0x1be);
- for (slot = 1 ; slot <= 4 ; slot++, p++) {
- /* If this is an EFI GPT disk, msdos should ignore it. */
- if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) {
- put_dev_sector(sect);
- return 0;
- }
- }
-#endif
- p = (struct partition *) (data + 0x1be);
-
- /*
- * Look for partitions in two passes:
- * First find the primary and DOS-type extended partitions.
- * On the second pass look inside *BSD, Unixware and Solaris partitions.
- */
-
- state->next = 5;
- for (slot = 1 ; slot <= 4 ; slot++, p++) {
- sector_t start = start_sect(p)*sector_size;
- sector_t size = nr_sects(p)*sector_size;
- if (!size)
- continue;
- if (is_extended_partition(p)) {
- /*
- * prevent someone doing mkfs or mkswap on an
- * extended partition, but leave room for LILO
- * FIXME: this uses one logical sector for > 512b
- * sector, although it may not be enough/proper.
- */
- sector_t n = 2;
- n = min(size, max(sector_size, n));
- put_partition(state, slot, start, n);
-
- strlcat(state->pp_buf, " <", PAGE_SIZE);
- parse_extended(state, start, size);
- strlcat(state->pp_buf, " >", PAGE_SIZE);
- continue;
- }
- put_partition(state, slot, start, size);
- if (SYS_IND(p) == LINUX_RAID_PARTITION)
- state->parts[slot].flags = ADDPART_FLAG_RAID;
- if (SYS_IND(p) == DM6_PARTITION)
- strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
- if (SYS_IND(p) == EZD_PARTITION)
- strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
- }
-
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
-
- /* second pass - output for each on a separate line */
- p = (struct partition *) (0x1be + data);
- for (slot = 1 ; slot <= 4 ; slot++, p++) {
- unsigned char id = SYS_IND(p);
- int n;
-
- if (!nr_sects(p))
- continue;
-
- for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
- ;
-
- if (!subtypes[n].parse)
- continue;
- subtypes[n].parse(state, start_sect(p) * sector_size,
- nr_sects(p) * sector_size, slot);
- }
- put_dev_sector(sect);
- return 1;
-}
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
deleted file mode 100644
index 38c781c490b3..000000000000
--- a/fs/partitions/msdos.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*
- * fs/partitions/msdos.h
- */
-
-#define MSDOS_LABEL_MAGIC 0xAA55
-
-int msdos_partition(struct parsed_partitions *state);
-
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
deleted file mode 100644
index 764b86a01965..000000000000
--- a/fs/partitions/osf.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * fs/partitions/osf.c
- *
- * Code extracted from drivers/block/genhd.c
- *
- * Copyright (C) 1991-1998 Linus Torvalds
- * Re-organised Feb 1998 Russell King
- */
-
-#include "check.h"
-#include "osf.h"
-
-#define MAX_OSF_PARTITIONS 18
-
-int osf_partition(struct parsed_partitions *state)
-{
- int i;
- int slot = 1;
- unsigned int npartitions;
- Sector sect;
- unsigned char *data;
- struct disklabel {
- __le32 d_magic;
- __le16 d_type,d_subtype;
- u8 d_typename[16];
- u8 d_packname[16];
- __le32 d_secsize;
- __le32 d_nsectors;
- __le32 d_ntracks;
- __le32 d_ncylinders;
- __le32 d_secpercyl;
- __le32 d_secprtunit;
- __le16 d_sparespertrack;
- __le16 d_sparespercyl;
- __le32 d_acylinders;
- __le16 d_rpm, d_interleave, d_trackskew, d_cylskew;
- __le32 d_headswitch, d_trkseek, d_flags;
- __le32 d_drivedata[5];
- __le32 d_spare[5];
- __le32 d_magic2;
- __le16 d_checksum;
- __le16 d_npartitions;
- __le32 d_bbsize, d_sbsize;
- struct d_partition {
- __le32 p_size;
- __le32 p_offset;
- __le32 p_fsize;
- u8 p_fstype;
- u8 p_frag;
- __le16 p_cpg;
- } d_partitions[MAX_OSF_PARTITIONS];
- } * label;
- struct d_partition * partition;
-
- data = read_part_sector(state, 0, &sect);
- if (!data)
- return -1;
-
- label = (struct disklabel *) (data+64);
- partition = label->d_partitions;
- if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) {
- put_dev_sector(sect);
- return 0;
- }
- if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) {
- put_dev_sector(sect);
- return 0;
- }
- npartitions = le16_to_cpu(label->d_npartitions);
- if (npartitions > MAX_OSF_PARTITIONS) {
- put_dev_sector(sect);
- return 0;
- }
- for (i = 0 ; i < npartitions; i++, partition++) {
- if (slot == state->limit)
- break;
- if (le32_to_cpu(partition->p_size))
- put_partition(state, slot,
- le32_to_cpu(partition->p_offset),
- le32_to_cpu(partition->p_size));
- slot++;
- }
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- put_dev_sector(sect);
- return 1;
-}
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
deleted file mode 100644
index 20ed2315ec16..000000000000
--- a/fs/partitions/osf.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/*
- * fs/partitions/osf.h
- */
-
-#define DISKLABELMAGIC (0x82564557UL)
-
-int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
deleted file mode 100644
index ea8a86dceaf4..000000000000
--- a/fs/partitions/sgi.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * fs/partitions/sgi.c
- *
- * Code extracted from drivers/block/genhd.c
- */
-
-#include "check.h"
-#include "sgi.h"
-
-struct sgi_disklabel {
- __be32 magic_mushroom; /* Big fat spliff... */
- __be16 root_part_num; /* Root partition number */
- __be16 swap_part_num; /* Swap partition number */
- s8 boot_file[16]; /* Name of boot file for ARCS */
- u8 _unused0[48]; /* Device parameter useless crapola.. */
- struct sgi_volume {
- s8 name[8]; /* Name of volume */
- __be32 block_num; /* Logical block number */
- __be32 num_bytes; /* How big, in bytes */
- } volume[15];
- struct sgi_partition {
- __be32 num_blocks; /* Size in logical blocks */
- __be32 first_block; /* First logical block */
- __be32 type; /* Type of this partition */
- } partitions[16];
- __be32 csum; /* Disk label checksum */
- __be32 _unused1; /* Padding */
-};
-
-int sgi_partition(struct parsed_partitions *state)
-{
- int i, csum;
- __be32 magic;
- int slot = 1;
- unsigned int start, blocks;
- __be32 *ui, cs;
- Sector sect;
- struct sgi_disklabel *label;
- struct sgi_partition *p;
- char b[BDEVNAME_SIZE];
-
- label = read_part_sector(state, 0, &sect);
- if (!label)
- return -1;
- p = &label->partitions[0];
- magic = label->magic_mushroom;
- if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) {
- /*printk("Dev %s SGI disklabel: bad magic %08x\n",
- bdevname(bdev, b), be32_to_cpu(magic));*/
- put_dev_sector(sect);
- return 0;
- }
- ui = ((__be32 *) (label + 1)) - 1;
- for(csum = 0; ui >= ((__be32 *) label);) {
- cs = *ui--;
- csum += be32_to_cpu(cs);
- }
- if(csum) {
- printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
- bdevname(state->bdev, b));
- put_dev_sector(sect);
- return 0;
- }
- /* All SGI disk labels have 16 partitions, disks under Linux only
- * have 15 minor's. Luckily there are always a few zero length
- * partitions which we don't care about so we never overflow the
- * current_minor.
- */
- for(i = 0; i < 16; i++, p++) {
- blocks = be32_to_cpu(p->num_blocks);
- start = be32_to_cpu(p->first_block);
- if (blocks) {
- put_partition(state, slot, start, blocks);
- if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION)
- state->parts[slot].flags = ADDPART_FLAG_RAID;
- }
- slot++;
- }
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- put_dev_sector(sect);
- return 1;
-}
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
deleted file mode 100644
index b9553ebdd5a9..000000000000
--- a/fs/partitions/sgi.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*
- * fs/partitions/sgi.h
- */
-
-extern int sgi_partition(struct parsed_partitions *state);
-
-#define SGI_LABEL_MAGIC 0x0be5a941
-
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
deleted file mode 100644
index b5b6fcfb3d36..000000000000
--- a/fs/partitions/sun.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * fs/partitions/sun.c
- *
- * Code extracted from drivers/block/genhd.c
- *
- * Copyright (C) 1991-1998 Linus Torvalds
- * Re-organised Feb 1998 Russell King
- */
-
-#include "check.h"
-#include "sun.h"
-
-int sun_partition(struct parsed_partitions *state)
-{
- int i;
- __be16 csum;
- int slot = 1;
- __be16 *ush;
- Sector sect;
- struct sun_disklabel {
- unsigned char info[128]; /* Informative text string */
- struct sun_vtoc {
- __be32 version; /* Layout version */
- char volume[8]; /* Volume name */
- __be16 nparts; /* Number of partitions */
- struct sun_info { /* Partition hdrs, sec 2 */
- __be16 id;
- __be16 flags;
- } infos[8];
- __be16 padding; /* Alignment padding */
- __be32 bootinfo[3]; /* Info needed by mboot */
- __be32 sanity; /* To verify vtoc sanity */
- __be32 reserved[10]; /* Free space */
- __be32 timestamp[8]; /* Partition timestamp */
- } vtoc;
- __be32 write_reinstruct; /* sectors to skip, writes */
- __be32 read_reinstruct; /* sectors to skip, reads */
- unsigned char spare[148]; /* Padding */
- __be16 rspeed; /* Disk rotational speed */
- __be16 pcylcount; /* Physical cylinder count */
- __be16 sparecyl; /* extra sects per cylinder */
- __be16 obs1; /* gap1 */
- __be16 obs2; /* gap2 */
- __be16 ilfact; /* Interleave factor */
- __be16 ncyl; /* Data cylinder count */
- __be16 nacyl; /* Alt. cylinder count */
- __be16 ntrks; /* Tracks per cylinder */
- __be16 nsect; /* Sectors per track */
- __be16 obs3; /* bhead - Label head offset */
- __be16 obs4; /* ppart - Physical Partition */
- struct sun_partition {
- __be32 start_cylinder;
- __be32 num_sectors;
- } partitions[8];
- __be16 magic; /* Magic number */
- __be16 csum; /* Label xor'd checksum */
- } * label;
- struct sun_partition *p;
- unsigned long spc;
- char b[BDEVNAME_SIZE];
- int use_vtoc;
- int nparts;
-
- label = read_part_sector(state, 0, &sect);
- if (!label)
- return -1;
-
- p = label->partitions;
- if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) {
-/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n",
- bdevname(bdev, b), be16_to_cpu(label->magic)); */
- put_dev_sector(sect);
- return 0;
- }
- /* Look at the checksum */
- ush = ((__be16 *) (label+1)) - 1;
- for (csum = 0; ush >= ((__be16 *) label);)
- csum ^= *ush--;
- if (csum) {
- printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
- bdevname(state->bdev, b));
- put_dev_sector(sect);
- return 0;
- }
-
- /* Check to see if we can use the VTOC table */
- use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) &&
- (be32_to_cpu(label->vtoc.version) == 1) &&
- (be16_to_cpu(label->vtoc.nparts) <= 8));
-
- /* Use 8 partition entries if not specified in validated VTOC */
- nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8;
-
- /*
- * So that old Linux-Sun partitions continue to work,
- * alow the VTOC to be used under the additional condition ...
- */
- use_vtoc = use_vtoc || !(label->vtoc.sanity ||
- label->vtoc.version || label->vtoc.nparts);
- spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect);
- for (i = 0; i < nparts; i++, p++) {
- unsigned long st_sector;
- unsigned int num_sectors;
-
- st_sector = be32_to_cpu(p->start_cylinder) * spc;
- num_sectors = be32_to_cpu(p->num_sectors);
- if (num_sectors) {
- put_partition(state, slot, st_sector, num_sectors);
- state->parts[slot].flags = 0;
- if (use_vtoc) {
- if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION)
- state->parts[slot].flags |= ADDPART_FLAG_RAID;
- else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK)
- state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK;
- }
- }
- slot++;
- }
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- put_dev_sector(sect);
- return 1;
-}
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
deleted file mode 100644
index 2424baa8319f..000000000000
--- a/fs/partitions/sun.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*
- * fs/partitions/sun.h
- */
-
-#define SUN_LABEL_MAGIC 0xDABE
-#define SUN_VTOC_SANITY 0x600DDEEE
-
-int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
deleted file mode 100644
index 9627ccffc1c4..000000000000
--- a/fs/partitions/sysv68.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * fs/partitions/sysv68.c
- *
- * Copyright (C) 2007 Philippe De Muyter <phdm@macqel.be>
- */
-
-#include "check.h"
-#include "sysv68.h"
-
-/*
- * Volume ID structure: on first 256-bytes sector of disk
- */
-
-struct volumeid {
- u8 vid_unused[248];
- u8 vid_mac[8]; /* ASCII string "MOTOROLA" */
-};
-
-/*
- * config block: second 256-bytes sector on disk
- */
-
-struct dkconfig {
- u8 ios_unused0[128];
- __be32 ios_slcblk; /* Slice table block number */
- __be16 ios_slccnt; /* Number of entries in slice table */
- u8 ios_unused1[122];
-};
-
-/*
- * combined volumeid and dkconfig block
- */
-
-struct dkblk0 {
- struct volumeid dk_vid;
- struct dkconfig dk_ios;
-};
-
-/*
- * Slice Table Structure
- */
-
-struct slice {
- __be32 nblocks; /* slice size (in blocks) */
- __be32 blkoff; /* block offset of slice */
-};
-
-
-int sysv68_partition(struct parsed_partitions *state)
-{
- int i, slices;
- int slot = 1;
- Sector sect;
- unsigned char *data;
- struct dkblk0 *b;
- struct slice *slice;
- char tmp[64];
-
- data = read_part_sector(state, 0, &sect);
- if (!data)
- return -1;
-
- b = (struct dkblk0 *)data;
- if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) {
- put_dev_sector(sect);
- return 0;
- }
- slices = be16_to_cpu(b->dk_ios.ios_slccnt);
- i = be32_to_cpu(b->dk_ios.ios_slcblk);
- put_dev_sector(sect);
-
- data = read_part_sector(state, i, &sect);
- if (!data)
- return -1;
-
- slices -= 1; /* last slice is the whole disk */
- snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- slice = (struct slice *)data;
- for (i = 0; i < slices; i++, slice++) {
- if (slot == state->limit)
- break;
- if (be32_to_cpu(slice->nblocks)) {
- put_partition(state, slot,
- be32_to_cpu(slice->blkoff),
- be32_to_cpu(slice->nblocks));
- snprintf(tmp, sizeof(tmp), "(s%u)", i);
- strlcat(state->pp_buf, tmp, PAGE_SIZE);
- }
- slot++;
- }
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- put_dev_sector(sect);
- return 1;
-}
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
deleted file mode 100644
index bf2f5ffa97ac..000000000000
--- a/fs/partitions/sysv68.h
+++ /dev/null
@@ -1 +0,0 @@
-extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
deleted file mode 100644
index 8dbaf9f77a99..000000000000
--- a/fs/partitions/ultrix.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * fs/partitions/ultrix.c
- *
- * Code extracted from drivers/block/genhd.c
- *
- * Re-organised Jul 1999 Russell King
- */
-
-#include "check.h"
-#include "ultrix.h"
-
-int ultrix_partition(struct parsed_partitions *state)
-{
- int i;
- Sector sect;
- unsigned char *data;
- struct ultrix_disklabel {
- s32 pt_magic; /* magic no. indicating part. info exits */
- s32 pt_valid; /* set by driver if pt is current */
- struct pt_info {
- s32 pi_nblocks; /* no. of sectors */
- u32 pi_blkoff; /* block offset for start */
- } pt_part[8];
- } *label;
-
-#define PT_MAGIC 0x032957 /* Partition magic number */
-#define PT_VALID 1 /* Indicates if struct is valid */
-
- data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
- if (!data)
- return -1;
-
- label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label));
-
- if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) {
- for (i=0; i<8; i++)
- if (label->pt_part[i].pi_nblocks)
- put_partition(state, i+1,
- label->pt_part[i].pi_blkoff,
- label->pt_part[i].pi_nblocks);
- put_dev_sector(sect);
- strlcat(state->pp_buf, "\n", PAGE_SIZE);
- return 1;
- } else {
- put_dev_sector(sect);
- return 0;
- }
-}
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
deleted file mode 100644
index a3cc00b2bded..000000000000
--- a/fs/partitions/ultrix.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/*
- * fs/partitions/ultrix.h
- */
-
-int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
index 4065f07366b3..a932ced92a16 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
if (nr_pages < pipe->nrbufs)
return -EBUSY;
- bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
+ bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
if (unlikely(!bufs))
return -ENOMEM;
@@ -1290,11 +1290,4 @@ static int __init init_pipe_fs(void)
return err;
}
-static void __exit exit_pipe_fs(void)
-{
- kern_unmount(pipe_mnt);
- unregister_filesystem(&pipe_fs_type);
-}
-
fs_initcall(init_pipe_fs);
-module_exit(exit_pipe_fs);
diff --git a/fs/pnode.c b/fs/pnode.c
index d42514e32380..ab5fa9e1a79a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -13,45 +13,30 @@
#include "pnode.h"
/* return the next shared peer mount of @p */
-static inline struct vfsmount *next_peer(struct vfsmount *p)
+static inline struct mount *next_peer(struct mount *p)
{
- return list_entry(p->mnt_share.next, struct vfsmount, mnt_share);
+ return list_entry(p->mnt_share.next, struct mount, mnt_share);
}
-static inline struct vfsmount *first_slave(struct vfsmount *p)
+static inline struct mount *first_slave(struct mount *p)
{
- return list_entry(p->mnt_slave_list.next, struct vfsmount, mnt_slave);
+ return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
}
-static inline struct vfsmount *next_slave(struct vfsmount *p)
+static inline struct mount *next_slave(struct mount *p)
{
- return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
+ return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
}
-/*
- * Return true if path is reachable from root
- *
- * namespace_sem is held, and mnt is attached
- */
-static bool is_path_reachable(struct vfsmount *mnt, struct dentry *dentry,
- const struct path *root)
-{
- while (mnt != root->mnt && mnt->mnt_parent != mnt) {
- dentry = mnt->mnt_mountpoint;
- mnt = mnt->mnt_parent;
- }
- return mnt == root->mnt && is_subdir(dentry, root->dentry);
-}
-
-static struct vfsmount *get_peer_under_root(struct vfsmount *mnt,
- struct mnt_namespace *ns,
- const struct path *root)
+static struct mount *get_peer_under_root(struct mount *mnt,
+ struct mnt_namespace *ns,
+ const struct path *root)
{
- struct vfsmount *m = mnt;
+ struct mount *m = mnt;
do {
/* Check the namespace first for optimization */
- if (m->mnt_ns == ns && is_path_reachable(m, m->mnt_root, root))
+ if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root))
return m;
m = next_peer(m);
@@ -66,12 +51,12 @@ static struct vfsmount *get_peer_under_root(struct vfsmount *mnt,
*
* Caller must hold namespace_sem
*/
-int get_dominating_id(struct vfsmount *mnt, const struct path *root)
+int get_dominating_id(struct mount *mnt, const struct path *root)
{
- struct vfsmount *m;
+ struct mount *m;
for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) {
- struct vfsmount *d = get_peer_under_root(m, mnt->mnt_ns, root);
+ struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root);
if (d)
return d->mnt_group_id;
}
@@ -79,10 +64,10 @@ int get_dominating_id(struct vfsmount *mnt, const struct path *root)
return 0;
}
-static int do_make_slave(struct vfsmount *mnt)
+static int do_make_slave(struct mount *mnt)
{
- struct vfsmount *peer_mnt = mnt, *master = mnt->mnt_master;
- struct vfsmount *slave_mnt;
+ struct mount *peer_mnt = mnt, *master = mnt->mnt_master;
+ struct mount *slave_mnt;
/*
* slave 'mnt' to a peer mount that has the
@@ -90,7 +75,7 @@ static int do_make_slave(struct vfsmount *mnt)
* slave it to anything that is available.
*/
while ((peer_mnt = next_peer(peer_mnt)) != mnt &&
- peer_mnt->mnt_root != mnt->mnt_root) ;
+ peer_mnt->mnt.mnt_root != mnt->mnt.mnt_root) ;
if (peer_mnt == mnt) {
peer_mnt = next_peer(mnt);
@@ -116,7 +101,7 @@ static int do_make_slave(struct vfsmount *mnt)
struct list_head *p = &mnt->mnt_slave_list;
while (!list_empty(p)) {
slave_mnt = list_first_entry(p,
- struct vfsmount, mnt_slave);
+ struct mount, mnt_slave);
list_del_init(&slave_mnt->mnt_slave);
slave_mnt->mnt_master = NULL;
}
@@ -129,7 +114,7 @@ static int do_make_slave(struct vfsmount *mnt)
/*
* vfsmount lock must be held for write
*/
-void change_mnt_propagation(struct vfsmount *mnt, int type)
+void change_mnt_propagation(struct mount *mnt, int type)
{
if (type == MS_SHARED) {
set_mnt_shared(mnt);
@@ -140,9 +125,9 @@ void change_mnt_propagation(struct vfsmount *mnt, int type)
list_del_init(&mnt->mnt_slave);
mnt->mnt_master = NULL;
if (type == MS_UNBINDABLE)
- mnt->mnt_flags |= MNT_UNBINDABLE;
+ mnt->mnt.mnt_flags |= MNT_UNBINDABLE;
else
- mnt->mnt_flags &= ~MNT_UNBINDABLE;
+ mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE;
}
}
@@ -156,20 +141,19 @@ void change_mnt_propagation(struct vfsmount *mnt, int type)
* vfsmount found while iterating with propagation_next() is
* a peer of one we'd found earlier.
*/
-static struct vfsmount *propagation_next(struct vfsmount *m,
- struct vfsmount *origin)
+static struct mount *propagation_next(struct mount *m,
+ struct mount *origin)
{
/* are there any slaves of this mount? */
if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
return first_slave(m);
while (1) {
- struct vfsmount *next;
- struct vfsmount *master = m->mnt_master;
+ struct mount *master = m->mnt_master;
if (master == origin->mnt_master) {
- next = next_peer(m);
- return ((next == origin) ? NULL : next);
+ struct mount *next = next_peer(m);
+ return (next == origin) ? NULL : next;
} else if (m->mnt_slave.next != &master->mnt_slave_list)
return next_slave(m);
@@ -187,13 +171,13 @@ static struct vfsmount *propagation_next(struct vfsmount *m,
* @type return CL_SLAVE if the new mount has to be
* cloned as a slave.
*/
-static struct vfsmount *get_source(struct vfsmount *dest,
- struct vfsmount *last_dest,
- struct vfsmount *last_src,
- int *type)
+static struct mount *get_source(struct mount *dest,
+ struct mount *last_dest,
+ struct mount *last_src,
+ int *type)
{
- struct vfsmount *p_last_src = NULL;
- struct vfsmount *p_last_dest = NULL;
+ struct mount *p_last_src = NULL;
+ struct mount *p_last_dest = NULL;
while (last_dest != dest->mnt_master) {
p_last_dest = last_dest;
@@ -233,33 +217,33 @@ static struct vfsmount *get_source(struct vfsmount *dest,
* @source_mnt: source mount.
* @tree_list : list of heads of trees to be attached.
*/
-int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
- struct vfsmount *source_mnt, struct list_head *tree_list)
+int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
+ struct mount *source_mnt, struct list_head *tree_list)
{
- struct vfsmount *m, *child;
+ struct mount *m, *child;
int ret = 0;
- struct vfsmount *prev_dest_mnt = dest_mnt;
- struct vfsmount *prev_src_mnt = source_mnt;
+ struct mount *prev_dest_mnt = dest_mnt;
+ struct mount *prev_src_mnt = source_mnt;
LIST_HEAD(tmp_list);
LIST_HEAD(umount_list);
for (m = propagation_next(dest_mnt, dest_mnt); m;
m = propagation_next(m, dest_mnt)) {
int type;
- struct vfsmount *source;
+ struct mount *source;
if (IS_MNT_NEW(m))
continue;
source = get_source(m, prev_dest_mnt, prev_src_mnt, &type);
- if (!(child = copy_tree(source, source->mnt_root, type))) {
+ if (!(child = copy_tree(source, source->mnt.mnt_root, type))) {
ret = -ENOMEM;
list_splice(tree_list, tmp_list.prev);
goto out;
}
- if (is_subdir(dest_dentry, m->mnt_root)) {
+ if (is_subdir(dest_dentry, m->mnt.mnt_root)) {
mnt_set_mountpoint(m, dest_dentry, child);
list_add_tail(&child->mnt_hash, tree_list);
} else {
@@ -275,7 +259,7 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
out:
br_write_lock(vfsmount_lock);
while (!list_empty(&tmp_list)) {
- child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
+ child = list_first_entry(&tmp_list, struct mount, mnt_hash);
umount_tree(child, 0, &umount_list);
}
br_write_unlock(vfsmount_lock);
@@ -286,7 +270,7 @@ out:
/*
* return true if the refcount is greater than count
*/
-static inline int do_refcount_check(struct vfsmount *mnt, int count)
+static inline int do_refcount_check(struct mount *mnt, int count)
{
int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts;
return (mycount > count);
@@ -302,10 +286,10 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
*
* vfsmount lock must be held for write
*/
-int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
+int propagate_mount_busy(struct mount *mnt, int refcnt)
{
- struct vfsmount *m, *child;
- struct vfsmount *parent = mnt->mnt_parent;
+ struct mount *m, *child;
+ struct mount *parent = mnt->mnt_parent;
int ret = 0;
if (mnt == parent)
@@ -321,7 +305,7 @@ int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
- child = __lookup_mnt(m, mnt->mnt_mountpoint, 0);
+ child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint, 0);
if (child && list_empty(&child->mnt_mounts) &&
(ret = do_refcount_check(child, 1)))
break;
@@ -333,17 +317,17 @@ int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
* NOTE: unmounting 'mnt' naturally propagates to all other mounts its
* parent propagates to.
*/
-static void __propagate_umount(struct vfsmount *mnt)
+static void __propagate_umount(struct mount *mnt)
{
- struct vfsmount *parent = mnt->mnt_parent;
- struct vfsmount *m;
+ struct mount *parent = mnt->mnt_parent;
+ struct mount *m;
BUG_ON(parent == mnt);
for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
- struct vfsmount *child = __lookup_mnt(m,
+ struct mount *child = __lookup_mnt(&m->mnt,
mnt->mnt_mountpoint, 0);
/*
* umount the child only if the child has no
@@ -363,7 +347,7 @@ static void __propagate_umount(struct vfsmount *mnt)
*/
int propagate_umount(struct list_head *list)
{
- struct vfsmount *mnt;
+ struct mount *mnt;
list_for_each_entry(mnt, list, mnt_hash)
__propagate_umount(mnt);
diff --git a/fs/pnode.h b/fs/pnode.h
index 1ea4ae1efcd3..65c60979d541 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -9,13 +9,13 @@
#define _LINUX_PNODE_H
#include <linux/list.h>
-#include <linux/mount.h>
+#include "mount.h"
-#define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED)
-#define IS_MNT_SLAVE(mnt) (mnt->mnt_master)
-#define IS_MNT_NEW(mnt) (!mnt->mnt_ns)
-#define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~MNT_SHARED)
-#define IS_MNT_UNBINDABLE(mnt) (mnt->mnt_flags & MNT_UNBINDABLE)
+#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
+#define IS_MNT_SLAVE(m) ((m)->mnt_master)
+#define IS_MNT_NEW(m) (!(m)->mnt_ns)
+#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
+#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
#define CL_EXPIRE 0x01
#define CL_SLAVE 0x02
@@ -23,17 +23,25 @@
#define CL_MAKE_SHARED 0x08
#define CL_PRIVATE 0x10
-static inline void set_mnt_shared(struct vfsmount *mnt)
+static inline void set_mnt_shared(struct mount *mnt)
{
- mnt->mnt_flags &= ~MNT_SHARED_MASK;
- mnt->mnt_flags |= MNT_SHARED;
+ mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK;
+ mnt->mnt.mnt_flags |= MNT_SHARED;
}
-void change_mnt_propagation(struct vfsmount *, int);
-int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
+void change_mnt_propagation(struct mount *, int);
+int propagate_mnt(struct mount *, struct dentry *, struct mount *,
struct list_head *);
int propagate_umount(struct list_head *);
-int propagate_mount_busy(struct vfsmount *, int);
-void mnt_release_group_id(struct vfsmount *);
-int get_dominating_id(struct vfsmount *mnt, const struct path *root);
+int propagate_mount_busy(struct mount *, int);
+void mnt_release_group_id(struct mount *);
+int get_dominating_id(struct mount *mnt, const struct path *root);
+unsigned int mnt_get_count(struct mount *mnt);
+void mnt_set_mountpoint(struct mount *, struct dentry *,
+ struct mount *);
+void release_mounts(struct list_head *);
+void umount_tree(struct mount *, int, struct list_head *);
+struct mount *copy_tree(struct mount *, struct dentry *, int);
+bool is_path_reachable(struct mount *, struct dentry *,
+ const struct path *root);
#endif /* _LINUX_PNODE_H */
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3a1dafd228d1..c602b8d20f06 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -380,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
state = *get_task_state(task);
vsize = eip = esp = 0;
- permitted = ptrace_may_access(task, PTRACE_MODE_READ);
+ permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
mm = get_task_mm(task);
if (mm) {
vsize = task_vsize(mm);
@@ -394,8 +394,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
sigemptyset(&sigign);
sigemptyset(&sigcatch);
- cutime = cstime = utime = stime = cputime_zero;
- cgtime = gtime = cputime_zero;
+ cutime = cstime = utime = stime = 0;
+ cgtime = gtime = 0;
if (lock_task_sighand(task, &flags)) {
struct signal_struct *sig = task->signal;
@@ -423,14 +423,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
do {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
- gtime = cputime_add(gtime, t->gtime);
+ gtime += t->gtime;
t = next_thread(t);
} while (t != task);
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
thread_group_times(task, &utime, &stime);
- gtime = cputime_add(gtime, sig->gtime);
+ gtime += sig->gtime;
}
sid = task_session_nr_ns(task, ns);
@@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n",
pid_nr_ns(pid, ns),
tcomm,
state,
@@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
task->policy,
(unsigned long long)delayacct_blkio_ticks(task),
cputime_to_clock_t(gtime),
- cputime_to_clock_t(cgtime));
+ cputime_to_clock_t(cgtime),
+ (mm && permitted) ? mm->start_data : 0,
+ (mm && permitted) ? mm->end_data : 0,
+ (mm && permitted) ? mm->start_brk : 0);
if (mm)
mmput(mm);
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 851ba3dcdc29..d4548dd49b02 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,9 +83,11 @@
#include <linux/pid_namespace.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
+#include <linux/flex_array.h>
#ifdef CONFIG_HARDWALL
#include <asm/hardwall.h>
#endif
+#include <trace/events/oom.h>
#include "internal.h"
/* NOTE:
@@ -101,7 +103,7 @@
struct pid_entry {
char *name;
int len;
- mode_t mode;
+ umode_t mode;
const struct inode_operations *iop;
const struct file_operations *fop;
union proc_op op;
@@ -133,6 +135,8 @@ struct pid_entry {
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
+static int proc_fd_permission(struct inode *inode, int mask);
+
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
* and .. links.
@@ -165,9 +169,9 @@ static int get_task_root(struct task_struct *task, struct path *root)
return result;
}
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
{
- struct task_struct *task = get_proc_task(inode);
+ struct task_struct *task = get_proc_task(dentry->d_inode);
int result = -ENOENT;
if (task) {
@@ -182,9 +186,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
return result;
}
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
{
- struct task_struct *task = get_proc_task(inode);
+ struct task_struct *task = get_proc_task(dentry->d_inode);
int result = -ENOENT;
if (task) {
@@ -194,82 +198,9 @@ static int proc_root_link(struct inode *inode, struct path *path)
return result;
}
-static struct mm_struct *__check_mem_permission(struct task_struct *task)
-{
- struct mm_struct *mm;
-
- mm = get_task_mm(task);
- if (!mm)
- return ERR_PTR(-EINVAL);
-
- /*
- * A task can always look at itself, in case it chooses
- * to use system calls instead of load instructions.
- */
- if (task == current)
- return mm;
-
- /*
- * If current is actively ptrace'ing, and would also be
- * permitted to freshly attach with ptrace now, permit it.
- */
- if (task_is_stopped_or_traced(task)) {
- int match;
- rcu_read_lock();
- match = (ptrace_parent(task) == current);
- rcu_read_unlock();
- if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
- return mm;
- }
-
- /*
- * No one else is allowed.
- */
- mmput(mm);
- return ERR_PTR(-EPERM);
-}
-
-/*
- * If current may access user memory in @task return a reference to the
- * corresponding mm, otherwise ERR_PTR.
- */
-static struct mm_struct *check_mem_permission(struct task_struct *task)
-{
- struct mm_struct *mm;
- int err;
-
- /*
- * Avoid racing if task exec's as we might get a new mm but validate
- * against old credentials.
- */
- err = mutex_lock_killable(&task->signal->cred_guard_mutex);
- if (err)
- return ERR_PTR(err);
-
- mm = __check_mem_permission(task);
- mutex_unlock(&task->signal->cred_guard_mutex);
-
- return mm;
-}
-
struct mm_struct *mm_for_maps(struct task_struct *task)
{
- struct mm_struct *mm;
- int err;
-
- err = mutex_lock_killable(&task->signal->cred_guard_mutex);
- if (err)
- return ERR_PTR(err);
-
- mm = get_task_mm(task);
- if (mm && mm != current->mm &&
- !ptrace_may_access(task, PTRACE_MODE_READ)) {
- mmput(mm);
- mm = ERR_PTR(-EACCES);
- }
- mutex_unlock(&task->signal->cred_guard_mutex);
-
- return mm;
+ return mm_access(task, PTRACE_MODE_READ);
}
static int proc_pid_cmdline(struct task_struct *task, char * buffer)
@@ -627,122 +558,54 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
return 0;
}
-static const struct inode_operations proc_def_inode_operations = {
- .setattr = proc_setattr,
-};
-
-static int mounts_open_common(struct inode *inode, struct file *file,
- const struct seq_operations *op)
+/*
+ * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+ * or euid/egid (for hide_pid_min=2)?
+ */
+static bool has_pid_permissions(struct pid_namespace *pid,
+ struct task_struct *task,
+ int hide_pid_min)
{
- struct task_struct *task = get_proc_task(inode);
- struct nsproxy *nsp;
- struct mnt_namespace *ns = NULL;
- struct path root;
- struct proc_mounts *p;
- int ret = -EINVAL;
-
- if (task) {
- rcu_read_lock();
- nsp = task_nsproxy(task);
- if (nsp) {
- ns = nsp->mnt_ns;
- if (ns)
- get_mnt_ns(ns);
- }
- rcu_read_unlock();
- if (ns && get_task_root(task, &root) == 0)
- ret = 0;
- put_task_struct(task);
- }
-
- if (!ns)
- goto err;
- if (ret)
- goto err_put_ns;
-
- ret = -ENOMEM;
- p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
- if (!p)
- goto err_put_path;
-
- file->private_data = &p->m;
- ret = seq_open(file, op);
- if (ret)
- goto err_free;
-
- p->m.private = p;
- p->ns = ns;
- p->root = root;
- p->m.poll_event = ns->event;
-
- return 0;
-
- err_free:
- kfree(p);
- err_put_path:
- path_put(&root);
- err_put_ns:
- put_mnt_ns(ns);
- err:
- return ret;
+ if (pid->hide_pid < hide_pid_min)
+ return true;
+ if (in_group_p(pid->pid_gid))
+ return true;
+ return ptrace_may_access(task, PTRACE_MODE_READ);
}
-static int mounts_release(struct inode *inode, struct file *file)
-{
- struct proc_mounts *p = file->private_data;
- path_put(&p->root);
- put_mnt_ns(p->ns);
- return seq_release(inode, file);
-}
-static unsigned mounts_poll(struct file *file, poll_table *wait)
+static int proc_pid_permission(struct inode *inode, int mask)
{
- struct proc_mounts *p = file->private_data;
- unsigned res = POLLIN | POLLRDNORM;
-
- poll_wait(file, &p->ns->poll, wait);
- if (mnt_had_events(p))
- res |= POLLERR | POLLPRI;
-
- return res;
-}
+ struct pid_namespace *pid = inode->i_sb->s_fs_info;
+ struct task_struct *task;
+ bool has_perms;
-static int mounts_open(struct inode *inode, struct file *file)
-{
- return mounts_open_common(inode, file, &mounts_op);
-}
+ task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
+ has_perms = has_pid_permissions(pid, task, 1);
+ put_task_struct(task);
-static const struct file_operations proc_mounts_operations = {
- .open = mounts_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = mounts_release,
- .poll = mounts_poll,
-};
+ if (!has_perms) {
+ if (pid->hide_pid == 2) {
+ /*
+ * Let's make getdents(), stat(), and open()
+ * consistent with each other. If a process
+ * may not stat() a file, it shouldn't be seen
+ * in procfs at all.
+ */
+ return -ENOENT;
+ }
-static int mountinfo_open(struct inode *inode, struct file *file)
-{
- return mounts_open_common(inode, file, &mountinfo_op);
+ return -EPERM;
+ }
+ return generic_permission(inode, mask);
}
-static const struct file_operations proc_mountinfo_operations = {
- .open = mountinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = mounts_release,
- .poll = mounts_poll,
-};
-static int mountstats_open(struct inode *inode, struct file *file)
-{
- return mounts_open_common(inode, file, &mountstats_op);
-}
-static const struct file_operations proc_mountstats_operations = {
- .open = mountstats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = mounts_release,
+static const struct inode_operations proc_def_inode_operations = {
+ .setattr = proc_setattr,
};
#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */
@@ -816,133 +679,96 @@ static const struct file_operations proc_single_file_operations = {
static int mem_open(struct inode* inode, struct file* file)
{
- file->private_data = (void*)((long)current->self_exec_id);
- /* OK to pass negative loff_t, we can catch out-of-range */
- file->f_mode |= FMODE_UNSIGNED_OFFSET;
- return 0;
-}
-
-static ssize_t mem_read(struct file * file, char __user * buf,
- size_t count, loff_t *ppos)
-{
struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- char *page;
- unsigned long src = *ppos;
- int ret = -ESRCH;
struct mm_struct *mm;
if (!task)
- goto out_no_task;
+ return -ESRCH;
- ret = -ENOMEM;
- page = (char *)__get_free_page(GFP_TEMPORARY);
- if (!page)
- goto out;
+ mm = mm_access(task, PTRACE_MODE_ATTACH);
+ put_task_struct(task);
- mm = check_mem_permission(task);
- ret = PTR_ERR(mm);
if (IS_ERR(mm))
- goto out_free;
+ return PTR_ERR(mm);
- ret = -EIO;
-
- if (file->private_data != (void*)((long)current->self_exec_id))
- goto out_put;
-
- ret = 0;
-
- while (count > 0) {
- int this_len, retval;
-
- this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
- retval = access_remote_vm(mm, src, page, this_len, 0);
- if (!retval) {
- if (!ret)
- ret = -EIO;
- break;
- }
-
- if (copy_to_user(buf, page, retval)) {
- ret = -EFAULT;
- break;
- }
-
- ret += retval;
- src += retval;
- buf += retval;
- count -= retval;
+ if (mm) {
+ /* ensure this mm_struct can't be freed */
+ atomic_inc(&mm->mm_count);
+ /* but do not pin its memory */
+ mmput(mm);
}
- *ppos = src;
-out_put:
- mmput(mm);
-out_free:
- free_page((unsigned long) page);
-out:
- put_task_struct(task);
-out_no_task:
- return ret;
+ /* OK to pass negative loff_t, we can catch out-of-range */
+ file->f_mode |= FMODE_UNSIGNED_OFFSET;
+ file->private_data = mm;
+
+ return 0;
}
-static ssize_t mem_write(struct file * file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t mem_rw(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos, int write)
{
- int copied;
+ struct mm_struct *mm = file->private_data;
+ unsigned long addr = *ppos;
+ ssize_t copied;
char *page;
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- unsigned long dst = *ppos;
- struct mm_struct *mm;
- copied = -ESRCH;
- if (!task)
- goto out_no_task;
+ if (!mm)
+ return 0;
- copied = -ENOMEM;
page = (char *)__get_free_page(GFP_TEMPORARY);
if (!page)
- goto out_task;
-
- mm = check_mem_permission(task);
- copied = PTR_ERR(mm);
- if (IS_ERR(mm))
- goto out_free;
-
- copied = -EIO;
- if (file->private_data != (void *)((long)current->self_exec_id))
- goto out_mm;
+ return -ENOMEM;
copied = 0;
+ if (!atomic_inc_not_zero(&mm->mm_users))
+ goto free;
+
while (count > 0) {
- int this_len, retval;
+ int this_len = min_t(int, count, PAGE_SIZE);
- this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
- if (copy_from_user(page, buf, this_len)) {
+ if (write && copy_from_user(page, buf, this_len)) {
copied = -EFAULT;
break;
}
- retval = access_remote_vm(mm, dst, page, this_len, 1);
- if (!retval) {
+
+ this_len = access_remote_vm(mm, addr, page, this_len, write);
+ if (!this_len) {
if (!copied)
copied = -EIO;
break;
}
- copied += retval;
- buf += retval;
- dst += retval;
- count -= retval;
+
+ if (!write && copy_to_user(buf, page, this_len)) {
+ copied = -EFAULT;
+ break;
+ }
+
+ buf += this_len;
+ addr += this_len;
+ copied += this_len;
+ count -= this_len;
}
- *ppos = dst;
+ *ppos = addr;
-out_mm:
mmput(mm);
-out_free:
+free:
free_page((unsigned long) page);
-out_task:
- put_task_struct(task);
-out_no_task:
return copied;
}
+static ssize_t mem_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return mem_rw(file, buf, count, ppos, 0);
+}
+
+static ssize_t mem_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return mem_rw(file, (char __user*)buf, count, ppos, 1);
+}
+
loff_t mem_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
@@ -959,11 +785,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
return file->f_pos;
}
+static int mem_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+ if (mm)
+ mmdrop(mm);
+ return 0;
+}
+
static const struct file_operations proc_mem_operations = {
.llseek = mem_lseek,
.read = mem_read,
.write = mem_write,
.open = mem_open,
+ .release = mem_release,
};
static ssize_t environ_read(struct file *file, char __user *buf,
@@ -1124,6 +959,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
else
task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
-OOM_DISABLE;
+ trace_oom_score_adj_update(task);
err_sighand:
unlock_task_sighand(task, &flags);
err_task_lock:
@@ -1211,6 +1047,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
task->signal->oom_score_adj = oom_score_adj;
if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
task->signal->oom_score_adj_min = oom_score_adj;
+ trace_oom_score_adj_update(task);
/*
* Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
* always attainable.
@@ -1261,9 +1098,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
ssize_t length;
uid_t loginuid;
- if (!capable(CAP_AUDIT_CONTROL))
- return -EPERM;
-
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
rcu_read_unlock();
@@ -1292,7 +1126,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
goto out_free_page;
}
- length = audit_set_loginuid(current, loginuid);
+ length = audit_set_loginuid(loginuid);
if (likely(length == 0))
length = count;
@@ -1567,13 +1401,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
.release = single_release,
};
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
{
struct task_struct *task;
struct mm_struct *mm;
struct file *exe_file;
- task = get_proc_task(inode);
+ task = get_proc_task(dentry->d_inode);
if (!task)
return -ENOENT;
mm = get_task_mm(task);
@@ -1603,7 +1437,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
if (!proc_fd_access_allowed(inode))
goto out;
- error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+ error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
out:
return ERR_PTR(error);
}
@@ -1642,7 +1476,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
if (!proc_fd_access_allowed(inode))
goto out;
- error = PROC_I(inode)->op.proc_get_link(inode, &path);
+ error = PROC_I(inode)->op.proc_get_link(dentry, &path);
if (error)
goto out;
@@ -1723,6 +1557,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
struct inode *inode = dentry->d_inode;
struct task_struct *task;
const struct cred *cred;
+ struct pid_namespace *pid = dentry->d_sb->s_fs_info;
generic_fillattr(inode, stat);
@@ -1731,6 +1566,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
stat->gid = 0;
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
+ if (!has_pid_permissions(pid, task, 2)) {
+ rcu_read_unlock();
+ /*
+ * This doesn't prevent learning whether PID exists,
+ * it only makes getattr() consistent with readdir().
+ */
+ return -ENOENT;
+ }
if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
task_dumpable(task)) {
cred = __task_cred(task);
@@ -1934,9 +1777,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
return -ENOENT;
}
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
{
- return proc_fd_info(inode, path, NULL);
+ return proc_fd_info(dentry->d_inode, path, NULL);
}
static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
@@ -2157,6 +2000,355 @@ static const struct file_operations proc_fd_operations = {
.llseek = default_llseek,
};
+#ifdef CONFIG_CHECKPOINT_RESTORE
+
+/*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+ unsigned long *start, unsigned long *end)
+{
+ if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+ unsigned long vm_start, vm_end;
+ bool exact_vma_exists = false;
+ struct mm_struct *mm = NULL;
+ struct task_struct *task;
+ const struct cred *cred;
+ struct inode *inode;
+ int status = 0;
+
+ if (nd && nd->flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ status = -EACCES;
+ goto out_notask;
+ }
+
+ inode = dentry->d_inode;
+ task = get_proc_task(inode);
+ if (!task)
+ goto out_notask;
+
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out;
+
+ if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+ down_read(&mm->mmap_sem);
+ exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+ up_read(&mm->mmap_sem);
+ }
+
+ mmput(mm);
+
+ if (exact_vma_exists) {
+ if (task_dumpable(task)) {
+ rcu_read_lock();
+ cred = __task_cred(task);
+ inode->i_uid = cred->euid;
+ inode->i_gid = cred->egid;
+ rcu_read_unlock();
+ } else {
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ }
+ security_task_to_inode(task, inode);
+ status = 1;
+ }
+
+out:
+ put_task_struct(task);
+
+out_notask:
+ if (status <= 0)
+ d_drop(dentry);
+
+ return status;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+ .d_revalidate = map_files_d_revalidate,
+ .d_delete = pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+ unsigned long vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ int rc;
+
+ rc = -ENOENT;
+ task = get_proc_task(dentry->d_inode);
+ if (!task)
+ goto out;
+
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+ if (rc)
+ goto out_mmput;
+
+ down_read(&mm->mmap_sem);
+ vma = find_exact_vma(mm, vm_start, vm_end);
+ if (vma && vma->vm_file) {
+ *path = vma->vm_file->f_path;
+ path_get(path);
+ rc = 0;
+ }
+ up_read(&mm->mmap_sem);
+
+out_mmput:
+ mmput(mm);
+out:
+ return rc;
+}
+
+struct map_files_info {
+ struct file *file;
+ unsigned long len;
+ unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ const struct file *file = ptr;
+ struct proc_inode *ei;
+ struct inode *inode;
+
+ if (!file)
+ return ERR_PTR(-ENOENT);
+
+ inode = proc_pid_make_inode(dir->i_sb, task);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ ei = PROC_I(inode);
+ ei->op.proc_get_link = proc_map_files_get_link;
+
+ inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_size = 64;
+ inode->i_mode = S_IFLNK;
+
+ if (file->f_mode & FMODE_READ)
+ inode->i_mode |= S_IRUSR;
+ if (file->f_mode & FMODE_WRITE)
+ inode->i_mode |= S_IWUSR;
+
+ d_set_d_op(dentry, &tid_map_files_dentry_operations);
+ d_add(dentry, inode);
+
+ return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+ struct dentry *dentry, struct nameidata *nd)
+{
+ unsigned long vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct dentry *result;
+ struct mm_struct *mm;
+
+ result = ERR_PTR(-EACCES);
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ result = ERR_PTR(-ENOENT);
+ task = get_proc_task(dir);
+ if (!task)
+ goto out;
+
+ result = ERR_PTR(-EACCES);
+ if (lock_trace(task))
+ goto out_put_task;
+
+ result = ERR_PTR(-ENOENT);
+ if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+ goto out_unlock;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_unlock;
+
+ down_read(&mm->mmap_sem);
+ vma = find_exact_vma(mm, vm_start, vm_end);
+ if (!vma)
+ goto out_no_vma;
+
+ result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+out_unlock:
+ unlock_trace(task);
+out_put_task:
+ put_task_struct(task);
+out:
+ return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+ .lookup = proc_map_files_lookup,
+ .permission = proc_fd_permission,
+ .setattr = proc_setattr,
+};
+
+static int
+proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ ino_t ino;
+ int ret;
+
+ ret = -EACCES;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ ret = -ENOENT;
+ task = get_proc_task(inode);
+ if (!task)
+ goto out;
+
+ ret = -EACCES;
+ if (lock_trace(task))
+ goto out_put_task;
+
+ ret = 0;
+ switch (filp->f_pos) {
+ case 0:
+ ino = inode->i_ino;
+ if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+ goto out_unlock;
+ filp->f_pos++;
+ case 1:
+ ino = parent_ino(dentry);
+ if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+ goto out_unlock;
+ filp->f_pos++;
+ default:
+ {
+ unsigned long nr_files, pos, i;
+ struct flex_array *fa = NULL;
+ struct map_files_info info;
+ struct map_files_info *p;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_unlock;
+ down_read(&mm->mmap_sem);
+
+ nr_files = 0;
+
+ /*
+ * We need two passes here:
+ *
+ * 1) Collect vmas of mapped files with mmap_sem taken
+ * 2) Release mmap_sem and instantiate entries
+ *
+ * otherwise we get lockdep complained, since filldir()
+ * routine might require mmap_sem taken in might_fault().
+ */
+
+ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ if (vma->vm_file && ++pos > filp->f_pos)
+ nr_files++;
+ }
+
+ if (nr_files) {
+ fa = flex_array_alloc(sizeof(info), nr_files,
+ GFP_KERNEL);
+ if (!fa || flex_array_prealloc(fa, 0, nr_files,
+ GFP_KERNEL)) {
+ ret = -ENOMEM;
+ if (fa)
+ flex_array_free(fa);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ goto out_unlock;
+ }
+ for (i = 0, vma = mm->mmap, pos = 2; vma;
+ vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= filp->f_pos)
+ continue;
+
+ get_file(vma->vm_file);
+ info.file = vma->vm_file;
+ info.len = snprintf(info.name,
+ sizeof(info.name), "%lx-%lx",
+ vma->vm_start, vma->vm_end);
+ if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+ BUG();
+ }
+ }
+ up_read(&mm->mmap_sem);
+
+ for (i = 0; i < nr_files; i++) {
+ p = flex_array_get(fa, i);
+ ret = proc_fill_cache(filp, dirent, filldir,
+ p->name, p->len,
+ proc_map_files_instantiate,
+ task, p->file);
+ if (ret)
+ break;
+ filp->f_pos++;
+ fput(p->file);
+ }
+ for (; i < nr_files; i++) {
+ /*
+ * In case of error don't forget
+ * to put rest of file refs.
+ */
+ p = flex_array_get(fa, i);
+ fput(p->file);
+ }
+ if (fa)
+ flex_array_free(fa);
+ mmput(mm);
+ }
+ }
+
+out_unlock:
+ unlock_trace(task);
+out_put_task:
+ put_task_struct(task);
+out:
+ return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+ .read = generic_read_dir,
+ .readdir = proc_map_files_readdir,
+ .llseek = default_llseek,
+};
+
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
/*
* /proc/pid/fd needs a special permission handler so that a process can still
* access /proc/self/fd after it has executed a setuid().
@@ -2772,6 +2964,9 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
@@ -2875,6 +3070,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
.lookup = proc_tgid_base_lookup,
.getattr = pid_getattr,
.setattr = proc_setattr,
+ .permission = proc_pid_permission,
};
static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
@@ -3078,6 +3274,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
proc_pid_instantiate, iter.task, NULL);
}
+static int fake_filldir(void *buf, const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned d_type)
+{
+ return 0;
+}
+
/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
@@ -3085,6 +3287,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
struct task_struct *reaper;
struct tgid_iter iter;
struct pid_namespace *ns;
+ filldir_t __filldir;
if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
goto out_no_task;
@@ -3106,8 +3309,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
for (iter = next_tgid(ns, iter);
iter.task;
iter.tgid += 1, iter = next_tgid(ns, iter)) {
+ if (has_pid_permissions(ns, iter.task, 2))
+ __filldir = filldir;
+ else
+ __filldir = fake_filldir;
+
filp->f_pos = iter.tgid + TGID_OFFSET;
- if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
+ if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
put_task_struct(iter.task);
goto out;
}
@@ -3442,6 +3650,7 @@ static const struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
.getattr = proc_task_getattr,
.setattr = proc_setattr,
+ .permission = proc_pid_permission,
};
static const struct file_operations proc_task_operations = {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 10090d9c7ad5..2edf34f2eb61 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -597,7 +597,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
const char *name,
- mode_t mode,
+ umode_t mode,
nlink_t nlink)
{
struct proc_dir_entry *ent = NULL;
@@ -659,7 +659,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
}
EXPORT_SYMBOL(proc_symlink);
-struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
+struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
struct proc_dir_entry *parent)
{
struct proc_dir_entry *ent;
@@ -699,7 +699,7 @@ struct proc_dir_entry *proc_mkdir(const char *name,
}
EXPORT_SYMBOL(proc_mkdir);
-struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
+struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode,
struct proc_dir_entry *parent)
{
struct proc_dir_entry *ent;
@@ -728,7 +728,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
}
EXPORT_SYMBOL(create_proc_entry);
-struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
+struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct file_operations *proc_fops,
void *data)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 7737c5468a40..84fd3235a590 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -7,6 +7,7 @@
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
+#include <linux/pid_namespace.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/stat.h>
@@ -17,7 +18,9 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
+#include <linux/seq_file.h>
#include <linux/slab.h>
+#include <linux/mount.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -77,7 +80,6 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
static void proc_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}
@@ -102,12 +104,27 @@ void __init proc_init_inodecache(void)
init_once);
}
+static int proc_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct pid_namespace *pid = sb->s_fs_info;
+
+ if (pid->pid_gid)
+ seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid);
+ if (pid->hide_pid != 0)
+ seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+
+ return 0;
+}
+
static const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.destroy_inode = proc_destroy_inode,
.drop_inode = generic_delete_inode,
.evict_inode = proc_evict_inode,
.statfs = simple_statfs,
+ .remount_fs = proc_remount,
+ .show_options = proc_show_options,
};
static void __pde_users_dec(struct proc_dir_entry *pde)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7838e5cfec14..292577531ad1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -117,6 +117,7 @@ void pde_put(struct proc_dir_entry *pde);
int proc_fill_super(struct super_block *);
struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+int proc_remount(struct super_block *sb, int *flags, char *data);
/*
* These are generic /proc routines that use the internal
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index be177f702acb..27da860115c6 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -9,7 +9,6 @@
#include <linux/file.h>
#include <linux/utsname.h>
#include <net/net_namespace.h>
-#include <linux/mnt_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/pid_namespace.h>
#include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index f738024ccc8e..06e1cc17caf6 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -179,7 +179,7 @@ const struct file_operations proc_net_operations = {
struct proc_dir_entry *proc_net_fops_create(struct net *net,
- const char *name, mode_t mode, const struct file_operations *fops)
+ const char *name, umode_t mode, const struct file_operations *fops)
{
return proc_create(name, mode, net->proc_net, fops);
}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 03102d978180..46a15d8a29ca 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,6 +18,7 @@
#include <linux/bitops.h>
#include <linux/mount.h>
#include <linux/pid_namespace.h>
+#include <linux/parser.h>
#include "internal.h"
@@ -36,6 +37,63 @@ static int proc_set_super(struct super_block *sb, void *data)
return err;
}
+enum {
+ Opt_gid, Opt_hidepid, Opt_err,
+};
+
+static const match_table_t tokens = {
+ {Opt_hidepid, "hidepid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_err, NULL},
+};
+
+static int proc_parse_options(char *options, struct pid_namespace *pid)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+
+ if (!options)
+ return 1;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ args[0].to = args[0].from = 0;
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return 0;
+ pid->pid_gid = option;
+ break;
+ case Opt_hidepid:
+ if (match_int(&args[0], &option))
+ return 0;
+ if (option < 0 || option > 2) {
+ pr_err("proc: hidepid value must be between 0 and 2.\n");
+ return 0;
+ }
+ pid->hide_pid = option;
+ break;
+ default:
+ pr_err("proc: unrecognized mount option \"%s\" "
+ "or missing value\n", p);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+int proc_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct pid_namespace *pid = sb->s_fs_info;
+ return !proc_parse_options(data, pid);
+}
+
static struct dentry *proc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
@@ -43,11 +101,15 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
struct super_block *sb;
struct pid_namespace *ns;
struct proc_inode *ei;
+ char *options;
- if (flags & MS_KERNMOUNT)
+ if (flags & MS_KERNMOUNT) {
ns = (struct pid_namespace *)data;
- else
+ options = NULL;
+ } else {
ns = current->nsproxy->pid_ns;
+ options = data;
+ }
sb = sget(fs_type, proc_test_super, proc_set_super, ns);
if (IS_ERR(sb))
@@ -55,6 +117,10 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
if (!sb->s_root) {
sb->s_flags = flags;
+ if (!proc_parse_options(options, ns)) {
+ deactivate_locked_super(sb);
+ return ERR_PTR(-EINVAL);
+ }
err = proc_fill_super(sb);
if (err) {
deactivate_locked_super(sb);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 0855e6f20391..121f77cfef76 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -22,29 +22,27 @@
#define arch_idle_time(cpu) 0
#endif
-static cputime64_t get_idle_time(int cpu)
+static u64 get_idle_time(int cpu)
{
- u64 idle_time = get_cpu_idle_time_us(cpu, NULL);
- cputime64_t idle;
+ u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL);
if (idle_time == -1ULL) {
/* !NO_HZ so we can rely on cpustat.idle */
- idle = kstat_cpu(cpu).cpustat.idle;
- idle = cputime64_add(idle, arch_idle_time(cpu));
+ idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle += arch_idle_time(cpu);
} else
idle = usecs_to_cputime64(idle_time);
return idle;
}
-static cputime64_t get_iowait_time(int cpu)
+static u64 get_iowait_time(int cpu)
{
- u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL);
- cputime64_t iowait;
+ u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL);
if (iowait_time == -1ULL)
/* !NO_HZ so we can rely on cpustat.iowait */
- iowait = kstat_cpu(cpu).cpustat.iowait;
+ iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
else
iowait = usecs_to_cputime64(iowait_time);
@@ -55,31 +53,30 @@ static int show_stat(struct seq_file *p, void *v)
{
int i, j;
unsigned long jif;
- cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
- cputime64_t guest, guest_nice;
+ u64 user, nice, system, idle, iowait, irq, softirq, steal;
+ u64 guest, guest_nice;
u64 sum = 0;
u64 sum_softirq = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
struct timespec boottime;
user = nice = system = idle = iowait =
- irq = softirq = steal = cputime64_zero;
- guest = guest_nice = cputime64_zero;
+ irq = softirq = steal = 0;
+ guest = guest_nice = 0;
getboottime(&boottime);
jif = boottime.tv_sec;
for_each_possible_cpu(i) {
- user = cputime64_add(user, kstat_cpu(i).cpustat.user);
- nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
- system = cputime64_add(system, kstat_cpu(i).cpustat.system);
- idle = cputime64_add(idle, get_idle_time(i));
- iowait = cputime64_add(iowait, get_iowait_time(i));
- irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
- softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
- steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
- guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
- guest_nice = cputime64_add(guest_nice,
- kstat_cpu(i).cpustat.guest_nice);
+ user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
+ nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
+ system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(i);
+ iowait += get_iowait_time(i);
+ irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
+ softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
+ steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
+ guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
+ guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
@@ -106,16 +103,16 @@ static int show_stat(struct seq_file *p, void *v)
(unsigned long long)cputime64_to_clock_t(guest_nice));
for_each_online_cpu(i) {
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kstat_cpu(i).cpustat.user;
- nice = kstat_cpu(i).cpustat.nice;
- system = kstat_cpu(i).cpustat.system;
+ user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
+ nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
+ system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
idle = get_idle_time(i);
iowait = get_iowait_time(i);
- irq = kstat_cpu(i).cpustat.irq;
- softirq = kstat_cpu(i).cpustat.softirq;
- steal = kstat_cpu(i).cpustat.steal;
- guest = kstat_cpu(i).cpustat.guest;
- guest_nice = kstat_cpu(i).cpustat.guest_nice;
+ irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
+ softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
+ steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
+ guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
+ guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
seq_printf(p,
"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
"%llu\n",
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e418c5abdb0e..7dcd2a250495 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -518,6 +518,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
if (!page)
continue;
+ if (PageReserved(page))
+ continue;
+
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
ClearPageReferenced(page);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 766b1d456050..9610ac772d7e 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -11,15 +11,20 @@ static int uptime_proc_show(struct seq_file *m, void *v)
{
struct timespec uptime;
struct timespec idle;
+ u64 idletime;
+ u64 nsec;
+ u32 rem;
int i;
- cputime_t idletime = cputime_zero;
+ idletime = 0;
for_each_possible_cpu(i)
- idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle);
+ idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
do_posix_clock_monotonic_gettime(&uptime);
monotonic_to_bootbased(&uptime);
- cputime_to_timespec(idletime, &idle);
+ nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
+ idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+ idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
(unsigned long) uptime.tv_sec,
(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
new file mode 100644
index 000000000000..12412852d88a
--- /dev/null
+++ b/fs/proc_namespace.c
@@ -0,0 +1,333 @@
+/*
+ * fs/proc_namespace.c - handling of /proc/<pid>/{mounts,mountinfo,mountstats}
+ *
+ * In fact, that's a piece of procfs; it's *almost* isolated from
+ * the rest of fs/proc, but has rather close relationships with
+ * fs/namespace.c, thus here instead of fs/proc
+ *
+ */
+#include <linux/mnt_namespace.h>
+#include <linux/nsproxy.h>
+#include <linux/security.h>
+#include <linux/fs_struct.h>
+#include "proc/internal.h" /* only for get_proc_task() in ->open() */
+
+#include "pnode.h"
+#include "internal.h"
+
+static unsigned mounts_poll(struct file *file, poll_table *wait)
+{
+ struct proc_mounts *p = file->private_data;
+ struct mnt_namespace *ns = p->ns;
+ unsigned res = POLLIN | POLLRDNORM;
+
+ poll_wait(file, &p->ns->poll, wait);
+
+ br_read_lock(vfsmount_lock);
+ if (p->m.poll_event != ns->event) {
+ p->m.poll_event = ns->event;
+ res |= POLLERR | POLLPRI;
+ }
+ br_read_unlock(vfsmount_lock);
+
+ return res;
+}
+
+struct proc_fs_info {
+ int flag;
+ const char *str;
+};
+
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
+{
+ static const struct proc_fs_info fs_info[] = {
+ { MS_SYNCHRONOUS, ",sync" },
+ { MS_DIRSYNC, ",dirsync" },
+ { MS_MANDLOCK, ",mand" },
+ { 0, NULL }
+ };
+ const struct proc_fs_info *fs_infop;
+
+ for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
+ if (sb->s_flags & fs_infop->flag)
+ seq_puts(m, fs_infop->str);
+ }
+
+ return security_sb_show_options(m, sb);
+}
+
+static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
+{
+ static const struct proc_fs_info mnt_info[] = {
+ { MNT_NOSUID, ",nosuid" },
+ { MNT_NODEV, ",nodev" },
+ { MNT_NOEXEC, ",noexec" },
+ { MNT_NOATIME, ",noatime" },
+ { MNT_NODIRATIME, ",nodiratime" },
+ { MNT_RELATIME, ",relatime" },
+ { 0, NULL }
+ };
+ const struct proc_fs_info *fs_infop;
+
+ for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
+ if (mnt->mnt_flags & fs_infop->flag)
+ seq_puts(m, fs_infop->str);
+ }
+}
+
+static inline void mangle(struct seq_file *m, const char *s)
+{
+ seq_escape(m, s, " \t\n\\");
+}
+
+static void show_type(struct seq_file *m, struct super_block *sb)
+{
+ mangle(m, sb->s_type->name);
+ if (sb->s_subtype && sb->s_subtype[0]) {
+ seq_putc(m, '.');
+ mangle(m, sb->s_subtype);
+ }
+}
+
+static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
+{
+ struct mount *r = real_mount(mnt);
+ int err = 0;
+ struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+ struct super_block *sb = mnt_path.dentry->d_sb;
+
+ if (sb->s_op->show_devname) {
+ err = sb->s_op->show_devname(m, mnt_path.dentry);
+ if (err)
+ goto out;
+ } else {
+ mangle(m, r->mnt_devname ? r->mnt_devname : "none");
+ }
+ seq_putc(m, ' ');
+ seq_path(m, &mnt_path, " \t\n\\");
+ seq_putc(m, ' ');
+ show_type(m, sb);
+ seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
+ err = show_sb_opts(m, sb);
+ if (err)
+ goto out;
+ show_mnt_opts(m, mnt);
+ if (sb->s_op->show_options)
+ err = sb->s_op->show_options(m, mnt_path.dentry);
+ seq_puts(m, " 0 0\n");
+out:
+ return err;
+}
+
+static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
+{
+ struct proc_mounts *p = m->private;
+ struct mount *r = real_mount(mnt);
+ struct super_block *sb = mnt->mnt_sb;
+ struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+ struct path root = p->root;
+ int err = 0;
+
+ seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
+ MAJOR(sb->s_dev), MINOR(sb->s_dev));
+ if (sb->s_op->show_path)
+ err = sb->s_op->show_path(m, mnt->mnt_root);
+ else
+ seq_dentry(m, mnt->mnt_root, " \t\n\\");
+ if (err)
+ goto out;
+ seq_putc(m, ' ');
+
+ /* mountpoints outside of chroot jail will give SEQ_SKIP on this */
+ err = seq_path_root(m, &mnt_path, &root, " \t\n\\");
+ if (err)
+ goto out;
+
+ seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
+ show_mnt_opts(m, mnt);
+
+ /* Tagged fields ("foo:X" or "bar") */
+ if (IS_MNT_SHARED(r))
+ seq_printf(m, " shared:%i", r->mnt_group_id);
+ if (IS_MNT_SLAVE(r)) {
+ int master = r->mnt_master->mnt_group_id;
+ int dom = get_dominating_id(r, &p->root);
+ seq_printf(m, " master:%i", master);
+ if (dom && dom != master)
+ seq_printf(m, " propagate_from:%i", dom);
+ }
+ if (IS_MNT_UNBINDABLE(r))
+ seq_puts(m, " unbindable");
+
+ /* Filesystem specific data */
+ seq_puts(m, " - ");
+ show_type(m, sb);
+ seq_putc(m, ' ');
+ if (sb->s_op->show_devname)
+ err = sb->s_op->show_devname(m, mnt->mnt_root);
+ else
+ mangle(m, r->mnt_devname ? r->mnt_devname : "none");
+ if (err)
+ goto out;
+ seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
+ err = show_sb_opts(m, sb);
+ if (err)
+ goto out;
+ if (sb->s_op->show_options)
+ err = sb->s_op->show_options(m, mnt->mnt_root);
+ seq_putc(m, '\n');
+out:
+ return err;
+}
+
+static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
+{
+ struct mount *r = real_mount(mnt);
+ struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+ struct super_block *sb = mnt_path.dentry->d_sb;
+ int err = 0;
+
+ /* device */
+ if (sb->s_op->show_devname) {
+ seq_puts(m, "device ");
+ err = sb->s_op->show_devname(m, mnt_path.dentry);
+ } else {
+ if (r->mnt_devname) {
+ seq_puts(m, "device ");
+ mangle(m, r->mnt_devname);
+ } else
+ seq_puts(m, "no device");
+ }
+
+ /* mount point */
+ seq_puts(m, " mounted on ");
+ seq_path(m, &mnt_path, " \t\n\\");
+ seq_putc(m, ' ');
+
+ /* file system type */
+ seq_puts(m, "with fstype ");
+ show_type(m, sb);
+
+ /* optional statistics */
+ if (sb->s_op->show_stats) {
+ seq_putc(m, ' ');
+ if (!err)
+ err = sb->s_op->show_stats(m, mnt_path.dentry);
+ }
+
+ seq_putc(m, '\n');
+ return err;
+}
+
+static int mounts_open_common(struct inode *inode, struct file *file,
+ int (*show)(struct seq_file *, struct vfsmount *))
+{
+ struct task_struct *task = get_proc_task(inode);
+ struct nsproxy *nsp;
+ struct mnt_namespace *ns = NULL;
+ struct path root;
+ struct proc_mounts *p;
+ int ret = -EINVAL;
+
+ if (!task)
+ goto err;
+
+ rcu_read_lock();
+ nsp = task_nsproxy(task);
+ if (!nsp) {
+ rcu_read_unlock();
+ put_task_struct(task);
+ goto err;
+ }
+ ns = nsp->mnt_ns;
+ if (!ns) {
+ rcu_read_unlock();
+ put_task_struct(task);
+ goto err;
+ }
+ get_mnt_ns(ns);
+ rcu_read_unlock();
+ task_lock(task);
+ if (!task->fs) {
+ task_unlock(task);
+ put_task_struct(task);
+ ret = -ENOENT;
+ goto err_put_ns;
+ }
+ get_fs_root(task->fs, &root);
+ task_unlock(task);
+ put_task_struct(task);
+
+ ret = -ENOMEM;
+ p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
+ if (!p)
+ goto err_put_path;
+
+ file->private_data = &p->m;
+ ret = seq_open(file, &mounts_op);
+ if (ret)
+ goto err_free;
+
+ p->m.private = p;
+ p->ns = ns;
+ p->root = root;
+ p->m.poll_event = ns->event;
+ p->show = show;
+
+ return 0;
+
+ err_free:
+ kfree(p);
+ err_put_path:
+ path_put(&root);
+ err_put_ns:
+ put_mnt_ns(ns);
+ err:
+ return ret;
+}
+
+static int mounts_release(struct inode *inode, struct file *file)
+{
+ struct proc_mounts *p = file->private_data;
+ path_put(&p->root);
+ put_mnt_ns(p->ns);
+ return seq_release(inode, file);
+}
+
+static int mounts_open(struct inode *inode, struct file *file)
+{
+ return mounts_open_common(inode, file, show_vfsmnt);
+}
+
+static int mountinfo_open(struct inode *inode, struct file *file)
+{
+ return mounts_open_common(inode, file, show_mountinfo);
+}
+
+static int mountstats_open(struct inode *inode, struct file *file)
+{
+ return mounts_open_common(inode, file, show_vfsstat);
+}
+
+const struct file_operations proc_mounts_operations = {
+ .open = mounts_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = mounts_release,
+ .poll = mounts_poll,
+};
+
+const struct file_operations proc_mountinfo_operations = {
+ .open = mountinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = mounts_release,
+ .poll = mounts_poll,
+};
+
+const struct file_operations proc_mountstats_operations = {
+ .open = mountstats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = mounts_release,
+};
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 379a02dc1217..b3b426edb2fd 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -80,7 +80,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
{
struct pstore_private *p = dentry->d_inode->i_private;
- p->psi->erase(p->type, p->id, p->psi);
+ if (p->psi->erase)
+ p->psi->erase(p->type, p->id, p->psi);
return simple_unlink(dir, dentry);
}
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 57bbf9078ac8..9ec22d3b4293 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -122,7 +122,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
memcpy(dst, s1 + s1_start, l1_cpy);
memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
- ret = psinfo->write(PSTORE_TYPE_DMESG, &id, part,
+ ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
hsize + l1_cpy + l2_cpy, psinfo);
if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
pstore_new_entry = 1;
@@ -207,8 +207,7 @@ void pstore_get_records(int quiet)
return;
mutex_lock(&psi->read_mutex);
- rc = psi->open(psi);
- if (rc)
+ if (psi->open && psi->open(psi))
goto out;
while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) {
@@ -219,7 +218,8 @@ void pstore_get_records(int quiet)
if (rc && (rc != -EEXIST || !quiet))
failed++;
}
- psi->close(psi);
+ if (psi->close)
+ psi->close(psi);
out:
mutex_unlock(&psi->read_mutex);
@@ -243,33 +243,5 @@ static void pstore_timefunc(unsigned long dummy)
mod_timer(&pstore_timer, jiffies + PSTORE_INTERVAL);
}
-/*
- * Call platform driver to write a record to the
- * persistent store.
- */
-int pstore_write(enum pstore_type_id type, char *buf, size_t size)
-{
- u64 id;
- int ret;
- unsigned long flags;
-
- if (!psinfo)
- return -ENODEV;
-
- if (size > psinfo->bufsize)
- return -EFBIG;
-
- spin_lock_irqsave(&psinfo->buf_lock, flags);
- memcpy(psinfo->buf, buf, size);
- ret = psinfo->write(type, &id, 0, size, psinfo);
- if (ret == 0 && pstore_is_mounted())
- pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
- size, CURRENT_TIME, psinfo);
- spin_unlock_irqrestore(&psinfo->buf_lock, flags);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(pstore_write);
-
module_param(backend, charp, 0444);
MODULE_PARM_DESC(backend, "Pstore backend to use");
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 3bdd21418432..6b009548d2e0 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -179,46 +179,33 @@ static const char *qnx4_checkroot(struct super_block *sb)
struct qnx4_inode_entry *rootdir;
int rd, rl;
int i, j;
- int found = 0;
- if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
+ if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/')
return "no qnx4 filesystem (no root dir).";
- } else {
- QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
- rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
- rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
- for (j = 0; j < rl; j++) {
- bh = sb_bread(sb, rd + j); /* root dir, first block */
- if (bh == NULL) {
- return "unable to read root entry.";
- }
- for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
- rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
- if (rootdir->di_fname != NULL) {
- QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
- if (!strcmp(rootdir->di_fname,
- QNX4_BMNAME)) {
- found = 1;
- qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
- if (!qnx4_sb(sb)->BitMap) {
- brelse (bh);
- return "not enough memory for bitmap inode";
- }
- memcpy( qnx4_sb(sb)->BitMap, rootdir, sizeof( struct qnx4_inode_entry ) ); /* keep bitmap inode known */
- break;
- }
- }
- }
+ QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
+ rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
+ rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
+ for (j = 0; j < rl; j++) {
+ bh = sb_bread(sb, rd + j); /* root dir, first block */
+ if (bh == NULL)
+ return "unable to read root entry.";
+ rootdir = (struct qnx4_inode_entry *) bh->b_data;
+ for (i = 0; i < QNX4_INODES_PER_BLOCK; i++, rootdir++) {
+ QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
+ if (strcmp(rootdir->di_fname, QNX4_BMNAME) != 0)
+ continue;
+ qnx4_sb(sb)->BitMap = kmemdup(rootdir,
+ sizeof(struct qnx4_inode_entry),
+ GFP_KERNEL);
brelse(bh);
- if (found != 0) {
- break;
- }
- }
- if (found == 0) {
- return "bitmap file not found.";
+ if (!qnx4_sb(sb)->BitMap)
+ return "not enough memory for bitmap inode";
+ /* keep bitmap inode known */
+ return NULL;
}
+ brelse(bh);
}
- return NULL;
+ return "bitmap file not found.";
}
static int qnx4_fill_super(struct super_block *s, void *data, int silent)
@@ -269,7 +256,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
if (IS_ERR(root)) {
printk(KERN_ERR "qnx4: get inode failed\n");
ret = PTR_ERR(root);
- goto out;
+ goto outb;
}
ret = -ENOMEM;
@@ -282,6 +269,8 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
outi:
iput(root);
+ outb:
+ kfree(qs->BitMap);
out:
brelse(bh);
outnobh:
@@ -427,7 +416,6 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb)
static void qnx4_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 5b572c89e6c4..46741970371b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -73,7 +73,6 @@
#include <linux/security.h>
#include <linux/kmod.h>
#include <linux/namei.h>
-#include <linux/buffer_head.h>
#include <linux/capability.h>
#include <linux/quotaops.h>
#include "../internal.h" /* ugh */
@@ -2126,6 +2125,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
mutex_unlock(&dqopt->dqio_mutex);
goto out_file_init;
}
+ if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+ dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
mutex_unlock(&dqopt->dqio_mutex);
spin_lock(&dq_state_lock);
dqopt->flags |= dquot_state_flag(flags, type);
@@ -2199,7 +2200,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id,
if (error)
return error;
/* Quota file not on the same filesystem? */
- if (path->mnt->mnt_sb != sb)
+ if (path->dentry->d_sb != sb)
error = -EXDEV;
else
error = vfs_load_quota_inode(path->dentry->d_inode, type,
@@ -2465,7 +2466,7 @@ int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
spin_lock(&dq_data_lock);
ii->dqi_bgrace = mi->dqi_bgrace;
ii->dqi_igrace = mi->dqi_igrace;
- ii->dqi_flags = mi->dqi_flags & DQF_MASK;
+ ii->dqi_flags = mi->dqi_flags & DQF_GETINFO_MASK;
ii->dqi_valid = IIF_ALL;
spin_unlock(&dq_data_lock);
mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
@@ -2491,8 +2492,8 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
if (ii->dqi_valid & IIF_IGRACE)
mi->dqi_igrace = ii->dqi_igrace;
if (ii->dqi_valid & IIF_FLAGS)
- mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) |
- (ii->dqi_flags & DQF_MASK);
+ mi->dqi_flags = (mi->dqi_flags & ~DQF_SETINFO_MASK) |
+ (ii->dqi_flags & DQF_SETINFO_MASK);
spin_unlock(&dq_data_lock);
mark_info_dirty(sb, type);
/* Force write to disk */
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 35f4b0ecdeb3..fc2c4388d126 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -13,7 +13,6 @@
#include <linux/kernel.h>
#include <linux/security.h>
#include <linux/syscalls.h>
-#include <linux/buffer_head.h>
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/types.h>
@@ -293,11 +292,26 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
}
}
+/* Return 1 if 'cmd' will block on frozen filesystem */
+static int quotactl_cmd_write(int cmd)
+{
+ switch (cmd) {
+ case Q_GETFMT:
+ case Q_GETINFO:
+ case Q_SYNC:
+ case Q_XGETQSTAT:
+ case Q_XGETQUOTA:
+ case Q_XQUOTASYNC:
+ return 0;
+ }
+ return 1;
+}
+
/*
* look up a superblock on which quota ops will be performed
* - use the name of a block device to find the superblock thereon
*/
-static struct super_block *quotactl_block(const char __user *special)
+static struct super_block *quotactl_block(const char __user *special, int cmd)
{
#ifdef CONFIG_BLOCK
struct block_device *bdev;
@@ -310,7 +324,10 @@ static struct super_block *quotactl_block(const char __user *special)
putname(tmp);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
- sb = get_super(bdev);
+ if (quotactl_cmd_write(cmd))
+ sb = get_super_thawed(bdev);
+ else
+ sb = get_super(bdev);
bdput(bdev);
if (!sb)
return ERR_PTR(-ENODEV);
@@ -362,7 +379,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
pathp = &path;
}
- sb = quotactl_block(special);
+ sb = quotactl_block(special, cmds);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
goto out;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 462ceb38fec6..aec766abe3af 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -52,7 +52,7 @@ static struct backing_dev_info ramfs_backing_dev_info = {
};
struct inode *ramfs_get_inode(struct super_block *sb,
- const struct inode *dir, int mode, dev_t dev)
+ const struct inode *dir, umode_t mode, dev_t dev)
{
struct inode * inode = new_inode(sb);
@@ -92,7 +92,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
*/
/* SMP-safe */
static int
-ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
int error = -ENOSPC;
@@ -106,7 +106,7 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
return error;
}
-static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
{
int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
if (!retval)
@@ -114,7 +114,7 @@ static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
return retval;
}
-static int ramfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd)
{
return ramfs_mknod(dir, dentry, mode | S_IFREG, 0);
}
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index d1aca1df4f92..70de42f09f1d 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -13,6 +13,7 @@
#include <linux/reiserfs_fs_sb.h>
#include <linux/reiserfs_fs_i.h>
#include <linux/quotaops.h>
+#include <linux/seq_file.h>
#define PREALLOCATION_SIZE 9
@@ -634,6 +635,96 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options)
return 0;
}
+static void print_sep(struct seq_file *seq, int *first)
+{
+ if (!*first)
+ seq_puts(seq, ":");
+ else
+ *first = 0;
+}
+
+void show_alloc_options(struct seq_file *seq, struct super_block *s)
+{
+ int first = 1;
+
+ if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) |
+ (1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups)))
+ return;
+
+ seq_puts(seq, ",alloc=");
+
+ if (TEST_OPTION(concentrating_formatted_nodes, s)) {
+ print_sep(seq, &first);
+ if (REISERFS_SB(s)->s_alloc_options.border != 10) {
+ seq_printf(seq, "concentrating_formatted_nodes=%d",
+ 100 / REISERFS_SB(s)->s_alloc_options.border);
+ } else
+ seq_puts(seq, "concentrating_formatted_nodes");
+ }
+ if (TEST_OPTION(displacing_large_files, s)) {
+ print_sep(seq, &first);
+ if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) {
+ seq_printf(seq, "displacing_large_files=%lu",
+ REISERFS_SB(s)->s_alloc_options.large_file_size);
+ } else
+ seq_puts(seq, "displacing_large_files");
+ }
+ if (TEST_OPTION(displacing_new_packing_localities, s)) {
+ print_sep(seq, &first);
+ seq_puts(seq, "displacing_new_packing_localities");
+ }
+ if (TEST_OPTION(old_hashed_relocation, s)) {
+ print_sep(seq, &first);
+ seq_puts(seq, "old_hashed_relocation");
+ }
+ if (TEST_OPTION(new_hashed_relocation, s)) {
+ print_sep(seq, &first);
+ seq_puts(seq, "new_hashed_relocation");
+ }
+ if (TEST_OPTION(dirid_groups, s)) {
+ print_sep(seq, &first);
+ seq_puts(seq, "dirid_groups");
+ }
+ if (TEST_OPTION(oid_groups, s)) {
+ print_sep(seq, &first);
+ seq_puts(seq, "oid_groups");
+ }
+ if (TEST_OPTION(packing_groups, s)) {
+ print_sep(seq, &first);
+ seq_puts(seq, "packing_groups");
+ }
+ if (TEST_OPTION(hashed_formatted_nodes, s)) {
+ print_sep(seq, &first);
+ seq_puts(seq, "hashed_formatted_nodes");
+ }
+ if (TEST_OPTION(skip_busy, s)) {
+ print_sep(seq, &first);
+ seq_puts(seq, "skip_busy");
+ }
+ if (TEST_OPTION(hundredth_slices, s)) {
+ print_sep(seq, &first);
+ seq_puts(seq, "hundredth_slices");
+ }
+ if (TEST_OPTION(old_way, s)) {
+ print_sep(seq, &first);
+ seq_puts(seq, "old_way");
+ }
+ if (TEST_OPTION(displace_based_on_dirid, s)) {
+ print_sep(seq, &first);
+ seq_puts(seq, "displace_based_on_dirid");
+ }
+ if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) {
+ print_sep(seq, &first);
+ seq_printf(seq, "preallocmin=%d",
+ REISERFS_SB(s)->s_alloc_options.preallocmin);
+ }
+ if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) {
+ print_sep(seq, &first);
+ seq_printf(seq, "preallocsize=%d",
+ REISERFS_SB(s)->s_alloc_options.preallocsize);
+ }
+}
+
static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint)
{
char *hash_in;
@@ -1273,10 +1364,7 @@ int reiserfs_init_bitmap_cache(struct super_block *sb)
struct reiserfs_bitmap_info *bitmap;
unsigned int bmap_nr = reiserfs_bmap_count(sb);
- /* Avoid lock recursion in fault case */
- reiserfs_write_unlock(sb);
bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
- reiserfs_write_lock(sb);
if (bitmap == NULL)
return -ENOMEM;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 950f13af0951..9e8cd5acd79c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1766,7 +1766,7 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i
for the fresh inode. This can only be done outside a transaction, so
if we return non-zero, we also end the transaction. */
int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
- struct inode *dir, int mode, const char *symname,
+ struct inode *dir, umode_t mode, const char *symname,
/* 0 for regular, EMTRY_DIR_SIZE for dirs,
strlen (symname) for symlinks) */
loff_t i_size, struct dentry *dentry,
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 4e153051bc75..950e3d1b5c9e 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -55,7 +55,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
break;
}
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
break;
@@ -96,7 +96,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
setflags_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
break;
}
case REISERFS_IOC_GETVERSION:
@@ -107,7 +107,7 @@ setflags_out:
err = -EPERM;
break;
}
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
break;
if (get_user(inode->i_generation, (int __user *)arg)) {
@@ -117,7 +117,7 @@ setflags_out:
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
setversion_out:
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
break;
default:
err = -ENOTTY;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index eb711060a6f2..c3cf54fd4de3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2678,16 +2678,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
char b[BDEVNAME_SIZE];
int ret;
- /*
- * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
- * dependency inversion warnings.
- */
- reiserfs_write_unlock(sb);
journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
if (!journal) {
reiserfs_warning(sb, "journal-1256",
"unable to get memory for journal structure");
- reiserfs_write_lock(sb);
return 1;
}
INIT_LIST_HEAD(&journal->j_bitmap_nodes);
@@ -2695,10 +2689,8 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
INIT_LIST_HEAD(&journal->j_working_list);
INIT_LIST_HEAD(&journal->j_journal_list);
journal->j_persistent_trans = 0;
- ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
- reiserfs_bmap_count(sb));
- reiserfs_write_lock(sb);
- if (ret)
+ if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
+ reiserfs_bmap_count(sb)))
goto free_and_return;
allocate_bitmap_nodes(sb);
@@ -2727,27 +2719,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
goto free_and_return;
}
- /*
- * We need to unlock here to avoid creating the following
- * dependency:
- * reiserfs_lock -> sysfs_mutex
- * Because the reiserfs mmap path creates the following dependency:
- * mm->mmap -> reiserfs_lock, hence we have
- * mm->mmap -> reiserfs_lock ->sysfs_mutex
- * This would ends up in a circular dependency with sysfs readdir path
- * which does sysfs_mutex -> mm->mmap_sem
- * This is fine because the reiserfs lock is useless in mount path,
- * at least until we call journal_begin. We keep it for paranoid
- * reasons.
- */
- reiserfs_write_unlock(sb);
if (journal_init_dev(sb, journal, j_dev_name) != 0) {
- reiserfs_write_lock(sb);
reiserfs_warning(sb, "sh-462",
"unable to initialize jornal device");
goto free_and_return;
}
- reiserfs_write_lock(sb);
rs = SB_DISK_SUPER_BLOCK(sb);
@@ -2829,9 +2805,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
journal->j_mount_id = 10;
journal->j_state = 0;
atomic_set(&(journal->j_jlock), 0);
- reiserfs_write_unlock(sb);
journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
- reiserfs_write_lock(sb);
journal->j_cnode_free_orig = journal->j_cnode_free_list;
journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
journal->j_cnode_used = 0;
@@ -2848,24 +2822,37 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
init_journal_hash(sb);
jl = journal->j_current_jl;
+
+ /*
+ * get_list_bitmap() may call flush_commit_list() which
+ * requires the lock. Calling flush_commit_list() shouldn't happen
+ * this early but I like to be paranoid.
+ */
+ reiserfs_write_lock(sb);
jl->j_list_bitmap = get_list_bitmap(sb, jl);
+ reiserfs_write_unlock(sb);
if (!jl->j_list_bitmap) {
reiserfs_warning(sb, "journal-2005",
"get_list_bitmap failed for journal list 0");
goto free_and_return;
}
- if (journal_read(sb) < 0) {
+
+ /*
+ * Journal_read needs to be inspected in order to push down
+ * the lock further inside (or even remove it).
+ */
+ reiserfs_write_lock(sb);
+ ret = journal_read(sb);
+ reiserfs_write_unlock(sb);
+ if (ret < 0) {
reiserfs_warning(sb, "reiserfs-2006",
"Replay Failure, unable to mount");
goto free_and_return;
}
reiserfs_mounted_fs_count++;
- if (reiserfs_mounted_fs_count <= 1) {
- reiserfs_write_unlock(sb);
+ if (reiserfs_mounted_fs_count <= 1)
commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
- reiserfs_write_lock(sb);
- }
INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
journal->j_work_sb = sb;
@@ -2896,14 +2883,13 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
journal->j_cnode_free < (journal->j_trans_max * 3)) {
return 1;
}
- /* protected by the BKL here */
+
journal->j_len_alloc += new_alloc;
th->t_blocks_allocated += new_alloc ;
return 0;
}
-/* this must be called inside a transaction, and requires the
-** kernel_lock to be held
+/* this must be called inside a transaction
*/
void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
{
@@ -2914,8 +2900,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
return;
}
-/* this must be called without a transaction started, and does not
-** require BKL
+/* this must be called without a transaction started
*/
void reiserfs_allow_writes(struct super_block *s)
{
@@ -2924,8 +2909,7 @@ void reiserfs_allow_writes(struct super_block *s)
wake_up(&journal->j_join_wait);
}
-/* this must be called without a transaction started, and does not
-** require BKL
+/* this must be called without a transaction started
*/
void reiserfs_wait_on_write_block(struct super_block *s)
{
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 80058e8ce361..146378865239 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -559,7 +559,7 @@ static int drop_new_inode(struct inode *inode)
** outside of a transaction, so we had to pull some bits of
** reiserfs_new_inode out into this func.
*/
-static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
+static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
{
/* Make inode invalid - just in case we are going to drop it before
* the initialization happens */
@@ -572,7 +572,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
return 0;
}
-static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
int retval;
@@ -643,7 +643,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
return retval;
}
-static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
int retval;
@@ -721,7 +721,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
return retval;
}
-static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int retval;
struct inode *inode;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 14363b96b6af..e12d8b97cd4d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,6 +28,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/crc32.h>
+#include <linux/seq_file.h>
struct file_system_type reiserfs_fs_type;
@@ -61,6 +62,7 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
static int reiserfs_remount(struct super_block *s, int *flags, char *data);
static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
+void show_alloc_options(struct seq_file *seq, struct super_block *s);
static int reiserfs_sync_fs(struct super_block *s, int wait)
{
@@ -453,16 +455,20 @@ int remove_save_link(struct inode *inode, int truncate)
static void reiserfs_kill_sb(struct super_block *s)
{
if (REISERFS_SB(s)) {
- if (REISERFS_SB(s)->xattr_root) {
- d_invalidate(REISERFS_SB(s)->xattr_root);
- dput(REISERFS_SB(s)->xattr_root);
- REISERFS_SB(s)->xattr_root = NULL;
- }
- if (REISERFS_SB(s)->priv_root) {
- d_invalidate(REISERFS_SB(s)->priv_root);
- dput(REISERFS_SB(s)->priv_root);
- REISERFS_SB(s)->priv_root = NULL;
- }
+ /*
+ * Force any pending inode evictions to occur now. Any
+ * inodes to be removed that have extended attributes
+ * associated with them need to clean them up before
+ * we can release the extended attribute root dentries.
+ * shrink_dcache_for_umount will BUG if we don't release
+ * those before it's called so ->put_super is too late.
+ */
+ shrink_dcache_sb(s);
+
+ dput(REISERFS_SB(s)->xattr_root);
+ REISERFS_SB(s)->xattr_root = NULL;
+ dput(REISERFS_SB(s)->priv_root);
+ REISERFS_SB(s)->priv_root = NULL;
}
kill_block_super(s);
@@ -532,7 +538,6 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
static void reiserfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
}
@@ -597,6 +602,82 @@ out:
reiserfs_write_unlock_once(inode->i_sb, lock_depth);
}
+static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct super_block *s = root->d_sb;
+ struct reiserfs_journal *journal = SB_JOURNAL(s);
+ long opts = REISERFS_SB(s)->s_mount_opt;
+
+ if (opts & (1 << REISERFS_LARGETAIL))
+ seq_puts(seq, ",tails=on");
+ else if (!(opts & (1 << REISERFS_SMALLTAIL)))
+ seq_puts(seq, ",notail");
+ /* tails=small is default so we don't show it */
+
+ if (!(opts & (1 << REISERFS_BARRIER_FLUSH)))
+ seq_puts(seq, ",barrier=none");
+ /* barrier=flush is default so we don't show it */
+
+ if (opts & (1 << REISERFS_ERROR_CONTINUE))
+ seq_puts(seq, ",errors=continue");
+ else if (opts & (1 << REISERFS_ERROR_PANIC))
+ seq_puts(seq, ",errors=panic");
+ /* errors=ro is default so we don't show it */
+
+ if (opts & (1 << REISERFS_DATA_LOG))
+ seq_puts(seq, ",data=journal");
+ else if (opts & (1 << REISERFS_DATA_WRITEBACK))
+ seq_puts(seq, ",data=writeback");
+ /* data=ordered is default so we don't show it */
+
+ if (opts & (1 << REISERFS_ATTRS))
+ seq_puts(seq, ",attrs");
+
+ if (opts & (1 << REISERFS_XATTRS_USER))
+ seq_puts(seq, ",user_xattr");
+
+ if (opts & (1 << REISERFS_EXPOSE_PRIVROOT))
+ seq_puts(seq, ",expose_privroot");
+
+ if (opts & (1 << REISERFS_POSIXACL))
+ seq_puts(seq, ",acl");
+
+ if (REISERFS_SB(s)->s_jdev)
+ seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev);
+
+ if (journal->j_max_commit_age != journal->j_default_max_commit_age)
+ seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
+
+#ifdef CONFIG_QUOTA
+ if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
+ seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]);
+ else if (opts & (1 << REISERFS_USRQUOTA))
+ seq_puts(seq, ",usrquota");
+ if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
+ seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
+ else if (opts & (1 << REISERFS_GRPQUOTA))
+ seq_puts(seq, ",grpquota");
+ if (REISERFS_SB(s)->s_jquota_fmt) {
+ if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_OLD)
+ seq_puts(seq, ",jqfmt=vfsold");
+ else if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_V0)
+ seq_puts(seq, ",jqfmt=vfsv0");
+ }
+#endif
+
+ /* Block allocator options */
+ if (opts & (1 << REISERFS_NO_BORDER))
+ seq_puts(seq, ",block-allocator=noborder");
+ if (opts & (1 << REISERFS_NO_UNHASHED_RELOCATION))
+ seq_puts(seq, ",block-allocator=no_unhashed_relocation");
+ if (opts & (1 << REISERFS_HASHED_RELOCATION))
+ seq_puts(seq, ",block-allocator=hashed_relocation");
+ if (opts & (1 << REISERFS_TEST4))
+ seq_puts(seq, ",block-allocator=test4");
+ show_alloc_options(seq, s);
+ return 0;
+}
+
#ifdef CONFIG_QUOTA
static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
size_t, loff_t);
@@ -617,7 +698,7 @@ static const struct super_operations reiserfs_sops = {
.unfreeze_fs = reiserfs_unfreeze,
.statfs = reiserfs_statfs,
.remount_fs = reiserfs_remount,
- .show_options = generic_show_options,
+ .show_options = reiserfs_show_options,
#ifdef CONFIG_QUOTA
.quota_read = reiserfs_quota_read,
.quota_write = reiserfs_quota_write,
@@ -915,9 +996,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
{"jdev",.arg_required = 'j',.values = NULL},
{"nolargeio",.arg_required = 'w',.values = NULL},
{"commit",.arg_required = 'c',.values = NULL},
- {"usrquota",.setmask = 1 << REISERFS_QUOTA},
- {"grpquota",.setmask = 1 << REISERFS_QUOTA},
- {"noquota",.clrmask = 1 << REISERFS_QUOTA},
+ {"usrquota",.setmask = 1 << REISERFS_USRQUOTA},
+ {"grpquota",.setmask = 1 << REISERFS_GRPQUOTA},
+ {"noquota",.clrmask = 1 << REISERFS_USRQUOTA | 1 << REISERFS_GRPQUOTA},
{"errors",.arg_required = 'e',.values = error_actions},
{"usrjquota",.arg_required =
'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
@@ -1031,12 +1112,19 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
return 0;
}
strcpy(qf_names[qtype], arg);
- *mount_options |= 1 << REISERFS_QUOTA;
+ if (qtype == USRQUOTA)
+ *mount_options |= 1 << REISERFS_USRQUOTA;
+ else
+ *mount_options |= 1 << REISERFS_GRPQUOTA;
} else {
if (qf_names[qtype] !=
REISERFS_SB(s)->s_qf_names[qtype])
kfree(qf_names[qtype]);
qf_names[qtype] = NULL;
+ if (qtype == USRQUOTA)
+ *mount_options &= ~(1 << REISERFS_USRQUOTA);
+ else
+ *mount_options &= ~(1 << REISERFS_GRPQUOTA);
}
}
if (c == 'f') {
@@ -1075,9 +1163,10 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
"journaled quota format not specified.");
return 0;
}
- /* This checking is not precise wrt the quota type but for our purposes it is sufficient */
- if (!(*mount_options & (1 << REISERFS_QUOTA))
- && sb_any_quota_loaded(s)) {
+ if ((!(*mount_options & (1 << REISERFS_USRQUOTA)) &&
+ sb_has_quota_loaded(s, USRQUOTA)) ||
+ (!(*mount_options & (1 << REISERFS_GRPQUOTA)) &&
+ sb_has_quota_loaded(s, GRPQUOTA))) {
reiserfs_warning(s, "super-6516", "quota options must "
"be present when quota is turned on.");
return 0;
@@ -1164,7 +1253,8 @@ static void handle_quota_files(struct super_block *s, char **qf_names,
kfree(REISERFS_SB(s)->s_qf_names[i]);
REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
}
- REISERFS_SB(s)->s_jquota_fmt = *qfmt;
+ if (*qfmt)
+ REISERFS_SB(s)->s_jquota_fmt = *qfmt;
}
#endif
@@ -1225,7 +1315,8 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
safe_mask |= 1 << REISERFS_ERROR_RO;
safe_mask |= 1 << REISERFS_ERROR_CONTINUE;
safe_mask |= 1 << REISERFS_ERROR_PANIC;
- safe_mask |= 1 << REISERFS_QUOTA;
+ safe_mask |= 1 << REISERFS_USRQUOTA;
+ safe_mask |= 1 << REISERFS_GRPQUOTA;
/* Update the bitmask, taking care to keep
* the bits we're not allowed to change here */
@@ -1428,9 +1519,7 @@ static int read_super_block(struct super_block *s, int offset)
static int reread_meta_blocks(struct super_block *s)
{
ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
- reiserfs_write_unlock(s);
wait_on_buffer(SB_BUFFER_WITH_SB(s));
- reiserfs_write_lock(s);
if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
reiserfs_warning(s, "reiserfs-2504", "error reading the super");
return 1;
@@ -1655,22 +1744,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
mutex_init(&REISERFS_SB(s)->lock);
REISERFS_SB(s)->lock_depth = -1;
- /*
- * This function is called with the bkl, which also was the old
- * locking used here.
- * do_journal_begin() will soon check if we hold the lock (ie: was the
- * bkl). This is likely because do_journal_begin() has several another
- * callers because at this time, it doesn't seem to be necessary to
- * protect against anything.
- * Anyway, let's be conservative and lock for now.
- */
- reiserfs_write_lock(s);
-
jdev_name = NULL;
if (reiserfs_parse_options
(s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
&commit_max_age, qf_names, &qfmt) == 0) {
- goto error;
+ goto error_unlocked;
+ }
+ if (jdev_name && jdev_name[0]) {
+ REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
+ if (!REISERFS_SB(s)->s_jdev) {
+ SWARN(silent, s, "", "Cannot allocate memory for "
+ "journal device name");
+ goto error;
+ }
}
#ifdef CONFIG_QUOTA
handle_quota_files(s, qf_names, &qfmt);
@@ -1678,7 +1764,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (blocks) {
SWARN(silent, s, "jmacd-7", "resize option for remount only");
- goto error;
+ goto error_unlocked;
}
/* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
@@ -1688,7 +1774,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
reiserfs_bdevname(s));
- goto error;
+ goto error_unlocked;
}
rs = SB_DISK_SUPER_BLOCK(s);
@@ -1704,7 +1790,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
"or increase size of your LVM partition");
SWARN(silent, s, "", "Or may be you forgot to "
"reboot after fdisk when it told you to");
- goto error;
+ goto error_unlocked;
}
sbi->s_mount_state = SB_REISERFS_STATE(s);
@@ -1712,8 +1798,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if ((errval = reiserfs_init_bitmap_cache(s))) {
SWARN(silent, s, "jmacd-8", "unable to read bitmap");
- goto error;
+ goto error_unlocked;
}
+
errval = -EINVAL;
#ifdef CONFIG_REISERFS_CHECK
SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
@@ -1736,24 +1823,26 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (reiserfs_barrier_flush(s)) {
printk("reiserfs: using flush barriers\n");
}
+
// set_device_ro(s->s_dev, 1) ;
if (journal_init(s, jdev_name, old_format, commit_max_age)) {
SWARN(silent, s, "sh-2022",
"unable to initialize journal space");
- goto error;
+ goto error_unlocked;
} else {
jinit_done = 1; /* once this is set, journal_release must be called
** if we error out of the mount
*/
}
+
if (reread_meta_blocks(s)) {
SWARN(silent, s, "jmacd-9",
"unable to reread meta blocks after journal init");
- goto error;
+ goto error_unlocked;
}
if (replay_only(s))
- goto error;
+ goto error_unlocked;
if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
SWARN(silent, s, "clm-7000",
@@ -1767,9 +1856,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
reiserfs_init_locked_inode, (void *)(&args));
if (!root_inode) {
SWARN(silent, s, "jmacd-10", "get root inode failed");
- goto error;
+ goto error_unlocked;
}
+ /*
+ * This path assumed to be called with the BKL in the old times.
+ * Now we have inherited the big reiserfs lock from it and many
+ * reiserfs helpers called in the mount path and elsewhere require
+ * this lock to be held even if it's not always necessary. Let's be
+ * conservative and hold it early. The window can be reduced after
+ * careful review of the code.
+ */
+ reiserfs_write_lock(s);
+
if (root_inode->i_state & I_NEW) {
reiserfs_read_locked_inode(root_inode, &args);
unlock_new_inode(root_inode);
@@ -1896,12 +1995,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
return (0);
error:
- if (jinit_done) { /* kill the commit thread, free journal ram */
+ reiserfs_write_unlock(s);
+
+error_unlocked:
+ /* kill the commit thread, free journal ram */
+ if (jinit_done) {
+ reiserfs_write_lock(s);
journal_release_error(NULL, s);
+ reiserfs_write_unlock(s);
}
- reiserfs_write_unlock(s);
-
reiserfs_free_bitmap_cache(s);
if (SB_BUFFER_WITH_SB(s))
brelse(SB_BUFFER_WITH_SB(s));
@@ -2054,12 +2157,13 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
int err;
struct inode *inode;
struct reiserfs_transaction_handle th;
+ int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
- if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
+ if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt)))
return -EINVAL;
/* Quotafile not on the same filesystem? */
- if (path->mnt->mnt_sb != sb) {
+ if (path->dentry->d_sb != sb) {
err = -EXDEV;
goto out;
}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6bc346c160e7..c24deda8a8bc 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -66,7 +66,7 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
}
#endif
-static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
BUG_ON(!mutex_is_locked(&dir->i_mutex));
return dir->i_op->mkdir(dir, dentry, mode);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index eed99428f104..e1a7779dd3cb 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -28,9 +28,10 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
struct inode *inode = file->f_mapping->host;
struct mtd_info *mtd = inode->i_sb->s_mtd;
unsigned long isize, offset, maxpages, lpages;
+ int ret;
if (!mtd)
- goto cant_map_directly;
+ return (unsigned long) -ENOSYS;
/* the mapping mustn't extend beyond the EOF */
lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -41,23 +42,20 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
if ((pgoff >= maxpages) || (maxpages - pgoff < lpages))
return (unsigned long) -EINVAL;
- /* we need to call down to the MTD layer to do the actual mapping */
- if (mtd->get_unmapped_area) {
- if (addr != 0)
- return (unsigned long) -EINVAL;
-
- if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
- return (unsigned long) -EINVAL;
+ if (addr != 0)
+ return (unsigned long) -EINVAL;
- offset += ROMFS_I(inode)->i_dataoffset;
- if (offset > mtd->size - len)
- return (unsigned long) -EINVAL;
+ if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
+ return (unsigned long) -EINVAL;
- return mtd->get_unmapped_area(mtd, len, offset, flags);
- }
+ offset += ROMFS_I(inode)->i_dataoffset;
+ if (offset > mtd->size - len)
+ return (unsigned long) -EINVAL;
-cant_map_directly:
- return (unsigned long) -ENOSYS;
+ ret = mtd_get_unmapped_area(mtd, len, offset, flags);
+ if (ret == -EOPNOTSUPP)
+ ret = -ENOSYS;
+ return (unsigned long) ret;
}
/*
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 8b4089f30408..bb36ab74eb45 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -403,7 +403,6 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
static void romfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
}
diff --git a/fs/select.c b/fs/select.c
index d33418fdc858..e782258d0de3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -912,7 +912,7 @@ static long do_restart_poll(struct restart_block *restart_block)
}
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
- long, timeout_msecs)
+ int, timeout_msecs)
{
struct timespec end_time, *to = NULL;
int ret;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index dba43c3ea3af..4023d6be939b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -397,7 +397,7 @@ EXPORT_SYMBOL(seq_printf);
* Returns pointer past last written character in @s, or NULL in case of
* failure.
*/
-char *mangle_path(char *s, char *p, char *esc)
+char *mangle_path(char *s, const char *p, const char *esc)
{
while (s <= p) {
char c = *p++;
@@ -427,7 +427,7 @@ EXPORT_SYMBOL(mangle_path);
* return the absolute path of 'path', as represented by the
* dentry / mnt pair in the path parameter.
*/
-int seq_path(struct seq_file *m, struct path *path, char *esc)
+int seq_path(struct seq_file *m, const struct path *path, const char *esc)
{
char *buf;
size_t size = seq_get_buf(m, &buf);
@@ -450,8 +450,8 @@ EXPORT_SYMBOL(seq_path);
/*
* Same as seq_path, but relative to supplied root.
*/
-int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
- char *esc)
+int seq_path_root(struct seq_file *m, const struct path *path,
+ const struct path *root, const char *esc)
{
char *buf;
size_t size = seq_get_buf(m, &buf);
@@ -480,7 +480,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
/*
* returns the path of the 'dentry' from the root of its filesystem.
*/
-int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
+int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
{
char *buf;
size_t size = seq_get_buf(m, &buf);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 492465b451dd..7ae2a574cb25 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -30,6 +30,21 @@
#include <linux/signalfd.h>
#include <linux/syscalls.h>
+void signalfd_cleanup(struct sighand_struct *sighand)
+{
+ wait_queue_head_t *wqh = &sighand->signalfd_wqh;
+ /*
+ * The lockless check can race with remove_wait_queue() in progress,
+ * but in this case its caller should run under rcu_read_lock() and
+ * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return.
+ */
+ if (likely(!waitqueue_active(wqh)))
+ return;
+
+ /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */
+ wake_up_poll(wqh, POLLHUP | POLLFREE);
+}
+
struct signalfd_ctx {
sigset_t sigmask;
};
diff --git a/fs/splice.c b/fs/splice.c
index fa2defa8afcf..1ec0493266b3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -25,7 +25,6 @@
#include <linux/mm_inline.h>
#include <linux/swap.h>
#include <linux/writeback.h>
-#include <linux/buffer_head.h>
#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/uio.h>
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f744be98cd5a..af0b73802592 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -70,11 +70,15 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
spin_lock(&cache->lock);
while (1) {
- for (i = 0; i < cache->entries; i++)
- if (cache->entry[i].block == block)
+ for (i = cache->curr_blk, n = 0; n < cache->entries; n++) {
+ if (cache->entry[i].block == block) {
+ cache->curr_blk = i;
break;
+ }
+ i = (i + 1) % cache->entries;
+ }
- if (i == cache->entries) {
+ if (n == cache->entries) {
/*
* Block not in cache, if all cache entries are used
* go to sleep waiting for one to become available.
@@ -245,6 +249,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
goto cleanup;
}
+ cache->curr_blk = 0;
cache->next_blk = 0;
cache->unused = entries;
cache->entries = entries;
@@ -332,17 +337,20 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
u64 *block, int *offset, int length)
{
struct squashfs_sb_info *msblk = sb->s_fs_info;
- int bytes, copied = length;
+ int bytes, res = length;
struct squashfs_cache_entry *entry;
TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
while (length) {
entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
- if (entry->error)
- return entry->error;
- else if (*offset >= entry->length)
- return -EIO;
+ if (entry->error) {
+ res = entry->error;
+ goto error;
+ } else if (*offset >= entry->length) {
+ res = -EIO;
+ goto error;
+ }
bytes = squashfs_copy_data(buffer, entry, *offset, length);
if (buffer)
@@ -358,7 +366,11 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
squashfs_cache_put(entry);
}
- return copied;
+ return res;
+
+error:
+ squashfs_cache_put(entry);
+ return res;
}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index fd7b3b3bda13..81afbccfa843 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -208,8 +208,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_op = &squashfs_inode_ops;
inode->i_fop = &generic_ro_fops;
inode->i_mode |= S_IFREG;
- inode->i_blocks = ((inode->i_size -
- le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
+ inode->i_blocks = (inode->i_size -
+ le64_to_cpu(sqsh_ino->sparse) + 511) >> 9;
squashfs_i(inode)->fragment_block = frag_blk;
squashfs_i(inode)->fragment_size = frag_size;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 651f0b31d296..52934a22f296 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -28,6 +28,7 @@
struct squashfs_cache {
char *name;
int entries;
+ int curr_blk;
int next_blk;
int num_waiters;
int unused;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 2da1715452ac..ecaa2f7bdb8f 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -290,7 +290,7 @@ handle_fragments:
check_directory_table:
/* Sanity check directory_table */
- if (msblk->directory_table >= next_table) {
+ if (msblk->directory_table > next_table) {
err = -EINVAL;
goto failed_mount;
}
@@ -464,7 +464,6 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb)
static void squashfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
}
diff --git a/fs/statfs.c b/fs/statfs.c
index 9cf04a118965..2aa6a22e0be2 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -7,6 +7,7 @@
#include <linux/statfs.h>
#include <linux/security.h>
#include <linux/uaccess.h>
+#include "internal.h"
static int flags_by_mnt(int mnt_flags)
{
@@ -45,7 +46,7 @@ static int calculate_f_flags(struct vfsmount *mnt)
flags_by_sb(mnt->mnt_sb->s_flags);
}
-int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
+static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
{
int retval;
@@ -205,19 +206,23 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user
return error;
}
-SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
+int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
{
- struct super_block *s;
- struct ustat tmp;
- struct kstatfs sbuf;
+ struct super_block *s = user_get_super(dev);
int err;
-
- s = user_get_super(new_decode_dev(dev));
if (!s)
return -EINVAL;
- err = statfs_by_dentry(s->s_root, &sbuf);
+ err = statfs_by_dentry(s->s_root, sbuf);
drop_super(s);
+ return err;
+}
+
+SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
+{
+ struct ustat tmp;
+ struct kstatfs sbuf;
+ int err = vfs_ustat(new_decode_dev(dev), &sbuf);
if (err)
return err;
diff --git a/fs/super.c b/fs/super.c
index afd0f1ad45e0..6277ec6cb60a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -136,12 +136,13 @@ static struct super_block *alloc_super(struct file_system_type *type)
INIT_LIST_HEAD(&s->s_files);
#endif
s->s_bdi = &default_backing_dev_info;
- INIT_LIST_HEAD(&s->s_instances);
+ INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
INIT_LIST_HEAD(&s->s_dentry_lru);
INIT_LIST_HEAD(&s->s_inode_lru);
spin_lock_init(&s->s_inode_lru_lock);
+ INIT_LIST_HEAD(&s->s_mounts);
init_rwsem(&s->s_umount);
mutex_init(&s->s_lock);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -200,6 +201,7 @@ static inline void destroy_super(struct super_block *s)
free_percpu(s->s_files);
#endif
security_sb_free(s);
+ WARN_ON(!list_empty(&s->s_mounts));
kfree(s->s_subtype);
kfree(s->s_options);
kfree(s);
@@ -210,7 +212,7 @@ static inline void destroy_super(struct super_block *s)
/*
* Drop a superblock's refcount. The caller must hold sb_lock.
*/
-void __put_super(struct super_block *sb)
+static void __put_super(struct super_block *sb)
{
if (!--sb->s_count) {
list_del_init(&sb->s_list);
@@ -225,7 +227,7 @@ void __put_super(struct super_block *sb)
* Drops a temporary reference, frees superblock if there's no
* references left.
*/
-void put_super(struct super_block *sb)
+static void put_super(struct super_block *sb)
{
spin_lock(&sb_lock);
__put_super(sb);
@@ -328,7 +330,7 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
bool grab_super_passive(struct super_block *sb)
{
spin_lock(&sb_lock);
- if (list_empty(&sb->s_instances)) {
+ if (hlist_unhashed(&sb->s_instances)) {
spin_unlock(&sb_lock);
return false;
}
@@ -337,7 +339,7 @@ bool grab_super_passive(struct super_block *sb)
spin_unlock(&sb_lock);
if (down_read_trylock(&sb->s_umount)) {
- if (sb->s_root)
+ if (sb->s_root && (sb->s_flags & MS_BORN))
return true;
up_read(&sb->s_umount);
}
@@ -400,7 +402,7 @@ void generic_shutdown_super(struct super_block *sb)
}
spin_lock(&sb_lock);
/* should be initialized for __put_super_and_need_restart() */
- list_del_init(&sb->s_instances);
+ hlist_del_init(&sb->s_instances);
spin_unlock(&sb_lock);
up_write(&sb->s_umount);
}
@@ -420,13 +422,14 @@ struct super_block *sget(struct file_system_type *type,
void *data)
{
struct super_block *s = NULL;
+ struct hlist_node *node;
struct super_block *old;
int err;
retry:
spin_lock(&sb_lock);
if (test) {
- list_for_each_entry(old, &type->fs_supers, s_instances) {
+ hlist_for_each_entry(old, node, &type->fs_supers, s_instances) {
if (!test(old, data))
continue;
if (!grab_super(old))
@@ -462,7 +465,7 @@ retry:
s->s_type = type;
strlcpy(s->s_id, type->name, sizeof(s->s_id));
list_add_tail(&s->s_list, &super_blocks);
- list_add(&s->s_instances, &type->fs_supers);
+ hlist_add_head(&s->s_instances, &type->fs_supers);
spin_unlock(&sb_lock);
get_filesystem(type);
register_shrinker(&s->s_shrink);
@@ -497,14 +500,14 @@ void sync_supers(void)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (list_empty(&sb->s_instances))
+ if (hlist_unhashed(&sb->s_instances))
continue;
if (sb->s_op->write_super && sb->s_dirt) {
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
- if (sb->s_root && sb->s_dirt)
+ if (sb->s_root && sb->s_dirt && (sb->s_flags & MS_BORN))
sb->s_op->write_super(sb);
up_read(&sb->s_umount);
@@ -533,13 +536,13 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (list_empty(&sb->s_instances))
+ if (hlist_unhashed(&sb->s_instances))
continue;
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
- if (sb->s_root)
+ if (sb->s_root && (sb->s_flags & MS_BORN))
f(sb, arg);
up_read(&sb->s_umount);
@@ -566,14 +569,15 @@ void iterate_supers_type(struct file_system_type *type,
void (*f)(struct super_block *, void *), void *arg)
{
struct super_block *sb, *p = NULL;
+ struct hlist_node *node;
spin_lock(&sb_lock);
- list_for_each_entry(sb, &type->fs_supers, s_instances) {
+ hlist_for_each_entry(sb, node, &type->fs_supers, s_instances) {
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
- if (sb->s_root)
+ if (sb->s_root && (sb->s_flags & MS_BORN))
f(sb, arg);
up_read(&sb->s_umount);
@@ -607,14 +611,14 @@ struct super_block *get_super(struct block_device *bdev)
spin_lock(&sb_lock);
rescan:
list_for_each_entry(sb, &super_blocks, s_list) {
- if (list_empty(&sb->s_instances))
+ if (hlist_unhashed(&sb->s_instances))
continue;
if (sb->s_bdev == bdev) {
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
/* still alive? */
- if (sb->s_root)
+ if (sb->s_root && (sb->s_flags & MS_BORN))
return sb;
up_read(&sb->s_umount);
/* nope, got unmounted */
@@ -630,6 +634,28 @@ rescan:
EXPORT_SYMBOL(get_super);
/**
+ * get_super_thawed - get thawed superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device. The superblock is returned once it is thawed
+ * (or immediately if it was not frozen). %NULL is returned if no match
+ * is found.
+ */
+struct super_block *get_super_thawed(struct block_device *bdev)
+{
+ while (1) {
+ struct super_block *s = get_super(bdev);
+ if (!s || s->s_frozen == SB_UNFROZEN)
+ return s;
+ up_read(&s->s_umount);
+ vfs_check_frozen(s, SB_FREEZE_WRITE);
+ put_super(s);
+ }
+}
+EXPORT_SYMBOL(get_super_thawed);
+
+/**
* get_active_super - get an active reference to the superblock of a device
* @bdev: device to get the superblock for
*
@@ -647,7 +673,7 @@ struct super_block *get_active_super(struct block_device *bdev)
restart:
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (list_empty(&sb->s_instances))
+ if (hlist_unhashed(&sb->s_instances))
continue;
if (sb->s_bdev == bdev) {
if (grab_super(sb)) /* drops sb_lock */
@@ -667,14 +693,14 @@ struct super_block *user_get_super(dev_t dev)
spin_lock(&sb_lock);
rescan:
list_for_each_entry(sb, &super_blocks, s_list) {
- if (list_empty(&sb->s_instances))
+ if (hlist_unhashed(&sb->s_instances))
continue;
if (sb->s_dev == dev) {
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
/* still alive? */
- if (sb->s_root)
+ if (sb->s_root && (sb->s_flags & MS_BORN))
return sb;
up_read(&sb->s_umount);
/* nope, got unmounted */
@@ -719,23 +745,29 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
/* If we are remounting RDONLY and current sb is read/write,
make sure there are no rw files opened */
if (remount_ro) {
- if (force)
+ if (force) {
mark_files_ro(sb);
- else if (!fs_may_remount_ro(sb))
- return -EBUSY;
+ } else {
+ retval = sb_prepare_remount_readonly(sb);
+ if (retval)
+ return retval;
+ }
}
if (sb->s_op->remount_fs) {
retval = sb->s_op->remount_fs(sb, &flags, data);
if (retval) {
if (!force)
- return retval;
+ goto cancel_readonly;
/* If forced remount, go ahead despite any errors */
WARN(1, "forced remount of a %s fs returned %i\n",
sb->s_type->name, retval);
}
}
sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
+ /* Needs to be ordered wrt mnt_is_readonly() */
+ smp_wmb();
+ sb->s_readonly_remount = 0;
/*
* Some filesystems modify their metadata via some other path than the
@@ -748,6 +780,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
if (remount_ro && sb->s_bdev)
invalidate_bdev(sb->s_bdev);
return 0;
+
+cancel_readonly:
+ sb->s_readonly_remount = 0;
+ return retval;
}
static void do_emergency_remount(struct work_struct *work)
@@ -756,12 +792,13 @@ static void do_emergency_remount(struct work_struct *work)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (list_empty(&sb->s_instances))
+ if (hlist_unhashed(&sb->s_instances))
continue;
sb->s_count++;
spin_unlock(&sb_lock);
down_write(&sb->s_umount);
- if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
+ if (sb->s_root && sb->s_bdev && (sb->s_flags & MS_BORN) &&
+ !(sb->s_flags & MS_RDONLY)) {
/*
* What lock protects sb->s_flags??
*/
@@ -1144,6 +1181,11 @@ int freeze_super(struct super_block *sb)
return -EBUSY;
}
+ if (!(sb->s_flags & MS_BORN)) {
+ up_write(&sb->s_umount);
+ return 0; /* sic - it's "nothing to do" */
+ }
+
if (sb->s_flags & MS_RDONLY) {
sb->s_frozen = SB_FREEZE_TRANS;
smp_wmb();
@@ -1166,6 +1208,8 @@ int freeze_super(struct super_block *sb)
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_frozen = SB_UNFROZEN;
+ smp_wmb();
+ wake_up(&sb->s_wait_unfrozen);
deactivate_locked_super(sb);
return ret;
}
diff --git a/fs/sync.c b/fs/sync.c
index 101b8ef901d7..f3501ef39235 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -14,7 +14,6 @@
#include <linux/linkage.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
#include <linux/backing-dev.h>
#include "internal.h"
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index d4e6080b4b20..00012e31829d 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -493,6 +493,12 @@ int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
const void *ns = NULL;
int err;
+ if (!dir_sd) {
+ WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n",
+ kobject_name(kobj));
+ return -ENOENT;
+ }
+
err = 0;
if (!sysfs_ns_type(dir_sd))
goto out;
@@ -518,7 +524,7 @@ out:
}
int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
- const struct attribute *attr, int type, mode_t amode)
+ const struct attribute *attr, int type, umode_t amode)
{
umode_t mode = (amode & S_IALLUGO) | S_IFREG;
struct sysfs_addrm_cxt acxt;
@@ -618,7 +624,7 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
*
*/
int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
- mode_t mode)
+ umode_t mode)
{
struct sysfs_dirent *sd;
struct iattr newattrs;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 194414f8298c..dd1701caecc9 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -33,7 +33,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
int error = 0, i;
for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
- mode_t mode = 0;
+ umode_t mode = 0;
/* in update mode, we're changing the permissions or
* visibility. Do this by first removing then
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index c81b22f3ace1..85eb81683a29 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -187,7 +187,7 @@ out:
return error;
}
-static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
+static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
{
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -318,8 +318,11 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
struct sysfs_addrm_cxt acxt;
struct sysfs_dirent *sd;
- if (!dir_sd)
+ if (!dir_sd) {
+ WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
+ name);
return -ENOENT;
+ }
sysfs_addrm_start(&acxt, dir_sd);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ce29e28b766d..7484a36ee678 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -79,7 +79,7 @@ struct sysfs_dirent {
};
unsigned int s_flags;
- unsigned short s_mode;
+ umode_t s_mode;
ino_t s_ino;
struct sysfs_inode_attrs *s_iattr;
};
@@ -229,7 +229,7 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd,
const struct attribute *attr, int type);
int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
- const struct attribute *attr, int type, mode_t amode);
+ const struct attribute *attr, int type, umode_t amode);
/*
* bin.c
*/
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 0c96c98bd1db..8233b02eccae 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -132,7 +132,7 @@ void sysv_free_inode(struct inode * inode)
brelse(bh);
}
-struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
+struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct sysv_sb_info *sbi = SYSV_SB(sb);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 25ffb3e9a3f8..3da5ce25faf0 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -336,7 +336,6 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
static void sysv_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
}
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index fa8d43c92bb8..90b54b438789 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -442,7 +442,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size)
int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
- struct super_block *s = mnt->mnt_sb;
+ struct super_block *s = dentry->d_sb;
generic_fillattr(dentry->d_inode, stat);
stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
stat->blksize = s->s_blocksize;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e474fbcf8bde..b217797e621b 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -61,7 +61,7 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st
return NULL;
}
-static int sysv_mknod(struct inode * dir, struct dentry * dentry, int mode, dev_t rdev)
+static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev)
{
struct inode * inode;
int err;
@@ -80,7 +80,7 @@ static int sysv_mknod(struct inode * dir, struct dentry * dentry, int mode, dev_
return err;
}
-static int sysv_create(struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
+static int sysv_create(struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd)
{
return sysv_mknod(dir, dentry, mode, 0);
}
@@ -131,7 +131,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
return add_nondir(dentry, inode);
}
-static int sysv_mkdir(struct inode * dir, struct dentry *dentry, int mode)
+static int sysv_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)
{
struct inode * inode;
int err = -EMLINK;
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index bb55cdb394bf..0e4b821c5691 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -125,7 +125,7 @@ static inline void dirty_sb(struct super_block *sb)
/* ialloc.c */
extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned,
struct buffer_head **);
-extern struct inode * sysv_new_inode(const struct inode *, mode_t);
+extern struct inode * sysv_new_inode(const struct inode *, umode_t);
extern void sysv_free_inode(struct inode *);
extern unsigned long sysv_count_free_inodes(struct super_block *);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index b09ba2dd8b62..f922cbacdb96 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -38,9 +38,6 @@
DEFINE_SPINLOCK(dbg_lock);
-static char dbg_key_buf0[128];
-static char dbg_key_buf1[128];
-
static const char *get_key_fmt(int fmt)
{
switch (fmt) {
@@ -103,8 +100,8 @@ static const char *get_dent_type(int type)
}
}
-static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
- char *buffer)
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+ const union ubifs_key *key, char *buffer, int len)
{
char *p = buffer;
int type = key_type(c, key);
@@ -112,45 +109,34 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
switch (type) {
case UBIFS_INO_KEY:
- sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key),
- get_key_type(type));
+ len -= snprintf(p, len, "(%lu, %s)",
+ (unsigned long)key_inum(c, key),
+ get_key_type(type));
break;
case UBIFS_DENT_KEY:
case UBIFS_XENT_KEY:
- sprintf(p, "(%lu, %s, %#08x)",
- (unsigned long)key_inum(c, key),
- get_key_type(type), key_hash(c, key));
+ len -= snprintf(p, len, "(%lu, %s, %#08x)",
+ (unsigned long)key_inum(c, key),
+ get_key_type(type), key_hash(c, key));
break;
case UBIFS_DATA_KEY:
- sprintf(p, "(%lu, %s, %u)",
- (unsigned long)key_inum(c, key),
- get_key_type(type), key_block(c, key));
+ len -= snprintf(p, len, "(%lu, %s, %u)",
+ (unsigned long)key_inum(c, key),
+ get_key_type(type), key_block(c, key));
break;
case UBIFS_TRUN_KEY:
- sprintf(p, "(%lu, %s)",
- (unsigned long)key_inum(c, key),
- get_key_type(type));
+ len -= snprintf(p, len, "(%lu, %s)",
+ (unsigned long)key_inum(c, key),
+ get_key_type(type));
break;
default:
- sprintf(p, "(bad key type: %#08x, %#08x)",
- key->u32[0], key->u32[1]);
+ len -= snprintf(p, len, "(bad key type: %#08x, %#08x)",
+ key->u32[0], key->u32[1]);
}
} else
- sprintf(p, "bad key format %d", c->key_fmt);
-}
-
-const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
-{
- /* dbg_lock must be held */
- sprintf_key(c, key, dbg_key_buf0);
- return dbg_key_buf0;
-}
-
-const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
-{
- /* dbg_lock must be held */
- sprintf_key(c, key, dbg_key_buf1);
- return dbg_key_buf1;
+ len -= snprintf(p, len, "bad key format %d", c->key_fmt);
+ ubifs_assert(len > 0);
+ return p;
}
const char *dbg_ntype(int type)
@@ -319,6 +305,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
int i, n;
union ubifs_key key;
const struct ubifs_ch *ch = node;
+ char key_buf[DBG_KEY_BUF_LEN];
if (dbg_is_tst_rcvry(c))
return;
@@ -474,7 +461,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
const struct ubifs_ino_node *ino = node;
key_read(c, &ino->key, &key);
- printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
+ printk(KERN_DEBUG "\tkey %s\n",
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
printk(KERN_DEBUG "\tcreat_sqnum %llu\n",
(unsigned long long)le64_to_cpu(ino->creat_sqnum));
printk(KERN_DEBUG "\tsize %llu\n",
@@ -517,7 +505,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
int nlen = le16_to_cpu(dent->nlen);
key_read(c, &dent->key, &key);
- printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
+ printk(KERN_DEBUG "\tkey %s\n",
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
printk(KERN_DEBUG "\tinum %llu\n",
(unsigned long long)le64_to_cpu(dent->inum));
printk(KERN_DEBUG "\ttype %d\n", (int)dent->type);
@@ -541,7 +530,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
key_read(c, &dn->key, &key);
- printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
+ printk(KERN_DEBUG "\tkey %s\n",
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
printk(KERN_DEBUG "\tsize %u\n",
le32_to_cpu(dn->size));
printk(KERN_DEBUG "\tcompr_typ %d\n",
@@ -582,7 +572,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
key_read(c, &br->key, &key);
printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
- le32_to_cpu(br->len), DBGKEY(&key));
+ le32_to_cpu(br->len),
+ dbg_snprintf_key(c, &key, key_buf,
+ DBG_KEY_BUF_LEN));
}
break;
}
@@ -934,6 +926,7 @@ void dbg_dump_znode(const struct ubifs_info *c,
{
int n;
const struct ubifs_zbranch *zbr;
+ char key_buf[DBG_KEY_BUF_LEN];
spin_lock(&dbg_lock);
if (znode->parent)
@@ -958,12 +951,16 @@ void dbg_dump_znode(const struct ubifs_info *c,
printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
"%s\n", n, zbr->znode, zbr->lnum,
zbr->offs, zbr->len,
- DBGKEY(&zbr->key));
+ dbg_snprintf_key(c, &zbr->key,
+ key_buf,
+ DBG_KEY_BUF_LEN));
else
printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
"%s\n", n, zbr->znode, zbr->lnum,
zbr->offs, zbr->len,
- DBGKEY(&zbr->key));
+ dbg_snprintf_key(c, &zbr->key,
+ key_buf,
+ DBG_KEY_BUF_LEN));
}
spin_unlock(&dbg_lock);
}
@@ -1260,6 +1257,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
int err, nlen1, nlen2, cmp;
struct ubifs_dent_node *dent1, *dent2;
union ubifs_key key;
+ char key_buf[DBG_KEY_BUF_LEN];
ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
@@ -1290,9 +1288,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
key_read(c, &dent1->key, &key);
if (keys_cmp(c, &zbr1->key, &key)) {
dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
- zbr1->offs, DBGKEY(&key));
+ zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+ DBG_KEY_BUF_LEN));
dbg_err("but it should have key %s according to tnc",
- DBGKEY(&zbr1->key));
+ dbg_snprintf_key(c, &zbr1->key, key_buf,
+ DBG_KEY_BUF_LEN));
dbg_dump_node(c, dent1);
goto out_free;
}
@@ -1300,9 +1300,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
key_read(c, &dent2->key, &key);
if (keys_cmp(c, &zbr2->key, &key)) {
dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
- zbr1->offs, DBGKEY(&key));
+ zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+ DBG_KEY_BUF_LEN));
dbg_err("but it should have key %s according to tnc",
- DBGKEY(&zbr2->key));
+ dbg_snprintf_key(c, &zbr2->key, key_buf,
+ DBG_KEY_BUF_LEN));
dbg_dump_node(c, dent2);
goto out_free;
}
@@ -1319,7 +1321,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
dbg_err("2 xent/dent nodes with the same name");
else
dbg_err("bad order of colliding key %s",
- DBGKEY(&key));
+ dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
dbg_dump_node(c, dent1);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 8d9c46810189..ad1a6fee6010 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -169,40 +169,39 @@ struct ubifs_global_debug_info {
spin_unlock(&dbg_lock); \
} while (0)
-const char *dbg_key_str0(const struct ubifs_info *c,
- const union ubifs_key *key);
-const char *dbg_key_str1(const struct ubifs_info *c,
- const union ubifs_key *key);
-
-/*
- * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message
- * macros.
- */
-#define DBGKEY(key) dbg_key_str0(c, (key))
-#define DBGKEY1(key) dbg_key_str1(c, (key))
-
-extern spinlock_t dbg_lock;
-
-#define ubifs_dbg_msg(type, fmt, ...) do { \
- spin_lock(&dbg_lock); \
- pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
- spin_unlock(&dbg_lock); \
+#define ubifs_dbg_msg(type, fmt, ...) \
+ pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
+
+#define DBG_KEY_BUF_LEN 32
+#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \
+ char __tmp_key_buf[DBG_KEY_BUF_LEN]; \
+ pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__, \
+ dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \
} while (0)
/* Just a debugging messages not related to any specific UBIFS subsystem */
-#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...) \
+ printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
+ __func__, ##__VA_ARGS__)
+
/* General messages */
#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
/* Additional journal messages */
#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
+#define dbg_jnlk(key, fmt, ...) \
+ ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__)
/* Additional TNC messages */
#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
+#define dbg_tnck(key, fmt, ...) \
+ ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__)
/* Additional lprops messages */
#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
/* Additional LEB find messages */
#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
/* Additional mount messages */
#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
+#define dbg_mntk(key, fmt, ...) \
+ ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__)
/* Additional I/O messages */
#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
/* Additional commit messages */
@@ -218,6 +217,7 @@ extern spinlock_t dbg_lock;
/* Additional recovery messages */
#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
+extern spinlock_t dbg_lock;
extern struct ubifs_global_debug_info ubifs_dbg;
static inline int dbg_is_chk_gen(const struct ubifs_info *c)
@@ -258,6 +258,8 @@ const char *dbg_cstate(int cmt_state);
const char *dbg_jhead(int jhead);
const char *dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key);
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+ const union ubifs_key *key, char *buffer, int len);
void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode);
void dbg_dump_node(const struct ubifs_info *c, const void *node);
void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
@@ -345,20 +347,23 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
#define dbg_dump_stack()
#define ubifs_assert_cmt_locked(c)
-#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnlk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnck(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mntk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; }
static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; }
@@ -368,6 +373,10 @@ static inline const char *dbg_jhead(int jhead) { return ""; }
static inline const char *
dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key) { return ""; }
+static inline const char *
+dbg_snprintf_key(const struct ubifs_info *c,
+ const union ubifs_key *key, char *buffer,
+ int len) { return ""; }
static inline void dbg_dump_inode(struct ubifs_info *c,
const struct inode *inode) { return; }
static inline void dbg_dump_node(const struct ubifs_info *c,
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 683492043317..d6fe1c79f18b 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -56,7 +56,7 @@
*
* This function returns the inherited flags.
*/
-static int inherit_flags(const struct inode *dir, int mode)
+static int inherit_flags(const struct inode *dir, umode_t mode)
{
int flags;
const struct ubifs_inode *ui = ubifs_inode(dir);
@@ -86,7 +86,7 @@ static int inherit_flags(const struct inode *dir, int mode)
* case of failure.
*/
struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
- int mode)
+ umode_t mode)
{
struct inode *inode;
struct ubifs_inode *ui;
@@ -253,7 +253,7 @@ out:
return ERR_PTR(err);
}
-static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
struct inode *inode;
@@ -268,7 +268,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode,
* parent directory inode.
*/
- dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+ dbg_gen("dent '%.*s', mode %#hx in dir ino %lu",
dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
err = ubifs_budget_space(c, &req);
@@ -712,7 +712,7 @@ out_cancel:
return err;
}
-static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
@@ -725,7 +725,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
* directory inode.
*/
- dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+ dbg_gen("dent '%.*s', mode %#hx in dir ino %lu",
dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
err = ubifs_budget_space(c, &req);
@@ -769,7 +769,7 @@ out_budg:
}
static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
- int mode, dev_t rdev)
+ umode_t mode, dev_t rdev)
{
struct inode *inode;
struct ubifs_inode *ui;
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 548acf494afd..1a7e2d8bdbe9 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -173,12 +173,12 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
* Make sure the file-system is read-write and make sure it
* will not become read-only while we are changing the flags.
*/
- err = mnt_want_write(file->f_path.mnt);
+ err = mnt_want_write_file(file);
if (err)
return err;
dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags);
err = setflags(inode, flags);
- mnt_drop_write(file->f_path.mnt);
+ mnt_drop_write_file(file);
return err;
}
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index cef0460f4c54..2f438ab2e7a2 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -697,9 +697,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
struct ubifs_inode *ui = ubifs_inode(inode);
- dbg_jnl("ino %lu, blk %u, len %d, key %s",
- (unsigned long)key_inum(c, key), key_block(c, key), len,
- DBGKEY(key));
+ dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
+ (unsigned long)key_inum(c, key), key_block(c, key), len);
ubifs_assert(len <= UBIFS_BLOCK_SIZE);
data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
@@ -1177,7 +1176,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
blk = new_size >> UBIFS_BLOCK_SHIFT;
data_key_init(c, &key, inum, blk);
- dbg_jnl("last block key %s", DBGKEY(&key));
+ dbg_jnlk(&key, "last block key ");
err = ubifs_tnc_lookup(c, &key, dn);
if (err == -ENOENT)
dlen = 0; /* Not found (so it is a hole) */
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 6189c74d97f0..66d59d0a1402 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1986,12 +1986,11 @@ again:
if (path[h].in_tree)
continue;
- nnode = kmalloc(sz, GFP_NOFS);
+ nnode = kmemdup(&path[h].nnode, sz, GFP_NOFS);
if (!nnode) {
err = -ENOMEM;
goto out;
}
- memcpy(nnode, &path[h].nnode, sz);
parent = nnode->parent;
parent->nbranch[nnode->iip].nnode = nnode;
path[h].ptr.nnode = nnode;
@@ -2004,12 +2003,11 @@ again:
const size_t sz = sizeof(struct ubifs_pnode);
struct ubifs_nnode *parent;
- pnode = kmalloc(sz, GFP_NOFS);
+ pnode = kmemdup(&path[h].pnode, sz, GFP_NOFS);
if (!pnode) {
err = -ENOMEM;
goto out;
}
- memcpy(pnode, &path[h].pnode, sz);
parent = pnode->parent;
parent->nbranch[pnode->iip].pnode = pnode;
path[h].ptr.pnode = pnode;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ccabaf1164b3..b007637f0406 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -221,8 +221,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
{
int err;
- dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
- r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
+ dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ",
+ r->lnum, r->offs, r->len, r->deletion, r->sqnum);
/* Set c->replay_sqnum to help deal with dangling branches. */
c->replay_sqnum = r->sqnum;
@@ -361,7 +361,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
{
struct replay_entry *r;
- dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+ dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
if (key_inum(c, key) >= c->highest_inum)
c->highest_inum = key_inum(c, key);
@@ -409,7 +409,7 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
struct replay_entry *r;
char *nbuf;
- dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+ dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
if (key_inum(c, key) >= c->highest_inum)
c->highest_inum = key_inum(c, key);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ae0e76bb6ebf..63765d58445b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -276,7 +276,6 @@ static void ubifs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
struct ubifs_inode *ui = ubifs_inode(inode);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(ubifs_inode_slab, ui);
}
@@ -420,9 +419,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
+static int ubifs_show_options(struct seq_file *s, struct dentry *root)
{
- struct ubifs_info *c = mnt->mnt_sb->s_fs_info;
+ struct ubifs_info *c = root->d_sb->s_fs_info;
if (c->mount_opts.unmount_mode == 2)
seq_printf(s, ",fast_unmount");
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 066738647685..16ad84d8402f 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -344,12 +344,11 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
return err;
}
- lnc_node = kmalloc(zbr->len, GFP_NOFS);
+ lnc_node = kmemdup(node, zbr->len, GFP_NOFS);
if (!lnc_node)
/* We don't have to have the cache, so no error */
return 0;
- memcpy(lnc_node, node, zbr->len);
zbr->leaf = lnc_node;
return 0;
}
@@ -506,7 +505,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
{
int ret;
- dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
+ dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs);
ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
zbr->offs);
@@ -520,8 +519,8 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
ret = 0;
}
if (ret == 0 && c->replaying)
- dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
- zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
+ dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ",
+ zbr->lnum, zbr->offs, zbr->len);
return ret;
}
@@ -996,9 +995,9 @@ static int fallible_resolve_collision(struct ubifs_info *c,
if (adding || !o_znode)
return 0;
- dbg_mnt("dangling match LEB %d:%d len %d %s",
+ dbg_mntk(key, "dangling match LEB %d:%d len %d key ",
o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
- o_znode->zbranch[o_n].len, DBGKEY(key));
+ o_znode->zbranch[o_n].len);
*zn = o_znode;
*n = o_n;
return 1;
@@ -1180,7 +1179,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
unsigned long time = get_seconds();
- dbg_tnc("search key %s", DBGKEY(key));
+ dbg_tnck(key, "search key ");
ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
znode = c->zroot.znode;
@@ -1316,7 +1315,7 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
unsigned long time = get_seconds();
- dbg_tnc("search and dirty key %s", DBGKEY(key));
+ dbg_tnck(key, "search and dirty key ");
znode = c->zroot.znode;
if (unlikely(!znode)) {
@@ -1723,8 +1722,8 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
if (!keys_eq(c, &zbr->key, &key1)) {
ubifs_err("bad key in node at LEB %d:%d",
zbr->lnum, zbr->offs);
- dbg_tnc("looked for key %s found node's key %s",
- DBGKEY(&zbr->key), DBGKEY1(&key1));
+ dbg_tnck(&zbr->key, "looked for key ");
+ dbg_tnck(&key1, "found node's key ");
goto out_err;
}
@@ -1777,7 +1776,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
ubifs_err("failed to read from LEB %d:%d, error %d",
lnum, offs, err);
dbg_dump_stack();
- dbg_tnc("key %s", DBGKEY(&bu->key));
+ dbg_tnck(&bu->key, "key ");
return err;
}
@@ -1812,7 +1811,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
int found, n, err;
struct ubifs_znode *znode;
- dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
+ dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
mutex_lock(&c->tnc_mutex);
found = ubifs_lookup_level0(c, key, &znode, &n);
if (!found) {
@@ -1986,8 +1985,7 @@ again:
zp = znode->parent;
if (znode->child_cnt < c->fanout) {
ubifs_assert(n != c->fanout);
- dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
- DBGKEY(key));
+ dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level);
insert_zbranch(znode, zbr, n);
@@ -2002,7 +2000,7 @@ again:
* Unfortunately, @znode does not have more empty slots and we have to
* split it.
*/
- dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
+ dbg_tnck(key, "splitting level %d, key ", znode->level);
if (znode->alt)
/*
@@ -2096,7 +2094,7 @@ do_split:
}
/* Insert new key and branch */
- dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
+ dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level);
insert_zbranch(zi, zbr, n);
@@ -2172,7 +2170,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
+ dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len);
found = lookup_level0_dirty(c, key, &znode, &n);
if (!found) {
struct ubifs_zbranch zbr;
@@ -2221,8 +2219,8 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
- old_offs, lnum, offs, len, DBGKEY(key));
+ dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum,
+ old_offs, lnum, offs, len);
found = lookup_level0_dirty(c, key, &znode, &n);
if (found < 0) {
err = found;
@@ -2304,8 +2302,8 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
- DBGKEY(key));
+ dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
+ lnum, offs, nm->len, nm->name);
found = lookup_level0_dirty(c, key, &znode, &n);
if (found < 0) {
err = found;
@@ -2398,7 +2396,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
/* Delete without merge for now */
ubifs_assert(znode->level == 0);
ubifs_assert(n >= 0 && n < c->fanout);
- dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
+ dbg_tnck(&znode->zbranch[n].key, "deleting key ");
zbr = &znode->zbranch[n];
lnc_free(zbr);
@@ -2508,7 +2506,7 @@ int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("key %s", DBGKEY(key));
+ dbg_tnck(key, "key ");
found = lookup_level0_dirty(c, key, &znode, &n);
if (found < 0) {
err = found;
@@ -2539,7 +2537,7 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
+ dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
err = lookup_level0_dirty(c, key, &znode, &n);
if (err < 0)
goto out_unlock;
@@ -2654,7 +2652,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
dbg_dump_znode(c, znode);
goto out_unlock;
}
- dbg_tnc("removing %s", DBGKEY(key));
+ dbg_tnck(key, "removing key ");
}
if (k) {
for (i = n + 1 + k; i < znode->child_cnt; i++)
@@ -2774,7 +2772,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
struct ubifs_zbranch *zbr;
union ubifs_key *dkey;
- dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
+ dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
ubifs_assert(is_hash_key(c, key));
mutex_lock(&c->tnc_mutex);
@@ -3333,9 +3331,9 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
out_dump:
block = key_block(c, key);
- ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
- "(data key %s)", (unsigned long)inode->i_ino, size,
- ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
+ ubifs_err("inode %lu has size %lld, but there are data at offset %lld",
+ (unsigned long)inode->i_ino, size,
+ ((loff_t)block) << UBIFS_BLOCK_SHIFT);
mutex_unlock(&c->tnc_mutex);
dbg_dump_inode(c, inode);
dbg_dump_stack();
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index b48db999903e..dc28fe6ec07a 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
case UBIFS_XENT_KEY:
break;
default:
- dbg_msg("bad key type at slot %d: %s", i,
- DBGKEY(&zbr->key));
+ dbg_msg("bad key type at slot %d: %d",
+ i, key_type(c, &zbr->key));
err = 3;
goto out_dump;
}
@@ -475,7 +475,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
zbr->offs);
if (err) {
- dbg_tnc("key %s", DBGKEY(key));
+ dbg_tnck(key, "key ");
return err;
}
@@ -484,8 +484,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
if (!keys_eq(c, key, &key1)) {
ubifs_err("bad key in node at LEB %d:%d",
zbr->lnum, zbr->offs);
- dbg_tnc("looked for key %s found node's key %s",
- DBGKEY(key), DBGKEY1(&key1));
+ dbg_tnck(key, "looked for key ");
+ dbg_tnck(&key1, "but found node's key ");
dbg_dump_node(c, node);
return -EINVAL;
}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 27f22551f805..12e94774aa88 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1734,7 +1734,7 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
/* dir.c */
struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
- int mode);
+ umode_t mode);
int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index bf18f7a04544..85b272268754 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -138,12 +138,11 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
ui = ubifs_inode(inode);
ui->xattr = 1;
ui->flags |= UBIFS_XATTR_FL;
- ui->data = kmalloc(size, GFP_NOFS);
+ ui->data = kmemdup(value, size, GFP_NOFS);
if (!ui->data) {
err = -ENOMEM;
goto out_free;
}
- memcpy(ui->data, value, size);
inode->i_size = ui->ui_size = size;
ui->data_len = size;
@@ -204,12 +203,11 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
return err;
kfree(ui->data);
- ui->data = kmalloc(size, GFP_NOFS);
+ ui->data = kmemdup(value, size, GFP_NOFS);
if (!ui->data) {
err = -ENOMEM;
goto out_free;
}
- memcpy(ui->data, value, size);
inode->i_size = ui->ui_size = size;
ui->data_len = size;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index d8ffa7cc661d..d567b8448dfc 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -125,7 +125,6 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
err = udf_expand_file_adinicb(inode);
if (err) {
udf_debug("udf_expand_adinicb: err=%d\n", err);
- up_write(&iinfo->i_data_sem);
return err;
}
} else {
@@ -133,9 +132,10 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
iinfo->i_lenAlloc = pos + count;
else
iinfo->i_lenAlloc = inode->i_size;
+ up_write(&iinfo->i_data_sem);
}
- }
- up_write(&iinfo->i_data_sem);
+ } else
+ up_write(&iinfo->i_data_sem);
retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
if (retval > 0)
@@ -201,12 +201,10 @@ out:
static int udf_release_file(struct inode *inode, struct file *filp)
{
if (filp->f_mode & FMODE_WRITE) {
- mutex_lock(&inode->i_mutex);
down_write(&UDF_I(inode)->i_data_sem);
udf_discard_prealloc(inode);
udf_truncate_tail_extent(inode);
up_write(&UDF_I(inode)->i_data_sem);
- mutex_unlock(&inode->i_mutex);
}
return 0;
}
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 6fb7e0adcda0..05ab48195be9 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -46,7 +46,7 @@ void udf_free_inode(struct inode *inode)
udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
}
-struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
+struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
{
struct super_block *sb = dir->i_sb;
struct udf_sb_info *sbi = UDF_SB(sb);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 4fd1d809738c..7699df7b3198 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -48,13 +48,12 @@ MODULE_LICENSE("GPL");
#define EXTENT_MERGE_SIZE 5
-static mode_t udf_convert_permissions(struct fileEntry *);
+static umode_t udf_convert_permissions(struct fileEntry *);
static int udf_update_inode(struct inode *, int);
static void udf_fill_inode(struct inode *, struct buffer_head *);
static int udf_sync_inode(struct inode *inode);
static int udf_alloc_i_data(struct inode *inode, size_t size);
-static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
- sector_t *, int *);
+static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
static int8_t udf_insert_aext(struct inode *, struct extent_position,
struct kernel_lb_addr, uint32_t);
static void udf_split_extents(struct inode *, int *, int, int,
@@ -151,6 +150,12 @@ const struct address_space_operations udf_aops = {
.bmap = udf_bmap,
};
+/*
+ * Expand file stored in ICB to a normal one-block-file
+ *
+ * This function requires i_data_sem for writing and releases it.
+ * This function requires i_mutex held
+ */
int udf_expand_file_adinicb(struct inode *inode)
{
struct page *page;
@@ -169,9 +174,15 @@ int udf_expand_file_adinicb(struct inode *inode)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
/* from now on we have normal address_space methods */
inode->i_data.a_ops = &udf_aops;
+ up_write(&iinfo->i_data_sem);
mark_inode_dirty(inode);
return 0;
}
+ /*
+ * Release i_data_sem so that we can lock a page - page lock ranks
+ * above i_data_sem. i_mutex still protects us against file changes.
+ */
+ up_write(&iinfo->i_data_sem);
page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
if (!page)
@@ -187,6 +198,7 @@ int udf_expand_file_adinicb(struct inode *inode)
SetPageUptodate(page);
kunmap(page);
}
+ down_write(&iinfo->i_data_sem);
memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00,
iinfo->i_lenAlloc);
iinfo->i_lenAlloc = 0;
@@ -196,17 +208,20 @@ int udf_expand_file_adinicb(struct inode *inode)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
/* from now on we have normal address_space methods */
inode->i_data.a_ops = &udf_aops;
+ up_write(&iinfo->i_data_sem);
err = inode->i_data.a_ops->writepage(page, &udf_wbc);
if (err) {
/* Restore everything back so that we don't lose data... */
lock_page(page);
kaddr = kmap(page);
+ down_write(&iinfo->i_data_sem);
memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
inode->i_size);
kunmap(page);
unlock_page(page);
iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
inode->i_data.a_ops = &udf_adinicb_aops;
+ up_write(&iinfo->i_data_sem);
}
page_cache_release(page);
mark_inode_dirty(inode);
@@ -310,7 +325,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
struct buffer_head *bh_result, int create)
{
int err, new;
- struct buffer_head *bh;
sector_t phys = 0;
struct udf_inode_info *iinfo;
@@ -323,7 +337,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
err = -EIO;
new = 0;
- bh = NULL;
iinfo = UDF_I(inode);
down_write(&iinfo->i_data_sem);
@@ -332,13 +345,10 @@ static int udf_get_block(struct inode *inode, sector_t block,
iinfo->i_next_alloc_goal++;
}
- err = 0;
- bh = inode_getblk(inode, block, &err, &phys, &new);
- BUG_ON(bh);
- if (err)
+ phys = inode_getblk(inode, block, &err, &new);
+ if (!phys)
goto abort;
- BUG_ON(!phys);
if (new)
set_buffer_new(bh_result);
@@ -547,11 +557,10 @@ out:
return err;
}
-static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
- int *err, sector_t *phys, int *new)
+static sector_t inode_getblk(struct inode *inode, sector_t block,
+ int *err, int *new)
{
static sector_t last_block;
- struct buffer_head *result = NULL;
struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
struct extent_position prev_epos, cur_epos, next_epos;
int count = 0, startnum = 0, endnum = 0;
@@ -566,6 +575,8 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
int goal = 0, pgoal = iinfo->i_location.logicalBlockNum;
int lastblock = 0;
+ *err = 0;
+ *new = 0;
prev_epos.offset = udf_file_entry_alloc_offset(inode);
prev_epos.block = iinfo->i_location;
prev_epos.bh = NULL;
@@ -635,8 +646,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
brelse(cur_epos.bh);
brelse(next_epos.bh);
newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
- *phys = newblock;
- return NULL;
+ return newblock;
}
last_block = block;
@@ -664,7 +674,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
brelse(cur_epos.bh);
brelse(next_epos.bh);
*err = ret;
- return NULL;
+ return 0;
}
c = 0;
offset = 0;
@@ -729,7 +739,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
if (!newblocknum) {
brelse(prev_epos.bh);
*err = -ENOSPC;
- return NULL;
+ return 0;
}
iinfo->i_lenExtents += inode->i_sb->s_blocksize;
}
@@ -761,10 +771,10 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
newblock = udf_get_pblock(inode->i_sb, newblocknum,
iinfo->i_location.partitionReferenceNum, 0);
- if (!newblock)
- return NULL;
- *phys = newblock;
- *err = 0;
+ if (!newblock) {
+ *err = -EIO;
+ return 0;
+ }
*new = 1;
iinfo->i_next_alloc_block = block;
iinfo->i_next_alloc_goal = newblocknum;
@@ -775,7 +785,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
else
mark_inode_dirty(inode);
- return result;
+ return newblock;
}
static void udf_split_extents(struct inode *inode, int *c, int offset,
@@ -1111,10 +1121,9 @@ int udf_setsize(struct inode *inode, loff_t newsize)
if (bsize <
(udf_file_entry_alloc_offset(inode) + newsize)) {
err = udf_expand_file_adinicb(inode);
- if (err) {
- up_write(&iinfo->i_data_sem);
+ if (err)
return err;
- }
+ down_write(&iinfo->i_data_sem);
} else
iinfo->i_lenAlloc = newsize;
}
@@ -1452,9 +1461,9 @@ static int udf_alloc_i_data(struct inode *inode, size_t size)
return 0;
}
-static mode_t udf_convert_permissions(struct fileEntry *fe)
+static umode_t udf_convert_permissions(struct fileEntry *fe)
{
- mode_t mode;
+ umode_t mode;
uint32_t permissions;
uint32_t flags;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 4639e137222f..08bf46edf9c4 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -552,7 +552,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
}
-static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
+static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
struct udf_fileident_bh fibh;
@@ -596,7 +596,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
return 0;
}
-static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
+static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
struct inode *inode;
@@ -640,7 +640,7 @@ out:
return err;
}
-static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct udf_fileident_bh fibh;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e185253470df..c09a84daaf50 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -89,7 +89,7 @@ static void udf_open_lvid(struct super_block *);
static void udf_close_lvid(struct super_block *);
static unsigned int udf_count_free(struct super_block *);
static int udf_statfs(struct dentry *, struct kstatfs *);
-static int udf_show_options(struct seq_file *, struct vfsmount *);
+static int udf_show_options(struct seq_file *, struct dentry *);
struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
{
@@ -138,7 +138,6 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
static void udf_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(udf_inode_cachep, UDF_I(inode));
}
@@ -196,11 +195,11 @@ struct udf_options {
unsigned int fileset;
unsigned int rootdir;
unsigned int flags;
- mode_t umask;
+ umode_t umask;
gid_t gid;
uid_t uid;
- mode_t fmode;
- mode_t dmode;
+ umode_t fmode;
+ umode_t dmode;
struct nls_table *nls_map;
};
@@ -250,9 +249,9 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
return 0;
}
-static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
+static int udf_show_options(struct seq_file *seq, struct dentry *root)
{
- struct super_block *sb = mnt->mnt_sb;
+ struct super_block *sb = root->d_sb;
struct udf_sb_info *sbi = UDF_SB(sb);
if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
@@ -280,11 +279,11 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET))
seq_printf(seq, ",gid=%u", sbi->s_gid);
if (sbi->s_umask != 0)
- seq_printf(seq, ",umask=%o", sbi->s_umask);
+ seq_printf(seq, ",umask=%ho", sbi->s_umask);
if (sbi->s_fmode != UDF_INVALID_MODE)
- seq_printf(seq, ",mode=%o", sbi->s_fmode);
+ seq_printf(seq, ",mode=%ho", sbi->s_fmode);
if (sbi->s_dmode != UDF_INVALID_MODE)
- seq_printf(seq, ",dmode=%o", sbi->s_dmode);
+ seq_printf(seq, ",dmode=%ho", sbi->s_dmode);
if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
seq_printf(seq, ",session=%u", sbi->s_session);
if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
@@ -1799,6 +1798,12 @@ static void udf_close_lvid(struct super_block *sb)
le16_to_cpu(lvid->descTag.descCRCLength)));
lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+ /*
+ * We set buffer uptodate unconditionally here to avoid spurious
+ * warnings from mark_buffer_dirty() when previous EIO has marked
+ * the buffer as !uptodate
+ */
+ set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index b1d4488b0f14..d7c6dbe4194b 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -41,10 +41,16 @@ static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
pc = (struct pathComponent *)(from + elen);
switch (pc->componentType) {
case 1:
- if (pc->lengthComponentIdent == 0) {
- p = to;
- *p++ = '/';
- }
+ /*
+ * Symlink points to some place which should be agreed
+ * upon between originator and receiver of the media. Ignore.
+ */
+ if (pc->lengthComponentIdent > 0)
+ break;
+ /* Fall through */
+ case 2:
+ p = to;
+ *p++ = '/';
break;
case 3:
memcpy(p, "../", 3);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 5142a82e3276..42ad69ac9576 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -50,7 +50,7 @@
#define UDF_SPARABLE_MAP15 0x1522U
#define UDF_METADATA_MAP25 0x2511U
-#define UDF_INVALID_MODE ((mode_t)-1)
+#define UDF_INVALID_MODE ((umode_t)-1)
#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
@@ -127,11 +127,11 @@ struct udf_sb_info {
struct buffer_head *s_lvid_bh;
/* Default permissions */
- mode_t s_umask;
+ umode_t s_umask;
gid_t s_gid;
uid_t s_uid;
- mode_t s_fmode;
- mode_t s_dmode;
+ umode_t s_fmode;
+ umode_t s_dmode;
/* Lock protecting consistency of above permission settings */
rwlock_t s_cred_lock;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index f34e6fc0cdaa..ebe10314e512 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -215,7 +215,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
/* ialloc.c */
extern void udf_free_inode(struct inode *);
-extern struct inode *udf_new_inode(struct inode *, int, int *);
+extern struct inode *udf_new_inode(struct inode *, umode_t, int *);
/* truncate.c */
extern void udf_truncate_tail_extent(struct inode *);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 78a4c70d46b5..4ec5c1085a87 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -170,7 +170,7 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode * ufs_new_inode(struct inode * dir, int mode)
+struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
{
struct super_block * sb;
struct ufs_sb_info * sbi;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 879b13436fa4..9094e1d917be 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -583,7 +583,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
- mode_t mode;
+ umode_t mode;
/*
* Copy data to the in-core inode.
@@ -630,7 +630,7 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
- mode_t mode;
+ umode_t mode;
UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
/*
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 639d49162241..38cac199edff 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -70,7 +70,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
+static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode,
struct nameidata *nd)
{
struct inode *inode;
@@ -94,7 +94,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
return err;
}
-static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
+static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct inode *inode;
int err;
@@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
return error;
}
-static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
{
struct inode * inode;
int err = -EMLINK;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 3915ade6f9a8..5246ee3e5607 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1351,9 +1351,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
return 0;
}
-static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int ufs_show_options(struct seq_file *seq, struct dentry *root)
{
- struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb);
+ struct ufs_sb_info *sbi = UFS_SB(root->d_sb);
unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
const struct match_token *tp = tokens;
@@ -1425,7 +1425,6 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
static void ufs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
}
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index c26f2bcec264..528750b7e701 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -104,7 +104,7 @@ extern const struct address_space_operations ufs_aops;
/* ialloc.c */
extern void ufs_free_inode (struct inode *inode);
-extern struct inode * ufs_new_inode (struct inode *, int);
+extern struct inode * ufs_new_inode (struct inode *, umode_t);
/* inode.c */
extern struct inode *ufs_iget(struct super_block *, unsigned long);
diff --git a/fs/xattr.c b/fs/xattr.c
index 67583de8218c..82f43376c7cd 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -397,7 +397,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
error = mnt_want_write_file(f);
if (!error) {
error = setxattr(dentry, name, value, size, flags);
- mnt_drop_write(f->f_path.mnt);
+ mnt_drop_write_file(f);
}
fput(f);
return error;
@@ -624,7 +624,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
error = mnt_want_write_file(f);
if (!error) {
error = removexattr(dentry, name);
- mnt_drop_write(f->f_path.mnt);
+ mnt_drop_write_file(f);
}
fput(f);
return error;
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 292eff198030..ab7c53fe346e 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -110,10 +110,4 @@ kmem_zone_destroy(kmem_zone_t *zone)
extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
-static inline int
-kmem_shake_allow(gfp_t gfp_mask)
-{
- return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS));
-}
-
#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 76e4266d2e7e..ac702a6eab9b 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -39,7 +39,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
struct posix_acl_entry *acl_e;
struct posix_acl *acl;
struct xfs_acl_entry *ace;
- int count, i;
+ unsigned int count, i;
count = be32_to_cpu(aclp->acl_cnt);
if (count > XFS_ACL_MAX_ENTRIES)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee9b625..74b9baf36ac3 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,8 +111,7 @@ xfs_ioend_new_eof(
xfs_fsize_t bsize;
bsize = ioend->io_offset + ioend->io_size;
- isize = MAX(ip->i_size, ip->i_new_size);
- isize = MIN(isize, bsize);
+ isize = MIN(i_size_read(VFS_I(ip)), bsize);
return isize > ip->i_d.di_size ? isize : 0;
}
@@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
}
/*
- * Update on-disk file size now that data has been written to disk. The
- * current in-memory file size is i_size. If a write is beyond eof i_new_size
- * will be the intended file size until i_size is updated. If this write does
- * not extend all the way to the valid file size then restrict this update to
- * the end of the write.
+ * Update on-disk file size now that data has been written to disk.
*
* This function does not block as blocking on the inode lock in IO completion
* can lead to IO completion order dependency deadlocks.. If it can't get the
@@ -1279,6 +1274,15 @@ xfs_end_io_direct_write(
struct xfs_ioend *ioend = iocb->private;
/*
+ * While the generic direct I/O code updates the inode size, it does
+ * so only after the end_io handler is called, which means our
+ * end_io handler thinks the on-disk size is outside the in-core
+ * size. To prevent this just update it a little bit earlier here.
+ */
+ if (offset + size > i_size_read(ioend->io_inode))
+ i_size_write(ioend->io_inode, offset + size);
+
+ /*
* blockdev_direct_IO can return an error even after the I/O
* completion handler was called. Thus we need to protect
* against double-freeing.
@@ -1340,12 +1344,11 @@ xfs_vm_write_failed(
if (to > inode->i_size) {
/*
- * punch out the delalloc blocks we have already allocated. We
- * don't call xfs_setattr() to do this as we may be in the
- * middle of a multi-iovec write and so the vfs inode->i_size
- * will not match the xfs ip->i_size and so it will zero too
- * much. Hence we jus truncate the page cache to zero what is
- * necessary and punch the delalloc blocks directly.
+ * Punch out the delalloc blocks we have already allocated.
+ *
+ * Don't bother with xfs_setattr given that nothing can have
+ * made it to disk yet as the page is still locked at this
+ * point.
*/
struct xfs_inode *ip = XFS_I(inode);
xfs_fileoff_t start_fsb;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1e5d97f86ea8..08b9ac644c31 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
if (error)
goto out;
- /*
- * Commit the last in the sequence of transactions.
- */
- xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index c1b55e596551..d25eafd4d28d 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
dp = args->dp;
mp = dp->i_mount;
dp->i_d.di_forkoff = forkoff;
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- dp->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
ifp = dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -326,7 +322,6 @@ xfs_attr_fork_reset(
ASSERT(ip->i_d.di_anextents == 0);
ASSERT(ip->i_afp == NULL);
- ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
(args->op_flags & XFS_DA_OP_ADDNAME) ||
!(mp->m_flags & XFS_MOUNT_ATTR2) ||
dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
- dp->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
xfs_trans_log_inode(args->trans, dp,
XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index d0ab78837057..188ef2fbd628 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge(
}
/*
-* Update the record referred to by cur to the value given
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+ return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >
+ XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+ return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) <=
+ XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given
* by [off, bno, len, state].
* This either works (return 0) or gets an EFSCORRUPTED error.
*/
@@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist,
&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur, 1,
&tmp_rval, XFS_DATA_FORK);
@@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur,
1, &tmp_rval, XFS_DATA_FORK);
@@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real(
}
/* convert to a btree if necessary */
- if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
@@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real(
}
/* convert to a btree if necessary */
- if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(cur == NULL);
@@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real(
}
/* convert to a btree if necessary */
- if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
@@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree(
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
/*
* Make space in the inode incore.
*/
@@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset(
ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
- if (dfl_forkoff > ip->i_d.di_forkoff) {
+ if (dfl_forkoff > ip->i_d.di_forkoff)
ip->i_d.di_forkoff = dfl_forkoff;
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
- }
}
}
@@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork(
int error; /* error return value */
ASSERT(XFS_IFORK_Q(ip) == 0);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
mp = ip->i_mount;
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork(
error = XFS_ERROR(EINVAL);
goto error1;
}
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
ASSERT(ip->i_afp == NULL);
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
ip->i_afp->if_flags = XFS_IFEXTENTS;
logflags = 0;
xfs_bmap_init(&flist, &firstblock);
@@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork(
} else
spin_unlock(&mp->m_sb_lock);
}
- if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
+
+ error = xfs_bmap_finish(&tp, &flist, &committed);
+ if (error)
goto error2;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
- return error;
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
error2:
xfs_bmap_cancel(&flist);
error1:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
error0:
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
return error;
}
@@ -3994,11 +3997,8 @@ xfs_bmap_one_block(
xfs_bmbt_irec_t s; /* internal version of extent */
#ifndef DEBUG
- if (whichfork == XFS_DATA_FORK) {
- return S_ISREG(ip->i_d.di_mode) ?
- (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
- (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
- }
+ if (whichfork == XFS_DATA_FORK)
+ return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
#endif /* !DEBUG */
if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
return 0;
@@ -4010,7 +4010,7 @@ xfs_bmap_one_block(
xfs_bmbt_get_all(ep, &s);
rval = s.br_startoff == 0 && s.br_blockcount == 1;
if (rval && whichfork == XFS_DATA_FORK)
- ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
+ ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
return rval;
}
@@ -4379,8 +4379,6 @@ xfs_bmapi_read(
XFS_STATS_INC(xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4871,8 +4869,6 @@ xfs_bmapi_write(
return XFS_ERROR(EIO);
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
XFS_STATS_INC(xs_blk_mapw);
@@ -4981,8 +4977,7 @@ xfs_bmapi_write(
/*
* Transform from btree to extents, give it cur.
*/
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+ if (xfs_bmap_wants_extents(ip, whichfork)) {
int tmp_logflags = 0;
ASSERT(bma.cur);
@@ -4992,10 +4987,10 @@ xfs_bmapi_write(
if (error)
goto error0;
}
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
- XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+ XFS_IFORK_NEXTENTS(ip, whichfork) >
+ XFS_IFORK_MAXEXT(ip, whichfork));
error = 0;
error0:
/*
@@ -5095,8 +5090,7 @@ xfs_bunmapi(
ASSERT(len > 0);
ASSERT(nexts >= 0);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
if (!(ifp->if_flags & XFS_IFEXTENTS) &&
(error = xfs_iread_extents(tp, ip, whichfork)))
return error;
@@ -5322,7 +5316,8 @@ xfs_bunmapi(
*/
if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+ XFS_IFORK_MAXEXT(ip, whichfork) &&
del.br_startoff > got.br_startoff &&
del.br_startoff + del.br_blockcount <
got.br_startoff + got.br_blockcount) {
@@ -5353,13 +5348,11 @@ nodelete:
}
}
*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
/*
* Convert to a btree if necessary.
*/
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
ASSERT(cur == NULL);
error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
&cur, 0, &tmp_logflags, whichfork);
@@ -5370,8 +5363,7 @@ nodelete:
/*
* transform from btree to extents, give it cur
*/
- else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+ else if (xfs_bmap_wants_extents(ip, whichfork)) {
ASSERT(cur != NULL);
error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
whichfork);
@@ -5382,8 +5374,6 @@ nodelete:
/*
* transform from extents to local?
*/
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
error = 0;
error0:
/*
@@ -5434,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole(
if (startblock == HOLESTARTBLOCK) {
mp = ip->i_mount;
out->bmv_block = -1;
- fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size));
+ fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
fixlen -= out->bmv_offset;
if (prealloced && out->bmv_offset + out->bmv_length == end) {
/* Came to hole at EOF. Trim it. */
@@ -5522,7 +5512,7 @@ xfs_getbmap(
fixlen = XFS_MAXIOFFSET(mp);
} else {
prealloced = 0;
- fixlen = ip->i_size;
+ fixlen = XFS_ISIZE(ip);
}
}
@@ -5551,7 +5541,7 @@ xfs_getbmap(
xfs_ilock(ip, XFS_IOLOCK_SHARED);
if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
- if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+ if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
if (error)
goto out_unlock_iolock;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index cf0ac056815f..4dff85c7d7eb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1370,7 +1370,7 @@ restart:
goto restart;
}
/*
- * clear the LRU reference count so the bufer doesn't get
+ * clear the LRU reference count so the buffer doesn't get
* ignored in xfs_buf_rele().
*/
atomic_set(&bp->b_lru_ref, 0);
@@ -1701,12 +1701,8 @@ xfsbufd(
struct list_head tmp;
struct blk_plug plug;
- if (unlikely(freezing(current))) {
- set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
- refrigerator();
- } else {
- clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
- }
+ if (unlikely(freezing(current)))
+ try_to_freeze();
/* sleep for a long time if there is nothing to do. */
if (list_empty(&target->bt_delwri_queue))
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5bab046e859f..df7ffb0affe7 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -90,8 +90,7 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_DELWRI_Q, "DELWRI_Q" }
typedef enum {
- XBT_FORCE_SLEEP = 0,
- XBT_FORCE_FLUSH = 1,
+ XBT_FORCE_FLUSH = 0,
} xfs_buftarg_flags_t;
typedef struct xfs_buftarg {
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 654dc6f05bac..dd974a55c77d 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -163,12 +163,14 @@ xfs_swap_extents_check_format(
/* Check temp in extent form to max in target */
if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+ XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
return EINVAL;
/* Check target in extent form to max in temp */
if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+ XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return EINVAL;
/*
@@ -180,18 +182,25 @@ xfs_swap_extents_check_format(
* (a common defrag case) which will occur when the temp inode is in
* extent format...
*/
- if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
- ((XFS_IFORK_BOFF(ip) &&
- tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
- XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
- return EINVAL;
+ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (XFS_IFORK_BOFF(ip) &&
+ tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+ return EINVAL;
+ if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+ XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+ return EINVAL;
+ }
/* Reciprocal target->temp btree format checks */
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
- ((XFS_IFORK_BOFF(tip) &&
- ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
- return EINVAL;
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (XFS_IFORK_BOFF(tip) &&
+ ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+ return EINVAL;
+
+ if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+ XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+ return EINVAL;
+ }
return 0;
}
@@ -349,16 +358,6 @@ xfs_swap_extents(
*tifp = *tempifp; /* struct copy */
/*
- * Fix the in-memory data fork values that are dependent on the fork
- * offset in the inode. We can't assume they remain the same as attr2
- * has dynamic fork offsets.
- */
- ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
- (uint)sizeof(xfs_bmbt_rec_t);
- tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
- (uint)sizeof(xfs_bmbt_rec_t);
-
- /*
* Fix the on-disk inode values
*/
tmp = (__uint64_t)ip->i_d.di_nblocks;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 8a24f0c6c860..286a051f12cf 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -68,7 +68,7 @@ xfs_trim_extents(
* Look up the longest btree in the AGF and start with it.
*/
error = xfs_alloc_lookup_le(cur, 0,
- XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+ be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
if (error)
goto out_del_cursor;
@@ -84,7 +84,7 @@ xfs_trim_extents(
if (error)
goto out_del_cursor;
XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
- ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+ ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
/*
* Too small? Give up.
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 25d7280e9f6b..53db20ee3e77 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -39,20 +39,19 @@
#include "xfs_qm.h"
#include "xfs_trace.h"
-
/*
- LOCK ORDER
-
- inode lock (ilock)
- dquot hash-chain lock (hashlock)
- xqm dquot freelist lock (freelistlock
- mount's dquot list lock (mplistlock)
- user dquot lock - lock ordering among dquots is based on the uid or gid
- group dquot lock - similar to udquots. Between the two dquots, the udquot
- has to be locked first.
- pin lock - the dquot lock must be held to take this lock.
- flush lock - ditto.
-*/
+ * Lock order:
+ *
+ * ip->i_lock
+ * qh->qh_lock
+ * qi->qi_dqlist_lock
+ * dquot->q_qlock (xfs_dqlock() and friends)
+ * dquot->q_flush (xfs_dqflock() and friends)
+ * xfs_Gqm->qm_dqfrlist_lock
+ *
+ * If two dquots need to be locked the order is user before group/project,
+ * otherwise by the lowest id first, see xfs_dqlock2.
+ */
#ifdef DEBUG
xfs_buftarg_t *xfs_dqerror_target;
@@ -64,82 +63,6 @@ int xfs_dqerror_mod = 33;
static struct lock_class_key xfs_dquot_other_class;
/*
- * Allocate and initialize a dquot. We don't always allocate fresh memory;
- * we try to reclaim a free dquot if the number of incore dquots are above
- * a threshold.
- * The only field inside the core that gets initialized at this point
- * is the d_id field. The idea is to fill in the entire q_core
- * when we read in the on disk dquot.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqinit(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type)
-{
- xfs_dquot_t *dqp;
- boolean_t brandnewdquot;
-
- brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
- dqp->dq_flags = type;
- dqp->q_core.d_id = cpu_to_be32(id);
- dqp->q_mount = mp;
-
- /*
- * No need to re-initialize these if this is a reclaimed dquot.
- */
- if (brandnewdquot) {
- INIT_LIST_HEAD(&dqp->q_freelist);
- mutex_init(&dqp->q_qlock);
- init_waitqueue_head(&dqp->q_pinwait);
-
- /*
- * Because we want to use a counting completion, complete
- * the flush completion once to allow a single access to
- * the flush completion without blocking.
- */
- init_completion(&dqp->q_flush);
- complete(&dqp->q_flush);
-
- trace_xfs_dqinit(dqp);
- } else {
- /*
- * Only the q_core portion was zeroed in dqreclaim_one().
- * So, we need to reset others.
- */
- dqp->q_nrefs = 0;
- dqp->q_blkno = 0;
- INIT_LIST_HEAD(&dqp->q_mplist);
- INIT_LIST_HEAD(&dqp->q_hashlist);
- dqp->q_bufoffset = 0;
- dqp->q_fileoffset = 0;
- dqp->q_transp = NULL;
- dqp->q_gdquot = NULL;
- dqp->q_res_bcount = 0;
- dqp->q_res_icount = 0;
- dqp->q_res_rtbcount = 0;
- atomic_set(&dqp->q_pincount, 0);
- dqp->q_hash = NULL;
- ASSERT(list_empty(&dqp->q_freelist));
-
- trace_xfs_dqreuse(dqp);
- }
-
- /*
- * In either case we need to make sure group quotas have a different
- * lock class than user quotas, to make sure lockdep knows we can
- * locks of one of each at the same time.
- */
- if (!(type & XFS_DQ_USER))
- lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
-
- /*
- * log item gets initialized later
- */
- return (dqp);
-}
-
-/*
* This is called to free all the memory associated with a dquot
*/
void
@@ -155,24 +78,6 @@ xfs_qm_dqdestroy(
}
/*
- * This is what a 'fresh' dquot inside a dquot chunk looks like on disk.
- */
-STATIC void
-xfs_qm_dqinit_core(
- xfs_dqid_t id,
- uint type,
- xfs_dqblk_t *d)
-{
- /*
- * Caller has zero'd the entire dquot 'chunk' already.
- */
- d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
- d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
- d->dd_diskdq.d_id = cpu_to_be32(id);
- d->dd_diskdq.d_flags = type;
-}
-
-/*
* If default limits are in force, push them into the dquot now.
* We overwrite the dquot limits only if they are zero and this
* is not the root dquot.
@@ -234,10 +139,10 @@ xfs_qm_adjust_dqtimers(
if (!d->d_btimer) {
if ((d->d_blk_softlimit &&
- (be64_to_cpu(d->d_bcount) >=
+ (be64_to_cpu(d->d_bcount) >
be64_to_cpu(d->d_blk_softlimit))) ||
(d->d_blk_hardlimit &&
- (be64_to_cpu(d->d_bcount) >=
+ (be64_to_cpu(d->d_bcount) >
be64_to_cpu(d->d_blk_hardlimit)))) {
d->d_btimer = cpu_to_be32(get_seconds() +
mp->m_quotainfo->qi_btimelimit);
@@ -246,10 +151,10 @@ xfs_qm_adjust_dqtimers(
}
} else {
if ((!d->d_blk_softlimit ||
- (be64_to_cpu(d->d_bcount) <
+ (be64_to_cpu(d->d_bcount) <=
be64_to_cpu(d->d_blk_softlimit))) &&
(!d->d_blk_hardlimit ||
- (be64_to_cpu(d->d_bcount) <
+ (be64_to_cpu(d->d_bcount) <=
be64_to_cpu(d->d_blk_hardlimit)))) {
d->d_btimer = 0;
}
@@ -257,10 +162,10 @@ xfs_qm_adjust_dqtimers(
if (!d->d_itimer) {
if ((d->d_ino_softlimit &&
- (be64_to_cpu(d->d_icount) >=
+ (be64_to_cpu(d->d_icount) >
be64_to_cpu(d->d_ino_softlimit))) ||
(d->d_ino_hardlimit &&
- (be64_to_cpu(d->d_icount) >=
+ (be64_to_cpu(d->d_icount) >
be64_to_cpu(d->d_ino_hardlimit)))) {
d->d_itimer = cpu_to_be32(get_seconds() +
mp->m_quotainfo->qi_itimelimit);
@@ -269,10 +174,10 @@ xfs_qm_adjust_dqtimers(
}
} else {
if ((!d->d_ino_softlimit ||
- (be64_to_cpu(d->d_icount) <
+ (be64_to_cpu(d->d_icount) <=
be64_to_cpu(d->d_ino_softlimit))) &&
(!d->d_ino_hardlimit ||
- (be64_to_cpu(d->d_icount) <
+ (be64_to_cpu(d->d_icount) <=
be64_to_cpu(d->d_ino_hardlimit)))) {
d->d_itimer = 0;
}
@@ -280,10 +185,10 @@ xfs_qm_adjust_dqtimers(
if (!d->d_rtbtimer) {
if ((d->d_rtb_softlimit &&
- (be64_to_cpu(d->d_rtbcount) >=
+ (be64_to_cpu(d->d_rtbcount) >
be64_to_cpu(d->d_rtb_softlimit))) ||
(d->d_rtb_hardlimit &&
- (be64_to_cpu(d->d_rtbcount) >=
+ (be64_to_cpu(d->d_rtbcount) >
be64_to_cpu(d->d_rtb_hardlimit)))) {
d->d_rtbtimer = cpu_to_be32(get_seconds() +
mp->m_quotainfo->qi_rtbtimelimit);
@@ -292,10 +197,10 @@ xfs_qm_adjust_dqtimers(
}
} else {
if ((!d->d_rtb_softlimit ||
- (be64_to_cpu(d->d_rtbcount) <
+ (be64_to_cpu(d->d_rtbcount) <=
be64_to_cpu(d->d_rtb_softlimit))) &&
(!d->d_rtb_hardlimit ||
- (be64_to_cpu(d->d_rtbcount) <
+ (be64_to_cpu(d->d_rtbcount) <=
be64_to_cpu(d->d_rtb_hardlimit)))) {
d->d_rtbtimer = 0;
}
@@ -328,8 +233,13 @@ xfs_qm_init_dquot_blk(
curid = id - (id % q->qi_dqperchunk);
ASSERT(curid >= 0);
memset(d, 0, BBTOB(q->qi_dqchunklen));
- for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
- xfs_qm_dqinit_core(curid, type, d);
+ for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) {
+ d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+ d->dd_diskdq.d_id = cpu_to_be32(curid);
+ d->dd_diskdq.d_flags = type;
+ }
+
xfs_trans_dquot_buf(tp, bp,
(type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
@@ -564,36 +474,87 @@ xfs_qm_dqtobp(
* Read in the ondisk dquot using dqtobp() then copy it to an incore version,
* and release the buffer immediately.
*
+ * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed.
*/
-/* ARGSUSED */
-STATIC int
+int
xfs_qm_dqread(
- xfs_trans_t **tpp,
- xfs_dqid_t id,
- xfs_dquot_t *dqp, /* dquot to get filled in */
- uint flags)
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ uint flags,
+ struct xfs_dquot **O_dqpp)
{
- xfs_disk_dquot_t *ddqp;
- xfs_buf_t *bp;
- int error;
- xfs_trans_t *tp;
+ struct xfs_dquot *dqp;
+ struct xfs_disk_dquot *ddqp;
+ struct xfs_buf *bp;
+ struct xfs_trans *tp = NULL;
+ int error;
+ int cancelflags = 0;
+
+
+ dqp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
+
+ dqp->dq_flags = type;
+ dqp->q_core.d_id = cpu_to_be32(id);
+ dqp->q_mount = mp;
+ INIT_LIST_HEAD(&dqp->q_freelist);
+ mutex_init(&dqp->q_qlock);
+ init_waitqueue_head(&dqp->q_pinwait);
- ASSERT(tpp);
+ /*
+ * Because we want to use a counting completion, complete
+ * the flush completion once to allow a single access to
+ * the flush completion without blocking.
+ */
+ init_completion(&dqp->q_flush);
+ complete(&dqp->q_flush);
+
+ /*
+ * Make sure group quotas have a different lock class than user
+ * quotas.
+ */
+ if (!(type & XFS_DQ_USER))
+ lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+
+ atomic_inc(&xfs_Gqm->qm_totaldquots);
trace_xfs_dqread(dqp);
+ if (flags & XFS_QMOPT_DQALLOC) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
+ error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
+ XFS_WRITE_LOG_RES(mp) +
+ /*
+ * Round the chunklen up to the next multiple
+ * of 128 (buf log item chunk size)).
+ */
+ BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128,
+ 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+ if (error)
+ goto error1;
+ cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+ }
+
/*
* get a pointer to the on-disk dquot and the buffer containing it
* dqp already knows its own type (GROUP/USER).
*/
- if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
- return (error);
+ error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags);
+ if (error) {
+ /*
+ * This can happen if quotas got turned off (ESRCH),
+ * or if the dquot didn't exist on disk and we ask to
+ * allocate (ENOENT).
+ */
+ trace_xfs_dqread_fail(dqp);
+ cancelflags |= XFS_TRANS_ABORT;
+ goto error1;
}
- tp = *tpp;
/* copy everything from disk dquot to the incore dquot */
memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
- ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
xfs_qm_dquot_logitem_init(dqp);
/*
@@ -622,77 +583,22 @@ xfs_qm_dqread(
ASSERT(xfs_buf_islocked(bp));
xfs_trans_brelse(tp, bp);
- return (error);
-}
-
-
-/*
- * allocate an incore dquot from the kernel heap,
- * and fill its core with quota information kept on disk.
- * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk
- * if it wasn't already allocated.
- */
-STATIC int
-xfs_qm_idtodq(
- xfs_mount_t *mp,
- xfs_dqid_t id, /* gid or uid, depending on type */
- uint type, /* UDQUOT or GDQUOT */
- uint flags, /* DQALLOC, DQREPAIR */
- xfs_dquot_t **O_dqpp)/* OUT : incore dquot, not locked */
-{
- xfs_dquot_t *dqp;
- int error;
- xfs_trans_t *tp;
- int cancelflags=0;
-
- dqp = xfs_qm_dqinit(mp, id, type);
- tp = NULL;
- if (flags & XFS_QMOPT_DQALLOC) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
- error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
- XFS_WRITE_LOG_RES(mp) +
- BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
- 128,
- 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
- if (error) {
- cancelflags = 0;
- goto error0;
- }
- cancelflags = XFS_TRANS_RELEASE_LOG_RES;
- }
-
- /*
- * Read it from disk; xfs_dqread() takes care of
- * all the necessary initialization of dquot's fields (locks, etc)
- */
- if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) {
- /*
- * This can happen if quotas got turned off (ESRCH),
- * or if the dquot didn't exist on disk and we ask to
- * allocate (ENOENT).
- */
- trace_xfs_dqread_fail(dqp);
- cancelflags |= XFS_TRANS_ABORT;
- goto error0;
- }
if (tp) {
- if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES)))
- goto error1;
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto error0;
}
*O_dqpp = dqp;
- return (0);
+ return error;
- error0:
- ASSERT(error);
+error1:
if (tp)
xfs_trans_cancel(tp, cancelflags);
- error1:
+error0:
xfs_qm_dqdestroy(dqp);
*O_dqpp = NULL;
- return (error);
+ return error;
}
/*
@@ -710,12 +616,9 @@ xfs_qm_dqlookup(
xfs_dquot_t **O_dqpp)
{
xfs_dquot_t *dqp;
- uint flist_locked;
ASSERT(mutex_is_locked(&qh->qh_lock));
- flist_locked = B_FALSE;
-
/*
* Traverse the hashchain looking for a match
*/
@@ -725,70 +628,31 @@ xfs_qm_dqlookup(
* dqlock to look at the id field of the dquot, since the
* id can't be modified without the hashlock anyway.
*/
- if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) {
- trace_xfs_dqlookup_found(dqp);
+ if (be32_to_cpu(dqp->q_core.d_id) != id || dqp->q_mount != mp)
+ continue;
- /*
- * All in core dquots must be on the dqlist of mp
- */
- ASSERT(!list_empty(&dqp->q_mplist));
-
- xfs_dqlock(dqp);
- if (dqp->q_nrefs == 0) {
- ASSERT(!list_empty(&dqp->q_freelist));
- if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
- trace_xfs_dqlookup_want(dqp);
-
- /*
- * We may have raced with dqreclaim_one()
- * (and lost). So, flag that we don't
- * want the dquot to be reclaimed.
- */
- dqp->dq_flags |= XFS_DQ_WANT;
- xfs_dqunlock(dqp);
- mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
- xfs_dqlock(dqp);
- dqp->dq_flags &= ~(XFS_DQ_WANT);
- }
- flist_locked = B_TRUE;
- }
+ trace_xfs_dqlookup_found(dqp);
- /*
- * id couldn't have changed; we had the hashlock all
- * along
- */
- ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
-
- if (flist_locked) {
- if (dqp->q_nrefs != 0) {
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- flist_locked = B_FALSE;
- } else {
- /* take it off the freelist */
- trace_xfs_dqlookup_freelist(dqp);
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- }
- }
+ xfs_dqlock(dqp);
+ if (dqp->dq_flags & XFS_DQ_FREEING) {
+ *O_dqpp = NULL;
+ xfs_dqunlock(dqp);
+ return -1;
+ }
- XFS_DQHOLD(dqp);
+ dqp->q_nrefs++;
- if (flist_locked)
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- /*
- * move the dquot to the front of the hashchain
- */
- ASSERT(mutex_is_locked(&qh->qh_lock));
- list_move(&dqp->q_hashlist, &qh->qh_list);
- trace_xfs_dqlookup_done(dqp);
- *O_dqpp = dqp;
- return 0;
- }
+ /*
+ * move the dquot to the front of the hashchain
+ */
+ list_move(&dqp->q_hashlist, &qh->qh_list);
+ trace_xfs_dqlookup_done(dqp);
+ *O_dqpp = dqp;
+ return 0;
}
*O_dqpp = NULL;
- ASSERT(mutex_is_locked(&qh->qh_lock));
- return (1);
+ return 1;
}
/*
@@ -829,11 +693,7 @@ xfs_qm_dqget(
return (EIO);
}
}
-#endif
- again:
-
-#ifdef DEBUG
ASSERT(type == XFS_DQ_USER ||
type == XFS_DQ_PROJ ||
type == XFS_DQ_GROUP);
@@ -845,13 +705,21 @@ xfs_qm_dqget(
ASSERT(ip->i_gdquot == NULL);
}
#endif
+
+restart:
mutex_lock(&h->qh_lock);
/*
* Look in the cache (hashtable).
* The chain is kept locked during lookup.
*/
- if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) {
+ switch (xfs_qm_dqlookup(mp, id, h, O_dqpp)) {
+ case -1:
+ XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
+ mutex_unlock(&h->qh_lock);
+ delay(1);
+ goto restart;
+ case 0:
XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
/*
* The dquot was found, moved to the front of the chain,
@@ -862,9 +730,11 @@ xfs_qm_dqget(
ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
mutex_unlock(&h->qh_lock);
trace_xfs_dqget_hit(*O_dqpp);
- return (0); /* success */
+ return 0; /* success */
+ default:
+ XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
+ break;
}
- XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
/*
* Dquot cache miss. We don't want to keep the inode lock across
@@ -882,41 +752,18 @@ xfs_qm_dqget(
version = h->qh_version;
mutex_unlock(&h->qh_lock);
- /*
- * Allocate the dquot on the kernel heap, and read the ondisk
- * portion off the disk. Also, do all the necessary initialization
- * This can return ENOENT if dquot didn't exist on disk and we didn't
- * ask it to allocate; ESRCH if quotas got turned off suddenly.
- */
- if ((error = xfs_qm_idtodq(mp, id, type,
- flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR|
- XFS_QMOPT_DOWARN),
- &dqp))) {
- if (ip)
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- return (error);
- }
+ error = xfs_qm_dqread(mp, id, type, flags, &dqp);
- /*
- * See if this is mount code calling to look at the overall quota limits
- * which are stored in the id == 0 user or group's dquot.
- * Since we may not have done a quotacheck by this point, just return
- * the dquot without attaching it to any hashtables, lists, etc, or even
- * taking a reference.
- * The caller must dqdestroy this once done.
- */
- if (flags & XFS_QMOPT_DQSUSER) {
- ASSERT(id == 0);
- ASSERT(! ip);
- goto dqret;
- }
+ if (ip)
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ if (error)
+ return error;
/*
* Dquot lock comes after hashlock in the lock ordering
*/
if (ip) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
/*
* A dquot could be attached to this inode by now, since
* we had dropped the ilock.
@@ -961,16 +808,21 @@ xfs_qm_dqget(
* lock order between the two dquots here since dqp isn't
* on any findable lists yet.
*/
- if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) {
+ switch (xfs_qm_dqlookup(mp, id, h, &tmpdqp)) {
+ case 0:
+ case -1:
/*
- * Duplicate found. Just throw away the new dquot
- * and start over.
+ * Duplicate found, either in cache or on its way out.
+ * Just throw away the new dquot and start over.
*/
- xfs_qm_dqput(tmpdqp);
+ if (tmpdqp)
+ xfs_qm_dqput(tmpdqp);
mutex_unlock(&h->qh_lock);
xfs_qm_dqdestroy(dqp);
XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
- goto again;
+ goto restart;
+ default:
+ break;
}
}
@@ -1015,67 +867,49 @@ xfs_qm_dqget(
*/
void
xfs_qm_dqput(
- xfs_dquot_t *dqp)
+ struct xfs_dquot *dqp)
{
- xfs_dquot_t *gdqp;
+ struct xfs_dquot *gdqp;
ASSERT(dqp->q_nrefs > 0);
ASSERT(XFS_DQ_IS_LOCKED(dqp));
trace_xfs_dqput(dqp);
- if (dqp->q_nrefs != 1) {
- dqp->q_nrefs--;
+recurse:
+ if (--dqp->q_nrefs > 0) {
xfs_dqunlock(dqp);
return;
}
+ trace_xfs_dqput_free(dqp);
+
+ mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+ if (list_empty(&dqp->q_freelist)) {
+ list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
+ xfs_Gqm->qm_dqfrlist_cnt++;
+ }
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+
/*
- * drop the dqlock and acquire the freelist and dqlock
- * in the right order; but try to get it out-of-order first
+ * If we just added a udquot to the freelist, then we want to release
+ * the gdquot reference that it (probably) has. Otherwise it'll keep
+ * the gdquot from getting reclaimed.
*/
- if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
- trace_xfs_dqput_wait(dqp);
- xfs_dqunlock(dqp);
- mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
- xfs_dqlock(dqp);
+ gdqp = dqp->q_gdquot;
+ if (gdqp) {
+ xfs_dqlock(gdqp);
+ dqp->q_gdquot = NULL;
}
+ xfs_dqunlock(dqp);
- while (1) {
- gdqp = NULL;
-
- /* We can't depend on nrefs being == 1 here */
- if (--dqp->q_nrefs == 0) {
- trace_xfs_dqput_free(dqp);
-
- list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
- xfs_Gqm->qm_dqfrlist_cnt++;
-
- /*
- * If we just added a udquot to the freelist, then
- * we want to release the gdquot reference that
- * it (probably) has. Otherwise it'll keep the
- * gdquot from getting reclaimed.
- */
- if ((gdqp = dqp->q_gdquot)) {
- /*
- * Avoid a recursive dqput call
- */
- xfs_dqlock(gdqp);
- dqp->q_gdquot = NULL;
- }
- }
- xfs_dqunlock(dqp);
-
- /*
- * If we had a group quota inside the user quota as a hint,
- * release it now.
- */
- if (! gdqp)
- break;
+ /*
+ * If we had a group quota hint, release it now.
+ */
+ if (gdqp) {
dqp = gdqp;
+ goto recurse;
}
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
}
/*
@@ -1169,7 +1003,7 @@ xfs_qm_dqflush(
* If not dirty, or it's pinned and we are not supposed to block, nada.
*/
if (!XFS_DQ_IS_DIRTY(dqp) ||
- (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
+ ((flags & SYNC_TRYLOCK) && atomic_read(&dqp->q_pincount) > 0)) {
xfs_dqfunlock(dqp);
return 0;
}
@@ -1257,40 +1091,17 @@ xfs_qm_dqflush(
}
-int
-xfs_qm_dqlock_nowait(
- xfs_dquot_t *dqp)
-{
- return mutex_trylock(&dqp->q_qlock);
-}
-
-void
-xfs_dqlock(
- xfs_dquot_t *dqp)
-{
- mutex_lock(&dqp->q_qlock);
-}
-
void
xfs_dqunlock(
xfs_dquot_t *dqp)
{
- mutex_unlock(&(dqp->q_qlock));
+ xfs_dqunlock_nonotify(dqp);
if (dqp->q_logitem.qli_dquot == dqp) {
- /* Once was dqp->q_mount, but might just have been cleared */
xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
- (xfs_log_item_t*)&(dqp->q_logitem));
+ &dqp->q_logitem.qli_item);
}
}
-
-void
-xfs_dqunlock_nonotify(
- xfs_dquot_t *dqp)
-{
- mutex_unlock(&(dqp->q_qlock));
-}
-
/*
* Lock two xfs_dquot structures.
*
@@ -1319,43 +1130,18 @@ xfs_dqlock2(
}
}
-
/*
- * Take a dquot out of the mount's dqlist as well as the hashlist.
- * This is called via unmount as well as quotaoff, and the purge
- * will always succeed unless there are soft (temp) references
- * outstanding.
- *
- * This returns 0 if it was purged, 1 if it wasn't. It's not an error code
- * that we're returning! XXXsup - not cool.
+ * Take a dquot out of the mount's dqlist as well as the hashlist. This is
+ * called via unmount as well as quotaoff, and the purge will always succeed.
*/
-/* ARGSUSED */
-int
+void
xfs_qm_dqpurge(
- xfs_dquot_t *dqp)
+ struct xfs_dquot *dqp)
{
- xfs_dqhash_t *qh = dqp->q_hash;
- xfs_mount_t *mp = dqp->q_mount;
-
- ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
- ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_dqhash *qh = dqp->q_hash;
xfs_dqlock(dqp);
- /*
- * We really can't afford to purge a dquot that is
- * referenced, because these are hard refs.
- * It shouldn't happen in general because we went thru _all_ inodes in
- * dqrele_all_inodes before calling this and didn't let the mountlock go.
- * However it is possible that we have dquots with temporary
- * references that are not attached to an inode. e.g. see xfs_setattr().
- */
- if (dqp->q_nrefs != 0) {
- xfs_dqunlock(dqp);
- mutex_unlock(&dqp->q_hash->qh_lock);
- return (1);
- }
-
- ASSERT(!list_empty(&dqp->q_freelist));
/*
* If we're turning off quotas, we have to make sure that, for
@@ -1370,23 +1156,18 @@ xfs_qm_dqpurge(
* Block on the flush lock after nudging dquot buffer,
* if it is incore.
*/
- xfs_qm_dqflock_pushbuf_wait(dqp);
+ xfs_dqflock_pushbuf_wait(dqp);
}
/*
- * XXXIf we're turning this type of quotas off, we don't care
+ * If we are turning this type of quotas off, we don't care
* about the dirty metadata sitting in this dquot. OTOH, if
* we're unmounting, we do care, so we flush it and wait.
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
int error;
- /* dqflush unlocks dqflock */
/*
- * Given that dqpurge is a very rare occurrence, it is OK
- * that we're holding the hashlist and mplist locks
- * across the disk write. But, ... XXXsup
- *
* We don't care about getting disk errors here. We need
* to purge this dquot anyway, so we go ahead regardless.
*/
@@ -1396,38 +1177,44 @@ xfs_qm_dqpurge(
__func__, dqp);
xfs_dqflock(dqp);
}
+
ASSERT(atomic_read(&dqp->q_pincount) == 0);
ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
!(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+ xfs_dqfunlock(dqp);
+ xfs_dqunlock(dqp);
+
+ mutex_lock(&qh->qh_lock);
list_del_init(&dqp->q_hashlist);
qh->qh_version++;
+ mutex_unlock(&qh->qh_lock);
+
+ mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
list_del_init(&dqp->q_mplist);
mp->m_quotainfo->qi_dqreclaims++;
mp->m_quotainfo->qi_dquots--;
+ mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+
/*
- * XXX Move this to the front of the freelist, if we can get the
- * freelist lock.
+ * We move dquots to the freelist as soon as their reference count
+ * hits zero, so it really should be on the freelist here.
*/
+ mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
ASSERT(!list_empty(&dqp->q_freelist));
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- dqp->q_mount = NULL;
- dqp->q_hash = NULL;
- dqp->dq_flags = XFS_DQ_INACTIVE;
- memset(&dqp->q_core, 0, sizeof(dqp->q_core));
- xfs_dqfunlock(dqp);
- xfs_dqunlock(dqp);
- mutex_unlock(&qh->qh_lock);
- return (0);
+ xfs_qm_dqdestroy(dqp);
}
-
/*
* Give the buffer a little push if it is incore and
* wait on the flush lock.
*/
void
-xfs_qm_dqflock_pushbuf_wait(
+xfs_dqflock_pushbuf_wait(
xfs_dquot_t *dqp)
{
xfs_mount_t *mp = dqp->q_mount;
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 34b7e945dbfa..a1d91d8f1802 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -80,8 +80,6 @@ enum {
XFS_QLOCK_NESTED,
};
-#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++)
-
/*
* Manage the q_flush completion queue embedded in the dquot. This completion
* queue synchronizes processes attempting to flush the in-core dquot back to
@@ -102,6 +100,21 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
complete(&dqp->q_flush);
}
+static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp)
+{
+ return mutex_trylock(&dqp->q_qlock);
+}
+
+static inline void xfs_dqlock(struct xfs_dquot *dqp)
+{
+ mutex_lock(&dqp->q_qlock);
+}
+
+static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp)
+{
+ mutex_unlock(&dqp->q_qlock);
+}
+
#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
@@ -116,12 +129,12 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
(XFS_IS_UQUOTA_ON((d)->q_mount)) : \
(XFS_IS_OQUOTA_ON((d)->q_mount))))
+extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
+ uint, struct xfs_dquot **);
extern void xfs_qm_dqdestroy(xfs_dquot_t *);
extern int xfs_qm_dqflush(xfs_dquot_t *, uint);
-extern int xfs_qm_dqpurge(xfs_dquot_t *);
+extern void xfs_qm_dqpurge(xfs_dquot_t *);
extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
-extern int xfs_qm_dqlock_nowait(xfs_dquot_t *);
-extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
xfs_disk_dquot_t *);
extern void xfs_qm_adjust_dqlimits(xfs_mount_t *,
@@ -129,9 +142,17 @@ extern void xfs_qm_adjust_dqlimits(xfs_mount_t *,
extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
xfs_dqid_t, uint, uint, xfs_dquot_t **);
extern void xfs_qm_dqput(xfs_dquot_t *);
-extern void xfs_dqlock(xfs_dquot_t *);
-extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
-extern void xfs_dqunlock(xfs_dquot_t *);
-extern void xfs_dqunlock_nonotify(xfs_dquot_t *);
+
+extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+extern void xfs_dqunlock(struct xfs_dquot *);
+extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp);
+
+static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
+{
+ xfs_dqlock(dqp);
+ dqp->q_nrefs++;
+ xfs_dqunlock(dqp);
+ return dqp;
+}
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 0dee0b71029d..34baeae45265 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -73,7 +73,6 @@ xfs_qm_dquot_logitem_format(
logvec->i_len = sizeof(xfs_disk_dquot_t);
logvec->i_type = XLOG_REG_TYPE_DQUOT;
- ASSERT(2 == lip->li_desc->lid_size);
qlip->qli_format.qlf_size = 2;
}
@@ -134,7 +133,7 @@ xfs_qm_dquot_logitem_push(
* lock without sleeping, then there must not have been
* anyone in the process of flushing the dquot.
*/
- error = xfs_qm_dqflush(dqp, 0);
+ error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK);
if (error)
xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
__func__, error, dqp);
@@ -237,7 +236,7 @@ xfs_qm_dquot_logitem_trylock(
if (atomic_read(&dqp->q_pincount) > 0)
return XFS_ITEM_PINNED;
- if (!xfs_qm_dqlock_nowait(dqp))
+ if (!xfs_dqlock_nowait(dqp))
return XFS_ITEM_LOCKED;
if (!xfs_dqflock_nowait(dqp)) {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 753ed9b5c70b..7e5bc872f2b4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -209,10 +209,10 @@ xfs_file_fsync(
/*
* First check if the VFS inode is marked dirty. All the dirtying
- * of non-transactional updates no goes through mark_inode_dirty*,
- * which allows us to distinguish beteeen pure timestamp updates
+ * of non-transactional updates do not go through mark_inode_dirty*,
+ * which allows us to distinguish between pure timestamp updates
* and i_size updates which need to be caught for fdatasync.
- * After that also theck for the dirty state in the XFS inode, which
+ * After that also check for the dirty state in the XFS inode, which
* might gets cleared when the inode gets written out via the AIL
* or xfs_iflush_cluster.
*/
@@ -327,7 +327,7 @@ xfs_file_aio_read(
mp->m_rtdev_targp : mp->m_ddev_targp;
if ((iocb->ki_pos & target->bt_smask) ||
(size & target->bt_smask)) {
- if (iocb->ki_pos == ip->i_size)
+ if (iocb->ki_pos == i_size_read(inode))
return 0;
return -XFS_ERROR(EINVAL);
}
@@ -412,51 +412,6 @@ xfs_file_splice_read(
return ret;
}
-STATIC void
-xfs_aio_write_isize_update(
- struct inode *inode,
- loff_t *ppos,
- ssize_t bytes_written)
-{
- struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t isize = i_size_read(inode);
-
- if (bytes_written > 0)
- XFS_STATS_ADD(xs_write_bytes, bytes_written);
-
- if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
- *ppos > isize))
- *ppos = isize;
-
- if (*ppos > ip->i_size) {
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- if (*ppos > ip->i_size)
- ip->i_size = *ppos;
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
- }
-}
-
-/*
- * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
- * part of the I/O may have been written to disk before the error occurred. In
- * this case the on-disk file size may have been adjusted beyond the in-memory
- * file size and now needs to be truncated back.
- */
-STATIC void
-xfs_aio_write_newsize_update(
- struct xfs_inode *ip,
- xfs_fsize_t new_size)
-{
- if (new_size == ip->i_new_size) {
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- if (new_size == ip->i_new_size)
- ip->i_new_size = 0;
- if (ip->i_d.di_size > ip->i_size)
- ip->i_d.di_size = ip->i_size;
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
- }
-}
-
/*
* xfs_file_splice_write() does not use xfs_rw_ilock() because
* generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -475,7 +430,6 @@ xfs_file_splice_write(
{
struct inode *inode = outfilp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t new_size;
int ioflags = 0;
ssize_t ret;
@@ -489,19 +443,12 @@ xfs_file_splice_write(
xfs_ilock(ip, XFS_IOLOCK_EXCL);
- new_size = *ppos + count;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (new_size > ip->i_size)
- ip->i_new_size = new_size;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+ if (ret > 0)
+ XFS_STATS_ADD(xs_write_bytes, ret);
- xfs_aio_write_isize_update(inode, ppos, ret);
- xfs_aio_write_newsize_update(ip, new_size);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
@@ -689,28 +636,26 @@ out_lock:
/*
* Common pre-write limit and setup checks.
*
- * Returns with iolock held according to @iolock.
+ * Called with the iolocked held either shared and exclusive according to
+ * @iolock, and returns with it held. Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
*/
STATIC ssize_t
xfs_file_aio_write_checks(
struct file *file,
loff_t *pos,
size_t *count,
- xfs_fsize_t *new_sizep,
int *iolock)
{
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t new_size;
int error = 0;
xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- *new_sizep = 0;
restart:
error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
if (error) {
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
- *iolock = 0;
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -720,36 +665,21 @@ restart:
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
- * write. There is no need to issue zeroing if another in-flght IO ends
- * at or before this one If zeronig is needed and we are currently
- * holding the iolock shared, we need to update it to exclusive which
- * involves dropping all locks and relocking to maintain correct locking
- * order. If we do this, restart the function to ensure all checks and
- * values are still valid.
+ * write. If zeroing is needed and we are currently holding the
+ * iolock shared, we need to update it to exclusive which involves
+ * dropping all locks and relocking to maintain correct locking order.
+ * If we do this, restart the function to ensure all checks and values
+ * are still valid.
*/
- if ((ip->i_new_size && *pos > ip->i_new_size) ||
- (!ip->i_new_size && *pos > ip->i_size)) {
+ if (*pos > i_size_read(inode)) {
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
*iolock = XFS_IOLOCK_EXCL;
xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
goto restart;
}
- error = -xfs_zero_eof(ip, *pos, ip->i_size);
+ error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
}
-
- /*
- * If this IO extends beyond EOF, we may need to update ip->i_new_size.
- * We have already zeroed space beyond EOF (if necessary). Only update
- * ip->i_new_size if this IO ends beyond any other in-flight writes.
- */
- new_size = *pos + *count;
- if (new_size > ip->i_size) {
- if (new_size > ip->i_new_size)
- ip->i_new_size = new_size;
- *new_sizep = new_size;
- }
-
xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
@@ -794,9 +724,7 @@ xfs_file_dio_aio_write(
const struct iovec *iovp,
unsigned long nr_segs,
loff_t pos,
- size_t ocount,
- xfs_fsize_t *new_size,
- int *iolock)
+ size_t ocount)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -806,10 +734,10 @@ xfs_file_dio_aio_write(
ssize_t ret = 0;
size_t count = ocount;
int unaligned_io = 0;
+ int iolock;
struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- *iolock = 0;
if ((pos & target->bt_smask) || (count & target->bt_smask))
return -XFS_ERROR(EINVAL);
@@ -824,31 +752,31 @@ xfs_file_dio_aio_write(
* EOF zeroing cases and fill out the new inode size as appropriate.
*/
if (unaligned_io || mapping->nrpages)
- *iolock = XFS_IOLOCK_EXCL;
+ iolock = XFS_IOLOCK_EXCL;
else
- *iolock = XFS_IOLOCK_SHARED;
- xfs_rw_ilock(ip, *iolock);
+ iolock = XFS_IOLOCK_SHARED;
+ xfs_rw_ilock(ip, iolock);
/*
* Recheck if there are cached pages that need invalidate after we got
* the iolock to protect against other threads adding new pages while
* we were waiting for the iolock.
*/
- if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, iolock);
+ iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, iolock);
}
- ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+ ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
if (ret)
- return ret;
+ goto out;
if (mapping->nrpages) {
ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
FI_REMAPF_LOCKED);
if (ret)
- return ret;
+ goto out;
}
/*
@@ -857,15 +785,18 @@ xfs_file_dio_aio_write(
*/
if (unaligned_io)
inode_dio_wait(inode);
- else if (*iolock == XFS_IOLOCK_EXCL) {
+ else if (iolock == XFS_IOLOCK_EXCL) {
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
- *iolock = XFS_IOLOCK_SHARED;
+ iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
ret = generic_file_direct_write(iocb, iovp,
&nr_segs, pos, &iocb->ki_pos, count, ocount);
+out:
+ xfs_rw_iunlock(ip, iolock);
+
/* No fallback to buffered IO on errors for XFS. */
ASSERT(ret < 0 || ret == count);
return ret;
@@ -877,9 +808,7 @@ xfs_file_buffered_aio_write(
const struct iovec *iovp,
unsigned long nr_segs,
loff_t pos,
- size_t ocount,
- xfs_fsize_t *new_size,
- int *iolock)
+ size_t ocount)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -887,14 +816,14 @@ xfs_file_buffered_aio_write(
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
int enospc = 0;
+ int iolock = XFS_IOLOCK_EXCL;
size_t count = ocount;
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ xfs_rw_ilock(ip, iolock);
- ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+ ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
if (ret)
- return ret;
+ goto out;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
@@ -908,13 +837,15 @@ write_retry:
* page locks and retry *once*
*/
if (ret == -ENOSPC && !enospc) {
- ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
- if (ret)
- return ret;
enospc = 1;
- goto write_retry;
+ ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+ if (!ret)
+ goto write_retry;
}
+
current->backing_dev_info = NULL;
+out:
+ xfs_rw_iunlock(ip, iolock);
return ret;
}
@@ -930,9 +861,7 @@ xfs_file_aio_write(
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- int iolock;
size_t ocount = 0;
- xfs_fsize_t new_size = 0;
XFS_STATS_INC(xs_write_calls);
@@ -951,33 +880,22 @@ xfs_file_aio_write(
return -EIO;
if (unlikely(file->f_flags & O_DIRECT))
- ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
- ocount, &new_size, &iolock);
+ ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
else
ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
- ocount, &new_size, &iolock);
-
- xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
+ ocount);
- if (ret <= 0)
- goto out_unlock;
+ if (ret > 0) {
+ ssize_t err;
- /* Handle various SYNC-type writes */
- if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
- loff_t end = pos + ret - 1;
- int error;
+ XFS_STATS_ADD(xs_write_bytes, ret);
- xfs_rw_iunlock(ip, iolock);
- error = xfs_file_fsync(file, pos, end,
- (file->f_flags & __O_SYNC) ? 0 : 1);
- xfs_rw_ilock(ip, iolock);
- if (error)
- ret = error;
+ /* Handle various SYNC-type writes */
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0)
+ ret = err;
}
-out_unlock:
- xfs_aio_write_newsize_update(ip, new_size);
- xfs_rw_iunlock(ip, iolock);
return ret;
}
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811c..652b875a9d4c 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
return -filemap_fdatawait_range(mapping, first,
- last == -1 ? ip->i_size - 1 : last);
+ last == -1 ? XFS_ISIZE(ip) - 1 : last);
}
return 0;
}
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 169380e66057..dad1a31aa4fc 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -447,7 +447,7 @@ STATIC xfs_buf_t * /* allocation group buffer */
xfs_ialloc_ag_select(
xfs_trans_t *tp, /* transaction pointer */
xfs_ino_t parent, /* parent directory inode number */
- mode_t mode, /* bits set to indicate file type */
+ umode_t mode, /* bits set to indicate file type */
int okalloc) /* ok to allocate more space */
{
xfs_buf_t *agbp; /* allocation group header buffer */
@@ -640,7 +640,7 @@ int
xfs_dialloc(
xfs_trans_t *tp, /* transaction pointer */
xfs_ino_t parent, /* parent inode (directory) */
- mode_t mode, /* mode bits for new inode */
+ umode_t mode, /* mode bits for new inode */
int okalloc, /* ok to allocate more space */
xfs_buf_t **IO_agbp, /* in/out ag header's buffer */
boolean_t *alloc_done, /* true if we needed to replenish
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index bb5385475e1f..666a037398d6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -81,7 +81,7 @@ int /* error */
xfs_dialloc(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t parent, /* parent inode (directory) */
- mode_t mode, /* mode bits for new inode */
+ umode_t mode, /* mode bits for new inode */
int okalloc, /* ok to allocate more space */
struct xfs_buf **agbp, /* buf for a.g. inode header */
boolean_t *alloc_done, /* an allocation was done to replenish
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0fa98b1c70ea..8c3e46394d48 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -77,7 +77,7 @@ xfs_inode_alloc(
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(completion_done(&ip->i_flush));
+ ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -94,8 +94,6 @@ xfs_inode_alloc(
ip->i_update_core = 0;
ip->i_delayed_blks = 0;
memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
- ip->i_size = 0;
- ip->i_new_size = 0;
return ip;
}
@@ -107,7 +105,6 @@ xfs_inode_free_callback(
struct inode *inode = container_of(head, struct inode, i_rcu);
struct xfs_inode *ip = XFS_I(inode);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_zone_free(xfs_inode_zone, ip);
}
@@ -151,7 +148,7 @@ xfs_inode_free(
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(completion_done(&ip->i_flush));
+ ASSERT(!xfs_isiflocked(ip));
/*
* Because we use RCU freeing we need to ensure the inode always
@@ -451,8 +448,6 @@ again:
*ipp = ip;
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
/*
* If we have a real type for an on-disk inode, we can set ops(&unlock)
* now. If it's a new inode being created, xfs_ialloc will handle it.
@@ -716,3 +711,19 @@ xfs_isilocked(
return 0;
}
#endif
+
+void
+__xfs_iflock(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+ do {
+ prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_isiflocked(ip))
+ io_schedule();
+ } while (!xfs_iflock_nowait(ip));
+
+ finish_wait(wq, &wait.wait);
+}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 755ee8164880..b21022499c2e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -299,11 +299,8 @@ xfs_iformat(
{
xfs_attr_shortform_t *atp;
int size;
- int error;
+ int error = 0;
xfs_fsize_t di_size;
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
- error = 0;
if (unlikely(be32_to_cpu(dip->di_nextents) +
be16_to_cpu(dip->di_anextents) >
@@ -350,7 +347,6 @@ xfs_iformat(
return XFS_ERROR(EFSCORRUPTED);
}
ip->i_d.di_size = 0;
- ip->i_size = 0;
ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
break;
@@ -409,10 +405,10 @@ xfs_iformat(
}
if (!XFS_DFORK_Q(dip))
return 0;
+
ASSERT(ip->i_afp == NULL);
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
switch (dip->di_aformat) {
case XFS_DINODE_FMT_LOCAL:
atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -604,10 +600,11 @@ xfs_iformat_btree(
* or the number of extents is greater than the number of
* blocks.
*/
- if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
- || XFS_BMDR_SPACE_CALC(nrecs) >
- XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
- || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+ XFS_IFORK_MAXEXT(ip, whichfork) ||
+ XFS_BMDR_SPACE_CALC(nrecs) >
+ XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
+ XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
(unsigned long long) ip->i_ino);
XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -835,12 +832,6 @@ xfs_iread(
* with the uninitialized part of it.
*/
ip->i_d.di_mode = 0;
- /*
- * Initialize the per-fork minima and maxima for a new
- * inode here. xfs_iformat will do it for old inodes.
- */
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
}
/*
@@ -861,7 +852,6 @@ xfs_iread(
}
ip->i_delayed_blks = 0;
- ip->i_size = ip->i_d.di_size;
/*
* Mark the buffer containing the inode as something to keep
@@ -961,7 +951,7 @@ int
xfs_ialloc(
xfs_trans_t *tp,
xfs_inode_t *pip,
- mode_t mode,
+ umode_t mode,
xfs_nlink_t nlink,
xfs_dev_t rdev,
prid_t prid,
@@ -1002,7 +992,7 @@ xfs_ialloc(
return error;
ASSERT(ip != NULL);
- ip->i_d.di_mode = (__uint16_t)mode;
+ ip->i_d.di_mode = mode;
ip->i_d.di_onlink = 0;
ip->i_d.di_nlink = nlink;
ASSERT(ip->i_d.di_nlink == nlink);
@@ -1051,7 +1041,6 @@ xfs_ialloc(
}
ip->i_d.di_size = 0;
- ip->i_size = 0;
ip->i_d.di_nextents = 0;
ASSERT(ip->i_d.di_nblocks == 0);
@@ -1166,52 +1155,6 @@ xfs_ialloc(
}
/*
- * Check to make sure that there are no blocks allocated to the
- * file beyond the size of the file. We don't check this for
- * files with fixed size extents or real time extents, but we
- * at least do it for regular files.
- */
-#ifdef DEBUG
-STATIC void
-xfs_isize_check(
- struct xfs_inode *ip,
- xfs_fsize_t isize)
-{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t map_first;
- int nimaps;
- xfs_bmbt_irec_t imaps[2];
- int error;
-
- if (!S_ISREG(ip->i_d.di_mode))
- return;
-
- if (XFS_IS_REALTIME_INODE(ip))
- return;
-
- if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
- return;
-
- nimaps = 2;
- map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
- /*
- * The filesystem could be shutting down, so bmapi may return
- * an error.
- */
- error = xfs_bmapi_read(ip, map_first,
- (XFS_B_TO_FSB(mp,
- (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
- imaps, &nimaps, XFS_BMAPI_ENTIRE);
- if (error)
- return;
- ASSERT(nimaps == 1);
- ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
-}
-#else /* DEBUG */
-#define xfs_isize_check(ip, isize)
-#endif /* DEBUG */
-
-/*
* Free up the underlying blocks past new_size. The new size must be smaller
* than the current size. This routine can be used both for the attribute and
* data fork, and does not modify the inode size, which is left to the caller.
@@ -1252,12 +1195,14 @@ xfs_itruncate_extents(
int done = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
- ASSERT(new_size <= ip->i_size);
+ ASSERT(new_size <= XFS_ISIZE(ip));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(ip->i_itemp != NULL);
ASSERT(ip->i_itemp->ili_lock_flags == 0);
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+ trace_xfs_itruncate_extents_start(ip, new_size);
+
/*
* Since it is possible for space to become allocated beyond
* the end of the file (in a crash where the space is allocated
@@ -1325,6 +1270,14 @@ xfs_itruncate_extents(
goto out;
}
+ /*
+ * Always re-log the inode so that our permanent transaction can keep
+ * on rolling it forward in the log.
+ */
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ trace_xfs_itruncate_extents_end(ip, new_size);
+
out:
*tpp = tp;
return error;
@@ -1338,74 +1291,6 @@ out_bmap_cancel:
goto out;
}
-int
-xfs_itruncate_data(
- struct xfs_trans **tpp,
- struct xfs_inode *ip,
- xfs_fsize_t new_size)
-{
- int error;
-
- trace_xfs_itruncate_data_start(ip, new_size);
-
- /*
- * The first thing we do is set the size to new_size permanently on
- * disk. This way we don't have to worry about anyone ever being able
- * to look at the data being freed even in the face of a crash.
- * What we're getting around here is the case where we free a block, it
- * is allocated to another file, it is written to, and then we crash.
- * If the new data gets written to the file but the log buffers
- * containing the free and reallocation don't, then we'd end up with
- * garbage in the blocks being freed. As long as we make the new_size
- * permanent before actually freeing any blocks it doesn't matter if
- * they get written to.
- */
- if (ip->i_d.di_nextents > 0) {
- /*
- * If we are not changing the file size then do not update
- * the on-disk file size - we may be called from
- * xfs_inactive_free_eofblocks(). If we update the on-disk
- * file size and then the system crashes before the contents
- * of the file are flushed to disk then the files may be
- * full of holes (ie NULL files bug).
- */
- if (ip->i_size != new_size) {
- ip->i_d.di_size = new_size;
- ip->i_size = new_size;
- xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
- }
- }
-
- error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
- if (error)
- return error;
-
- /*
- * If we are not changing the file size then do not update the on-disk
- * file size - we may be called from xfs_inactive_free_eofblocks().
- * If we update the on-disk file size and then the system crashes
- * before the contents of the file are flushed to disk then the files
- * may be full of holes (ie NULL files bug).
- */
- xfs_isize_check(ip, new_size);
- if (ip->i_size != new_size) {
- ip->i_d.di_size = new_size;
- ip->i_size = new_size;
- }
-
- ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
- ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
-
- /*
- * Always re-log the inode so that our permanent transaction can keep
- * on rolling it forward in the log.
- */
- xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-
- trace_xfs_itruncate_data_end(ip, new_size);
- return 0;
-}
-
/*
* This is called when the inode's link count goes to 0.
* We place the on-disk inode on a list in the AGI. It
@@ -1824,8 +1709,7 @@ xfs_ifree(
ASSERT(ip->i_d.di_nlink == 0);
ASSERT(ip->i_d.di_nextents == 0);
ASSERT(ip->i_d.di_anextents == 0);
- ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
- (!S_ISREG(ip->i_d.di_mode)));
+ ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
ASSERT(ip->i_d.di_nblocks == 0);
/*
@@ -1844,8 +1728,6 @@ xfs_ifree(
ip->i_d.di_flags = 0;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
/*
@@ -2151,7 +2033,7 @@ xfs_idestroy_fork(
* once someone is waiting for it to be unpinned.
*/
static void
-xfs_iunpin_nowait(
+xfs_iunpin(
struct xfs_inode *ip)
{
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2163,14 +2045,29 @@ xfs_iunpin_nowait(
}
+static void
+__xfs_iunpin_wait(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
+
+ xfs_iunpin(ip);
+
+ do {
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_ipincount(ip))
+ io_schedule();
+ } while (xfs_ipincount(ip));
+ finish_wait(wq, &wait.wait);
+}
+
void
xfs_iunpin_wait(
struct xfs_inode *ip)
{
- if (xfs_ipincount(ip)) {
- xfs_iunpin_nowait(ip);
- wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
- }
+ if (xfs_ipincount(ip))
+ __xfs_iunpin_wait(ip);
}
/*
@@ -2510,9 +2407,9 @@ xfs_iflush(
XFS_STATS_INC(xs_iflush_count);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(xfs_isiflocked(ip));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > ip->i_df.if_ext_max);
+ ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
iip = ip->i_itemp;
mp = ip->i_mount;
@@ -2529,7 +2426,7 @@ xfs_iflush(
* out for us if they occur after the log force completes.
*/
if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
- xfs_iunpin_nowait(ip);
+ xfs_iunpin(ip);
xfs_ifunlock(ip);
return EAGAIN;
}
@@ -2626,9 +2523,9 @@ xfs_iflush_int(
#endif
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(xfs_isiflocked(ip));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > ip->i_df.if_ext_max);
+ ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
iip = ip->i_itemp;
mp = ip->i_mount;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b4cd4739f98e..2f27b7454085 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -66,7 +66,6 @@ typedef struct xfs_ifork {
struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
- unsigned char if_ext_max; /* max # of extent records */
union {
xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
@@ -206,12 +205,12 @@ typedef struct xfs_icdinode {
((w) == XFS_DATA_FORK ? \
((ip)->i_d.di_nextents = (n)) : \
((ip)->i_d.di_anextents = (n)))
-
+#define XFS_IFORK_MAXEXT(ip, w) \
+ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
#ifdef __KERNEL__
-struct bhv_desc;
struct xfs_buf;
struct xfs_bmap_free;
struct xfs_bmbt_irec;
@@ -220,12 +219,6 @@ struct xfs_mount;
struct xfs_trans;
struct xfs_dquot;
-typedef struct dm_attrs_s {
- __uint32_t da_dmevmask; /* DMIG event mask */
- __uint16_t da_dmstate; /* DMIG state info */
- __uint16_t da_pad; /* DMIG extra padding */
-} dm_attrs_t;
-
typedef struct xfs_inode {
/* Inode linking and identification information. */
struct xfs_mount *i_mount; /* fs mount struct ptr */
@@ -244,27 +237,19 @@ typedef struct xfs_inode {
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
mrlock_t i_iolock; /* inode IO lock */
- struct completion i_flush; /* inode flush completion q */
atomic_t i_pincount; /* inode pin count */
- wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
spinlock_t i_flags_lock; /* inode i_flags lock */
/* Miscellaneous state. */
- unsigned short i_flags; /* see defined flags below */
+ unsigned long i_flags; /* see defined flags below */
unsigned char i_update_core; /* timestamps/size is dirty */
unsigned int i_delayed_blks; /* count of delay alloc blks */
xfs_icdinode_t i_d; /* most of ondisk inode */
- xfs_fsize_t i_size; /* in-memory size */
- xfs_fsize_t i_new_size; /* size when write completes */
-
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
} xfs_inode_t;
-#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \
- (ip)->i_size : (ip)->i_d.di_size;
-
/* Convert from vfs inode to xfs inode */
static inline struct xfs_inode *XFS_I(struct inode *inode)
{
@@ -278,6 +263,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
}
/*
+ * For regular files we only update the on-disk filesize when actually
+ * writing data back to disk. Until then only the copy in the VFS inode
+ * is uptodate.
+ */
+static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
+{
+ if (S_ISREG(ip->i_d.di_mode))
+ return i_size_read(VFS_I(ip));
+ return ip->i_d.di_size;
+}
+
+/*
* i_flags helper functions
*/
static inline void
@@ -331,6 +328,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
return ret;
}
+static inline int
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+{
+ int ret;
+
+ spin_lock(&ip->i_flags_lock);
+ ret = ip->i_flags & flags;
+ if (!ret)
+ ip->i_flags |= flags;
+ spin_unlock(&ip->i_flags_lock);
+ return ret;
+}
+
/*
* Project quota id helpers (previously projid was 16bit only
* and using two 16bit values to hold new 32bit projid was chosen
@@ -351,35 +361,19 @@ xfs_set_projid(struct xfs_inode *ip,
}
/*
- * Manage the i_flush queue embedded in the inode. This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
- wait_for_completion(&ip->i_flush);
-}
-
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
- return try_wait_for_completion(&ip->i_flush);
-}
-
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
- complete(&ip->i_flush);
-}
-
-/*
* In-core inode flags.
*/
-#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
-#define XFS_ISTALE 0x0002 /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
-#define XFS_INEW 0x0008 /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
-#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
-#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
+#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
+#define XFS_ISTALE (1 << 1) /* inode has been staled */
+#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
+#define XFS_INEW (1 << 3) /* inode has just been allocated */
+#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */
+#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
+#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
+#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT)
+#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
+#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
/*
* Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -392,6 +386,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
XFS_IFILESTREAM);
/*
+ * Synchronize processes attempting to flush the in-core inode back to disk.
+ */
+
+extern void __xfs_iflock(struct xfs_inode *ip);
+
+static inline int xfs_iflock_nowait(struct xfs_inode *ip)
+{
+ return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
+}
+
+static inline void xfs_iflock(struct xfs_inode *ip)
+{
+ if (!xfs_iflock_nowait(ip))
+ __xfs_iflock(ip);
+}
+
+static inline void xfs_ifunlock(struct xfs_inode *ip)
+{
+ xfs_iflags_clear(ip, XFS_IFLOCK);
+ wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
+}
+
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+ return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
+/*
* Flags for inode locking.
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
* 1<<16 - 1<<32-1 -- lockdep annotation (integers)
@@ -481,7 +503,7 @@ void xfs_inode_free(struct xfs_inode *ip);
/*
* xfs_inode.c prototypes.
*/
-int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
+int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
xfs_nlink_t, xfs_dev_t, prid_t, int,
struct xfs_buf **, boolean_t *, xfs_inode_t **);
@@ -491,8 +513,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
struct xfs_bmap_free *);
int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
int, xfs_fsize_t);
-int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
- xfs_fsize_t);
int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
void xfs_iext_realloc(xfs_inode_t *, int, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index abaafdbb3e65..91d71dcd4852 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -79,8 +79,6 @@ xfs_inode_item_size(
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
iip->ili_format.ilf_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
XFS_ILOG_DEV | XFS_ILOG_UUID);
@@ -437,7 +435,6 @@ xfs_inode_item_format(
* Assert that no attribute-related log flags are set.
*/
if (!XFS_IFORK_Q(ip)) {
- ASSERT(nvecs == lip->li_desc->lid_size);
iip->ili_format.ilf_size = nvecs;
ASSERT(!(iip->ili_format.ilf_fields &
(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
@@ -521,7 +518,6 @@ xfs_inode_item_format(
break;
}
- ASSERT(nvecs == lip->li_desc->lid_size);
iip->ili_format.ilf_size = nvecs;
}
@@ -559,7 +555,7 @@ xfs_inode_item_unpin(
trace_xfs_inode_unpin(ip, _RET_IP_);
ASSERT(atomic_read(&ip->i_pincount) > 0);
if (atomic_dec_and_test(&ip->i_pincount))
- wake_up(&ip->i_ipin_wait);
+ wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
}
/*
@@ -721,7 +717,7 @@ xfs_inode_item_pushbuf(
* If a flush is not in progress anymore, chances are that the
* inode was taken off the AIL. So, just get out.
*/
- if (completion_done(&ip->i_flush) ||
+ if (!xfs_isiflocked(ip) ||
!(lip->li_flags & XFS_LI_IN_AIL)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return true;
@@ -754,7 +750,7 @@ xfs_inode_item_push(
struct xfs_inode *ip = iip->ili_inode;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
- ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(xfs_isiflocked(ip));
/*
* Since we were able to lock the inode's flush lock and
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d99a90518909..76f3ca5cfc36 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -559,23 +559,23 @@ xfs_attrmulti_by_handle(
ops[i].am_flags);
break;
case ATTR_OP_SET:
- ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ ops[i].am_error = mnt_want_write_file(parfilp);
if (ops[i].am_error)
break;
ops[i].am_error = xfs_attrmulti_attr_set(
dentry->d_inode, attr_name,
ops[i].am_attrvalue, ops[i].am_length,
ops[i].am_flags);
- mnt_drop_write(parfilp->f_path.mnt);
+ mnt_drop_write_file(parfilp);
break;
case ATTR_OP_REMOVE:
- ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ ops[i].am_error = mnt_want_write_file(parfilp);
if (ops[i].am_error)
break;
ops[i].am_error = xfs_attrmulti_attr_remove(
dentry->d_inode, attr_name,
ops[i].am_flags);
- mnt_drop_write(parfilp->f_path.mnt);
+ mnt_drop_write_file(parfilp);
break;
default:
ops[i].am_error = EINVAL;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 54e623bfbb85..f9ccb7b7c043 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -454,23 +454,23 @@ xfs_compat_attrmulti_by_handle(
&ops[i].am_length, ops[i].am_flags);
break;
case ATTR_OP_SET:
- ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ ops[i].am_error = mnt_want_write_file(parfilp);
if (ops[i].am_error)
break;
ops[i].am_error = xfs_attrmulti_attr_set(
dentry->d_inode, attr_name,
compat_ptr(ops[i].am_attrvalue),
ops[i].am_length, ops[i].am_flags);
- mnt_drop_write(parfilp->f_path.mnt);
+ mnt_drop_write_file(parfilp);
break;
case ATTR_OP_REMOVE:
- ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ ops[i].am_error = mnt_want_write_file(parfilp);
if (ops[i].am_error)
break;
ops[i].am_error = xfs_attrmulti_attr_remove(
dentry->d_inode, attr_name,
ops[i].am_flags);
- mnt_drop_write(parfilp->f_path.mnt);
+ mnt_drop_write_file(parfilp);
break;
default:
ops[i].am_error = EINVAL;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9afa282aa937..246c7d57c6f9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb(
xfs_fileoff_t *last_fsb)
{
xfs_fileoff_t new_last_fsb = 0;
- xfs_extlen_t align;
+ xfs_extlen_t align = 0;
int eof, error;
- if (XFS_IS_REALTIME_INODE(ip))
- ;
- /*
- * If mounted with the "-o swalloc" option, roundup the allocation
- * request to a stripe width boundary if the file size is >=
- * stripe width and we are allocating past the allocation eof.
- */
- else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
- (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
- new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
- /*
- * Roundup the allocation request to a stripe unit (m_dalign) boundary
- * if the file size is >= stripe unit size, and we are allocating past
- * the allocation eof.
- */
- else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
- new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
+ if (!XFS_IS_REALTIME_INODE(ip)) {
+ /*
+ * Round up the allocation request to a stripe unit
+ * (m_dalign) boundary if the file size is >= stripe unit
+ * size, and we are allocating past the allocation eof.
+ *
+ * If mounted with the "-o swalloc" option the alignment is
+ * increased from the strip unit size to the stripe width.
+ */
+ if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+ align = mp->m_swidth;
+ else if (mp->m_dalign)
+ align = mp->m_dalign;
+
+ if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
+ new_last_fsb = roundup_64(*last_fsb, align);
+ }
/*
* Always round up the allocation request to an extent boundary
@@ -154,7 +154,7 @@ xfs_iomap_write_direct(
offset_fsb = XFS_B_TO_FSBT(mp, offset);
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
- if ((offset + count) > ip->i_size) {
+ if ((offset + count) > XFS_ISIZE(ip)) {
error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
goto error_out;
@@ -211,7 +211,7 @@ xfs_iomap_write_direct(
xfs_trans_ijoin(tp, ip, 0);
bmapi_flag = 0;
- if (offset < ip->i_size || extsz)
+ if (offset < XFS_ISIZE(ip) || extsz)
bmapi_flag |= XFS_BMAPI_PREALLOC;
/*
@@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate(
int found_delalloc = 0;
*prealloc = 0;
- if ((offset + count) <= ip->i_size)
+ if (offset + count <= XFS_ISIZE(ip))
return 0;
/*
@@ -340,7 +340,7 @@ xfs_iomap_prealloc_size(
* if we pass in alloc_blocks = 0. Hence the "+ 1" to
* ensure we always pass in a non-zero value.
*/
- alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+ alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
rounddown_pow_of_two(alloc_blocks));
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate(
* back....
*/
nimaps = 1;
- end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
+ end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
error = xfs_bmap_last_offset(NULL, ip, &last_block,
XFS_DATA_FORK);
if (error)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 23ce927973a4..ab302539e5b9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -168,7 +168,7 @@ STATIC int
xfs_vn_mknod(
struct inode *dir,
struct dentry *dentry,
- int mode,
+ umode_t mode,
dev_t rdev)
{
struct inode *inode;
@@ -231,7 +231,7 @@ STATIC int
xfs_vn_create(
struct inode *dir,
struct dentry *dentry,
- int mode,
+ umode_t mode,
struct nameidata *nd)
{
return xfs_vn_mknod(dir, dentry, mode, 0);
@@ -241,7 +241,7 @@ STATIC int
xfs_vn_mkdir(
struct inode *dir,
struct dentry *dentry,
- int mode)
+ umode_t mode)
{
return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0);
}
@@ -366,7 +366,7 @@ xfs_vn_symlink(
struct xfs_inode *cip = NULL;
struct xfs_name name;
int error;
- mode_t mode;
+ umode_t mode;
mode = S_IFLNK |
(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
@@ -750,6 +750,7 @@ xfs_setattr_size(
struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
int mask = iattr->ia_valid;
+ xfs_off_t oldsize, newsize;
struct xfs_trans *tp;
int error;
uint lock_flags;
@@ -777,11 +778,13 @@ xfs_setattr_size(
lock_flags |= XFS_IOLOCK_EXCL;
xfs_ilock(ip, lock_flags);
+ oldsize = inode->i_size;
+ newsize = iattr->ia_size;
+
/*
* Short circuit the truncate case for zero length files.
*/
- if (iattr->ia_size == 0 &&
- ip->i_size == 0 && ip->i_d.di_nextents == 0) {
+ if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
goto out_unlock;
@@ -807,14 +810,14 @@ xfs_setattr_size(
* the inode to the transaction, because the inode cannot be unlocked
* once it is a part of the transaction.
*/
- if (iattr->ia_size > ip->i_size) {
+ if (newsize > oldsize) {
/*
* Do the first part of growing a file: zero any data in the
* last block that is beyond the old EOF. We need to do this
* before the inode is joined to the transaction to modify
* i_size.
*/
- error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+ error = xfs_zero_eof(ip, newsize, oldsize);
if (error)
goto out_unlock;
}
@@ -833,8 +836,8 @@ xfs_setattr_size(
* here and prevents waiting for other data not within the range we
* care about here.
*/
- if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
- error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
+ error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
FI_NONE);
if (error)
goto out_unlock;
@@ -845,8 +848,7 @@ xfs_setattr_size(
*/
inode_dio_wait(inode);
- error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
- xfs_get_blocks);
+ error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
if (error)
goto out_unlock;
@@ -857,7 +859,7 @@ xfs_setattr_size(
if (error)
goto out_trans_cancel;
- truncate_setsize(inode, iattr->ia_size);
+ truncate_setsize(inode, newsize);
commit_flags = XFS_TRANS_RELEASE_LOG_RES;
lock_flags |= XFS_ILOCK_EXCL;
@@ -876,19 +878,29 @@ xfs_setattr_size(
* these flags set. For all other operations the VFS set these flags
* explicitly if it wants a timestamp update.
*/
- if (iattr->ia_size != ip->i_size &&
- (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+ if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
iattr->ia_ctime = iattr->ia_mtime =
current_fs_time(inode->i_sb);
mask |= ATTR_CTIME | ATTR_MTIME;
}
- if (iattr->ia_size > ip->i_size) {
- ip->i_d.di_size = iattr->ia_size;
- ip->i_size = iattr->ia_size;
- } else if (iattr->ia_size <= ip->i_size ||
- (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
- error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
+ /*
+ * The first thing we do is set the size to new_size permanently on
+ * disk. This way we don't have to worry about anyone ever being able
+ * to look at the data being freed even in the face of a crash.
+ * What we're getting around here is the case where we free a block, it
+ * is allocated to another file, it is written to, and then we crash.
+ * If the new data gets written to the file but the log buffers
+ * containing the free and reallocation don't, then we'd end up with
+ * garbage in the blocks being freed. As long as we make the new size
+ * permanent before actually freeing any blocks it doesn't matter if
+ * they get written to.
+ */
+ ip->i_d.di_size = newsize;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ if (newsize <= oldsize) {
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
if (error)
goto out_trans_abort;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 34817adf4b9e..e2cc3568c299 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -760,38 +760,6 @@ xfs_log_item_init(
INIT_LIST_HEAD(&item->li_cil);
}
-/*
- * Write region vectors to log. The write happens using the space reservation
- * of the ticket (tic). It is not a requirement that all writes for a given
- * transaction occur with one call to xfs_log_write(). However, it is important
- * to note that the transaction reservation code makes an assumption about the
- * number of log headers a transaction requires that may be violated if you
- * don't pass all the transaction vectors in one call....
- */
-int
-xfs_log_write(
- struct xfs_mount *mp,
- struct xfs_log_iovec reg[],
- int nentries,
- struct xlog_ticket *tic,
- xfs_lsn_t *start_lsn)
-{
- struct log *log = mp->m_log;
- int error;
- struct xfs_log_vec vec = {
- .lv_niovecs = nentries,
- .lv_iovecp = reg,
- };
-
- if (XLOG_FORCED_SHUTDOWN(log))
- return XFS_ERROR(EIO);
-
- error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
- if (error)
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
- return error;
-}
-
void
xfs_log_move_tail(xfs_mount_t *mp,
xfs_lsn_t tail_lsn)
@@ -1685,7 +1653,7 @@ xlog_print_tic_res(
};
xfs_warn(mp,
- "xfs_log_write: reservation summary:\n"
+ "xlog_write: reservation summary:\n"
" trans type = %s (%u)\n"
" unit res = %d bytes\n"
" current res = %d bytes\n"
@@ -1714,7 +1682,7 @@ xlog_print_tic_res(
}
xfs_alert_tag(mp, XFS_PTAG_LOGRES,
- "xfs_log_write: reservation ran out. Need to up reservation");
+ "xlog_write: reservation ran out. Need to up reservation");
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
@@ -1968,23 +1936,21 @@ xlog_write(
*start_lsn = 0;
len = xlog_write_calc_vec_length(ticket, log_vector);
- if (log->l_cilp) {
- /*
- * Region headers and bytes are already accounted for.
- * We only need to take into account start records and
- * split regions in this function.
- */
- if (ticket->t_flags & XLOG_TIC_INITED)
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
- /*
- * Commit record headers need to be accounted for. These
- * come in as separate writes so are easy to detect.
- */
- if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
- } else
- ticket->t_curr_res -= len;
+ /*
+ * Region headers and bytes are already accounted for.
+ * We only need to take into account start records and
+ * split regions in this function.
+ */
+ if (ticket->t_flags & XLOG_TIC_INITED)
+ ticket->t_curr_res -= sizeof(xlog_op_header_t);
+
+ /*
+ * Commit record headers need to be accounted for. These
+ * come in as separate writes so are easy to detect.
+ */
+ if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
+ ticket->t_curr_res -= sizeof(xlog_op_header_t);
if (ticket->t_curr_res < 0)
xlog_print_tic_res(log->l_mp, ticket);
@@ -2931,8 +2897,7 @@ _xfs_log_force(
XFS_STATS_INC(xs_log_force);
- if (log->l_cilp)
- xlog_cil_force(log);
+ xlog_cil_force(log);
spin_lock(&log->l_icloglock);
@@ -3081,11 +3046,9 @@ _xfs_log_force_lsn(
XFS_STATS_INC(xs_log_force);
- if (log->l_cilp) {
- lsn = xlog_cil_force_lsn(log, lsn);
- if (lsn == NULLCOMMITLSN)
- return 0;
- }
+ lsn = xlog_cil_force_lsn(log, lsn);
+ if (lsn == NULLCOMMITLSN)
+ return 0;
try_again:
spin_lock(&log->l_icloglock);
@@ -3653,7 +3616,7 @@ xfs_log_force_umount(
* completed transactions are flushed to disk with the xfs_log_force()
* call below.
*/
- if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
+ if (!logerror)
xlog_cil_force(log);
/*
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 3f7bf451c034..2aee3b22d29c 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -174,11 +174,6 @@ int xfs_log_reserve(struct xfs_mount *mp,
__uint8_t clientid,
uint flags,
uint t_type);
-int xfs_log_write(struct xfs_mount *mp,
- xfs_log_iovec_t region[],
- int nentries,
- struct xlog_ticket *ticket,
- xfs_lsn_t *start_lsn);
int xfs_log_unmount_write(struct xfs_mount *mp);
void xfs_log_unmount(struct xfs_mount *mp);
int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
@@ -189,8 +184,7 @@ void xlog_iodone(struct xfs_buf *);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
-void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
- struct xfs_log_vec *log_vector,
+int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_lsn_t *commit_lsn, int flags);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index c7755d5a5fbe..d4fadbe8ac90 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -32,10 +32,7 @@
#include "xfs_discard.h"
/*
- * Perform initial CIL structure initialisation. If the CIL is not
- * enabled in this filesystem, ensure the log->l_cilp is null so
- * we can check this conditional to determine if we are doing delayed
- * logging or not.
+ * Perform initial CIL structure initialisation.
*/
int
xlog_cil_init(
@@ -44,10 +41,6 @@ xlog_cil_init(
struct xfs_cil *cil;
struct xfs_cil_ctx *ctx;
- log->l_cilp = NULL;
- if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
- return 0;
-
cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
if (!cil)
return ENOMEM;
@@ -80,9 +73,6 @@ void
xlog_cil_destroy(
struct log *log)
{
- if (!log->l_cilp)
- return;
-
if (log->l_cilp->xc_ctx) {
if (log->l_cilp->xc_ctx->ticket)
xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
@@ -137,9 +127,6 @@ void
xlog_cil_init_post_recovery(
struct log *log)
{
- if (!log->l_cilp)
- return;
-
log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
log->l_cilp->xc_ctx->sequence = 1;
log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
@@ -172,37 +159,73 @@ xlog_cil_init_post_recovery(
* format the regions into the iclog as though they are being formatted
* directly out of the objects themselves.
*/
-static void
-xlog_cil_format_items(
- struct log *log,
- struct xfs_log_vec *log_vector)
+static struct xfs_log_vec *
+xlog_cil_prepare_log_vecs(
+ struct xfs_trans *tp)
{
- struct xfs_log_vec *lv;
+ struct xfs_log_item_desc *lidp;
+ struct xfs_log_vec *lv = NULL;
+ struct xfs_log_vec *ret_lv = NULL;
- ASSERT(log_vector);
- for (lv = log_vector; lv; lv = lv->lv_next) {
+
+ /* Bail out if we didn't find a log item. */
+ if (list_empty(&tp->t_items)) {
+ ASSERT(0);
+ return NULL;
+ }
+
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+ struct xfs_log_vec *new_lv;
void *ptr;
int index;
int len = 0;
+ uint niovecs;
+
+ /* Skip items which aren't dirty in this transaction. */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ continue;
+
+ /* Skip items that do not have any vectors for writing */
+ niovecs = IOP_SIZE(lidp->lid_item);
+ if (!niovecs)
+ continue;
+
+ new_lv = kmem_zalloc(sizeof(*new_lv) +
+ niovecs * sizeof(struct xfs_log_iovec),
+ KM_SLEEP);
+
+ /* The allocated iovec region lies beyond the log vector. */
+ new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+ new_lv->lv_niovecs = niovecs;
+ new_lv->lv_item = lidp->lid_item;
/* build the vector array and calculate it's length */
- IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
- for (index = 0; index < lv->lv_niovecs; index++)
- len += lv->lv_iovecp[index].i_len;
+ IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
+ for (index = 0; index < new_lv->lv_niovecs; index++)
+ len += new_lv->lv_iovecp[index].i_len;
- lv->lv_buf_len = len;
- lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
- ptr = lv->lv_buf;
+ new_lv->lv_buf_len = len;
+ new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len,
+ KM_SLEEP|KM_NOFS);
+ ptr = new_lv->lv_buf;
- for (index = 0; index < lv->lv_niovecs; index++) {
- struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+ for (index = 0; index < new_lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index];
memcpy(ptr, vec->i_addr, vec->i_len);
vec->i_addr = ptr;
ptr += vec->i_len;
}
- ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+ ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
+
+ if (!ret_lv)
+ ret_lv = new_lv;
+ else
+ lv->lv_next = new_lv;
+ lv = new_lv;
}
+
+ return ret_lv;
}
/*
@@ -256,7 +279,7 @@ xfs_cil_prepare_item(
* Insert the log items into the CIL and calculate the difference in space
* consumed by the item. Add the space to the checkpoint ticket and calculate
* if the change requires additional log metadata. If it does, take that space
- * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * as well. Remove the amount of space we added to the checkpoint ticket from
* the current transaction ticket so that the accounting works out correctly.
*/
static void
@@ -635,28 +658,30 @@ out_abort:
* background commit, returns without it held once background commits are
* allowed again.
*/
-void
+int
xfs_log_commit_cil(
struct xfs_mount *mp,
struct xfs_trans *tp,
- struct xfs_log_vec *log_vector,
xfs_lsn_t *commit_lsn,
int flags)
{
struct log *log = mp->m_log;
int log_flags = 0;
int push = 0;
+ struct xfs_log_vec *log_vector;
if (flags & XFS_TRANS_RELEASE_LOG_RES)
log_flags = XFS_LOG_REL_PERM_RESERV;
/*
- * do all the hard work of formatting items (including memory
+ * Do all the hard work of formatting items (including memory
* allocation) outside the CIL context lock. This prevents stalling CIL
* pushes when we are low on memory and a transaction commit spends a
* lot of time in memory reclaim.
*/
- xlog_cil_format_items(log, log_vector);
+ log_vector = xlog_cil_prepare_log_vecs(tp);
+ if (!log_vector)
+ return ENOMEM;
/* lock out background commit */
down_read(&log->l_cilp->xc_ctx_lock);
@@ -709,6 +734,7 @@ xfs_log_commit_cil(
*/
if (push)
xlog_cil_push(log, 0);
+ return 0;
}
/*
@@ -786,8 +812,6 @@ xfs_log_item_in_current_chkpt(
{
struct xfs_cil_ctx *ctx;
- if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
- return false;
if (list_empty(&lip->li_cil))
return false;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 541a508adea1..0ed9ee77937c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1489,7 +1489,7 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
+ ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
memcpy(&ptr[old_len], dp, len); /* d, s, l */
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -1981,7 +1981,7 @@ xfs_qm_dqcheck(
if (!errs && ddq->d_id) {
if (ddq->d_blk_softlimit &&
- be64_to_cpu(ddq->d_bcount) >=
+ be64_to_cpu(ddq->d_bcount) >
be64_to_cpu(ddq->d_blk_softlimit)) {
if (!ddq->d_btimer) {
if (flags & XFS_QMOPT_DOWARN)
@@ -1992,7 +1992,7 @@ xfs_qm_dqcheck(
}
}
if (ddq->d_ino_softlimit &&
- be64_to_cpu(ddq->d_icount) >=
+ be64_to_cpu(ddq->d_icount) >
be64_to_cpu(ddq->d_ino_softlimit)) {
if (!ddq->d_itimer) {
if (flags & XFS_QMOPT_DOWARN)
@@ -2003,7 +2003,7 @@ xfs_qm_dqcheck(
}
}
if (ddq->d_rtb_softlimit &&
- be64_to_cpu(ddq->d_rtbcount) >=
+ be64_to_cpu(ddq->d_rtbcount) >
be64_to_cpu(ddq->d_rtb_softlimit)) {
if (!ddq->d_rtbtimer) {
if (flags & XFS_QMOPT_DOWARN)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bb24dac42a25..19f69e232509 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -219,7 +219,6 @@ typedef struct xfs_mount {
#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
must be synchronous except
for space allocations */
-#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
operations, typically for
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 0bbb1a41998b..c436def733bf 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -50,7 +50,6 @@
*/
struct mutex xfs_Gqm_lock;
struct xfs_qm *xfs_Gqm;
-uint ndquot;
kmem_zone_t *qm_dqzone;
kmem_zone_t *qm_dqtrxzone;
@@ -93,7 +92,6 @@ xfs_Gqm_init(void)
goto out_free_udqhash;
hsize /= sizeof(xfs_dqhash_t);
- ndquot = hsize << 8;
xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
xqm->qm_dqhashmask = hsize - 1;
@@ -137,7 +135,6 @@ xfs_Gqm_init(void)
xqm->qm_dqtrxzone = qm_dqtrxzone;
atomic_set(&xqm->qm_totaldquots, 0);
- xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
xqm->qm_nrefs = 0;
return xqm;
@@ -154,12 +151,17 @@ STATIC void
xfs_qm_destroy(
struct xfs_qm *xqm)
{
- struct xfs_dquot *dqp, *n;
int hsize, i;
ASSERT(xqm != NULL);
ASSERT(xqm->qm_nrefs == 0);
+
unregister_shrinker(&xfs_qm_shaker);
+
+ mutex_lock(&xqm->qm_dqfrlist_lock);
+ ASSERT(list_empty(&xqm->qm_dqfrlist));
+ mutex_unlock(&xqm->qm_dqfrlist_lock);
+
hsize = xqm->qm_dqhashmask + 1;
for (i = 0; i < hsize; i++) {
xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
@@ -171,17 +173,6 @@ xfs_qm_destroy(
xqm->qm_grp_dqhtable = NULL;
xqm->qm_dqhashmask = 0;
- /* frlist cleanup */
- mutex_lock(&xqm->qm_dqfrlist_lock);
- list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
- xfs_dqlock(dqp);
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- xfs_dqunlock(dqp);
- xfs_qm_dqdestroy(dqp);
- }
- mutex_unlock(&xqm->qm_dqfrlist_lock);
- mutex_destroy(&xqm->qm_dqfrlist_lock);
kmem_free(xqm);
}
@@ -232,34 +223,10 @@ STATIC void
xfs_qm_rele_quotafs_ref(
struct xfs_mount *mp)
{
- xfs_dquot_t *dqp, *n;
-
ASSERT(xfs_Gqm);
ASSERT(xfs_Gqm->qm_nrefs > 0);
/*
- * Go thru the freelist and destroy all inactive dquots.
- */
- mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-
- list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
- xfs_dqlock(dqp);
- if (dqp->dq_flags & XFS_DQ_INACTIVE) {
- ASSERT(dqp->q_mount == NULL);
- ASSERT(! XFS_DQ_IS_DIRTY(dqp));
- ASSERT(list_empty(&dqp->q_hashlist));
- ASSERT(list_empty(&dqp->q_mplist));
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- xfs_dqunlock(dqp);
- xfs_qm_dqdestroy(dqp);
- } else {
- xfs_dqunlock(dqp);
- }
- }
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-
- /*
* Destroy the entire XQM. If somebody mounts with quotaon, this'll
* be restarted.
*/
@@ -415,8 +382,7 @@ xfs_qm_unmount_quotas(
*/
STATIC int
xfs_qm_dqflush_all(
- struct xfs_mount *mp,
- int sync_mode)
+ struct xfs_mount *mp)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
int recl;
@@ -429,7 +395,8 @@ again:
mutex_lock(&q->qi_dqlist_lock);
list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
xfs_dqlock(dqp);
- if (! XFS_DQ_IS_DIRTY(dqp)) {
+ if ((dqp->dq_flags & XFS_DQ_FREEING) ||
+ !XFS_DQ_IS_DIRTY(dqp)) {
xfs_dqunlock(dqp);
continue;
}
@@ -444,14 +411,14 @@ again:
* out immediately. We'll be able to acquire
* the flush lock when the I/O completes.
*/
- xfs_qm_dqflock_pushbuf_wait(dqp);
+ xfs_dqflock_pushbuf_wait(dqp);
}
/*
* Let go of the mplist lock. We don't want to hold it
* across a disk write.
*/
mutex_unlock(&q->qi_dqlist_lock);
- error = xfs_qm_dqflush(dqp, sync_mode);
+ error = xfs_qm_dqflush(dqp, 0);
xfs_dqunlock(dqp);
if (error)
return error;
@@ -468,6 +435,7 @@ again:
/* return ! busy */
return 0;
}
+
/*
* Release the group dquot pointers the user dquots may be
* carrying around as a hint. mplist is locked on entry and exit.
@@ -478,31 +446,26 @@ xfs_qm_detach_gdquots(
{
struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_dquot *dqp, *gdqp;
- int nrecl;
again:
ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
xfs_dqlock(dqp);
- if ((gdqp = dqp->q_gdquot)) {
- xfs_dqlock(gdqp);
- dqp->q_gdquot = NULL;
- }
- xfs_dqunlock(dqp);
-
- if (gdqp) {
- /*
- * Can't hold the mplist lock across a dqput.
- * XXXmust convert to marker based iterations here.
- */
- nrecl = q->qi_dqreclaims;
+ if (dqp->dq_flags & XFS_DQ_FREEING) {
+ xfs_dqunlock(dqp);
mutex_unlock(&q->qi_dqlist_lock);
- xfs_qm_dqput(gdqp);
-
+ delay(1);
mutex_lock(&q->qi_dqlist_lock);
- if (nrecl != q->qi_dqreclaims)
- goto again;
+ goto again;
}
+
+ gdqp = dqp->q_gdquot;
+ if (gdqp)
+ dqp->q_gdquot = NULL;
+ xfs_dqunlock(dqp);
+
+ if (gdqp)
+ xfs_qm_dqrele(gdqp);
}
}
@@ -520,8 +483,8 @@ xfs_qm_dqpurge_int(
struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_dquot *dqp, *n;
uint dqtype;
- int nrecl;
- int nmisses;
+ int nmisses = 0;
+ LIST_HEAD (dispose_list);
if (!q)
return 0;
@@ -540,47 +503,26 @@ xfs_qm_dqpurge_int(
*/
xfs_qm_detach_gdquots(mp);
- again:
- nmisses = 0;
- ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
/*
- * Try to get rid of all of the unwanted dquots. The idea is to
- * get them off mplist and hashlist, but leave them on freelist.
+ * Try to get rid of all of the unwanted dquots.
*/
list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
- /*
- * It's OK to look at the type without taking dqlock here.
- * We're holding the mplist lock here, and that's needed for
- * a dqreclaim.
- */
- if ((dqp->dq_flags & dqtype) == 0)
- continue;
-
- if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
- nrecl = q->qi_dqreclaims;
- mutex_unlock(&q->qi_dqlist_lock);
- mutex_lock(&dqp->q_hash->qh_lock);
- mutex_lock(&q->qi_dqlist_lock);
-
- /*
- * XXXTheoretically, we can get into a very long
- * ping pong game here.
- * No one can be adding dquots to the mplist at
- * this point, but somebody might be taking things off.
- */
- if (nrecl != q->qi_dqreclaims) {
- mutex_unlock(&dqp->q_hash->qh_lock);
- goto again;
- }
+ xfs_dqlock(dqp);
+ if ((dqp->dq_flags & dqtype) != 0 &&
+ !(dqp->dq_flags & XFS_DQ_FREEING)) {
+ if (dqp->q_nrefs == 0) {
+ dqp->dq_flags |= XFS_DQ_FREEING;
+ list_move_tail(&dqp->q_mplist, &dispose_list);
+ } else
+ nmisses++;
}
-
- /*
- * Take the dquot off the mplist and hashlist. It may remain on
- * freelist in INACTIVE state.
- */
- nmisses += xfs_qm_dqpurge(dqp);
+ xfs_dqunlock(dqp);
}
mutex_unlock(&q->qi_dqlist_lock);
+
+ list_for_each_entry_safe(dqp, n, &dispose_list, q_mplist)
+ xfs_qm_dqpurge(dqp);
+
return nmisses;
}
@@ -648,12 +590,9 @@ xfs_qm_dqattach_one(
*/
dqp = udqhint->q_gdquot;
if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
- xfs_dqlock(dqp);
- XFS_DQHOLD(dqp);
ASSERT(*IO_idqpp == NULL);
- *IO_idqpp = dqp;
- xfs_dqunlock(dqp);
+ *IO_idqpp = xfs_qm_dqhold(dqp);
xfs_dqunlock(udqhint);
return 0;
}
@@ -693,11 +632,7 @@ xfs_qm_dqattach_one(
/*
* Given a udquot and gdquot, attach a ptr to the group dquot in the
- * udquot as a hint for future lookups. The idea sounds simple, but the
- * execution isn't, because the udquot might have a group dquot attached
- * already and getting rid of that gets us into lock ordering constraints.
- * The process is complicated more by the fact that the dquots may or may not
- * be locked on entry.
+ * udquot as a hint for future lookups.
*/
STATIC void
xfs_qm_dqattach_grouphint(
@@ -708,45 +643,17 @@ xfs_qm_dqattach_grouphint(
xfs_dqlock(udq);
- if ((tmp = udq->q_gdquot)) {
- if (tmp == gdq) {
- xfs_dqunlock(udq);
- return;
- }
+ tmp = udq->q_gdquot;
+ if (tmp) {
+ if (tmp == gdq)
+ goto done;
udq->q_gdquot = NULL;
- /*
- * We can't keep any dqlocks when calling dqrele,
- * because the freelist lock comes before dqlocks.
- */
- xfs_dqunlock(udq);
- /*
- * we took a hard reference once upon a time in dqget,
- * so give it back when the udquot no longer points at it
- * dqput() does the unlocking of the dquot.
- */
xfs_qm_dqrele(tmp);
-
- xfs_dqlock(udq);
- xfs_dqlock(gdq);
-
- } else {
- ASSERT(XFS_DQ_IS_LOCKED(udq));
- xfs_dqlock(gdq);
}
- ASSERT(XFS_DQ_IS_LOCKED(udq));
- ASSERT(XFS_DQ_IS_LOCKED(gdq));
- /*
- * Somebody could have attached a gdquot here,
- * when we dropped the uqlock. If so, just do nothing.
- */
- if (udq->q_gdquot == NULL) {
- XFS_DQHOLD(gdq);
- udq->q_gdquot = gdq;
- }
-
- xfs_dqunlock(gdq);
+ udq->q_gdquot = xfs_qm_dqhold(gdq);
+done:
xfs_dqunlock(udq);
}
@@ -813,17 +720,13 @@ xfs_qm_dqattach_locked(
ASSERT(ip->i_gdquot);
/*
- * We may or may not have the i_udquot locked at this point,
- * but this check is OK since we don't depend on the i_gdquot to
- * be accurate 100% all the time. It is just a hint, and this
- * will succeed in general.
+ * We do not have i_udquot locked at this point, but this check
+ * is OK since we don't depend on the i_gdquot to be accurate
+ * 100% all the time. It is just a hint, and this will
+ * succeed in general.
*/
- if (ip->i_udquot->q_gdquot == ip->i_gdquot)
- goto done;
- /*
- * Attach i_gdquot to the gdquot hint inside the i_udquot.
- */
- xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
+ if (ip->i_udquot->q_gdquot != ip->i_gdquot)
+ xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
}
done:
@@ -879,100 +782,6 @@ xfs_qm_dqdetach(
}
}
-int
-xfs_qm_sync(
- struct xfs_mount *mp,
- int flags)
-{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- int recl, restarts;
- struct xfs_dquot *dqp;
- int error;
-
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
- return 0;
-
- restarts = 0;
-
- again:
- mutex_lock(&q->qi_dqlist_lock);
- /*
- * dqpurge_all() also takes the mplist lock and iterate thru all dquots
- * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
- * when we have the mplist lock, we know that dquots will be consistent
- * as long as we have it locked.
- */
- if (!XFS_IS_QUOTA_ON(mp)) {
- mutex_unlock(&q->qi_dqlist_lock);
- return 0;
- }
- ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
- list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
- /*
- * If this is vfs_sync calling, then skip the dquots that
- * don't 'seem' to be dirty. ie. don't acquire dqlock.
- * This is very similar to what xfs_sync does with inodes.
- */
- if (flags & SYNC_TRYLOCK) {
- if (!XFS_DQ_IS_DIRTY(dqp))
- continue;
- if (!xfs_qm_dqlock_nowait(dqp))
- continue;
- } else {
- xfs_dqlock(dqp);
- }
-
- /*
- * Now, find out for sure if this dquot is dirty or not.
- */
- if (! XFS_DQ_IS_DIRTY(dqp)) {
- xfs_dqunlock(dqp);
- continue;
- }
-
- /* XXX a sentinel would be better */
- recl = q->qi_dqreclaims;
- if (!xfs_dqflock_nowait(dqp)) {
- if (flags & SYNC_TRYLOCK) {
- xfs_dqunlock(dqp);
- continue;
- }
- /*
- * If we can't grab the flush lock then if the caller
- * really wanted us to give this our best shot, so
- * see if we can give a push to the buffer before we wait
- * on the flush lock. At this point, we know that
- * even though the dquot is being flushed,
- * it has (new) dirty data.
- */
- xfs_qm_dqflock_pushbuf_wait(dqp);
- }
- /*
- * Let go of the mplist lock. We don't want to hold it
- * across a disk write
- */
- mutex_unlock(&q->qi_dqlist_lock);
- error = xfs_qm_dqflush(dqp, flags);
- xfs_dqunlock(dqp);
- if (error && XFS_FORCED_SHUTDOWN(mp))
- return 0; /* Need to prevent umount failure */
- else if (error)
- return error;
-
- mutex_lock(&q->qi_dqlist_lock);
- if (recl != q->qi_dqreclaims) {
- if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
- break;
-
- mutex_unlock(&q->qi_dqlist_lock);
- goto again;
- }
- }
-
- mutex_unlock(&q->qi_dqlist_lock);
- return 0;
-}
-
/*
* The hash chains and the mplist use the same xfs_dqhash structure as
* their list head, but we can take the mplist qh_lock and one of the
@@ -1034,18 +843,21 @@ xfs_qm_init_quotainfo(
/*
* We try to get the limits from the superuser's limits fields.
* This is quite hacky, but it is standard quota practice.
+ *
* We look at the USR dquot with id == 0 first, but if user quotas
* are not enabled we goto the GRP dquot with id == 0.
* We don't really care to keep separate default limits for user
* and group quotas, at least not at this point.
+ *
+ * Since we may not have done a quotacheck by this point, just read
+ * the dquot without attaching it to any hashtables or lists.
*/
- error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0,
- XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
- (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
- XFS_DQ_PROJ),
- XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN,
- &dqp);
- if (! error) {
+ error = xfs_qm_dqread(mp, 0,
+ XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
+ (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
+ XFS_DQ_PROJ),
+ XFS_QMOPT_DOWARN, &dqp);
+ if (!error) {
xfs_disk_dquot_t *ddqp = &dqp->q_core;
/*
@@ -1072,11 +884,6 @@ xfs_qm_init_quotainfo(
qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
- /*
- * We sent the XFS_QMOPT_DQSUSER flag to dqget because
- * we don't want this dquot cached. We haven't done a
- * quotacheck yet, and quotacheck doesn't like incore dquots.
- */
xfs_qm_dqdestroy(dqp);
} else {
qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -1661,7 +1468,7 @@ xfs_qm_quotacheck(
* successfully.
*/
if (!error)
- error = xfs_qm_dqflush_all(mp, 0);
+ error = xfs_qm_dqflush_all(mp);
/*
* We can get this error if we couldn't do a dquot allocation inside
@@ -1790,257 +1597,150 @@ xfs_qm_init_quotainos(
return 0;
}
-
-
-/*
- * Just pop the least recently used dquot off the freelist and
- * recycle it. The returned dquot is locked.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqreclaim_one(void)
+STATIC void
+xfs_qm_dqfree_one(
+ struct xfs_dquot *dqp)
{
- xfs_dquot_t *dqpout;
- xfs_dquot_t *dqp;
- int restarts;
- int startagain;
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
- restarts = 0;
- dqpout = NULL;
+ mutex_lock(&dqp->q_hash->qh_lock);
+ list_del_init(&dqp->q_hashlist);
+ dqp->q_hash->qh_version++;
+ mutex_unlock(&dqp->q_hash->qh_lock);
- /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-again:
- startagain = 0;
- mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+ mutex_lock(&qi->qi_dqlist_lock);
+ list_del_init(&dqp->q_mplist);
+ qi->qi_dquots--;
+ qi->qi_dqreclaims++;
+ mutex_unlock(&qi->qi_dqlist_lock);
- list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
- struct xfs_mount *mp = dqp->q_mount;
- xfs_dqlock(dqp);
+ xfs_qm_dqdestroy(dqp);
+}
- /*
- * We are racing with dqlookup here. Naturally we don't
- * want to reclaim a dquot that lookup wants. We release the
- * freelist lock and start over, so that lookup will grab
- * both the dquot and the freelistlock.
- */
- if (dqp->dq_flags & XFS_DQ_WANT) {
- ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
-
- trace_xfs_dqreclaim_want(dqp);
- XQM_STATS_INC(xqmstats.xs_qm_dqwants);
- restarts++;
- startagain = 1;
- goto dqunlock;
- }
+STATIC void
+xfs_qm_dqreclaim_one(
+ struct xfs_dquot *dqp,
+ struct list_head *dispose_list)
+{
+ struct xfs_mount *mp = dqp->q_mount;
+ int error;
- /*
- * If the dquot is inactive, we are assured that it is
- * not on the mplist or the hashlist, and that makes our
- * life easier.
- */
- if (dqp->dq_flags & XFS_DQ_INACTIVE) {
- ASSERT(mp == NULL);
- ASSERT(! XFS_DQ_IS_DIRTY(dqp));
- ASSERT(list_empty(&dqp->q_hashlist));
- ASSERT(list_empty(&dqp->q_mplist));
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- dqpout = dqp;
- XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
- goto dqunlock;
- }
+ if (!xfs_dqlock_nowait(dqp))
+ goto out_busy;
- ASSERT(dqp->q_hash);
- ASSERT(!list_empty(&dqp->q_mplist));
+ /*
+ * This dquot has acquired a reference in the meantime remove it from
+ * the freelist and try again.
+ */
+ if (dqp->q_nrefs) {
+ xfs_dqunlock(dqp);
- /*
- * Try to grab the flush lock. If this dquot is in the process
- * of getting flushed to disk, we don't want to reclaim it.
- */
- if (!xfs_dqflock_nowait(dqp))
- goto dqunlock;
+ trace_xfs_dqreclaim_want(dqp);
+ XQM_STATS_INC(xqmstats.xs_qm_dqwants);
- /*
- * We have the flush lock so we know that this is not in the
- * process of being flushed. So, if this is dirty, flush it
- * DELWRI so that we don't get a freelist infested with
- * dirty dquots.
- */
- if (XFS_DQ_IS_DIRTY(dqp)) {
- int error;
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
+ return;
+ }
- trace_xfs_dqreclaim_dirty(dqp);
+ ASSERT(dqp->q_hash);
+ ASSERT(!list_empty(&dqp->q_mplist));
- /*
- * We flush it delayed write, so don't bother
- * releasing the freelist lock.
- */
- error = xfs_qm_dqflush(dqp, 0);
- if (error) {
- xfs_warn(mp, "%s: dquot %p flush failed",
- __func__, dqp);
- }
- goto dqunlock;
- }
+ /*
+ * Try to grab the flush lock. If this dquot is in the process of
+ * getting flushed to disk, we don't want to reclaim it.
+ */
+ if (!xfs_dqflock_nowait(dqp))
+ goto out_busy;
+
+ /*
+ * We have the flush lock so we know that this is not in the
+ * process of being flushed. So, if this is dirty, flush it
+ * DELWRI so that we don't get a freelist infested with
+ * dirty dquots.
+ */
+ if (XFS_DQ_IS_DIRTY(dqp)) {
+ trace_xfs_dqreclaim_dirty(dqp);
/*
- * We're trying to get the hashlock out of order. This races
- * with dqlookup; so, we giveup and goto the next dquot if
- * we couldn't get the hashlock. This way, we won't starve
- * a dqlookup process that holds the hashlock that is
- * waiting for the freelist lock.
+ * We flush it delayed write, so don't bother releasing the
+ * freelist lock.
*/
- if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
- restarts++;
- goto dqfunlock;
+ error = xfs_qm_dqflush(dqp, 0);
+ if (error) {
+ xfs_warn(mp, "%s: dquot %p flush failed",
+ __func__, dqp);
}
/*
- * This races with dquot allocation code as well as dqflush_all
- * and reclaim code. So, if we failed to grab the mplist lock,
- * giveup everything and start over.
+ * Give the dquot another try on the freelist, as the
+ * flushing will take some time.
*/
- if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
- restarts++;
- startagain = 1;
- goto qhunlock;
- }
-
- ASSERT(dqp->q_nrefs == 0);
- list_del_init(&dqp->q_mplist);
- mp->m_quotainfo->qi_dquots--;
- mp->m_quotainfo->qi_dqreclaims++;
- list_del_init(&dqp->q_hashlist);
- dqp->q_hash->qh_version++;
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- dqpout = dqp;
- mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
-qhunlock:
- mutex_unlock(&dqp->q_hash->qh_lock);
-dqfunlock:
- xfs_dqfunlock(dqp);
-dqunlock:
- xfs_dqunlock(dqp);
- if (dqpout)
- break;
- if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
- break;
- if (startagain) {
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- goto again;
- }
+ goto out_busy;
}
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- return dqpout;
-}
+ xfs_dqfunlock(dqp);
-/*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
- * favor the lookup function ...
- */
-STATIC int
-xfs_qm_shake_freelist(
- int howmany)
-{
- int nreclaimed = 0;
- xfs_dquot_t *dqp;
+ /*
+ * Prevent lookups now that we are past the point of no return.
+ */
+ dqp->dq_flags |= XFS_DQ_FREEING;
+ xfs_dqunlock(dqp);
- if (howmany <= 0)
- return 0;
+ ASSERT(dqp->q_nrefs == 0);
+ list_move_tail(&dqp->q_freelist, dispose_list);
+ xfs_Gqm->qm_dqfrlist_cnt--;
- while (nreclaimed < howmany) {
- dqp = xfs_qm_dqreclaim_one();
- if (!dqp)
- return nreclaimed;
- xfs_qm_dqdestroy(dqp);
- nreclaimed++;
- }
- return nreclaimed;
+ trace_xfs_dqreclaim_done(dqp);
+ XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
+ return;
+
+out_busy:
+ xfs_dqunlock(dqp);
+
+ /*
+ * Move the dquot to the tail of the list so that we don't spin on it.
+ */
+ list_move_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
+
+ trace_xfs_dqreclaim_busy(dqp);
+ XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
}
-/*
- * The kmem_shake interface is invoked when memory is running low.
- */
-/* ARGSUSED */
STATIC int
xfs_qm_shake(
- struct shrinker *shrink,
- struct shrink_control *sc)
+ struct shrinker *shrink,
+ struct shrink_control *sc)
{
- int ndqused, nfree, n;
- gfp_t gfp_mask = sc->gfp_mask;
-
- if (!kmem_shake_allow(gfp_mask))
- return 0;
- if (!xfs_Gqm)
- return 0;
-
- nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
- /* incore dquots in all f/s's */
- ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
-
- ASSERT(ndqused >= 0);
+ int nr_to_scan = sc->nr_to_scan;
+ LIST_HEAD (dispose_list);
+ struct xfs_dquot *dqp;
- if (nfree <= ndqused && nfree < ndquot)
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
return 0;
+ if (!nr_to_scan)
+ goto out;
- ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */
- n = nfree - ndqused - ndquot; /* # over target */
-
- return xfs_qm_shake_freelist(MAX(nfree, n));
-}
-
-
-/*------------------------------------------------------------------*/
-
-/*
- * Return a new incore dquot. Depending on the number of
- * dquots in the system, we either allocate a new one on the kernel heap,
- * or reclaim a free one.
- * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
- * to reclaim an existing one from the freelist.
- */
-boolean_t
-xfs_qm_dqalloc_incore(
- xfs_dquot_t **O_dqpp)
-{
- xfs_dquot_t *dqp;
-
- /*
- * Check against high water mark to see if we want to pop
- * a nincompoop dquot off the freelist.
- */
- if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
- /*
- * Try to recycle a dquot from the freelist.
- */
- if ((dqp = xfs_qm_dqreclaim_one())) {
- XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
- /*
- * Just zero the core here. The rest will get
- * reinitialized by caller. XXX we shouldn't even
- * do this zero ...
- */
- memset(&dqp->q_core, 0, sizeof(dqp->q_core));
- *O_dqpp = dqp;
- return B_FALSE;
- }
- XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
+ mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+ while (!list_empty(&xfs_Gqm->qm_dqfrlist)) {
+ if (nr_to_scan-- <= 0)
+ break;
+ dqp = list_first_entry(&xfs_Gqm->qm_dqfrlist, struct xfs_dquot,
+ q_freelist);
+ xfs_qm_dqreclaim_one(dqp, &dispose_list);
}
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- /*
- * Allocate a brand new dquot on the kernel heap and return it
- * to the caller to initialize.
- */
- ASSERT(xfs_Gqm->qm_dqzone != NULL);
- *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
- atomic_inc(&xfs_Gqm->qm_totaldquots);
-
- return B_TRUE;
+ while (!list_empty(&dispose_list)) {
+ dqp = list_first_entry(&dispose_list, struct xfs_dquot,
+ q_freelist);
+ list_del_init(&dqp->q_freelist);
+ xfs_qm_dqfree_one(dqp);
+ }
+out:
+ return (xfs_Gqm->qm_dqfrlist_cnt / 100) * sysctl_vfs_cache_pressure;
}
-
/*
* Start a transaction and write the incore superblock changes to
* disk. flags parameter indicates which fields have changed.
@@ -2151,10 +1851,7 @@ xfs_qm_vop_dqalloc(
* this to caller
*/
ASSERT(ip->i_udquot);
- uq = ip->i_udquot;
- xfs_dqlock(uq);
- XFS_DQHOLD(uq);
- xfs_dqunlock(uq);
+ uq = xfs_qm_dqhold(ip->i_udquot);
}
}
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
@@ -2175,10 +1872,7 @@ xfs_qm_vop_dqalloc(
xfs_ilock(ip, lockflags);
} else {
ASSERT(ip->i_gdquot);
- gq = ip->i_gdquot;
- xfs_dqlock(gq);
- XFS_DQHOLD(gq);
- xfs_dqunlock(gq);
+ gq = xfs_qm_dqhold(ip->i_gdquot);
}
} else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
if (xfs_get_projid(ip) != prid) {
@@ -2198,10 +1892,7 @@ xfs_qm_vop_dqalloc(
xfs_ilock(ip, lockflags);
} else {
ASSERT(ip->i_gdquot);
- gq = ip->i_gdquot;
- xfs_dqlock(gq);
- XFS_DQHOLD(gq);
- xfs_dqunlock(gq);
+ gq = xfs_qm_dqhold(ip->i_gdquot);
}
}
if (uq)
@@ -2251,14 +1942,10 @@ xfs_qm_vop_chown(
xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
/*
- * Take an extra reference, because the inode
- * is going to keep this dquot pointer even
- * after the trans_commit.
+ * Take an extra reference, because the inode is going to keep
+ * this dquot pointer even after the trans_commit.
*/
- xfs_dqlock(newdq);
- XFS_DQHOLD(newdq);
- xfs_dqunlock(newdq);
- *IO_olddq = newdq;
+ *IO_olddq = xfs_qm_dqhold(newdq);
return prevdq;
}
@@ -2390,25 +2077,21 @@ xfs_qm_vop_create_dqattach(
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
if (udqp) {
- xfs_dqlock(udqp);
- XFS_DQHOLD(udqp);
- xfs_dqunlock(udqp);
ASSERT(ip->i_udquot == NULL);
- ip->i_udquot = udqp;
ASSERT(XFS_IS_UQUOTA_ON(mp));
ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
+
+ ip->i_udquot = xfs_qm_dqhold(udqp);
xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (gdqp) {
- xfs_dqlock(gdqp);
- XFS_DQHOLD(gdqp);
- xfs_dqunlock(gdqp);
ASSERT(ip->i_gdquot == NULL);
- ip->i_gdquot = gdqp;
ASSERT(XFS_IS_OQUOTA_ON(mp));
ASSERT((XFS_IS_GQUOTA_ON(mp) ?
ip->i_d.di_gid : xfs_get_projid(ip)) ==
be32_to_cpu(gdqp->q_core.d_id));
+
+ ip->i_gdquot = xfs_qm_dqhold(gdqp);
xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 43b9abe1052c..9a9b997e1a0a 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -26,30 +26,12 @@
struct xfs_qm;
struct xfs_inode;
-extern uint ndquot;
extern struct mutex xfs_Gqm_lock;
extern struct xfs_qm *xfs_Gqm;
extern kmem_zone_t *qm_dqzone;
extern kmem_zone_t *qm_dqtrxzone;
/*
- * Used in xfs_qm_sync called by xfs_sync to count the max times that it can
- * iterate over the mountpt's dquot list in one call.
- */
-#define XFS_QM_SYNC_MAX_RESTARTS 7
-
-/*
- * Ditto, for xfs_qm_dqreclaim_one.
- */
-#define XFS_QM_RECLAIM_MAX_RESTARTS 4
-
-/*
- * Ideal ratio of free to in use dquots. Quota manager makes an attempt
- * to keep this balance.
- */
-#define XFS_QM_DQFREE_RATIO 2
-
-/*
* Dquot hashtable constants/threshold values.
*/
#define XFS_QM_HASHSIZE_LOW (PAGE_SIZE / sizeof(xfs_dqhash_t))
@@ -80,7 +62,6 @@ typedef struct xfs_qm {
int qm_dqfrlist_cnt;
atomic_t qm_totaldquots; /* total incore dquots */
uint qm_nrefs; /* file systems with quota on */
- int qm_dqfree_ratio;/* ratio of free to inuse dquots */
kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */
kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */
} xfs_qm_t;
@@ -149,7 +130,6 @@ extern int xfs_qm_quotacheck(xfs_mount_t *);
extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
/* dquot stuff */
-extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **);
extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint);
extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
index 8671a0b32644..5729ba570877 100644
--- a/fs/xfs/xfs_qm_stats.c
+++ b/fs/xfs/xfs_qm_stats.c
@@ -42,9 +42,9 @@ static int xqm_proc_show(struct seq_file *m, void *v)
{
/* maximum; incore; ratio free to inuse; freelist */
seq_printf(m, "%d\t%d\t%d\t%u\n",
- ndquot,
+ 0,
xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
- xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
+ 0,
xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
return 0;
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 5cc3dde1bc90..711a86e39ff0 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -31,6 +31,7 @@
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
#include "xfs_itable.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
@@ -263,13 +264,18 @@ xfs_qm_scall_trunc_qfile(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_itruncate_data(&tp, ip, 0);
+ ip->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
if (error) {
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
XFS_TRANS_ABORT);
goto out_unlock;
}
+ ASSERT(ip->i_d.di_nextents == 0);
+
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
@@ -807,11 +813,11 @@ xfs_qm_export_dquot(
(XFS_IS_OQUOTA_ENFORCED(mp) &&
(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
dst->d_id != 0) {
- if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
+ if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) &&
(dst->d_blk_softlimit > 0)) {
ASSERT(dst->d_btimer != 0);
}
- if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
+ if (((int) dst->d_icount > (int) dst->d_ino_softlimit) &&
(dst->d_ino_softlimit > 0)) {
ASSERT(dst->d_itimer != 0);
}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index a595f29567fe..8a0807e0f979 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -87,8 +87,7 @@ typedef struct xfs_dqblk {
#define XFS_DQ_PROJ 0x0002 /* project quota */
#define XFS_DQ_GROUP 0x0004 /* a group quota */
#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
-#define XFS_DQ_WANT 0x0010 /* for lookup/reclaim race */
-#define XFS_DQ_INACTIVE 0x0020 /* dq off mplist & hashlist */
+#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */
#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
@@ -97,8 +96,7 @@ typedef struct xfs_dqblk {
{ XFS_DQ_PROJ, "PROJ" }, \
{ XFS_DQ_GROUP, "GROUP" }, \
{ XFS_DQ_DIRTY, "DIRTY" }, \
- { XFS_DQ_WANT, "WANT" }, \
- { XFS_DQ_INACTIVE, "INACTIVE" }
+ { XFS_DQ_FREEING, "FREEING" }
/*
* In the worst case, when both user and group quotas are on,
@@ -199,7 +197,6 @@ typedef struct xfs_qoff_logformat {
#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
-#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */
#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
@@ -326,7 +323,6 @@ extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
extern void xfs_qm_dqdetach(struct xfs_inode *);
extern void xfs_qm_dqrele(struct xfs_dquot *);
extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
-extern int xfs_qm_sync(struct xfs_mount *, int);
extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
extern void xfs_qm_mount_quotas(struct xfs_mount *);
extern void xfs_qm_unmount(struct xfs_mount *);
@@ -366,10 +362,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
#define xfs_qm_dqdetach(ip)
#define xfs_qm_dqrele(d)
#define xfs_qm_statvfs(ip, s)
-static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
-{
- return 0;
-}
#define xfs_qm_newmount(mp, a, b) (0)
#define xfs_qm_mount_quotas(mp)
#define xfs_qm_unmount(mp)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8a899496fd5f..ee5b695c99a7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -199,7 +199,6 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_BARRIER;
mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
- mp->m_flags |= XFS_MOUNT_DELAYLOG;
/*
* These can be overridden by the mount option parsing.
@@ -353,11 +352,11 @@ xfs_parseargs(
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
mp->m_qflags &= ~XFS_OQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
- mp->m_flags |= XFS_MOUNT_DELAYLOG;
+ xfs_warn(mp,
+ "delaylog is the default now, option is deprecated.");
} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
- mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
xfs_warn(mp,
- "nodelaylog is deprecated and will be removed in Linux 3.3");
+ "nodelaylog support has been removed, option is deprecated.");
} else if (!strcmp(this_char, MNTOPT_DISCARD)) {
mp->m_flags |= XFS_MOUNT_DISCARD;
} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
@@ -395,13 +394,6 @@ xfs_parseargs(
return EINVAL;
}
- if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
- !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
- xfs_warn(mp,
- "the discard option is incompatible with the nodelaylog option");
- return EINVAL;
- }
-
#ifndef CONFIG_XFS_QUOTA
if (XFS_IS_QUOTA_RUNNING(mp)) {
xfs_warn(mp, "quota support not available in this kernel.");
@@ -501,7 +493,6 @@ xfs_showargs(
{ XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 },
{ XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
- { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
{ XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
{ 0, NULL }
};
@@ -837,14 +828,6 @@ xfs_fs_inode_init_once(
/* xfs inode */
atomic_set(&ip->i_pincount, 0);
spin_lock_init(&ip->i_flags_lock);
- init_waitqueue_head(&ip->i_ipin_wait);
- /*
- * Because we want to use a counting completion, complete
- * the flush completion once to allow a single access to
- * the flush completion without blocking.
- */
- init_completion(&ip->i_flush);
- complete(&ip->i_flush);
mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
"xfsino", ip->i_ino);
@@ -1014,17 +997,10 @@ xfs_fs_sync_fs(
int error;
/*
- * Not much we can do for the first async pass. Writing out the
- * superblock would be counter-productive as we are going to redirty
- * when writing out other data and metadata (and writing out a single
- * block is quite fast anyway).
- *
- * Try to asynchronously kick off quota syncing at least.
+ * Doing anything during the async pass would be counterproductive.
*/
- if (!wait) {
- xfs_qm_sync(mp, SYNC_TRYLOCK);
+ if (!wait)
return 0;
- }
error = xfs_quiesce_data(mp);
if (error)
@@ -1238,9 +1214,9 @@ xfs_fs_unfreeze(
STATIC int
xfs_fs_show_options(
struct seq_file *m,
- struct vfsmount *mnt)
+ struct dentry *root)
{
- return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
+ return -xfs_showargs(XFS_M(root->d_sb), m);
}
/*
@@ -1621,12 +1597,12 @@ STATIC int __init
xfs_init_workqueues(void)
{
/*
- * max_active is set to 8 to give enough concurency to allow
- * multiple work operations on each CPU to run. This allows multiple
- * filesystems to be running sync work concurrently, and scales with
- * the number of CPUs in the system.
+ * We never want to the same work item to run twice, reclaiming inodes
+ * or idling the log is not going to get any faster by multiple CPUs
+ * competing for ressources. Use the default large max_active value
+ * so that even lots of filesystems can perform these task in parallel.
*/
- xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
+ xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
if (!xfs_syncd_wq)
return -ENOMEM;
return 0;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index f0994aedcd15..40b75eecd2b4 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -395,10 +395,7 @@ xfs_quiesce_data(
*/
xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
- xfs_qm_sync(mp, SYNC_TRYLOCK);
- xfs_qm_sync(mp, SYNC_WAIT);
-
- /* force out the newly dirtied log buffers */
+ /* force out the log */
xfs_log_force(mp, XFS_LOG_SYNC);
/* write superblock and hoover up shutdown errors */
@@ -506,7 +503,6 @@ xfs_sync_worker(
error = xfs_fs_log_dummy(mp);
else
xfs_log_force(mp, 0);
- error = xfs_qm_sync(mp, SYNC_TRYLOCK);
/* start pushing all the metadata that is currently dirty */
xfs_ail_push_all(mp->m_ail);
@@ -711,14 +707,13 @@ xfs_reclaim_inode_grab(
return 1;
/*
- * do some unlocked checks first to avoid unnecessary lock traffic.
- * The first is a flush lock check, the second is a already in reclaim
- * check. Only do these checks if we are not going to block on locks.
+ * If we are asked for non-blocking operation, do unlocked checks to
+ * see if the inode already is being flushed or in reclaim to avoid
+ * lock traffic.
*/
if ((flags & SYNC_TRYLOCK) &&
- (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+ __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
return 1;
- }
/*
* The radix tree lock here protects a thread in xfs_iget from racing
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 494035798873..bb134a819930 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -733,18 +733,15 @@ DEFINE_EVENT(xfs_dquot_class, name, \
DEFINE_DQUOT_EVENT(xfs_dqadjust);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
-DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_done);
DEFINE_DQUOT_EVENT(xfs_dqattach_found);
DEFINE_DQUOT_EVENT(xfs_dqattach_get);
-DEFINE_DQUOT_EVENT(xfs_dqinit);
-DEFINE_DQUOT_EVENT(xfs_dqreuse);
DEFINE_DQUOT_EVENT(xfs_dqalloc);
DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
DEFINE_DQUOT_EVENT(xfs_dqread);
DEFINE_DQUOT_EVENT(xfs_dqread_fail);
DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
DEFINE_DQUOT_EVENT(xfs_dqget_hit);
DEFINE_DQUOT_EVENT(xfs_dqget_miss);
@@ -893,7 +890,6 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_fsize_t, size)
- __field(xfs_fsize_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
__field(int, flags)
@@ -902,17 +898,15 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->size = ip->i_d.di_size;
- __entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
__entry->flags = flags;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
"offset 0x%llx count 0x%zx ioflags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
- __entry->new_size,
__entry->offset,
__entry->count,
__print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -980,7 +974,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(loff_t, size)
- __field(loff_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
__field(int, type)
@@ -992,7 +985,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->size = ip->i_d.di_size;
- __entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
__entry->type = type;
@@ -1000,13 +992,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
- "offset 0x%llx count %zd type %s "
- "startoff 0x%llx startblock %lld blockcount 0x%llx",
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
+ "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
- __entry->new_size,
__entry->offset,
__entry->count,
__print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1033,26 +1023,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
__field(xfs_ino_t, ino)
__field(loff_t, isize)
__field(loff_t, disize)
- __field(loff_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->isize = ip->i_size;
+ __entry->isize = VFS_I(ip)->i_size;
__entry->disize = ip->i_d.di_size;
- __entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
),
- TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx "
+ TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
"offset 0x%llx count %zd",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->isize,
__entry->disize,
- __entry->new_size,
__entry->offset,
__entry->count)
);
@@ -1092,8 +1079,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
DEFINE_EVENT(xfs_itrunc_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
TP_ARGS(ip, new_size))
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start);
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
TRACE_EVENT(xfs_pagecache_inval,
TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
@@ -1570,7 +1557,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__field(xfs_ino_t, ino)
__field(int, format)
__field(int, nex)
- __field(int, max_nex)
__field(int, broot_size)
__field(int, fork_off)
),
@@ -1580,18 +1566,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__entry->ino = ip->i_ino;
__entry->format = ip->i_d.di_format;
__entry->nex = ip->i_d.di_nextents;
- __entry->max_nex = ip->i_df.if_ext_max;
__entry->broot_size = ip->i_df.if_broot_bytes;
__entry->fork_off = XFS_IFORK_BOFF(ip);
),
TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
- "Max in-fork extents %d, broot size %d, fork offset %d",
+ "broot size %d, fork offset %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
__print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
__entry->nex,
- __entry->max_nex,
__entry->broot_size,
__entry->fork_off)
)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1f35b2feca97..7adcdf15ae0c 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1151,14 +1151,13 @@ xfs_trans_add_item(
{
struct xfs_log_item_desc *lidp;
- ASSERT(lip->li_mountp = tp->t_mountp);
- ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
+ ASSERT(lip->li_mountp == tp->t_mountp);
+ ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
lidp->lid_item = lip;
lidp->lid_flags = 0;
- lidp->lid_size = 0;
list_add_tail(&lidp->lid_trans, &tp->t_items);
lip->li_desc = lidp;
@@ -1210,219 +1209,6 @@ xfs_trans_free_items(
}
}
-/*
- * Unlock the items associated with a transaction.
- *
- * Items which were not logged should be freed. Those which were logged must
- * still be tracked so they can be unpinned when the transaction commits.
- */
-STATIC void
-xfs_trans_unlock_items(
- struct xfs_trans *tp,
- xfs_lsn_t commit_lsn)
-{
- struct xfs_log_item_desc *lidp, *next;
-
- list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
-
- lip->li_desc = NULL;
-
- if (commit_lsn != NULLCOMMITLSN)
- IOP_COMMITTING(lip, commit_lsn);
- IOP_UNLOCK(lip);
-
- /*
- * Free the descriptor if the item is not dirty
- * within this transaction.
- */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
- xfs_trans_free_item_desc(lidp);
- }
-}
-
-/*
- * Total up the number of log iovecs needed to commit this
- * transaction. The transaction itself needs one for the
- * transaction header. Ask each dirty item in turn how many
- * it needs to get the total.
- */
-static uint
-xfs_trans_count_vecs(
- struct xfs_trans *tp)
-{
- int nvecs;
- struct xfs_log_item_desc *lidp;
-
- nvecs = 1;
-
- /* In the non-debug case we need to start bailing out if we
- * didn't find a log_item here, return zero and let trans_commit
- * deal with it.
- */
- if (list_empty(&tp->t_items)) {
- ASSERT(0);
- return 0;
- }
-
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- /*
- * Skip items which aren't dirty in this transaction.
- */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
- continue;
- lidp->lid_size = IOP_SIZE(lidp->lid_item);
- nvecs += lidp->lid_size;
- }
-
- return nvecs;
-}
-
-/*
- * Fill in the vector with pointers to data to be logged
- * by this transaction. The transaction header takes
- * the first vector, and then each dirty item takes the
- * number of vectors it indicated it needed in xfs_trans_count_vecs().
- *
- * As each item fills in the entries it needs, also pin the item
- * so that it cannot be flushed out until the log write completes.
- */
-static void
-xfs_trans_fill_vecs(
- struct xfs_trans *tp,
- struct xfs_log_iovec *log_vector)
-{
- struct xfs_log_item_desc *lidp;
- struct xfs_log_iovec *vecp;
- uint nitems;
-
- /*
- * Skip over the entry for the transaction header, we'll
- * fill that in at the end.
- */
- vecp = log_vector + 1;
-
- nitems = 0;
- ASSERT(!list_empty(&tp->t_items));
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- /* Skip items which aren't dirty in this transaction. */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
- continue;
-
- /*
- * The item may be marked dirty but not log anything. This can
- * be used to get called when a transaction is committed.
- */
- if (lidp->lid_size)
- nitems++;
- IOP_FORMAT(lidp->lid_item, vecp);
- vecp += lidp->lid_size;
- IOP_PIN(lidp->lid_item);
- }
-
- /*
- * Now that we've counted the number of items in this transaction, fill
- * in the transaction header. Note that the transaction header does not
- * have a log item.
- */
- tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
- tp->t_header.th_type = tp->t_type;
- tp->t_header.th_num_items = nitems;
- log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
- log_vector->i_len = sizeof(xfs_trans_header_t);
- log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
-}
-
-/*
- * The committed item processing consists of calling the committed routine of
- * each logged item, updating the item's position in the AIL if necessary, and
- * unpinning each item. If the committed routine returns -1, then do nothing
- * further with the item because it may have been freed.
- *
- * Since items are unlocked when they are copied to the incore log, it is
- * possible for two transactions to be completing and manipulating the same
- * item simultaneously. The AIL lock will protect the lsn field of each item.
- * The value of this field can never go backwards.
- *
- * We unpin the items after repositioning them in the AIL, because otherwise
- * they could be immediately flushed and we'd have to race with the flusher
- * trying to pull the item from the AIL as we add it.
- */
-static void
-xfs_trans_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn,
- int aborted)
-{
- xfs_lsn_t item_lsn;
- struct xfs_ail *ailp;
-
- if (aborted)
- lip->li_flags |= XFS_LI_ABORTED;
- item_lsn = IOP_COMMITTED(lip, commit_lsn);
-
- /* item_lsn of -1 means the item needs no further processing */
- if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
- return;
-
- /*
- * If the returned lsn is greater than what it contained before, update
- * the location of the item in the AIL. If it is not, then do nothing.
- * Items can never move backwards in the AIL.
- *
- * While the new lsn should usually be greater, it is possible that a
- * later transaction completing simultaneously with an earlier one
- * using the same item could complete first with a higher lsn. This
- * would cause the earlier transaction to fail the test below.
- */
- ailp = lip->li_ailp;
- spin_lock(&ailp->xa_lock);
- if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
- /*
- * This will set the item's lsn to item_lsn and update the
- * position of the item in the AIL.
- *
- * xfs_trans_ail_update() drops the AIL lock.
- */
- xfs_trans_ail_update(ailp, lip, item_lsn);
- } else {
- spin_unlock(&ailp->xa_lock);
- }
-
- /*
- * Now that we've repositioned the item in the AIL, unpin it so it can
- * be flushed. Pass information about buffer stale state down from the
- * log item flags, if anyone else stales the buffer we do not want to
- * pay any attention to it.
- */
- IOP_UNPIN(lip, 0);
-}
-
-/*
- * This is typically called by the LM when a transaction has been fully
- * committed to disk. It needs to unpin the items which have
- * been logged by the transaction and update their positions
- * in the AIL if necessary.
- *
- * This also gets called when the transactions didn't get written out
- * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
- */
-STATIC void
-xfs_trans_committed(
- void *arg,
- int abortflag)
-{
- struct xfs_trans *tp = arg;
- struct xfs_log_item_desc *lidp, *next;
-
- list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
- xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
- xfs_trans_free_item_desc(lidp);
- }
-
- xfs_trans_free(tp);
-}
-
static inline void
xfs_log_item_batch_insert(
struct xfs_ail *ailp,
@@ -1538,258 +1324,6 @@ xfs_trans_committed_bulk(
}
/*
- * Called from the trans_commit code when we notice that the filesystem is in
- * the middle of a forced shutdown.
- *
- * When we are called here, we have already pinned all the items in the
- * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
- * so we can simply walk the items in the transaction, unpin them with an abort
- * flag and then free the items. Note that unpinning the items can result in
- * them being freed immediately, so we need to use a safe list traversal method
- * here.
- */
-STATIC void
-xfs_trans_uncommit(
- struct xfs_trans *tp,
- uint flags)
-{
- struct xfs_log_item_desc *lidp, *n;
-
- list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
- if (lidp->lid_flags & XFS_LID_DIRTY)
- IOP_UNPIN(lidp->lid_item, 1);
- }
-
- xfs_trans_unreserve_and_mod_sb(tp);
- xfs_trans_unreserve_and_mod_dquots(tp);
-
- xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
- xfs_trans_free(tp);
-}
-
-/*
- * Format the transaction direct to the iclog. This isolates the physical
- * transaction commit operation from the logical operation and hence allows
- * other methods to be introduced without affecting the existing commit path.
- */
-static int
-xfs_trans_commit_iclog(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- xfs_lsn_t *commit_lsn,
- int flags)
-{
- int shutdown;
- int error;
- int log_flags = 0;
- struct xlog_in_core *commit_iclog;
-#define XFS_TRANS_LOGVEC_COUNT 16
- struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
- struct xfs_log_iovec *log_vector;
- uint nvec;
-
-
- /*
- * Ask each log item how many log_vector entries it will
- * need so we can figure out how many to allocate.
- * Try to avoid the kmem_alloc() call in the common case
- * by using a vector from the stack when it fits.
- */
- nvec = xfs_trans_count_vecs(tp);
- if (nvec == 0) {
- return ENOMEM; /* triggers a shutdown! */
- } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
- log_vector = log_vector_fast;
- } else {
- log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec *
- sizeof(xfs_log_iovec_t),
- KM_SLEEP);
- }
-
- /*
- * Fill in the log_vector and pin the logged items, and
- * then write the transaction to the log.
- */
- xfs_trans_fill_vecs(tp, log_vector);
-
- if (flags & XFS_TRANS_RELEASE_LOG_RES)
- log_flags = XFS_LOG_REL_PERM_RESERV;
-
- error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
-
- /*
- * The transaction is committed incore here, and can go out to disk
- * at any time after this call. However, all the items associated
- * with the transaction are still locked and pinned in memory.
- */
- *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
-
- tp->t_commit_lsn = *commit_lsn;
- trace_xfs_trans_commit_lsn(tp);
-
- if (nvec > XFS_TRANS_LOGVEC_COUNT)
- kmem_free(log_vector);
-
- /*
- * If we got a log write error. Unpin the logitems that we
- * had pinned, clean up, free trans structure, and return error.
- */
- if (error || *commit_lsn == -1) {
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
- xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
- return XFS_ERROR(EIO);
- }
-
- /*
- * Once the transaction has committed, unused
- * reservations need to be released and changes to
- * the superblock need to be reflected in the in-core
- * version. Do that now.
- */
- xfs_trans_unreserve_and_mod_sb(tp);
-
- /*
- * Tell the LM to call the transaction completion routine
- * when the log write with LSN commit_lsn completes (e.g.
- * when the transaction commit really hits the on-disk log).
- * After this call we cannot reference tp, because the call
- * can happen at any time and the call will free the transaction
- * structure pointed to by tp. The only case where we call
- * the completion routine (xfs_trans_committed) directly is
- * if the log is turned off on a debug kernel or we're
- * running in simulation mode (the log is explicitly turned
- * off).
- */
- tp->t_logcb.cb_func = xfs_trans_committed;
- tp->t_logcb.cb_arg = tp;
-
- /*
- * We need to pass the iclog buffer which was used for the
- * transaction commit record into this function, and attach
- * the callback to it. The callback must be attached before
- * the items are unlocked to avoid racing with other threads
- * waiting for an item to unlock.
- */
- shutdown = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb));
-
- /*
- * Mark this thread as no longer being in a transaction
- */
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-
- /*
- * Once all the items of the transaction have been copied
- * to the in core log and the callback is attached, the
- * items can be unlocked.
- *
- * This will free descriptors pointing to items which were
- * not logged since there is nothing more to do with them.
- * For items which were logged, we will keep pointers to them
- * so they can be unpinned after the transaction commits to disk.
- * This will also stamp each modified meta-data item with
- * the commit lsn of this transaction for dependency tracking
- * purposes.
- */
- xfs_trans_unlock_items(tp, *commit_lsn);
-
- /*
- * If we detected a log error earlier, finish committing
- * the transaction now (unpin log items, etc).
- *
- * Order is critical here, to avoid using the transaction
- * pointer after its been freed (by xfs_trans_committed
- * either here now, or as a callback). We cannot do this
- * step inside xfs_log_notify as was done earlier because
- * of this issue.
- */
- if (shutdown)
- xfs_trans_committed(tp, XFS_LI_ABORTED);
-
- /*
- * Now that the xfs_trans_committed callback has been attached,
- * and the items are released we can finally allow the iclog to
- * go to disk.
- */
- return xfs_log_release_iclog(mp, commit_iclog);
-}
-
-/*
- * Walk the log items and allocate log vector structures for
- * each item large enough to fit all the vectors they require.
- * Note that this format differs from the old log vector format in
- * that there is no transaction header in these log vectors.
- */
-STATIC struct xfs_log_vec *
-xfs_trans_alloc_log_vecs(
- xfs_trans_t *tp)
-{
- struct xfs_log_item_desc *lidp;
- struct xfs_log_vec *lv = NULL;
- struct xfs_log_vec *ret_lv = NULL;
-
-
- /* Bail out if we didn't find a log item. */
- if (list_empty(&tp->t_items)) {
- ASSERT(0);
- return NULL;
- }
-
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- struct xfs_log_vec *new_lv;
-
- /* Skip items which aren't dirty in this transaction. */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
- continue;
-
- /* Skip items that do not have any vectors for writing */
- lidp->lid_size = IOP_SIZE(lidp->lid_item);
- if (!lidp->lid_size)
- continue;
-
- new_lv = kmem_zalloc(sizeof(*new_lv) +
- lidp->lid_size * sizeof(struct xfs_log_iovec),
- KM_SLEEP);
-
- /* The allocated iovec region lies beyond the log vector. */
- new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
- new_lv->lv_niovecs = lidp->lid_size;
- new_lv->lv_item = lidp->lid_item;
- if (!ret_lv)
- ret_lv = new_lv;
- else
- lv->lv_next = new_lv;
- lv = new_lv;
- }
-
- return ret_lv;
-}
-
-static int
-xfs_trans_commit_cil(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- xfs_lsn_t *commit_lsn,
- int flags)
-{
- struct xfs_log_vec *log_vector;
-
- /*
- * Get each log item to allocate a vector structure for
- * the log item to to pass to the log write code. The
- * CIL commit code will format the vector and save it away.
- */
- log_vector = xfs_trans_alloc_log_vecs(tp);
- if (!log_vector)
- return ENOMEM;
-
- xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
-
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
- xfs_trans_free(tp);
- return 0;
-}
-
-/*
* Commit the given transaction to the log.
*
* XFS disk error handling mechanism is not based on a typical
@@ -1845,17 +1379,16 @@ xfs_trans_commit(
xfs_trans_apply_sb_deltas(tp);
xfs_trans_apply_dquot_deltas(tp);
- if (mp->m_flags & XFS_MOUNT_DELAYLOG)
- error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
- else
- error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
-
+ error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
if (error == ENOMEM) {
xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
error = XFS_ERROR(EIO);
goto out_unreserve;
}
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ xfs_trans_free(tp);
+
/*
* If the transaction needs to be synchronous, then force the
* log out now and wait for it.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 3ae713c0abd9..f6118703f20d 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -163,9 +163,8 @@ typedef struct xfs_trans_header {
*/
struct xfs_log_item_desc {
struct xfs_log_item *lid_item;
- ushort lid_size;
- unsigned char lid_flags;
struct list_head lid_trans;
+ unsigned char lid_flags;
};
#define XFS_LID_DIRTY 0x1
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 4d00ee67792d..c4ba366d24e6 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -649,12 +649,12 @@ xfs_trans_dqresv(
* nblks.
*/
if (hardlimit > 0ULL &&
- hardlimit <= nblks + *resbcountp) {
+ hardlimit < nblks + *resbcountp) {
xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
goto error_return;
}
if (softlimit > 0ULL &&
- softlimit <= nblks + *resbcountp) {
+ softlimit < nblks + *resbcountp) {
if ((timer != 0 && get_seconds() > timer) ||
(warns != 0 && warns >= warnlimit)) {
xfs_quota_warn(mp, dqp,
@@ -677,11 +677,13 @@ xfs_trans_dqresv(
if (!softlimit)
softlimit = q->qi_isoftlimit;
- if (hardlimit > 0ULL && count >= hardlimit) {
+ if (hardlimit > 0ULL &&
+ hardlimit < ninos + count) {
xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
goto error_return;
}
- if (softlimit > 0ULL && count >= softlimit) {
+ if (softlimit > 0ULL &&
+ softlimit < ninos + count) {
if ((timer != 0 && get_seconds() > timer) ||
(warns != 0 && warns >= warnlimit)) {
xfs_quota_warn(mp, dqp,
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 8b32d1a4c5a1..89dbb4a50872 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -53,7 +53,7 @@ xfs_dir_ialloc(
output: may be a new transaction. */
xfs_inode_t *dp, /* directory within whose allocate
the inode. */
- mode_t mode,
+ umode_t mode,
xfs_nlink_t nlink,
xfs_dev_t rdev,
prid_t prid, /* project id */
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index 456fca314933..5eeab4690cfe 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,7 +18,7 @@
#ifndef __XFS_UTILS_H__
#define __XFS_UTILS_H__
-extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
+extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, umode_t, xfs_nlink_t,
xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index ce9268a2f56b..ebdb88840a47 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -131,7 +131,8 @@ xfs_readlink(
__func__, (unsigned long long) ip->i_ino,
(long long) pathlen);
ASSERT(0);
- return XFS_ERROR(EFSCORRUPTED);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto out;
}
@@ -175,7 +176,7 @@ xfs_free_eofblocks(
* Figure out if there are any blocks beyond the end
* of the file. If not, then there is nothing to do.
*/
- end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
if (last_fsb <= end_fsb)
return 0;
@@ -226,7 +227,14 @@ xfs_free_eofblocks(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_itruncate_data(&tp, ip, ip->i_size);
+ /*
+ * Do not update the on-disk file size. If we update the
+ * on-disk file size and then the system crashes before the
+ * contents of the file are flushed to disk then the files
+ * may be full of holes (ie NULL files bug).
+ */
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+ XFS_ISIZE(ip));
if (error) {
/*
* If we get an error at this point we simply don't
@@ -540,8 +548,8 @@ xfs_release(
return 0;
if ((S_ISREG(ip->i_d.di_mode) &&
- ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
- ip->i_delayed_blks > 0)) &&
+ (VFS_I(ip)->i_size > 0 ||
+ (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
(ip->i_df.if_flags & XFS_IFEXTENTS)) &&
(!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
@@ -618,7 +626,7 @@ xfs_inactive(
* only one with a reference to the inode.
*/
truncate = ((ip->i_d.di_nlink == 0) &&
- ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
+ ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
(ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
S_ISREG(ip->i_d.di_mode));
@@ -632,12 +640,12 @@ xfs_inactive(
if (ip->i_d.di_nlink != 0) {
if ((S_ISREG(ip->i_d.di_mode) &&
- ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
- ip->i_delayed_blks > 0)) &&
- (ip->i_df.if_flags & XFS_IFEXTENTS) &&
- (!(ip->i_d.di_flags &
+ (VFS_I(ip)->i_size > 0 ||
+ (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
+ (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+ (!(ip->i_d.di_flags &
(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
- (ip->i_delayed_blks != 0)))) {
+ ip->i_delayed_blks != 0))) {
error = xfs_free_eofblocks(mp, ip, 0);
if (error)
return VN_INACTIVE_CACHE;
@@ -670,13 +678,18 @@ xfs_inactive(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_itruncate_data(&tp, ip, 0);
+ ip->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
if (error) {
xfs_trans_cancel(tp,
XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
return VN_INACTIVE_CACHE;
}
+
+ ASSERT(ip->i_d.di_nextents == 0);
} else if (S_ISLNK(ip->i_d.di_mode)) {
/*
@@ -822,7 +835,7 @@ int
xfs_create(
xfs_inode_t *dp,
struct xfs_name *name,
- mode_t mode,
+ umode_t mode,
xfs_dev_t rdev,
xfs_inode_t **ipp)
{
@@ -1481,7 +1494,7 @@ xfs_symlink(
xfs_inode_t *dp,
struct xfs_name *link_name,
const char *target_path,
- mode_t mode,
+ umode_t mode,
xfs_inode_t **ipp)
{
xfs_mount_t *mp = dp->i_mount;
@@ -1961,11 +1974,11 @@ xfs_zero_remaining_bytes(
* since nothing can read beyond eof. The space will
* be zeroed when the file is extended anyway.
*/
- if (startoff >= ip->i_size)
+ if (startoff >= XFS_ISIZE(ip))
return 0;
- if (endoff > ip->i_size)
- endoff = ip->i_size;
+ if (endoff > XFS_ISIZE(ip))
+ endoff = XFS_ISIZE(ip);
bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -2260,7 +2273,7 @@ xfs_change_file_space(
bf->l_start += offset;
break;
case 2: /*SEEK_END*/
- bf->l_start += ip->i_size;
+ bf->l_start += XFS_ISIZE(ip);
break;
default:
return XFS_ERROR(EINVAL);
@@ -2277,7 +2290,7 @@ xfs_change_file_space(
bf->l_whence = 0;
startoffset = bf->l_start;
- fsize = ip->i_size;
+ fsize = XFS_ISIZE(ip);
/*
* XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 35d3d513e1e9..0c877cbde142 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -26,7 +26,7 @@ int xfs_release(struct xfs_inode *ip);
int xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
-int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
+int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode,
xfs_dev_t rdev, struct xfs_inode **ipp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
@@ -35,7 +35,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
xfs_off_t *offset, filldir_t filldir);
int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
- const char *target_path, mode_t mode, struct xfs_inode **ipp);
+ const char *target_path, umode_t mode, struct xfs_inode **ipp);
int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
int xfs_change_file_space(struct xfs_inode *ip, int cmd,
xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);