aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Shi <alex.shi@linaro.org>2016-04-11 11:16:39 +0800
committerAlex Shi <alex.shi@linaro.org>2016-04-11 11:16:39 +0800
commitd83731cf53fd3af7173cab5e3f573d06ebc852be (patch)
treed61002ed1d894fe2d9b26003a1230f05a61ed51b
parentbfbd381ea0ca274e4c07ea62acaaeac31829cd5d (diff)
parent017cfa86c2450b158d7d0336f2f3ea685da1230b (diff)
Merge branch 'linux-linaro-lsk-v4.1' into linux-linaro-lsk-v4.1-rtlsk-v4.1-16.04-rt
-rw-r--r--Documentation/cgroups/blkio-controller.txt83
-rw-r--r--Documentation/cgroups/memory.txt1
-rw-r--r--Documentation/devicetree/bindings/arm/omap/omap.txt1
-rw-r--r--Documentation/devicetree/bindings/arm/psci.txt6
-rw-r--r--Documentation/devicetree/bindings/opp/opp.txt519
-rw-r--r--Documentation/devicetree/bindings/power/opp.txt25
-rw-r--r--Documentation/filesystems/debugfs.txt2
-rw-r--r--Documentation/kernel-parameters.txt4
-rw-r--r--Documentation/virtual/kvm/mmu.txt3
-rw-r--r--Makefile2
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/boot/dts/dra7.dtsi10
-rw-r--r--arch/arm/include/asm/cacheflush.h1
-rw-r--r--arch/arm/include/asm/psci.h23
-rw-r--r--arch/arm/kernel/Makefile2
-rw-r--r--arch/arm/kernel/psci.c299
-rw-r--r--arch/arm/kernel/psci_smp.c35
-rw-r--r--arch/arm/kernel/setup.c3
-rw-r--r--arch/arm/kvm/psci.c14
-rw-r--r--arch/arm/mach-highbank/highbank.c2
-rw-r--r--arch/arm/mach-highbank/pm.c16
-rw-r--r--arch/arm/mach-imx/mach-imx6q.c2
-rw-r--r--arch/arm/mach-omap2/omap-hotplug.c2
-rw-r--r--arch/arm/mach-omap2/omap-wakeupgen.c2
-rw-r--r--arch/arm/mach-omap2/omap_hwmod.c75
-rw-r--r--arch/arm/mach-omap2/omap_hwmod.h6
-rw-r--r--arch/arm/mach-prima2/hotplug.c2
-rw-r--r--arch/arm/mach-qcom/platsmp.c2
-rw-r--r--arch/arm/mach-realview/hotplug.c2
-rw-r--r--arch/arm/mach-spear/hotplug.c2
-rw-r--r--arch/arm/mach-tegra/hotplug.c2
-rw-r--r--arch/arm/mach-ux500/hotplug.c2
-rw-r--r--arch/arm/mach-vexpress/hotplug.c2
-rw-r--r--arch/arm/mm/init.c92
-rw-r--r--arch/arm/vdso/vdso.S3
-rw-r--r--arch/arm64/Kconfig28
-rw-r--r--arch/arm64/Makefile7
-rw-r--r--arch/arm64/include/asm/acpi.h21
-rw-r--r--arch/arm64/include/asm/assembler.h13
-rw-r--r--arch/arm64/include/asm/barrier.h24
-rw-r--r--arch/arm64/include/asm/cacheflush.h4
-rw-r--r--arch/arm64/include/asm/cpu_ops.h27
-rw-r--r--arch/arm64/include/asm/hardirq.h4
-rw-r--r--arch/arm64/include/asm/irq_work.h11
-rw-r--r--arch/arm64/include/asm/kasan.h38
-rw-r--r--arch/arm64/include/asm/memory.h2
-rw-r--r--arch/arm64/include/asm/percpu.h8
-rw-r--r--arch/arm64/include/asm/pgalloc.h1
-rw-r--r--arch/arm64/include/asm/pgtable.h14
-rw-r--r--arch/arm64/include/asm/psci.h20
-rw-r--r--arch/arm64/include/asm/ptrace.h4
-rw-r--r--arch/arm64/include/asm/smp.h6
-rw-r--r--arch/arm64/include/asm/smp_plat.h16
-rw-r--r--arch/arm64/include/asm/string.h16
-rw-r--r--arch/arm64/include/asm/topology.h9
-rw-r--r--arch/arm64/kernel/Makefile5
-rw-r--r--arch/arm64/kernel/acpi.c123
-rw-r--r--arch/arm64/kernel/arm64ksyms.c3
-rw-r--r--arch/arm64/kernel/cpu_ops.c74
-rw-r--r--arch/arm64/kernel/cpuidle.c7
-rw-r--r--arch/arm64/kernel/debug-monitors.c4
-rw-r--r--arch/arm64/kernel/head.S23
-rw-r--r--arch/arm64/kernel/irq.c2
-rw-r--r--arch/arm64/kernel/module.c16
-rw-r--r--arch/arm64/kernel/psci.c414
-rw-r--r--arch/arm64/kernel/setup.c81
-rw-r--r--arch/arm64/kernel/sleep.S14
-rw-r--r--arch/arm64/kernel/smp.c240
-rw-r--r--arch/arm64/kernel/smp_spin_table.c8
-rw-r--r--arch/arm64/kernel/suspend.c3
-rw-r--r--arch/arm64/kernel/time.c2
-rw-r--r--arch/arm64/kernel/traps.c4
-rw-r--r--arch/arm64/kernel/vdso/vdso.S3
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S11
-rw-r--r--arch/arm64/lib/copy_template.S193
-rw-r--r--arch/arm64/lib/memchr.S2
-rw-r--r--arch/arm64/lib/memcmp.S2
-rw-r--r--arch/arm64/lib/memcpy.S184
-rw-r--r--arch/arm64/lib/memmove.S9
-rw-r--r--arch/arm64/lib/memset.S5
-rw-r--r--arch/arm64/lib/strcmp.S2
-rw-r--r--arch/arm64/lib/strlen.S2
-rw-r--r--arch/arm64/lib/strncmp.S2
-rw-r--r--arch/arm64/mm/Makefile3
-rw-r--r--arch/arm64/mm/cache.S10
-rw-r--r--arch/arm64/mm/context.c16
-rw-r--r--arch/arm64/mm/flush.c4
-rw-r--r--arch/arm64/mm/init.c6
-rw-r--r--arch/arm64/mm/kasan_init.c165
-rw-r--r--arch/arm64/mm/pgd.c2
-rw-r--r--arch/arm64/mm/proc.S4
-rw-r--r--arch/mips/Kconfig13
-rw-r--r--arch/parisc/include/asm/cache.h3
-rw-r--r--arch/parisc/include/asm/cacheflush.h4
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S14
-rw-r--r--arch/s390/include/asm/mmu_context.h16
-rw-r--r--arch/s390/include/asm/pgalloc.h24
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/Kconfig.debug17
-rw-r--r--arch/x86/include/asm/cacheflush.h8
-rw-r--r--arch/x86/include/asm/kvm_para.h7
-rw-r--r--arch/x86/include/asm/sections.h2
-rw-r--r--arch/x86/kernel/ftrace.c6
-rw-r--r--arch/x86/kernel/kgdb.c8
-rw-r--r--arch/x86/kernel/test_nx.c2
-rw-r--r--arch/x86/kernel/test_rodata.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S25
-rw-r--r--arch/x86/kvm/vmx.c43
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/vdso/vdso2c.h2
-rw-r--r--block/bio.c37
-rw-r--r--block/blk-cgroup.c124
-rw-r--r--block/blk-core.c70
-rw-r--r--block/blk-integrity.c1
-rw-r--r--block/blk-sysfs.c3
-rw-r--r--block/blk-throttle.c2
-rw-r--r--block/bounce.c4
-rw-r--r--block/cfq-iosched.c2
-rw-r--r--block/elevator.c2
-rw-r--r--block/genhd.c1
-rw-r--r--drivers/acpi/ec_sys.c2
-rw-r--r--drivers/acpi/internal.h2
-rw-r--r--drivers/base/power/Makefile2
-rw-r--r--drivers/base/power/opp.c927
-rw-r--r--drivers/base/power/opp/Makefile3
-rw-r--r--drivers/base/power/opp/core.c2053
-rw-r--r--drivers/base/power/opp/cpu.c271
-rw-r--r--drivers/base/power/opp/debugfs.c218
-rw-r--r--drivers/base/power/opp/opp.h207
-rw-r--r--drivers/base/regmap/internal.h6
-rw-r--r--drivers/base/regmap/regcache-lzo.c4
-rw-r--r--drivers/base/regmap/regcache.c24
-rw-r--r--drivers/block/drbd/drbd_int.h1
-rw-r--r--drivers/block/drbd/drbd_main.c10
-rw-r--r--drivers/block/pktcdvd.c1
-rw-r--r--drivers/char/raw.c1
-rw-r--r--drivers/cpufreq/Makefile1
-rw-r--r--drivers/cpufreq/arm_big_little.h2
-rw-r--r--drivers/cpufreq/arm_big_little_dt.c4
-rw-r--r--drivers/cpufreq/cpufreq-dt.c362
-rw-r--r--drivers/cpufreq/cpufreq.c68
-rw-r--r--drivers/cpufreq/cpufreq_opp.c110
-rw-r--r--drivers/cpufreq/exynos5440-cpufreq.c6
-rw-r--r--drivers/cpufreq/freq_table.c15
-rw-r--r--drivers/cpufreq/imx6q-cpufreq.c6
-rw-r--r--drivers/cpuidle/cpuidle-calxeda.c15
-rw-r--r--drivers/dma/at_xdmac.c42
-rw-r--r--drivers/firmware/Kconfig3
-rw-r--r--drivers/firmware/Makefile1
-rw-r--r--drivers/firmware/efi/Makefile8
-rw-r--r--drivers/firmware/psci.c470
-rw-r--r--drivers/gpu/drm/radeon/atombios_dp.c118
-rw-r--r--drivers/gpu/drm/radeon/radeon_dp_mst.c12
-rw-r--r--drivers/gpu/drm/radeon/radeon_mode.h6
-rw-r--r--drivers/gpu/drm/radeon/radeon_pm.c5
-rw-r--r--drivers/iommu/amd_iommu_init.c2
-rw-r--r--drivers/iommu/amd_iommu_types.h2
-rw-r--r--drivers/md/bcache/request.c1
-rw-r--r--drivers/md/dm.c2
-rw-r--r--drivers/md/dm.h1
-rw-r--r--drivers/md/md.h1
-rw-r--r--drivers/md/raid1.c4
-rw-r--r--drivers/md/raid10.c2
-rw-r--r--drivers/misc/lkdtm.c29
-rw-r--r--drivers/mtd/devices/block2mtd.c1
-rw-r--r--drivers/net/can/usb/gs_usb.c24
-rw-r--r--drivers/net/wireless/ath/ath10k/core.h2
-rw-r--r--drivers/net/wireless/ath/ath5k/ath5k.h2
-rw-r--r--drivers/net/wireless/ath/ath9k/hw.c2
-rw-r--r--drivers/net/wireless/ath/ath9k/hw.h4
-rw-r--r--drivers/net/wireless/b43/debugfs.c18
-rw-r--r--drivers/net/wireless/b43/debugfs.h2
-rw-r--r--drivers/net/wireless/b43legacy/debugfs.c10
-rw-r--r--drivers/net/wireless/b43legacy/debugfs.h2
-rw-r--r--drivers/net/wireless/iwlegacy/common.h6
-rw-r--r--drivers/net/wireless/iwlwifi/mvm/mvm.h6
-rw-r--r--drivers/net/wireless/iwlwifi/mvm/tx.c9
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h4
-rw-r--r--drivers/target/target_core_tmr.c1
-rw-r--r--drivers/uwb/uwb-debug.c2
-rw-r--r--fs/block_dev.c9
-rw-r--r--fs/btrfs/disk-io.c11
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/buffer.c74
-rw-r--r--fs/debugfs/file.c191
-rw-r--r--fs/ext2/super.c1
-rw-r--r--fs/ext4/ext4.h15
-rw-r--r--fs/ext4/extents.c90
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/inode.c71
-rw-r--r--fs/ext4/mballoc.c1
-rw-r--r--fs/ext4/page-io.c9
-rw-r--r--fs/ext4/super.c4
-rw-r--r--fs/ext4/truncate.h2
-rw-r--r--fs/f2fs/node.c4
-rw-r--r--fs/f2fs/segment.h3
-rw-r--r--fs/fat/file.c1
-rw-r--r--fs/fat/inode.c1
-rw-r--r--fs/fs-writeback.c1168
-rw-r--r--fs/fuse/file.c12
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/super.c1
-rw-r--r--fs/inode.c1
-rw-r--r--fs/jffs2/dir.c11
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/nfs/filelayout/filelayout.c1
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nilfs2/segbuf.c12
-rw-r--r--fs/ocfs2/file.c1
-rw-r--r--fs/overlayfs/dir.c10
-rw-r--r--fs/overlayfs/inode.c2
-rw-r--r--fs/overlayfs/super.c12
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/ufs/super.c1
-rw-r--r--fs/xfs/xfs_aops.c12
-rw-r--r--fs/xfs/xfs_file.c1
-rw-r--r--include/asm-generic/early_ioremap.h6
-rw-r--r--include/asm-generic/vmlinux.lds.h1
-rw-r--r--include/linux/backing-dev-defs.h256
-rw-r--r--include/linux/backing-dev.h561
-rw-r--r--include/linux/bio.h3
-rw-r--r--include/linux/blk-cgroup.h (renamed from block/blk-cgroup.h)32
-rw-r--r--include/linux/blk_types.h1
-rw-r--r--include/linux/blkdev.h21
-rw-r--r--include/linux/cache.h14
-rw-r--r--include/linux/cgroup.h25
-rw-r--r--include/linux/cpufreq.h13
-rw-r--r--include/linux/debugfs.h26
-rw-r--r--include/linux/edac.h2
-rw-r--r--include/linux/fault-inject.h2
-rw-r--r--include/linux/fs.h26
-rw-r--r--include/linux/init.h4
-rw-r--r--include/linux/memcontrol.h29
-rw-r--r--include/linux/mm.h8
-rw-r--r--include/linux/pagemap.h3
-rw-r--r--include/linux/pm_opp.h99
-rw-r--r--include/linux/psci.h54
-rw-r--r--include/linux/regulator/consumer.h10
-rw-r--r--include/linux/thermal.h1
-rw-r--r--include/linux/tracepoint.h17
-rw-r--r--include/linux/writeback.h221
-rw-r--r--include/net/iw_handler.h6
-rw-r--r--include/trace/events/writeback.h15
-rw-r--r--include/uapi/linux/psci.h18
-rw-r--r--init/Kconfig5
-rw-r--r--init/main.c27
-rw-r--r--kernel/debug/kdb/kdb_bp.c4
-rw-r--r--kernel/events/core.c20
-rw-r--r--lib/dma-debug.c2
-rw-r--r--mm/backing-dev.c662
-rw-r--r--mm/early_ioremap.c22
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/failslab.c8
-rw-r--r--mm/filemap.c34
-rw-r--r--mm/madvise.c1
-rw-r--r--mm/memcontrol.c223
-rw-r--r--mm/page-writeback.c1231
-rw-r--r--mm/page_alloc.c8
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/truncate.c18
-rw-r--r--mm/vmscan.c81
-rw-r--r--net/mac80211/agg-rx.c2
-rw-r--r--net/mac80211/ieee80211_i.h2
-rw-r--r--net/mac80211/rc80211_minstrel.c2
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c5
-rw-r--r--net/mac80211/rx.c37
-rw-r--r--net/wireless/core.c2
-rw-r--r--net/wireless/wext-core.c52
-rw-r--r--scripts/Makefile.kasan4
-rwxr-xr-xscripts/ld-version.sh2
-rw-r--r--sound/soc/codecs/wm8958-dsp2.c8
-rw-r--r--sound/soc/codecs/wm8994.c4
-rw-r--r--sound/soc/codecs/wm_adsp.c8
-rw-r--r--sound/soc/samsung/i2s.c21
-rw-r--r--sound/soc/soc-dapm.c8
281 files changed, 10060 insertions, 4420 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index cd556b914786..68b6a6a470b0 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -387,8 +387,81 @@ groups and put applications in that group which are not driving enough
IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
on individual groups and throughput should improve.
-What works
-==========
-- Currently only sync IO queues are support. All the buffered writes are
- still system wide and not per group. Hence we will not see service
- differentiation between buffered writes between groups.
+Writeback
+=========
+
+Page cache is dirtied through buffered writes and shared mmaps and
+written asynchronously to the backing filesystem by the writeback
+mechanism. Writeback sits between the memory and IO domains and
+regulates the proportion of dirty memory by balancing dirtying and
+write IOs.
+
+On traditional cgroup hierarchies, relationships between different
+controllers cannot be established making it impossible for writeback
+to operate accounting for cgroup resource restrictions and all
+writeback IOs are attributed to the root cgroup.
+
+If both the blkio and memory controllers are used on the v2 hierarchy
+and the filesystem supports cgroup writeback, writeback operations
+correctly follow the resource restrictions imposed by both memory and
+blkio controllers.
+
+Writeback examines both system-wide and per-cgroup dirty memory status
+and enforces the more restrictive of the two. Also, writeback control
+parameters which are absolute values - vm.dirty_bytes and
+vm.dirty_background_bytes - are distributed across cgroups according
+to their current writeback bandwidth.
+
+There's a peculiarity stemming from the discrepancy in ownership
+granularity between memory controller and writeback. While memory
+controller tracks ownership per page, writeback operates on inode
+basis. cgroup writeback bridges the gap by tracking ownership by
+inode but migrating ownership if too many foreign pages, pages which
+don't match the current inode ownership, have been encountered while
+writing back the inode.
+
+This is a conscious design choice as writeback operations are
+inherently tied to inodes making strictly following page ownership
+complicated and inefficient. The only use case which suffers from
+this compromise is multiple cgroups concurrently dirtying disjoint
+regions of the same inode, which is an unlikely use case and decided
+to be unsupported. Note that as memory controller assigns page
+ownership on the first use and doesn't update it until the page is
+released, even if cgroup writeback strictly follows page ownership,
+multiple cgroups dirtying overlapping areas wouldn't work as expected.
+In general, write-sharing an inode across multiple cgroups is not well
+supported.
+
+Filesystem support for cgroup writeback
+---------------------------------------
+
+A filesystem can make writeback IOs cgroup-aware by updating
+address_space_operations->writepage[s]() to annotate bio's using the
+following two functions.
+
+* wbc_init_bio(@wbc, @bio)
+
+ Should be called for each bio carrying writeback data and associates
+ the bio with the inode's owner cgroup. Can be called anytime
+ between bio allocation and submission.
+
+* wbc_account_io(@wbc, @page, @bytes)
+
+ Should be called for each data segment being written out. While
+ this function doesn't care exactly when it's called during the
+ writeback session, it's the easiest and most natural to call it as
+ data segments are added to a bio.
+
+With writeback bio's annotated, cgroup support can be enabled per
+super_block by setting MS_CGROUPWB in ->s_flags. This allows for
+selective disabling of cgroup writeback support which is helpful when
+certain filesystem features, e.g. journaled data mode, are
+incompatible.
+
+wbc_init_bio() binds the specified bio to its cgroup. Depending on
+the configuration, the bio may be executed at a lower priority and if
+the writeback session is holding shared resources, e.g. a journal
+entry, may lead to priority inversion. There is no one easy solution
+for the problem. Filesystems can try to work around specific problem
+cases by skipping wbc_init_bio() or using bio_associate_blkcg()
+directly.
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index f456b4315e86..ff71e16cc752 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -493,6 +493,7 @@ pgpgin - # of charging events to the memory cgroup. The charging
pgpgout - # of uncharging events to the memory cgroup. The uncharging
event happens each time a page is unaccounted from the cgroup.
swap - # of bytes of swap usage
+dirty - # of bytes that are waiting to get written back to the disk.
writeback - # of bytes of file/anon cache that are queued for syncing to
disk.
inactive_anon - # of bytes of anonymous and swap cache memory on inactive
diff --git a/Documentation/devicetree/bindings/arm/omap/omap.txt b/Documentation/devicetree/bindings/arm/omap/omap.txt
index 4f6a82cef1d1..cbe35b3de9e9 100644
--- a/Documentation/devicetree/bindings/arm/omap/omap.txt
+++ b/Documentation/devicetree/bindings/arm/omap/omap.txt
@@ -23,6 +23,7 @@ Optional properties:
during suspend.
- ti,no-reset-on-init: When present, the module should not be reset at init
- ti,no-idle-on-init: When present, the module should not be idled at init
+- ti,no-idle: When present, the module is never allowed to idle.
Example:
diff --git a/Documentation/devicetree/bindings/arm/psci.txt b/Documentation/devicetree/bindings/arm/psci.txt
index 5aa40ede0e99..a9adab84e2fe 100644
--- a/Documentation/devicetree/bindings/arm/psci.txt
+++ b/Documentation/devicetree/bindings/arm/psci.txt
@@ -31,6 +31,10 @@ Main node required properties:
support, but are permitted to be present for compatibility with
existing software when "arm,psci" is later in the compatible list.
+ * "arm,psci-1.0" : for implementations complying to PSCI 1.0. PSCI 1.0 is
+ backward compatible with PSCI 0.2 with minor specification updates,
+ as defined in the PSCI specification[2].
+
- method : The method of calling the PSCI firmware. Permitted
values are:
@@ -100,3 +104,5 @@ Case 3: PSCI v0.2 and PSCI v0.1.
[1] Kernel documentation - ARM idle states bindings
Documentation/devicetree/bindings/arm/idle-states.txt
+[2] Power State Coordination Interface (PSCI) specification
+ http://infocenter.arm.com/help/topic/com.arm.doc.den0022c/DEN0022C_Power_State_Coordination_Interface.pdf
diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
new file mode 100644
index 000000000000..601256fe8c0d
--- /dev/null
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -0,0 +1,519 @@
+Generic OPP (Operating Performance Points) Bindings
+----------------------------------------------------
+
+Devices work at voltage-current-frequency combinations and some implementations
+have the liberty of choosing these. These combinations are called Operating
+Performance Points aka OPPs. This document defines bindings for these OPPs
+applicable across wide range of devices. For illustration purpose, this document
+uses CPU as a device.
+
+This document contain multiple versions of OPP binding and only one of them
+should be used per device.
+
+Binding 1: operating-points
+============================
+
+This binding only supports voltage-frequency pairs.
+
+Properties:
+- operating-points: An array of 2-tuples items, and each item consists
+ of frequency and voltage like <freq-kHz vol-uV>.
+ freq: clock frequency in kHz
+ vol: voltage in microvolt
+
+Examples:
+
+cpu@0 {
+ compatible = "arm,cortex-a9";
+ reg = <0>;
+ next-level-cache = <&L2>;
+ operating-points = <
+ /* kHz uV */
+ 792000 1100000
+ 396000 950000
+ 198000 850000
+ >;
+};
+
+
+Binding 2: operating-points-v2
+============================
+
+* Property: operating-points-v2
+
+Devices supporting OPPs must set their "operating-points-v2" property with
+phandle to a OPP table in their DT node. The OPP core will use this phandle to
+find the operating points for the device.
+
+If required, this can be extended for SoC vendor specfic bindings. Such bindings
+should be documented as Documentation/devicetree/bindings/power/<vendor>-opp.txt
+and should have a compatible description like: "operating-points-v2-<vendor>".
+
+* OPP Table Node
+
+This describes the OPPs belonging to a device. This node can have following
+properties:
+
+Required properties:
+- compatible: Allow OPPs to express their compatibility. It should be:
+ "operating-points-v2".
+
+- OPP nodes: One or more OPP nodes describing voltage-current-frequency
+ combinations. Their name isn't significant but their phandle can be used to
+ reference an OPP.
+
+Optional properties:
+- opp-shared: Indicates that device nodes using this OPP Table Node's phandle
+ switch their DVFS state together, i.e. they share clock/voltage/current lines.
+ Missing property means devices have independent clock/voltage/current lines,
+ but they share OPP tables.
+
+- status: Marks the OPP table enabled/disabled.
+
+
+* OPP Node
+
+This defines voltage-current-frequency combinations along with other related
+properties.
+
+Required properties:
+- opp-hz: Frequency in Hz, expressed as a 64-bit big-endian integer.
+
+Optional properties:
+- opp-microvolt: voltage in micro Volts.
+
+ A single regulator's voltage is specified with an array of size one or three.
+ Single entry is for target voltage and three entries are for <target min max>
+ voltages.
+
+ Entries for multiple regulators must be present in the same order as
+ regulators are specified in device's DT node.
+
+- opp-microvolt-<name>: Named opp-microvolt property. This is exactly similar to
+ the above opp-microvolt property, but allows multiple voltage ranges to be
+ provided for the same OPP. At runtime, the platform can pick a <name> and
+ matching opp-microvolt-<name> property will be enabled for all OPPs. If the
+ platform doesn't pick a specific <name> or the <name> doesn't match with any
+ opp-microvolt-<name> properties, then opp-microvolt property shall be used, if
+ present.
+
+- opp-microamp: The maximum current drawn by the device in microamperes
+ considering system specific parameters (such as transients, process, aging,
+ maximum operating temperature range etc.) as necessary. This may be used to
+ set the most efficient regulator operating mode.
+
+ Should only be set if opp-microvolt is set for the OPP.
+
+ Entries for multiple regulators must be present in the same order as
+ regulators are specified in device's DT node. If this property isn't required
+ for few regulators, then this should be marked as zero for them. If it isn't
+ required for any regulator, then this property need not be present.
+
+- opp-microamp-<name>: Named opp-microamp property. Similar to
+ opp-microvolt-<name> property, but for microamp instead.
+
+- clock-latency-ns: Specifies the maximum possible transition latency (in
+ nanoseconds) for switching to this OPP from any other OPP.
+
+- turbo-mode: Marks the OPP to be used only for turbo modes. Turbo mode is
+ available on some platforms, where the device can run over its operating
+ frequency for a short duration of time limited by the device's power, current
+ and thermal limits.
+
+- opp-suspend: Marks the OPP to be used during device suspend. Only one OPP in
+ the table should have this.
+
+- opp-supported-hw: This enables us to select only a subset of OPPs from the
+ larger OPP table, based on what version of the hardware we are running on. We
+ still can't have multiple nodes with the same opp-hz value in OPP table.
+
+ It's an user defined array containing a hierarchy of hardware version numbers,
+ supported by the OPP. For example: a platform with hierarchy of three levels
+ of versions (A, B and C), this field should be like <X Y Z>, where X
+ corresponds to Version hierarchy A, Y corresponds to version hierarchy B and Z
+ corresponds to version hierarchy C.
+
+ Each level of hierarchy is represented by a 32 bit value, and so there can be
+ only 32 different supported version per hierarchy. i.e. 1 bit per version. A
+ value of 0xFFFFFFFF will enable the OPP for all versions for that hierarchy
+ level. And a value of 0x00000000 will disable the OPP completely, and so we
+ never want that to happen.
+
+ If 32 values aren't sufficient for a version hierarchy, than that version
+ hierarchy can be contained in multiple 32 bit values. i.e. <X Y Z1 Z2> in the
+ above example, Z1 & Z2 refer to the version hierarchy Z.
+
+- status: Marks the node enabled/disabled.
+
+Example 1: Single cluster Dual-core ARM cortex A9, switch DVFS states together.
+
+/ {
+ cpus {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ cpu@0 {
+ compatible = "arm,cortex-a9";
+ reg = <0>;
+ next-level-cache = <&L2>;
+ clocks = <&clk_controller 0>;
+ clock-names = "cpu";
+ cpu-supply = <&cpu_supply0>;
+ operating-points-v2 = <&cpu0_opp_table>;
+ };
+
+ cpu@1 {
+ compatible = "arm,cortex-a9";
+ reg = <1>;
+ next-level-cache = <&L2>;
+ clocks = <&clk_controller 0>;
+ clock-names = "cpu";
+ cpu-supply = <&cpu_supply0>;
+ operating-points-v2 = <&cpu0_opp_table>;
+ };
+ };
+
+ cpu0_opp_table: opp_table0 {
+ compatible = "operating-points-v2";
+ opp-shared;
+
+ opp@1000000000 {
+ opp-hz = /bits/ 64 <1000000000>;
+ opp-microvolt = <970000 975000 985000>;
+ opp-microamp = <70000>;
+ clock-latency-ns = <300000>;
+ opp-suspend;
+ };
+ opp@1100000000 {
+ opp-hz = /bits/ 64 <1100000000>;
+ opp-microvolt = <980000 1000000 1010000>;
+ opp-microamp = <80000>;
+ clock-latency-ns = <310000>;
+ };
+ opp@1200000000 {
+ opp-hz = /bits/ 64 <1200000000>;
+ opp-microvolt = <1025000>;
+ clock-latency-ns = <290000>;
+ turbo-mode;
+ };
+ };
+};
+
+Example 2: Single cluster, Quad-core Qualcom-krait, switches DVFS states
+independently.
+
+/ {
+ cpus {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ cpu@0 {
+ compatible = "qcom,krait";
+ reg = <0>;
+ next-level-cache = <&L2>;
+ clocks = <&clk_controller 0>;
+ clock-names = "cpu";
+ cpu-supply = <&cpu_supply0>;
+ operating-points-v2 = <&cpu_opp_table>;
+ };
+
+ cpu@1 {
+ compatible = "qcom,krait";
+ reg = <1>;
+ next-level-cache = <&L2>;
+ clocks = <&clk_controller 1>;
+ clock-names = "cpu";
+ cpu-supply = <&cpu_supply1>;
+ operating-points-v2 = <&cpu_opp_table>;
+ };
+
+ cpu@2 {
+ compatible = "qcom,krait";
+ reg = <2>;
+ next-level-cache = <&L2>;
+ clocks = <&clk_controller 2>;
+ clock-names = "cpu";
+ cpu-supply = <&cpu_supply2>;
+ operating-points-v2 = <&cpu_opp_table>;
+ };
+
+ cpu@3 {
+ compatible = "qcom,krait";
+ reg = <3>;
+ next-level-cache = <&L2>;
+ clocks = <&clk_controller 3>;
+ clock-names = "cpu";
+ cpu-supply = <&cpu_supply3>;
+ operating-points-v2 = <&cpu_opp_table>;
+ };
+ };
+
+ cpu_opp_table: opp_table {
+ compatible = "operating-points-v2";
+
+ /*
+ * Missing opp-shared property means CPUs switch DVFS states
+ * independently.
+ */
+
+ opp@1000000000 {
+ opp-hz = /bits/ 64 <1000000000>;
+ opp-microvolt = <970000 975000 985000>;
+ opp-microamp = <70000>;
+ clock-latency-ns = <300000>;
+ opp-suspend;
+ };
+ opp@1100000000 {
+ opp-hz = /bits/ 64 <1100000000>;
+ opp-microvolt = <980000 1000000 1010000>;
+ opp-microamp = <80000>;
+ clock-latency-ns = <310000>;
+ };
+ opp@1200000000 {
+ opp-hz = /bits/ 64 <1200000000>;
+ opp-microvolt = <1025000>;
+ opp-microamp = <90000;
+ lock-latency-ns = <290000>;
+ turbo-mode;
+ };
+ };
+};
+
+Example 3: Dual-cluster, Dual-core per cluster. CPUs within a cluster switch
+DVFS state together.
+
+/ {
+ cpus {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ cpu@0 {
+ compatible = "arm,cortex-a7";
+ reg = <0>;
+ next-level-cache = <&L2>;
+ clocks = <&clk_controller 0>;
+ clock-names = "cpu";
+ cpu-supply = <&cpu_supply0>;
+ operating-points-v2 = <&cluster0_opp>;
+ };
+
+ cpu@1 {
+ compatible = "arm,cortex-a7";
+ reg = <1>;
+ next-level-cache = <&L2>;
+ clocks = <&clk_controller 0>;
+ clock-names = "cpu";
+ cpu-supply = <&cpu_supply0>;
+ operating-points-v2 = <&cluster0_opp>;
+ };
+
+ cpu@100 {
+ compatible = "arm,cortex-a15";
+ reg = <100>;
+ next-level-cache = <&L2>;
+ clocks = <&clk_controller 1>;
+ clock-names = "cpu";
+ cpu-supply = <&cpu_supply1>;
+ operating-points-v2 = <&cluster1_opp>;
+ };
+
+ cpu@101 {
+ compatible = "arm,cortex-a15";
+ reg = <101>;
+ next-level-cache = <&L2>;
+ clocks = <&clk_controller 1>;
+ clock-names = "cpu";
+ cpu-supply = <&cpu_supply1>;
+ operating-points-v2 = <&cluster1_opp>;
+ };
+ };
+
+ cluster0_opp: opp_table0 {
+ compatible = "operating-points-v2";
+ opp-shared;
+
+ opp@1000000000 {
+ opp-hz = /bits/ 64 <1000000000>;
+ opp-microvolt = <970000 975000 985000>;
+ opp-microamp = <70000>;
+ clock-latency-ns = <300000>;
+ opp-suspend;
+ };
+ opp@1100000000 {
+ opp-hz = /bits/ 64 <1100000000>;
+ opp-microvolt = <980000 1000000 1010000>;
+ opp-microamp = <80000>;
+ clock-latency-ns = <310000>;
+ };
+ opp@1200000000 {
+ opp-hz = /bits/ 64 <1200000000>;
+ opp-microvolt = <1025000>;
+ opp-microamp = <90000>;
+ clock-latency-ns = <290000>;
+ turbo-mode;
+ };
+ };
+
+ cluster1_opp: opp_table1 {
+ compatible = "operating-points-v2";
+ opp-shared;
+
+ opp@1300000000 {
+ opp-hz = /bits/ 64 <1300000000>;
+ opp-microvolt = <1045000 1050000 1055000>;
+ opp-microamp = <95000>;
+ clock-latency-ns = <400000>;
+ opp-suspend;
+ };
+ opp@1400000000 {
+ opp-hz = /bits/ 64 <1400000000>;
+ opp-microvolt = <1075000>;
+ opp-microamp = <100000>;
+ clock-latency-ns = <400000>;
+ };
+ opp@1500000000 {
+ opp-hz = /bits/ 64 <1500000000>;
+ opp-microvolt = <1010000 1100000 1110000>;
+ opp-microamp = <95000>;
+ clock-latency-ns = <400000>;
+ turbo-mode;
+ };
+ };
+};
+
+Example 4: Handling multiple regulators
+
+/ {
+ cpus {
+ cpu@0 {
+ compatible = "arm,cortex-a7";
+ ...
+
+ cpu-supply = <&cpu_supply0>, <&cpu_supply1>, <&cpu_supply2>;
+ operating-points-v2 = <&cpu0_opp_table>;
+ };
+ };
+
+ cpu0_opp_table: opp_table0 {
+ compatible = "operating-points-v2";
+ opp-shared;
+
+ opp@1000000000 {
+ opp-hz = /bits/ 64 <1000000000>;
+ opp-microvolt = <970000>, /* Supply 0 */
+ <960000>, /* Supply 1 */
+ <960000>; /* Supply 2 */
+ opp-microamp = <70000>, /* Supply 0 */
+ <70000>, /* Supply 1 */
+ <70000>; /* Supply 2 */
+ clock-latency-ns = <300000>;
+ };
+
+ /* OR */
+
+ opp@1000000000 {
+ opp-hz = /bits/ 64 <1000000000>;
+ opp-microvolt = <970000 975000 985000>, /* Supply 0 */
+ <960000 965000 975000>, /* Supply 1 */
+ <960000 965000 975000>; /* Supply 2 */
+ opp-microamp = <70000>, /* Supply 0 */
+ <70000>, /* Supply 1 */
+ <70000>; /* Supply 2 */
+ clock-latency-ns = <300000>;
+ };
+
+ /* OR */
+
+ opp@1000000000 {
+ opp-hz = /bits/ 64 <1000000000>;
+ opp-microvolt = <970000 975000 985000>, /* Supply 0 */
+ <960000 965000 975000>, /* Supply 1 */
+ <960000 965000 975000>; /* Supply 2 */
+ opp-microamp = <70000>, /* Supply 0 */
+ <0>, /* Supply 1 doesn't need this */
+ <70000>; /* Supply 2 */
+ clock-latency-ns = <300000>;
+ };
+ };
+};
+
+Example 5: opp-supported-hw
+(example: three level hierarchy of versions: cuts, substrate and process)
+
+/ {
+ cpus {
+ cpu@0 {
+ compatible = "arm,cortex-a7";
+ ...
+
+ cpu-supply = <&cpu_supply>
+ operating-points-v2 = <&cpu0_opp_table_slow>;
+ };
+ };
+
+ opp_table {
+ compatible = "operating-points-v2";
+ status = "okay";
+ opp-shared;
+
+ opp@600000000 {
+ /*
+ * Supports all substrate and process versions for 0xF
+ * cuts, i.e. only first four cuts.
+ */
+ opp-supported-hw = <0xF 0xFFFFFFFF 0xFFFFFFFF>
+ opp-hz = /bits/ 64 <600000000>;
+ opp-microvolt = <900000 915000 925000>;
+ ...
+ };
+
+ opp@800000000 {
+ /*
+ * Supports:
+ * - cuts: only one, 6th cut (represented by 6th bit).
+ * - substrate: supports 16 different substrate versions
+ * - process: supports 9 different process versions
+ */
+ opp-supported-hw = <0x20 0xff0000ff 0x0000f4f0>
+ opp-hz = /bits/ 64 <800000000>;
+ opp-microvolt = <900000 915000 925000>;
+ ...
+ };
+ };
+};
+
+Example 6: opp-microvolt-<name>, opp-microamp-<name>:
+(example: device with two possible microvolt ranges: slow and fast)
+
+/ {
+ cpus {
+ cpu@0 {
+ compatible = "arm,cortex-a7";
+ ...
+
+ operating-points-v2 = <&cpu0_opp_table>;
+ };
+ };
+
+ cpu0_opp_table: opp_table0 {
+ compatible = "operating-points-v2";
+ opp-shared;
+
+ opp@1000000000 {
+ opp-hz = /bits/ 64 <1000000000>;
+ opp-microvolt-slow = <900000 915000 925000>;
+ opp-microvolt-fast = <970000 975000 985000>;
+ opp-microamp-slow = <70000>;
+ opp-microamp-fast = <71000>;
+ };
+
+ opp@1200000000 {
+ opp-hz = /bits/ 64 <1200000000>;
+ opp-microvolt-slow = <900000 915000 925000>, /* Supply vcc0 */
+ <910000 925000 935000>; /* Supply vcc1 */
+ opp-microvolt-fast = <970000 975000 985000>, /* Supply vcc0 */
+ <960000 965000 975000>; /* Supply vcc1 */
+ opp-microamp = <70000>; /* Will be used for both slow/fast */
+ };
+ };
+};
diff --git a/Documentation/devicetree/bindings/power/opp.txt b/Documentation/devicetree/bindings/power/opp.txt
deleted file mode 100644
index 74499e5033fc..000000000000
--- a/Documentation/devicetree/bindings/power/opp.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-* Generic OPP Interface
-
-SoCs have a standard set of tuples consisting of frequency and
-voltage pairs that the device will support per voltage domain. These
-are called Operating Performance Points or OPPs.
-
-Properties:
-- operating-points: An array of 2-tuples items, and each item consists
- of frequency and voltage like <freq-kHz vol-uV>.
- freq: clock frequency in kHz
- vol: voltage in microvolt
-
-Examples:
-
-cpu@0 {
- compatible = "arm,cortex-a9";
- reg = <0>;
- next-level-cache = <&L2>;
- operating-points = <
- /* kHz uV */
- 792000 1100000
- 396000 950000
- 198000 850000
- >;
-};
diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index 88ab81c79109..97dfdfa13631 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -94,7 +94,7 @@ a variable of type size_t.
Boolean values can be placed in debugfs with:
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
- struct dentry *parent, u32 *value);
+ struct dentry *parent, bool *value);
A read on the resulting file will yield either Y (for non-zero values) or
N, followed by a newline. If written to, it will accept either upper- or
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index cd03a0faca8f..51bbc77d5d16 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3289,6 +3289,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
ro [KNL] Mount root device read-only on boot
+ rodata= [KNL]
+ on Mark read-only kernel memory as read-only (default).
+ off Leave read-only kernel memory writable for debugging.
+
root= [KNL] Root filesystem
See name_to_dev_t comment in init/do_mounts.c.
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index c59bd9bc41ef..4176ab076f1c 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -352,7 +352,8 @@ In the first case there are two additional complications:
- if CR4.SMEP is enabled: since we've turned the page into a kernel page,
the kernel may now execute it. We handle this by also setting spte.nx.
If we get a user fetch or read fault, we'll change spte.u=1 and
- spte.nx=gpte.nx back.
+ spte.nx=gpte.nx back. For this to work, KVM forces EFER.NX to 1 when
+ shadow paging is in use.
- if CR4.SMAP is disabled: since the page has been changed to a kernel
page, it can not be reused when CR4.SMAP is enabled. We set
CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note,
diff --git a/Makefile b/Makefile
index 39be1bbd373a..79fab0d55218 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
VERSION = 4
PATCHLEVEL = 1
-SUBLEVEL = 20
+SUBLEVEL = 21
EXTRAVERSION =
NAME = Series 4800
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e16a259177d4..ed16f8cdc4fe 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1467,6 +1467,7 @@ config HOTPLUG_CPU
config ARM_PSCI
bool "Support for the ARM Power State Coordination Interface (PSCI)"
depends on CPU_V7
+ select ARM_PSCI_FW
help
Say Y here if you want Linux to communicate with system firmware
implementing the PSCI specification for CPU-centric power
diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi
index dfcc0dd637e5..bc04b754fe36 100644
--- a/arch/arm/boot/dts/dra7.dtsi
+++ b/arch/arm/boot/dts/dra7.dtsi
@@ -1411,6 +1411,16 @@
0x48485200 0x2E00>;
#address-cells = <1>;
#size-cells = <1>;
+
+ /*
+ * Do not allow gating of cpsw clock as workaround
+ * for errata i877. Keeping internal clock disabled
+ * causes the device switching characteristics
+ * to degrade over time and eventually fail to meet
+ * the data manual delay time/skew specs.
+ */
+ ti,no-idle;
+
/*
* rx_thresh_pend
* rx_pend
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 2d46862e7bef..5797815727fe 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -488,7 +488,6 @@ int set_memory_x(unsigned long addr, int numpages);
int set_memory_nx(unsigned long addr, int numpages);
#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
void set_kernel_text_rw(void);
void set_kernel_text_ro(void);
#else
diff --git a/arch/arm/include/asm/psci.h b/arch/arm/include/asm/psci.h
index c25ef3ec6d1f..68ee3ce17b82 100644
--- a/arch/arm/include/asm/psci.h
+++ b/arch/arm/include/asm/psci.h
@@ -14,34 +14,11 @@
#ifndef __ASM_ARM_PSCI_H
#define __ASM_ARM_PSCI_H
-#define PSCI_POWER_STATE_TYPE_STANDBY 0
-#define PSCI_POWER_STATE_TYPE_POWER_DOWN 1
-
-struct psci_power_state {
- u16 id;
- u8 type;
- u8 affinity_level;
-};
-
-struct psci_operations {
- int (*cpu_suspend)(struct psci_power_state state,
- unsigned long entry_point);
- int (*cpu_off)(struct psci_power_state state);
- int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
- int (*migrate)(unsigned long cpuid);
- int (*affinity_info)(unsigned long target_affinity,
- unsigned long lowest_affinity_level);
- int (*migrate_info_type)(void);
-};
-
-extern struct psci_operations psci_ops;
extern struct smp_operations psci_smp_ops;
#ifdef CONFIG_ARM_PSCI
-int psci_init(void);
bool psci_smp_available(void);
#else
-static inline int psci_init(void) { return 0; }
static inline bool psci_smp_available(void) { return false; }
#endif
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 752725dcbf42..2c06383f4d47 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -86,7 +86,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_ARM_VIRT_EXT) += hyp-stub.o
ifeq ($(CONFIG_ARM_PSCI),y)
-obj-y += psci.o psci-call.o
+obj-y += psci-call.o
obj-$(CONFIG_SMP) += psci_smp.o
endif
diff --git a/arch/arm/kernel/psci.c b/arch/arm/kernel/psci.c
deleted file mode 100644
index f90fdf4ce7c7..000000000000
--- a/arch/arm/kernel/psci.c
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * Copyright (C) 2012 ARM Limited
- *
- * Author: Will Deacon <will.deacon@arm.com>
- */
-
-#define pr_fmt(fmt) "psci: " fmt
-
-#include <linux/init.h>
-#include <linux/of.h>
-#include <linux/reboot.h>
-#include <linux/pm.h>
-#include <uapi/linux/psci.h>
-
-#include <asm/compiler.h>
-#include <asm/errno.h>
-#include <asm/psci.h>
-#include <asm/system_misc.h>
-
-struct psci_operations psci_ops;
-
-static int (*invoke_psci_fn)(u32, u32, u32, u32);
-typedef int (*psci_initcall_t)(const struct device_node *);
-
-asmlinkage int __invoke_psci_fn_hvc(u32, u32, u32, u32);
-asmlinkage int __invoke_psci_fn_smc(u32, u32, u32, u32);
-
-enum psci_function {
- PSCI_FN_CPU_SUSPEND,
- PSCI_FN_CPU_ON,
- PSCI_FN_CPU_OFF,
- PSCI_FN_MIGRATE,
- PSCI_FN_AFFINITY_INFO,
- PSCI_FN_MIGRATE_INFO_TYPE,
- PSCI_FN_MAX,
-};
-
-static u32 psci_function_id[PSCI_FN_MAX];
-
-static int psci_to_linux_errno(int errno)
-{
- switch (errno) {
- case PSCI_RET_SUCCESS:
- return 0;
- case PSCI_RET_NOT_SUPPORTED:
- return -EOPNOTSUPP;
- case PSCI_RET_INVALID_PARAMS:
- return -EINVAL;
- case PSCI_RET_DENIED:
- return -EPERM;
- };
-
- return -EINVAL;
-}
-
-static u32 psci_power_state_pack(struct psci_power_state state)
-{
- return ((state.id << PSCI_0_2_POWER_STATE_ID_SHIFT)
- & PSCI_0_2_POWER_STATE_ID_MASK) |
- ((state.type << PSCI_0_2_POWER_STATE_TYPE_SHIFT)
- & PSCI_0_2_POWER_STATE_TYPE_MASK) |
- ((state.affinity_level << PSCI_0_2_POWER_STATE_AFFL_SHIFT)
- & PSCI_0_2_POWER_STATE_AFFL_MASK);
-}
-
-static int psci_get_version(void)
-{
- int err;
-
- err = invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0);
- return err;
-}
-
-static int psci_cpu_suspend(struct psci_power_state state,
- unsigned long entry_point)
-{
- int err;
- u32 fn, power_state;
-
- fn = psci_function_id[PSCI_FN_CPU_SUSPEND];
- power_state = psci_power_state_pack(state);
- err = invoke_psci_fn(fn, power_state, entry_point, 0);
- return psci_to_linux_errno(err);
-}
-
-static int psci_cpu_off(struct psci_power_state state)
-{
- int err;
- u32 fn, power_state;
-
- fn = psci_function_id[PSCI_FN_CPU_OFF];
- power_state = psci_power_state_pack(state);
- err = invoke_psci_fn(fn, power_state, 0, 0);
- return psci_to_linux_errno(err);
-}
-
-static int psci_cpu_on(unsigned long cpuid, unsigned long entry_point)
-{
- int err;
- u32 fn;
-
- fn = psci_function_id[PSCI_FN_CPU_ON];
- err = invoke_psci_fn(fn, cpuid, entry_point, 0);
- return psci_to_linux_errno(err);
-}
-
-static int psci_migrate(unsigned long cpuid)
-{
- int err;
- u32 fn;
-
- fn = psci_function_id[PSCI_FN_MIGRATE];
- err = invoke_psci_fn(fn, cpuid, 0, 0);
- return psci_to_linux_errno(err);
-}
-
-static int psci_affinity_info(unsigned long target_affinity,
- unsigned long lowest_affinity_level)
-{
- int err;
- u32 fn;
-
- fn = psci_function_id[PSCI_FN_AFFINITY_INFO];
- err = invoke_psci_fn(fn, target_affinity, lowest_affinity_level, 0);
- return err;
-}
-
-static int psci_migrate_info_type(void)
-{
- int err;
- u32 fn;
-
- fn = psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE];
- err = invoke_psci_fn(fn, 0, 0, 0);
- return err;
-}
-
-static int get_set_conduit_method(struct device_node *np)
-{
- const char *method;
-
- pr_info("probing for conduit method from DT.\n");
-
- if (of_property_read_string(np, "method", &method)) {
- pr_warn("missing \"method\" property\n");
- return -ENXIO;
- }
-
- if (!strcmp("hvc", method)) {
- invoke_psci_fn = __invoke_psci_fn_hvc;
- } else if (!strcmp("smc", method)) {
- invoke_psci_fn = __invoke_psci_fn_smc;
- } else {
- pr_warn("invalid \"method\" property: %s\n", method);
- return -EINVAL;
- }
- return 0;
-}
-
-static void psci_sys_reset(enum reboot_mode reboot_mode, const char *cmd)
-{
- invoke_psci_fn(PSCI_0_2_FN_SYSTEM_RESET, 0, 0, 0);
-}
-
-static void psci_sys_poweroff(void)
-{
- invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
-}
-
-/*
- * PSCI Function IDs for v0.2+ are well defined so use
- * standard values.
- */
-static int psci_0_2_init(struct device_node *np)
-{
- int err, ver;
-
- err = get_set_conduit_method(np);
-
- if (err)
- goto out_put_node;
-
- ver = psci_get_version();
-
- if (ver == PSCI_RET_NOT_SUPPORTED) {
- /* PSCI v0.2 mandates implementation of PSCI_ID_VERSION. */
- pr_err("PSCI firmware does not comply with the v0.2 spec.\n");
- err = -EOPNOTSUPP;
- goto out_put_node;
- } else {
- pr_info("PSCIv%d.%d detected in firmware.\n",
- PSCI_VERSION_MAJOR(ver),
- PSCI_VERSION_MINOR(ver));
-
- if (PSCI_VERSION_MAJOR(ver) == 0 &&
- PSCI_VERSION_MINOR(ver) < 2) {
- err = -EINVAL;
- pr_err("Conflicting PSCI version detected.\n");
- goto out_put_node;
- }
- }
-
- pr_info("Using standard PSCI v0.2 function IDs\n");
- psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_0_2_FN_CPU_SUSPEND;
- psci_ops.cpu_suspend = psci_cpu_suspend;
-
- psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF;
- psci_ops.cpu_off = psci_cpu_off;
-
- psci_function_id[PSCI_FN_CPU_ON] = PSCI_0_2_FN_CPU_ON;
- psci_ops.cpu_on = psci_cpu_on;
-
- psci_function_id[PSCI_FN_MIGRATE] = PSCI_0_2_FN_MIGRATE;
- psci_ops.migrate = psci_migrate;
-
- psci_function_id[PSCI_FN_AFFINITY_INFO] = PSCI_0_2_FN_AFFINITY_INFO;
- psci_ops.affinity_info = psci_affinity_info;
-
- psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE] =
- PSCI_0_2_FN_MIGRATE_INFO_TYPE;
- psci_ops.migrate_info_type = psci_migrate_info_type;
-
- arm_pm_restart = psci_sys_reset;
-
- pm_power_off = psci_sys_poweroff;
-
-out_put_node:
- of_node_put(np);
- return err;
-}
-
-/*
- * PSCI < v0.2 get PSCI Function IDs via DT.
- */
-static int psci_0_1_init(struct device_node *np)
-{
- u32 id;
- int err;
-
- err = get_set_conduit_method(np);
-
- if (err)
- goto out_put_node;
-
- pr_info("Using PSCI v0.1 Function IDs from DT\n");
-
- if (!of_property_read_u32(np, "cpu_suspend", &id)) {
- psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
- psci_ops.cpu_suspend = psci_cpu_suspend;
- }
-
- if (!of_property_read_u32(np, "cpu_off", &id)) {
- psci_function_id[PSCI_FN_CPU_OFF] = id;
- psci_ops.cpu_off = psci_cpu_off;
- }
-
- if (!of_property_read_u32(np, "cpu_on", &id)) {
- psci_function_id[PSCI_FN_CPU_ON] = id;
- psci_ops.cpu_on = psci_cpu_on;
- }
-
- if (!of_property_read_u32(np, "migrate", &id)) {
- psci_function_id[PSCI_FN_MIGRATE] = id;
- psci_ops.migrate = psci_migrate;
- }
-
-out_put_node:
- of_node_put(np);
- return err;
-}
-
-static const struct of_device_id psci_of_match[] __initconst = {
- { .compatible = "arm,psci", .data = psci_0_1_init},
- { .compatible = "arm,psci-0.2", .data = psci_0_2_init},
- {},
-};
-
-int __init psci_init(void)
-{
- struct device_node *np;
- const struct of_device_id *matched_np;
- psci_initcall_t init_fn;
-
- np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);
- if (!np)
- return -ENODEV;
-
- init_fn = (psci_initcall_t)matched_np->data;
- return init_fn(np);
-}
diff --git a/arch/arm/kernel/psci_smp.c b/arch/arm/kernel/psci_smp.c
index 28a1db4da704..9d479b2ea40d 100644
--- a/arch/arm/kernel/psci_smp.c
+++ b/arch/arm/kernel/psci_smp.c
@@ -17,6 +17,8 @@
#include <linux/smp.h>
#include <linux/of.h>
#include <linux/delay.h>
+#include <linux/psci.h>
+
#include <uapi/linux/psci.h>
#include <asm/psci.h>
@@ -51,25 +53,37 @@ static int psci_boot_secondary(unsigned int cpu, struct task_struct *idle)
{
if (psci_ops.cpu_on)
return psci_ops.cpu_on(cpu_logical_map(cpu),
- __pa(secondary_startup));
+ virt_to_idmap(&secondary_startup));
return -ENODEV;
}
#ifdef CONFIG_HOTPLUG_CPU
-void __ref psci_cpu_die(unsigned int cpu)
+int psci_cpu_disable(unsigned int cpu)
+{
+ /* Fail early if we don't have CPU_OFF support */
+ if (!psci_ops.cpu_off)
+ return -EOPNOTSUPP;
+
+ /* Trusted OS will deny CPU_OFF */
+ if (psci_tos_resident_on(cpu))
+ return -EPERM;
+
+ return 0;
+}
+
+void psci_cpu_die(unsigned int cpu)
{
- const struct psci_power_state ps = {
- .type = PSCI_POWER_STATE_TYPE_POWER_DOWN,
- };
+ u32 state = PSCI_POWER_STATE_TYPE_POWER_DOWN <<
+ PSCI_0_2_POWER_STATE_TYPE_SHIFT;
- if (psci_ops.cpu_off)
- psci_ops.cpu_off(ps);
+ if (psci_ops.cpu_off)
+ psci_ops.cpu_off(state);
- /* We should never return */
- panic("psci: cpu %d failed to shutdown\n", cpu);
+ /* We should never return */
+ panic("psci: cpu %d failed to shutdown\n", cpu);
}
-int __ref psci_cpu_kill(unsigned int cpu)
+int psci_cpu_kill(unsigned int cpu)
{
int err, i;
@@ -109,6 +123,7 @@ bool __init psci_smp_available(void)
struct smp_operations __initdata psci_smp_ops = {
.smp_boot_secondary = psci_boot_secondary,
#ifdef CONFIG_HOTPLUG_CPU
+ .cpu_disable = psci_cpu_disable,
.cpu_die = psci_cpu_die,
.cpu_kill = psci_cpu_kill,
#endif
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 6c777e908a24..3224680e44f4 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -31,6 +31,7 @@
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/sort.h>
+#include <linux/psci.h>
#include <asm/unified.h>
#include <asm/cp15.h>
@@ -950,7 +951,7 @@ void __init setup_arch(char **cmdline_p)
unflatten_device_tree();
arm_dt_init_cpu_maps();
- psci_init();
+ psci_dt_init();
#ifdef CONFIG_SMP
if (is_smp()) {
if (!mdesc->smp_init || !mdesc->smp_init()) {
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index e24f0461ea2d..ddc0defe5548 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -24,6 +24,8 @@
#include <asm/kvm_psci.h>
#include <asm/kvm_host.h>
+#include <uapi/linux/psci.h>
+
/*
* This is an implementation of the Power State Coordination Interface
* as described in ARM document number ARM DEN 0022A.
@@ -124,7 +126,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
{
- int i;
+ int i, matching_cpus = 0;
unsigned long mpidr;
unsigned long target_affinity;
unsigned long target_affinity_mask;
@@ -149,12 +151,16 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
*/
kvm_for_each_vcpu(i, tmp, kvm) {
mpidr = kvm_vcpu_get_mpidr_aff(tmp);
- if (((mpidr & target_affinity_mask) == target_affinity) &&
- !tmp->arch.pause) {
- return PSCI_0_2_AFFINITY_LEVEL_ON;
+ if ((mpidr & target_affinity_mask) == target_affinity) {
+ matching_cpus++;
+ if (!tmp->arch.pause)
+ return PSCI_0_2_AFFINITY_LEVEL_ON;
}
}
+ if (!matching_cpus)
+ return PSCI_RET_INVALID_PARAMS;
+
return PSCI_0_2_AFFINITY_LEVEL_OFF;
}
diff --git a/arch/arm/mach-highbank/highbank.c b/arch/arm/mach-highbank/highbank.c
index 231fba0d03e5..6050a14faee6 100644
--- a/arch/arm/mach-highbank/highbank.c
+++ b/arch/arm/mach-highbank/highbank.c
@@ -28,8 +28,8 @@
#include <linux/reboot.h>
#include <linux/amba/bus.h>
#include <linux/platform_device.h>
+#include <linux/psci.h>
-#include <asm/psci.h>
#include <asm/hardware/cache-l2x0.h>
#include <asm/mach/arch.h>
#include <asm/mach/map.h>
diff --git a/arch/arm/mach-highbank/pm.c b/arch/arm/mach-highbank/pm.c
index 7f2bd85eb935..400311695548 100644
--- a/arch/arm/mach-highbank/pm.c
+++ b/arch/arm/mach-highbank/pm.c
@@ -16,19 +16,21 @@
#include <linux/cpu_pm.h>
#include <linux/init.h>
+#include <linux/psci.h>
#include <linux/suspend.h>
#include <asm/suspend.h>
-#include <asm/psci.h>
+
+#include <uapi/linux/psci.h>
+
+#define HIGHBANK_SUSPEND_PARAM \
+ ((0 << PSCI_0_2_POWER_STATE_ID_SHIFT) | \
+ (1 << PSCI_0_2_POWER_STATE_AFFL_SHIFT) | \
+ (PSCI_POWER_STATE_TYPE_POWER_DOWN << PSCI_0_2_POWER_STATE_TYPE_SHIFT))
static int highbank_suspend_finish(unsigned long val)
{
- const struct psci_power_state ps = {
- .type = PSCI_POWER_STATE_TYPE_POWER_DOWN,
- .affinity_level = 1,
- };
-
- return psci_ops.cpu_suspend(ps, __pa(cpu_resume));
+ return psci_ops.cpu_suspend(HIGHBANK_SUSPEND_PARAM, __pa(cpu_resume));
}
static int highbank_pm_enter(suspend_state_t state)
diff --git a/arch/arm/mach-imx/mach-imx6q.c b/arch/arm/mach-imx/mach-imx6q.c
index 3ab61549ce0f..bb8a1306ac34 100644
--- a/arch/arm/mach-imx/mach-imx6q.c
+++ b/arch/arm/mach-imx/mach-imx6q.c
@@ -350,7 +350,7 @@ static void __init imx6q_opp_init(void)
return;
}
- if (of_init_opp_table(cpu_dev)) {
+ if (dev_pm_opp_of_add_table(cpu_dev)) {
pr_warn("failed to init OPP table\n");
goto put_node;
}
diff --git a/arch/arm/mach-omap2/omap-hotplug.c b/arch/arm/mach-omap2/omap-hotplug.c
index 971791fe9a3f..593fec753b28 100644
--- a/arch/arm/mach-omap2/omap-hotplug.c
+++ b/arch/arm/mach-omap2/omap-hotplug.c
@@ -27,7 +27,7 @@
* platform-specific code to shutdown a CPU
* Called with IRQs disabled
*/
-void __ref omap4_cpu_die(unsigned int cpu)
+void omap4_cpu_die(unsigned int cpu)
{
unsigned int boot_cpu = 0;
void __iomem *base = omap_get_wakeupgen_base();
diff --git a/arch/arm/mach-omap2/omap-wakeupgen.c b/arch/arm/mach-omap2/omap-wakeupgen.c
index 6833df45d7b1..3e059b215a1d 100644
--- a/arch/arm/mach-omap2/omap-wakeupgen.c
+++ b/arch/arm/mach-omap2/omap-wakeupgen.c
@@ -330,7 +330,7 @@ static int irq_cpu_hotplug_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __refdata irq_hotplug_notifier = {
+static struct notifier_block irq_hotplug_notifier = {
.notifier_call = irq_cpu_hotplug_notify,
};
diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c
index 5286e7773ed4..9185bb958503 100644
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -876,6 +876,36 @@ static int _init_opt_clks(struct omap_hwmod *oh)
return ret;
}
+static void _enable_optional_clocks(struct omap_hwmod *oh)
+{
+ struct omap_hwmod_opt_clk *oc;
+ int i;
+
+ pr_debug("omap_hwmod: %s: enabling optional clocks\n", oh->name);
+
+ for (i = oh->opt_clks_cnt, oc = oh->opt_clks; i > 0; i--, oc++)
+ if (oc->_clk) {
+ pr_debug("omap_hwmod: enable %s:%s\n", oc->role,
+ __clk_get_name(oc->_clk));
+ clk_enable(oc->_clk);
+ }
+}
+
+static void _disable_optional_clocks(struct omap_hwmod *oh)
+{
+ struct omap_hwmod_opt_clk *oc;
+ int i;
+
+ pr_debug("omap_hwmod: %s: disabling optional clocks\n", oh->name);
+
+ for (i = oh->opt_clks_cnt, oc = oh->opt_clks; i > 0; i--, oc++)
+ if (oc->_clk) {
+ pr_debug("omap_hwmod: disable %s:%s\n", oc->role,
+ __clk_get_name(oc->_clk));
+ clk_disable(oc->_clk);
+ }
+}
+
/**
* _enable_clocks - enable hwmod main clock and interface clocks
* @oh: struct omap_hwmod *
@@ -903,6 +933,9 @@ static int _enable_clocks(struct omap_hwmod *oh)
clk_enable(os->_clk);
}
+ if (oh->flags & HWMOD_OPT_CLKS_NEEDED)
+ _enable_optional_clocks(oh);
+
/* The opt clocks are controlled by the device driver. */
return 0;
@@ -934,41 +967,14 @@ static int _disable_clocks(struct omap_hwmod *oh)
clk_disable(os->_clk);
}
+ if (oh->flags & HWMOD_OPT_CLKS_NEEDED)
+ _disable_optional_clocks(oh);
+
/* The opt clocks are controlled by the device driver. */
return 0;
}
-static void _enable_optional_clocks(struct omap_hwmod *oh)
-{
- struct omap_hwmod_opt_clk *oc;
- int i;
-
- pr_debug("omap_hwmod: %s: enabling optional clocks\n", oh->name);
-
- for (i = oh->opt_clks_cnt, oc = oh->opt_clks; i > 0; i--, oc++)
- if (oc->_clk) {
- pr_debug("omap_hwmod: enable %s:%s\n", oc->role,
- __clk_get_name(oc->_clk));
- clk_enable(oc->_clk);
- }
-}
-
-static void _disable_optional_clocks(struct omap_hwmod *oh)
-{
- struct omap_hwmod_opt_clk *oc;
- int i;
-
- pr_debug("omap_hwmod: %s: disabling optional clocks\n", oh->name);
-
- for (i = oh->opt_clks_cnt, oc = oh->opt_clks; i > 0; i--, oc++)
- if (oc->_clk) {
- pr_debug("omap_hwmod: disable %s:%s\n", oc->role,
- __clk_get_name(oc->_clk));
- clk_disable(oc->_clk);
- }
-}
-
/**
* _omap4_enable_module - enable CLKCTRL modulemode on OMAP4
* @oh: struct omap_hwmod *
@@ -2180,6 +2186,11 @@ static int _enable(struct omap_hwmod *oh)
*/
static int _idle(struct omap_hwmod *oh)
{
+ if (oh->flags & HWMOD_NO_IDLE) {
+ oh->_int_flags |= _HWMOD_SKIP_ENABLE;
+ return 0;
+ }
+
pr_debug("omap_hwmod: %s: idling\n", oh->name);
if (oh->_state != _HWMOD_STATE_ENABLED) {
@@ -2484,6 +2495,8 @@ static int __init _init(struct omap_hwmod *oh, void *data)
oh->flags |= HWMOD_INIT_NO_RESET;
if (of_find_property(np, "ti,no-idle-on-init", NULL))
oh->flags |= HWMOD_INIT_NO_IDLE;
+ if (of_find_property(np, "ti,no-idle", NULL))
+ oh->flags |= HWMOD_NO_IDLE;
}
oh->_state = _HWMOD_STATE_INITIALIZED;
@@ -2610,7 +2623,7 @@ static void __init _setup_postsetup(struct omap_hwmod *oh)
* XXX HWMOD_INIT_NO_IDLE does not belong in hwmod data -
* it should be set by the core code as a runtime flag during startup
*/
- if ((oh->flags & HWMOD_INIT_NO_IDLE) &&
+ if ((oh->flags & (HWMOD_INIT_NO_IDLE | HWMOD_NO_IDLE)) &&
(postsetup_state == _HWMOD_STATE_IDLE)) {
oh->_int_flags |= _HWMOD_SKIP_ENABLE;
postsetup_state = _HWMOD_STATE_ENABLED;
diff --git a/arch/arm/mach-omap2/omap_hwmod.h b/arch/arm/mach-omap2/omap_hwmod.h
index 9611c91d9b82..ec289c5f099a 100644
--- a/arch/arm/mach-omap2/omap_hwmod.h
+++ b/arch/arm/mach-omap2/omap_hwmod.h
@@ -517,6 +517,10 @@ struct omap_hwmod_omap4_prcm {
* HWMOD_RECONFIG_IO_CHAIN: omap_hwmod code needs to reconfigure wake-up
* events by calling _reconfigure_io_chain() when a device is enabled
* or idled.
+ * HWMOD_OPT_CLKS_NEEDED: The optional clocks are needed for the module to
+ * operate and they need to be handled at the same time as the main_clk.
+ * HWMOD_NO_IDLE: Do not idle the hwmod at all. Useful to handle certain
+ * IPs like CPSW on DRA7, where clocks to this module cannot be disabled.
*/
#define HWMOD_SWSUP_SIDLE (1 << 0)
#define HWMOD_SWSUP_MSTANDBY (1 << 1)
@@ -532,6 +536,8 @@ struct omap_hwmod_omap4_prcm {
#define HWMOD_FORCE_MSTANDBY (1 << 11)
#define HWMOD_SWSUP_SIDLE_ACT (1 << 12)
#define HWMOD_RECONFIG_IO_CHAIN (1 << 13)
+#define HWMOD_OPT_CLKS_NEEDED (1 << 14)
+#define HWMOD_NO_IDLE (1 << 15)
/*
* omap_hwmod._int_flags definitions
diff --git a/arch/arm/mach-prima2/hotplug.c b/arch/arm/mach-prima2/hotplug.c
index 0ab2f8bae28e..a728c78b996f 100644
--- a/arch/arm/mach-prima2/hotplug.c
+++ b/arch/arm/mach-prima2/hotplug.c
@@ -32,7 +32,7 @@ static inline void platform_do_lowpower(unsigned int cpu)
*
* Called with IRQs disabled
*/
-void __ref sirfsoc_cpu_die(unsigned int cpu)
+void sirfsoc_cpu_die(unsigned int cpu)
{
platform_do_lowpower(cpu);
}
diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
index 82c9b9145c3e..0a49fe1bc8cf 100644
--- a/arch/arm/mach-qcom/platsmp.c
+++ b/arch/arm/mach-qcom/platsmp.c
@@ -49,7 +49,7 @@ extern void secondary_startup_arm(void);
static DEFINE_RAW_SPINLOCK(boot_lock);
#ifdef CONFIG_HOTPLUG_CPU
-static void __ref qcom_cpu_die(unsigned int cpu)
+static void qcom_cpu_die(unsigned int cpu)
{
wfi();
}
diff --git a/arch/arm/mach-realview/hotplug.c b/arch/arm/mach-realview/hotplug.c
index ac22dd41b135..968e2d1964f6 100644
--- a/arch/arm/mach-realview/hotplug.c
+++ b/arch/arm/mach-realview/hotplug.c
@@ -90,7 +90,7 @@ static inline void platform_do_lowpower(unsigned int cpu, int *spurious)
*
* Called with IRQs disabled
*/
-void __ref realview_cpu_die(unsigned int cpu)
+void realview_cpu_die(unsigned int cpu)
{
int spurious = 0;
diff --git a/arch/arm/mach-spear/hotplug.c b/arch/arm/mach-spear/hotplug.c
index d97749c642ce..12edd1cf8a12 100644
--- a/arch/arm/mach-spear/hotplug.c
+++ b/arch/arm/mach-spear/hotplug.c
@@ -80,7 +80,7 @@ static inline void spear13xx_do_lowpower(unsigned int cpu, int *spurious)
*
* Called with IRQs disabled
*/
-void __ref spear13xx_cpu_die(unsigned int cpu)
+void spear13xx_cpu_die(unsigned int cpu)
{
int spurious = 0;
diff --git a/arch/arm/mach-tegra/hotplug.c b/arch/arm/mach-tegra/hotplug.c
index 6fc71f1534b0..1b129899a277 100644
--- a/arch/arm/mach-tegra/hotplug.c
+++ b/arch/arm/mach-tegra/hotplug.c
@@ -37,7 +37,7 @@ int tegra_cpu_kill(unsigned cpu)
*
* Called with IRQs disabled
*/
-void __ref tegra_cpu_die(unsigned int cpu)
+void tegra_cpu_die(unsigned int cpu)
{
if (!tegra_hotplug_shutdown) {
WARN(1, "hotplug is not yet initialized\n");
diff --git a/arch/arm/mach-ux500/hotplug.c b/arch/arm/mach-ux500/hotplug.c
index 2bc00b085e38..1cbed0331fd3 100644
--- a/arch/arm/mach-ux500/hotplug.c
+++ b/arch/arm/mach-ux500/hotplug.c
@@ -21,7 +21,7 @@
*
* Called with IRQs disabled
*/
-void __ref ux500_cpu_die(unsigned int cpu)
+void ux500_cpu_die(unsigned int cpu)
{
/* directly enter low power state, skipping secure registers */
for (;;) {
diff --git a/arch/arm/mach-vexpress/hotplug.c b/arch/arm/mach-vexpress/hotplug.c
index f0ce6b8f5e71..f2fafc10a91d 100644
--- a/arch/arm/mach-vexpress/hotplug.c
+++ b/arch/arm/mach-vexpress/hotplug.c
@@ -85,7 +85,7 @@ static inline void platform_do_lowpower(unsigned int cpu, int *spurious)
*
* Called with IRQs disabled
*/
-void __ref vexpress_cpu_die(unsigned int cpu)
+void vexpress_cpu_die(unsigned int cpu)
{
int spurious = 0;
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index be92fa0f2f35..8d32f7e9695f 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -22,6 +22,7 @@
#include <linux/memblock.h>
#include <linux/dma-contiguous.h>
#include <linux/sizes.h>
+#include <linux/stop_machine.h>
#include <asm/cp15.h>
#include <asm/mach-types.h>
@@ -626,12 +627,10 @@ static struct section_perm ro_perms[] = {
* safe to be called with preemption disabled, as under stop_machine().
*/
static inline void section_update(unsigned long addr, pmdval_t mask,
- pmdval_t prot)
+ pmdval_t prot, struct mm_struct *mm)
{
- struct mm_struct *mm;
pmd_t *pmd;
- mm = current->active_mm;
pmd = pmd_offset(pud_offset(pgd_offset(mm, addr), addr), addr);
#ifdef CONFIG_ARM_LPAE
@@ -655,49 +654,82 @@ static inline bool arch_has_strict_perms(void)
return !!(get_cr() & CR_XP);
}
-#define set_section_perms(perms, field) { \
- size_t i; \
- unsigned long addr; \
- \
- if (!arch_has_strict_perms()) \
- return; \
- \
- for (i = 0; i < ARRAY_SIZE(perms); i++) { \
- if (!IS_ALIGNED(perms[i].start, SECTION_SIZE) || \
- !IS_ALIGNED(perms[i].end, SECTION_SIZE)) { \
- pr_err("BUG: section %lx-%lx not aligned to %lx\n", \
- perms[i].start, perms[i].end, \
- SECTION_SIZE); \
- continue; \
- } \
- \
- for (addr = perms[i].start; \
- addr < perms[i].end; \
- addr += SECTION_SIZE) \
- section_update(addr, perms[i].mask, \
- perms[i].field); \
- } \
+void set_section_perms(struct section_perm *perms, int n, bool set,
+ struct mm_struct *mm)
+{
+ size_t i;
+ unsigned long addr;
+
+ if (!arch_has_strict_perms())
+ return;
+
+ for (i = 0; i < n; i++) {
+ if (!IS_ALIGNED(perms[i].start, SECTION_SIZE) ||
+ !IS_ALIGNED(perms[i].end, SECTION_SIZE)) {
+ pr_err("BUG: section %lx-%lx not aligned to %lx\n",
+ perms[i].start, perms[i].end,
+ SECTION_SIZE);
+ continue;
+ }
+
+ for (addr = perms[i].start;
+ addr < perms[i].end;
+ addr += SECTION_SIZE)
+ section_update(addr, perms[i].mask,
+ set ? perms[i].prot : perms[i].clear, mm);
+ }
+
}
-static inline void fix_kernmem_perms(void)
+static void update_sections_early(struct section_perm perms[], int n)
{
- set_section_perms(nx_perms, prot);
+ struct task_struct *t, *s;
+
+ read_lock(&tasklist_lock);
+ for_each_process(t) {
+ if (t->flags & PF_KTHREAD)
+ continue;
+ for_each_thread(t, s)
+ set_section_perms(perms, n, true, s->mm);
+ }
+ read_unlock(&tasklist_lock);
+ set_section_perms(perms, n, true, current->active_mm);
+ set_section_perms(perms, n, true, &init_mm);
+}
+
+int __fix_kernmem_perms(void *unused)
+{
+ update_sections_early(nx_perms, ARRAY_SIZE(nx_perms));
+ return 0;
+}
+
+void fix_kernmem_perms(void)
+{
+ stop_machine(__fix_kernmem_perms, NULL, NULL);
}
#ifdef CONFIG_DEBUG_RODATA
+int __mark_rodata_ro(void *unused)
+{
+ update_sections_early(ro_perms, ARRAY_SIZE(ro_perms));
+ return 0;
+}
+
void mark_rodata_ro(void)
{
- set_section_perms(ro_perms, prot);
+ stop_machine(__mark_rodata_ro, NULL, NULL);
}
void set_kernel_text_rw(void)
{
- set_section_perms(ro_perms, clear);
+ set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), false,
+ current->active_mm);
}
void set_kernel_text_ro(void)
{
- set_section_perms(ro_perms, prot);
+ set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), true,
+ current->active_mm);
}
#endif /* CONFIG_DEBUG_RODATA */
diff --git a/arch/arm/vdso/vdso.S b/arch/arm/vdso/vdso.S
index b2b97e3e7bab..a62a7b64f49c 100644
--- a/arch/arm/vdso/vdso.S
+++ b/arch/arm/vdso/vdso.S
@@ -23,9 +23,8 @@
#include <linux/const.h>
#include <asm/page.h>
- __PAGE_ALIGNED_DATA
-
.globl vdso_start, vdso_end
+ .section .data..ro_after_init
.balign PAGE_SIZE
vdso_start:
.incbin "arch/arm/vdso/vdso.so"
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index caa0556de363..1f05b354d550 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -19,6 +19,7 @@ config ARM64
select ARM_GIC_V2M if PCI_MSI
select ARM_GIC_V3
select ARM_GIC_V3_ITS if PCI_MSI
+ select ARM_PSCI_FW
select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
select COMMON_CLK
@@ -26,7 +27,7 @@ config ARM64
select DCACHE_WORD_ACCESS
select GENERIC_ALLOCATOR
select GENERIC_CLOCKEVENTS
- select GENERIC_CLOCKEVENTS_BROADCAST if SMP
+ select GENERIC_CLOCKEVENTS_BROADCAST
select GENERIC_CPU_AUTOPROBE
select GENERIC_EARLY_IOREMAP
select GENERIC_IRQ_PROBE
@@ -44,6 +45,7 @@ config ARM64
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_BITREVERSE
select HAVE_ARCH_JUMP_LABEL
+ select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
select HAVE_ARCH_KGDB
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
@@ -140,6 +142,9 @@ config NEED_DMA_MAP_STATE
config NEED_SG_DMA_LENGTH
def_bool y
+config SMP
+ def_bool y
+
config SWIOTLB
def_bool y
@@ -488,22 +493,8 @@ config CPU_BIG_ENDIAN
help
Say Y if you plan on running a kernel in big-endian mode.
-config SMP
- bool "Symmetric Multi-Processing"
- help
- This enables support for systems with more than one CPU. If
- you say N here, the kernel will run on single and
- multiprocessor machines, but will use only one CPU of a
- multiprocessor machine. If you say Y here, the kernel will run
- on many, but not all, single processor machines. On a single
- processor machine, the kernel will run faster if you say N
- here.
-
- If you don't know what to do here, say N.
-
config SCHED_MC
bool "Multi-core scheduler support"
- depends on SMP
help
Multi-core scheduler support improves the CPU scheduler's decision
making when dealing with multi-core CPU chips at a cost of slightly
@@ -511,7 +502,6 @@ config SCHED_MC
config SCHED_SMT
bool "SMT scheduler support"
- depends on SMP
help
Improves the CPU scheduler's decision making when dealing with
MultiThreading at a cost of slightly increased overhead in some
@@ -520,23 +510,17 @@ config SCHED_SMT
config NR_CPUS
int "Maximum number of CPUs (2-4096)"
range 2 4096
- depends on SMP
# These have to remain sorted largest to smallest
default "64"
config HOTPLUG_CPU
bool "Support for hot-pluggable CPUs"
- depends on SMP
help
Say Y here to experiment with turning CPUs off and on. CPUs
can be controlled through /sys/devices/system/cpu.
source kernel/Kconfig.preempt
-config UP_LATE_INIT
- def_bool y
- depends on !SMP
-
config HZ
int
default 100
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 3258174e6152..a24076567f9e 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -44,6 +44,13 @@ else
TEXT_OFFSET := 0x00080000
endif
+# KASAN_SHADOW_OFFSET = VA_START + (1 << (VA_BITS - 3)) - (1 << 61)
+# in 32-bit arithmetic
+KASAN_SHADOW_OFFSET := $(shell printf "0x%08x00000000\n" $$(( \
+ (0xffffffff & (-1 << ($(CONFIG_ARM64_VA_BITS) - 32))) \
+ + (1 << ($(CONFIG_ARM64_VA_BITS) - 32 - 3)) \
+ - (1 << (64 - 32 - 3)) )) )
+
export TEXT_OFFSET GZFLAGS
core-y += arch/arm64/kernel/ arch/arm64/mm/
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index 59c05d8ea4a0..ed7e212c893c 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -12,8 +12,9 @@
#ifndef _ASM_ACPI_H
#define _ASM_ACPI_H
-#include <linux/mm.h>
#include <linux/irqchip/arm-gic-acpi.h>
+#include <linux/mm.h>
+#include <linux/psci.h>
#include <asm/cputype.h>
#include <asm/smp_plat.h>
@@ -39,18 +40,6 @@ extern int acpi_disabled;
extern int acpi_noirq;
extern int acpi_pci_disabled;
-/* 1 to indicate PSCI 0.2+ is implemented */
-static inline bool acpi_psci_present(void)
-{
- return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_COMPLIANT;
-}
-
-/* 1 to indicate HVC must be used instead of SMC as the PSCI conduit */
-static inline bool acpi_psci_use_hvc(void)
-{
- return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC;
-}
-
static inline void disable_acpi(void)
{
acpi_disabled = 1;
@@ -88,9 +77,11 @@ static inline void arch_fix_phys_package_id(int num, u32 slot) { }
void __init acpi_init_cpus(void);
#else
-static inline bool acpi_psci_present(void) { return false; }
-static inline bool acpi_psci_use_hvc(void) { return false; }
static inline void acpi_init_cpus(void) { }
#endif /* CONFIG_ACPI */
+static inline const char *acpi_get_enable_method(int cpu)
+{
+ return acpi_psci_present() ? "psci" : NULL;
+}
#endif /*_ASM_ACPI_H*/
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 144b64ad96c3..09f13a96941b 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -103,9 +103,7 @@
* SMP data memory barrier
*/
.macro smp_dmb, opt
-#ifdef CONFIG_SMP
dmb \opt
-#endif
.endm
#define USER(l, x...) \
@@ -207,4 +205,15 @@ lr .req x30 // link register
str \src, [\tmp, :lo12:\sym]
.endm
+/*
+ * Annotate a function as position independent, i.e., safe to be called before
+ * the kernel virtual mapping is activated.
+ */
+#define ENDPIPROC(x) \
+ .globl __pi_##x; \
+ .type __pi_##x, %function; \
+ .set __pi_##x, x; \
+ .size __pi_##x, . - x; \
+ ENDPROC(x)
+
#endif /* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 71f19c4dc0de..2666c9b84ca0 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -35,28 +35,6 @@
#define dma_rmb() dmb(oshld)
#define dma_wmb() dmb(oshst)
-#ifndef CONFIG_SMP
-#define smp_mb() barrier()
-#define smp_rmb() barrier()
-#define smp_wmb() barrier()
-
-#define smp_store_release(p, v) \
-do { \
- compiletime_assert_atomic_type(*p); \
- barrier(); \
- ACCESS_ONCE(*p) = (v); \
-} while (0)
-
-#define smp_load_acquire(p) \
-({ \
- typeof(*p) ___p1 = ACCESS_ONCE(*p); \
- compiletime_assert_atomic_type(*p); \
- barrier(); \
- ___p1; \
-})
-
-#else
-
#define smp_mb() dmb(ish)
#define smp_rmb() dmb(ishld)
#define smp_wmb() dmb(ishst)
@@ -109,8 +87,6 @@ do { \
___p1; \
})
-#endif
-
#define read_barrier_depends() do { } while(0)
#define smp_read_barrier_depends() do { } while(0)
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 67d309cc3b6b..0cd01d54eb13 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -153,8 +153,4 @@ int set_memory_rw(unsigned long addr, int numpages);
int set_memory_x(unsigned long addr, int numpages);
int set_memory_nx(unsigned long addr, int numpages);
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
#endif
diff --git a/arch/arm64/include/asm/cpu_ops.h b/arch/arm64/include/asm/cpu_ops.h
index 5a31d6716914..8f03446cf89f 100644
--- a/arch/arm64/include/asm/cpu_ops.h
+++ b/arch/arm64/include/asm/cpu_ops.h
@@ -19,15 +19,15 @@
#include <linux/init.h>
#include <linux/threads.h>
-struct device_node;
-
/**
* struct cpu_operations - Callback operations for hotplugging CPUs.
*
* @name: Name of the property as appears in a devicetree cpu node's
- * enable-method property.
- * @cpu_init: Reads any data necessary for a specific enable-method from the
- * devicetree, for a given cpu node and proposed logical id.
+ * enable-method property. On systems booting with ACPI, @name
+ * identifies the struct cpu_operations entry corresponding to
+ * the boot protocol specified in the ACPI MADT table.
+ * @cpu_init: Reads any data necessary for a specific enable-method for a
+ * proposed logical id.
* @cpu_prepare: Early one-time preparation step for a cpu. If there is a
* mechanism for doing so, tests whether it is possible to boot
* the given CPU.
@@ -40,15 +40,15 @@ struct device_node;
* @cpu_die: Makes a cpu leave the kernel. Must not fail. Called from the
* cpu being killed.
* @cpu_kill: Ensures a cpu has left the kernel. Called from another cpu.
- * @cpu_init_idle: Reads any data necessary to initialize CPU idle states from
- * devicetree, for a given cpu node and proposed logical id.
+ * @cpu_init_idle: Reads any data necessary to initialize CPU idle states for
+ * a proposed logical id.
* @cpu_suspend: Suspends a cpu and saves the required context. May fail owing
* to wrong parameters or error conditions. Called from the
* CPU being suspended. Must be called with IRQs disabled.
*/
struct cpu_operations {
const char *name;
- int (*cpu_init)(struct device_node *, unsigned int);
+ int (*cpu_init)(unsigned int);
int (*cpu_prepare)(unsigned int);
int (*cpu_boot)(unsigned int);
void (*cpu_postboot)(void);
@@ -58,14 +58,17 @@ struct cpu_operations {
int (*cpu_kill)(unsigned int cpu);
#endif
#ifdef CONFIG_CPU_IDLE
- int (*cpu_init_idle)(struct device_node *, unsigned int);
+ int (*cpu_init_idle)(unsigned int);
int (*cpu_suspend)(unsigned long);
#endif
};
extern const struct cpu_operations *cpu_ops[NR_CPUS];
-int __init cpu_read_ops(struct device_node *dn, int cpu);
-void __init cpu_read_bootcpu_ops(void);
-const struct cpu_operations *cpu_get_ops(const char *name);
+int __init cpu_read_ops(int cpu);
+
+static inline void __init cpu_read_bootcpu_ops(void)
+{
+ cpu_read_ops(0);
+}
#endif /* ifndef __ASM_CPU_OPS_H */
diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h
index 6aae421f4d73..2bb7009bdac7 100644
--- a/arch/arm64/include/asm/hardirq.h
+++ b/arch/arm64/include/asm/hardirq.h
@@ -24,9 +24,7 @@
typedef struct {
unsigned int __softirq_pending;
-#ifdef CONFIG_SMP
unsigned int ipi_irqs[NR_IPI];
-#endif
} ____cacheline_aligned irq_cpustat_t;
#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */
@@ -34,10 +32,8 @@ typedef struct {
#define __inc_irq_stat(cpu, member) __IRQ_STAT(cpu, member)++
#define __get_irq_stat(cpu, member) __IRQ_STAT(cpu, member)
-#ifdef CONFIG_SMP
u64 smp_irq_stat_cpu(unsigned int cpu);
#define arch_irq_stat_cpu smp_irq_stat_cpu
-#endif
#define __ARCH_IRQ_EXIT_IRQS_DISABLED 1
diff --git a/arch/arm64/include/asm/irq_work.h b/arch/arm64/include/asm/irq_work.h
index b4f6b19a8a68..8e24ef3f7c82 100644
--- a/arch/arm64/include/asm/irq_work.h
+++ b/arch/arm64/include/asm/irq_work.h
@@ -1,8 +1,6 @@
#ifndef __ASM_IRQ_WORK_H
#define __ASM_IRQ_WORK_H
-#ifdef CONFIG_SMP
-
#include <asm/smp.h>
static inline bool arch_irq_work_has_interrupt(void)
@@ -10,13 +8,4 @@ static inline bool arch_irq_work_has_interrupt(void)
return !!__smp_cross_call;
}
-#else
-
-static inline bool arch_irq_work_has_interrupt(void)
-{
- return false;
-}
-
-#endif
-
#endif /* __ASM_IRQ_WORK_H */
diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h
new file mode 100644
index 000000000000..2774fa384c47
--- /dev/null
+++ b/arch/arm64/include/asm/kasan.h
@@ -0,0 +1,38 @@
+#ifndef __ASM_KASAN_H
+#define __ASM_KASAN_H
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_KASAN
+
+#include <linux/linkage.h>
+#include <asm/memory.h>
+
+/*
+ * KASAN_SHADOW_START: beginning of the kernel virtual addresses.
+ * KASAN_SHADOW_END: KASAN_SHADOW_START + 1/8 of kernel virtual addresses.
+ */
+#define KASAN_SHADOW_START (VA_START)
+#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1UL << (VA_BITS - 3)))
+
+/*
+ * This value is used to map an address to the corresponding shadow
+ * address by the following formula:
+ * shadow_addr = (address >> 3) + KASAN_SHADOW_OFFSET;
+ *
+ * (1 << 61) shadow addresses - [KASAN_SHADOW_OFFSET,KASAN_SHADOW_END]
+ * cover all 64-bits of virtual addresses. So KASAN_SHADOW_OFFSET
+ * should satisfy the following equation:
+ * KASAN_SHADOW_OFFSET = KASAN_SHADOW_END - (1ULL << 61)
+ */
+#define KASAN_SHADOW_OFFSET (KASAN_SHADOW_END - (1ULL << (64 - 3)))
+
+void kasan_init(void);
+asmlinkage void kasan_early_init(void);
+
+#else
+static inline void kasan_init(void) { }
+#endif
+
+#endif
+#endif
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 44a59c20e773..53d3fdca4943 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -42,12 +42,14 @@
* PAGE_OFFSET - the virtual address of the start of the kernel image (top
* (VA_BITS - 1))
* VA_BITS - the maximum number of bits for virtual addresses.
+ * VA_START - the first kernel virtual address.
* TASK_SIZE - the maximum size of a user space task.
* TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area.
* The module space lives between the addresses given by TASK_SIZE
* and PAGE_OFFSET - it must be within 128MB of the kernel text.
*/
#define VA_BITS (CONFIG_ARM64_VA_BITS)
+#define VA_START (UL(0xffffffffffffffff) << VA_BITS)
#define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1))
#define MODULES_END (PAGE_OFFSET)
#define MODULES_VADDR (MODULES_END - SZ_64M)
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 4fde8c1df97f..0a456bef8c79 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -16,8 +16,6 @@
#ifndef __ASM_PERCPU_H
#define __ASM_PERCPU_H
-#ifdef CONFIG_SMP
-
static inline void set_my_cpu_offset(unsigned long off)
{
asm volatile("msr tpidr_el1, %0" :: "r" (off) : "memory");
@@ -38,12 +36,6 @@ static inline unsigned long __my_cpu_offset(void)
}
#define __my_cpu_offset __my_cpu_offset()
-#else /* !CONFIG_SMP */
-
-#define set_my_cpu_offset(x) do { } while (0)
-
-#endif /* CONFIG_SMP */
-
#define PERCPU_OP(op, asm_op) \
static inline unsigned long __percpu_##op(void *ptr, \
unsigned long val, int size) \
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 76420568d66a..c15053902942 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -27,6 +27,7 @@
#define check_pgt_cache() do { } while (0)
#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
#if CONFIG_PGTABLE_LEVELS > 2
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index cf7319422768..03ac25f4ee5c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -40,7 +40,14 @@
* fixed mappings and modules
*/
#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
-#define VMALLOC_START (UL(0xffffffffffffffff) << VA_BITS)
+
+#ifndef CONFIG_KASAN
+#define VMALLOC_START (VA_START)
+#else
+#include <asm/kasan.h>
+#define VMALLOC_START (KASAN_SHADOW_END + SZ_64K)
+#endif
+
#define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
#define vmemmap ((struct page *)(VMALLOC_END + SZ_64K))
@@ -53,13 +60,8 @@ extern void __pmd_error(const char *file, int line, unsigned long val);
extern void __pud_error(const char *file, int line, unsigned long val);
extern void __pgd_error(const char *file, int line, unsigned long val);
-#ifdef CONFIG_SMP
#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
-#else
-#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF)
-#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF)
-#endif
#define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE))
#define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_NC))
diff --git a/arch/arm64/include/asm/psci.h b/arch/arm64/include/asm/psci.h
deleted file mode 100644
index 2454bc59c916..000000000000
--- a/arch/arm64/include/asm/psci.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * Copyright (C) 2013 ARM Limited
- */
-
-#ifndef __ASM_PSCI_H
-#define __ASM_PSCI_H
-
-int psci_dt_init(void);
-int psci_acpi_init(void);
-
-#endif /* __ASM_PSCI_H */
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index d4264bb0a409..e9e5467e0bf4 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -183,11 +183,7 @@ static inline int valid_user_regs(struct user_pt_regs *regs)
#define instruction_pointer(regs) ((unsigned long)(regs)->pc)
-#ifdef CONFIG_SMP
extern unsigned long profile_pc(struct pt_regs *regs);
-#else
-#define profile_pc(regs) instruction_pointer(regs)
-#endif
#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index bf22650b1a78..d9c3d6a6100a 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -20,10 +20,6 @@
#include <linux/cpumask.h>
#include <linux/thread_info.h>
-#ifndef CONFIG_SMP
-# error "<asm/smp.h> included in non-SMP build"
-#endif
-
#define raw_smp_processor_id() (current_thread_info()->cpu)
struct seq_file;
@@ -42,7 +38,7 @@ extern void handle_IPI(int ipinr, struct pt_regs *regs);
* Discover the set of possible CPUs and determine their
* SMP operations.
*/
-extern void of_smp_init_cpus(void);
+extern void smp_init_cpus(void);
/*
* Provide a function to raise an IPI cross call on CPUs in callmap.
diff --git a/arch/arm64/include/asm/smp_plat.h b/arch/arm64/include/asm/smp_plat.h
index 8dcd61e32176..7abf7570c00f 100644
--- a/arch/arm64/include/asm/smp_plat.h
+++ b/arch/arm64/include/asm/smp_plat.h
@@ -19,6 +19,8 @@
#ifndef __ASM_SMP_PLAT_H
#define __ASM_SMP_PLAT_H
+#include <linux/cpumask.h>
+
#include <asm/types.h>
struct mpidr_hash {
@@ -39,6 +41,20 @@ static inline u32 mpidr_hash_size(void)
*/
extern u64 __cpu_logical_map[NR_CPUS];
#define cpu_logical_map(cpu) __cpu_logical_map[cpu]
+/*
+ * Retrieve logical cpu index corresponding to a given MPIDR.Aff*
+ * - mpidr: MPIDR.Aff* bits to be used for the look-up
+ *
+ * Returns the cpu logical index or -EINVAL on look-up error
+ */
+static inline int get_logical_index(u64 mpidr)
+{
+ int cpu;
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+ if (cpu_logical_map(cpu) == mpidr)
+ return cpu;
+ return -EINVAL;
+}
void __init do_post_cpus_up_work(void);
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index 64d2d4884a9d..2eb714c4639f 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -36,17 +36,33 @@ extern __kernel_size_t strnlen(const char *, __kernel_size_t);
#define __HAVE_ARCH_MEMCPY
extern void *memcpy(void *, const void *, __kernel_size_t);
+extern void *__memcpy(void *, const void *, __kernel_size_t);
#define __HAVE_ARCH_MEMMOVE
extern void *memmove(void *, const void *, __kernel_size_t);
+extern void *__memmove(void *, const void *, __kernel_size_t);
#define __HAVE_ARCH_MEMCHR
extern void *memchr(const void *, int, __kernel_size_t);
#define __HAVE_ARCH_MEMSET
extern void *memset(void *, int, __kernel_size_t);
+extern void *__memset(void *, int, __kernel_size_t);
#define __HAVE_ARCH_MEMCMP
extern int memcmp(const void *, const void *, size_t);
+
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+
+/*
+ * For files that are not instrumented (e.g. mm/slub.c) we
+ * should use not instrumented version of mem* functions.
+ */
+
+#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memmove(dst, src, len) __memmove(dst, src, len)
+#define memset(s, c, n) __memset(s, c, n)
+#endif
+
#endif
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 7ebcd31ce51c..65e9c4615664 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -1,8 +1,6 @@
#ifndef __ASM_TOPOLOGY_H
#define __ASM_TOPOLOGY_H
-#ifdef CONFIG_SMP
-
#include <linux/cpumask.h>
struct cpu_topology {
@@ -24,13 +22,6 @@ void init_cpu_topology(void);
void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
-#else
-
-static inline void init_cpu_topology(void) { }
-static inline void store_cpu_topology(unsigned int cpuid) { }
-
-#endif
-
#include <asm-generic/topology.h>
#endif /* _ASM_ARM_TOPOLOGY_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 426d0763c81b..1bbcd69cd0e4 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -7,6 +7,8 @@ AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET)
CFLAGS_efi-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET)
CFLAGS_armv8_deprecated.o := -I$(src)
+KASAN_SANITIZE_efi-stub.o := n
+
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_insn.o = -pg
CFLAGS_REMOVE_return_address.o = -pg
@@ -17,7 +19,8 @@ arm64-obj-y := debug-monitors.o entry.o irq.o fpsimd.o \
sys.o stacktrace.o time.o traps.o io.o vdso.o \
hyp-stub.o psci.o psci-call.o cpu_ops.o insn.o \
return_address.o cpuinfo.o cpu_errata.o \
- cpufeature.o alternative.o cacheinfo.o
+ cpufeature.o alternative.o cacheinfo.o \
+ smp.o smp_spin_table.o topology.o
arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \
sys_compat.o entry32.o \
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 8b839558838e..19de7537e7d3 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -36,12 +36,6 @@ EXPORT_SYMBOL(acpi_disabled);
int acpi_pci_disabled = 1; /* skip ACPI PCI scan and IRQ initialization */
EXPORT_SYMBOL(acpi_pci_disabled);
-/* Processors with enabled flag and sane MPIDR */
-static int enabled_cpus;
-
-/* Boot CPU is valid or not in MADT */
-static bool bootcpu_valid __initdata;
-
static bool param_acpi_off __initdata;
static bool param_acpi_force __initdata;
@@ -95,122 +89,15 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
early_memunmap(map, size);
}
-/**
- * acpi_map_gic_cpu_interface - generates a logical cpu number
- * and map to MPIDR represented by GICC structure
- */
-static void __init
-acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor)
+bool __init acpi_psci_present(void)
{
- int i;
- u64 mpidr = processor->arm_mpidr & MPIDR_HWID_BITMASK;
- bool enabled = !!(processor->flags & ACPI_MADT_ENABLED);
-
- if (mpidr == INVALID_HWID) {
- pr_info("Skip MADT cpu entry with invalid MPIDR\n");
- return;
- }
-
- total_cpus++;
- if (!enabled)
- return;
-
- if (enabled_cpus >= NR_CPUS) {
- pr_warn("NR_CPUS limit of %d reached, Processor %d/0x%llx ignored.\n",
- NR_CPUS, total_cpus, mpidr);
- return;
- }
-
- /* Check if GICC structure of boot CPU is available in the MADT */
- if (cpu_logical_map(0) == mpidr) {
- if (bootcpu_valid) {
- pr_err("Firmware bug, duplicate CPU MPIDR: 0x%llx in MADT\n",
- mpidr);
- return;
- }
-
- bootcpu_valid = true;
- }
-
- /*
- * Duplicate MPIDRs are a recipe for disaster. Scan
- * all initialized entries and check for
- * duplicates. If any is found just ignore the CPU.
- */
- for (i = 1; i < enabled_cpus; i++) {
- if (cpu_logical_map(i) == mpidr) {
- pr_err("Firmware bug, duplicate CPU MPIDR: 0x%llx in MADT\n",
- mpidr);
- return;
- }
- }
-
- if (!acpi_psci_present())
- return;
-
- cpu_ops[enabled_cpus] = cpu_get_ops("psci");
- /* CPU 0 was already initialized */
- if (enabled_cpus) {
- if (!cpu_ops[enabled_cpus])
- return;
-
- if (cpu_ops[enabled_cpus]->cpu_init(NULL, enabled_cpus))
- return;
-
- /* map the logical cpu id to cpu MPIDR */
- cpu_logical_map(enabled_cpus) = mpidr;
- }
-
- enabled_cpus++;
+ return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_COMPLIANT;
}
-static int __init
-acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header,
- const unsigned long end)
+/* Whether HVC must be used instead of SMC as the PSCI conduit */
+bool __init acpi_psci_use_hvc(void)
{
- struct acpi_madt_generic_interrupt *processor;
-
- processor = (struct acpi_madt_generic_interrupt *)header;
-
- if (BAD_MADT_ENTRY(processor, end))
- return -EINVAL;
-
- acpi_table_print_madt_entry(header);
- acpi_map_gic_cpu_interface(processor);
- return 0;
-}
-
-/* Parse GIC cpu interface entries in MADT for SMP init */
-void __init acpi_init_cpus(void)
-{
- int count, i;
-
- /*
- * do a partial walk of MADT to determine how many CPUs
- * we have including disabled CPUs, and get information
- * we need for SMP init
- */
- count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
- acpi_parse_gic_cpu_interface, 0);
-
- if (!count) {
- pr_err("No GIC CPU interface entries present\n");
- return;
- } else if (count < 0) {
- pr_err("Error parsing GIC CPU interface entry\n");
- return;
- }
-
- if (!bootcpu_valid) {
- pr_err("MADT missing boot CPU MPIDR, not enabling secondaries\n");
- return;
- }
-
- for (i = 0; i < enabled_cpus; i++)
- set_cpu_possible(i, true);
-
- /* Make boot-up look pretty */
- pr_info("%d CPUs enabled, %d CPUs total\n", enabled_cpus, total_cpus);
+ return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC;
}
/*
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index a85843ddbde8..3b6d8cc9dfe0 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -51,6 +51,9 @@ EXPORT_SYMBOL(strnlen);
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(memmove);
+EXPORT_SYMBOL(__memset);
+EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(__memmove);
EXPORT_SYMBOL(memchr);
EXPORT_SYMBOL(memcmp);
diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c
index fb8ff9ba467a..b6bd7d447768 100644
--- a/arch/arm64/kernel/cpu_ops.c
+++ b/arch/arm64/kernel/cpu_ops.c
@@ -16,11 +16,13 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-#include <asm/cpu_ops.h>
-#include <asm/smp_plat.h>
+#include <linux/acpi.h>
#include <linux/errno.h>
#include <linux/of.h>
#include <linux/string.h>
+#include <asm/acpi.h>
+#include <asm/cpu_ops.h>
+#include <asm/smp_plat.h>
extern const struct cpu_operations smp_spin_table_ops;
extern const struct cpu_operations cpu_psci_ops;
@@ -28,14 +30,12 @@ extern const struct cpu_operations cpu_psci_ops;
const struct cpu_operations *cpu_ops[NR_CPUS];
static const struct cpu_operations *supported_cpu_ops[] __initconst = {
-#ifdef CONFIG_SMP
&smp_spin_table_ops,
-#endif
&cpu_psci_ops,
NULL,
};
-const struct cpu_operations * __init cpu_get_ops(const char *name)
+static const struct cpu_operations * __init cpu_get_ops(const char *name)
{
const struct cpu_operations **ops = supported_cpu_ops;
@@ -49,39 +49,53 @@ const struct cpu_operations * __init cpu_get_ops(const char *name)
return NULL;
}
+static const char *__init cpu_read_enable_method(int cpu)
+{
+ const char *enable_method;
+
+ if (acpi_disabled) {
+ struct device_node *dn = of_get_cpu_node(cpu, NULL);
+
+ if (!dn) {
+ if (!cpu)
+ pr_err("Failed to find device node for boot cpu\n");
+ return NULL;
+ }
+
+ enable_method = of_get_property(dn, "enable-method", NULL);
+ if (!enable_method) {
+ /*
+ * The boot CPU may not have an enable method (e.g.
+ * when spin-table is used for secondaries).
+ * Don't warn spuriously.
+ */
+ if (cpu != 0)
+ pr_err("%s: missing enable-method property\n",
+ dn->full_name);
+ }
+ } else {
+ enable_method = acpi_get_enable_method(cpu);
+ if (!enable_method)
+ pr_err("Unsupported ACPI enable-method\n");
+ }
+
+ return enable_method;
+}
/*
- * Read a cpu's enable method from the device tree and record it in cpu_ops.
+ * Read a cpu's enable method and record it in cpu_ops.
*/
-int __init cpu_read_ops(struct device_node *dn, int cpu)
+int __init cpu_read_ops(int cpu)
{
- const char *enable_method = of_get_property(dn, "enable-method", NULL);
- if (!enable_method) {
- /*
- * The boot CPU may not have an enable method (e.g. when
- * spin-table is used for secondaries). Don't warn spuriously.
- */
- if (cpu != 0)
- pr_err("%s: missing enable-method property\n",
- dn->full_name);
- return -ENOENT;
- }
+ const char *enable_method = cpu_read_enable_method(cpu);
+
+ if (!enable_method)
+ return -ENODEV;
cpu_ops[cpu] = cpu_get_ops(enable_method);
if (!cpu_ops[cpu]) {
- pr_warn("%s: unsupported enable-method property: %s\n",
- dn->full_name, enable_method);
+ pr_warn("Unsupported enable-method: %s\n", enable_method);
return -EOPNOTSUPP;
}
return 0;
}
-
-void __init cpu_read_bootcpu_ops(void)
-{
- struct device_node *dn = of_get_cpu_node(0, NULL);
- if (!dn) {
- pr_err("Failed to find device node for boot cpu\n");
- return;
- }
- cpu_read_ops(dn, 0);
-}
diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c
index 2bbd0fee084f..7ce589ca54a4 100644
--- a/arch/arm64/kernel/cpuidle.c
+++ b/arch/arm64/kernel/cpuidle.c
@@ -18,15 +18,10 @@
int arm_cpuidle_init(unsigned int cpu)
{
int ret = -EOPNOTSUPP;
- struct device_node *cpu_node = of_cpu_device_node_get(cpu);
-
- if (!cpu_node)
- return -ENODEV;
if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_init_idle)
- ret = cpu_ops[cpu]->cpu_init_idle(cpu_node, cpu);
+ ret = cpu_ops[cpu]->cpu_init_idle(cpu);
- of_node_put(cpu_node);
return ret;
}
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 0d1d675f2cce..ce31f8d3a91b 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -58,7 +58,7 @@ static u32 mdscr_read(void)
* Allow root to disable self-hosted debug from userspace.
* This is useful if you want to connect an external JTAG debugger.
*/
-static u32 debug_enabled = 1;
+static bool debug_enabled = true;
static int create_debug_debugfs_entry(void)
{
@@ -69,7 +69,7 @@ fs_initcall(create_debug_debugfs_entry);
static int __init early_debug_disable(char *buf)
{
- debug_enabled = 0;
+ debug_enabled = false;
return 0;
}
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index cc7435c9676e..10cee4372bd2 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -62,13 +62,8 @@
/*
* Initial memory map attributes.
*/
-#ifndef CONFIG_SMP
-#define PTE_FLAGS PTE_TYPE_PAGE | PTE_AF
-#define PMD_FLAGS PMD_TYPE_SECT | PMD_SECT_AF
-#else
#define PTE_FLAGS PTE_TYPE_PAGE | PTE_AF | PTE_SHARED
#define PMD_FLAGS PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S
-#endif
#ifdef CONFIG_ARM64_64K_PAGES
#define MM_MMUFLAGS PTE_ATTRINDX(MT_NORMAL) | PTE_FLAGS
@@ -382,7 +377,7 @@ __create_page_tables:
* Create the identity mapping.
*/
mov x0, x25 // idmap_pg_dir
- adrp x3, KERNEL_START // __pa(KERNEL_START)
+ adrp x3, __idmap_text_start // __pa(__idmap_text_start)
#ifndef CONFIG_ARM64_VA_BITS_48
#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3)
@@ -405,11 +400,11 @@ __create_page_tables:
/*
* Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
- * entire kernel image can be ID mapped. As T0SZ == (64 - #bits used),
+ * entire ID map region can be mapped. As T0SZ == (64 - #bits used),
* this number conveniently equals the number of leading zeroes in
- * the physical address of KERNEL_END.
+ * the physical address of __idmap_text_end.
*/
- adrp x5, KERNEL_END
+ adrp x5, __idmap_text_end
clz x5, x5
cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough?
b.ge 1f // .. then skip additional level
@@ -424,8 +419,8 @@ __create_page_tables:
#endif
create_pgd_entry x0, x3, x5, x6
- mov x5, x3 // __pa(KERNEL_START)
- adr_l x6, KERNEL_END // __pa(KERNEL_END)
+ mov x5, x3 // __pa(__idmap_text_start)
+ adr_l x6, __idmap_text_end // __pa(__idmap_text_end)
create_block_map x0, x7, x3, x5, x6
/*
@@ -486,6 +481,9 @@ __mmap_switched:
str_l x21, __fdt_pointer, x5 // Save FDT pointer
str_l x24, memstart_addr, x6 // Save PHYS_OFFSET
mov x29, #0
+#ifdef CONFIG_KASAN
+ bl kasan_early_init
+#endif
b start_kernel
ENDPROC(__mmap_switched)
@@ -621,7 +619,6 @@ ENTRY(__boot_cpu_mode)
.long BOOT_CPU_MODE_EL1
.popsection
-#ifdef CONFIG_SMP
/*
* This provides a "holding pen" for platforms to hold all secondary
* cores are held until we're ready for them to initialise.
@@ -669,7 +666,6 @@ ENTRY(__secondary_switched)
mov x29, #0
b secondary_start_kernel
ENDPROC(__secondary_switched)
-#endif /* CONFIG_SMP */
/*
* Enable the MMU.
@@ -679,6 +675,7 @@ ENDPROC(__secondary_switched)
*
* other registers depend on the function called upon completion
*/
+ .section ".idmap.text", "ax"
__enable_mmu:
ldr x5, =vectors
msr vbar_el1, x5
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 240b75c0e94f..de99d4bd31bb 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -33,9 +33,7 @@ unsigned long irq_err_count;
int arch_show_interrupts(struct seq_file *p, int prec)
{
-#ifdef CONFIG_SMP
show_ipi_list(p, prec);
-#endif
seq_printf(p, "%*s: %10lu\n", prec, "Err", irq_err_count);
return 0;
}
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 876eb8df50bf..f4bc779e62e8 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -21,6 +21,7 @@
#include <linux/bitops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
+#include <linux/kasan.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/moduleloader.h>
@@ -34,9 +35,18 @@
void *module_alloc(unsigned long size)
{
- return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
- GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
- NUMA_NO_NODE, __builtin_return_address(0));
+ void *p;
+
+ p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
+ NUMA_NO_NODE, __builtin_return_address(0));
+
+ if (p && (kasan_module_alloc(p, size) < 0)) {
+ vfree(p);
+ return NULL;
+ }
+
+ return p;
}
enum aarch64_reloc_op {
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c
index 24d4733b7e3c..f67f35b6edb1 100644
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -15,183 +15,32 @@
#define pr_fmt(fmt) "psci: " fmt
-#include <linux/acpi.h>
#include <linux/init.h>
#include <linux/of.h>
#include <linux/smp.h>
-#include <linux/reboot.h>
-#include <linux/pm.h>
#include <linux/delay.h>
+#include <linux/psci.h>
#include <linux/slab.h>
+
#include <uapi/linux/psci.h>
-#include <asm/acpi.h>
#include <asm/compiler.h>
#include <asm/cpu_ops.h>
#include <asm/errno.h>
-#include <asm/psci.h>
#include <asm/smp_plat.h>
#include <asm/suspend.h>
-#include <asm/system_misc.h>
-
-#define PSCI_POWER_STATE_TYPE_STANDBY 0
-#define PSCI_POWER_STATE_TYPE_POWER_DOWN 1
-
-struct psci_power_state {
- u16 id;
- u8 type;
- u8 affinity_level;
-};
-
-struct psci_operations {
- int (*cpu_suspend)(struct psci_power_state state,
- unsigned long entry_point);
- int (*cpu_off)(struct psci_power_state state);
- int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
- int (*migrate)(unsigned long cpuid);
- int (*affinity_info)(unsigned long target_affinity,
- unsigned long lowest_affinity_level);
- int (*migrate_info_type)(void);
-};
-
-static struct psci_operations psci_ops;
-
-static int (*invoke_psci_fn)(u64, u64, u64, u64);
-typedef int (*psci_initcall_t)(const struct device_node *);
-
-asmlinkage int __invoke_psci_fn_hvc(u64, u64, u64, u64);
-asmlinkage int __invoke_psci_fn_smc(u64, u64, u64, u64);
-
-enum psci_function {
- PSCI_FN_CPU_SUSPEND,
- PSCI_FN_CPU_ON,
- PSCI_FN_CPU_OFF,
- PSCI_FN_MIGRATE,
- PSCI_FN_AFFINITY_INFO,
- PSCI_FN_MIGRATE_INFO_TYPE,
- PSCI_FN_MAX,
-};
-
-static DEFINE_PER_CPU_READ_MOSTLY(struct psci_power_state *, psci_power_state);
-
-static u32 psci_function_id[PSCI_FN_MAX];
-
-static int psci_to_linux_errno(int errno)
-{
- switch (errno) {
- case PSCI_RET_SUCCESS:
- return 0;
- case PSCI_RET_NOT_SUPPORTED:
- return -EOPNOTSUPP;
- case PSCI_RET_INVALID_PARAMS:
- return -EINVAL;
- case PSCI_RET_DENIED:
- return -EPERM;
- };
-
- return -EINVAL;
-}
-
-static u32 psci_power_state_pack(struct psci_power_state state)
-{
- return ((state.id << PSCI_0_2_POWER_STATE_ID_SHIFT)
- & PSCI_0_2_POWER_STATE_ID_MASK) |
- ((state.type << PSCI_0_2_POWER_STATE_TYPE_SHIFT)
- & PSCI_0_2_POWER_STATE_TYPE_MASK) |
- ((state.affinity_level << PSCI_0_2_POWER_STATE_AFFL_SHIFT)
- & PSCI_0_2_POWER_STATE_AFFL_MASK);
-}
-
-static void psci_power_state_unpack(u32 power_state,
- struct psci_power_state *state)
-{
- state->id = (power_state & PSCI_0_2_POWER_STATE_ID_MASK) >>
- PSCI_0_2_POWER_STATE_ID_SHIFT;
- state->type = (power_state & PSCI_0_2_POWER_STATE_TYPE_MASK) >>
- PSCI_0_2_POWER_STATE_TYPE_SHIFT;
- state->affinity_level =
- (power_state & PSCI_0_2_POWER_STATE_AFFL_MASK) >>
- PSCI_0_2_POWER_STATE_AFFL_SHIFT;
-}
-
-static int psci_get_version(void)
-{
- int err;
-
- err = invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0);
- return err;
-}
-
-static int psci_cpu_suspend(struct psci_power_state state,
- unsigned long entry_point)
-{
- int err;
- u32 fn, power_state;
-
- fn = psci_function_id[PSCI_FN_CPU_SUSPEND];
- power_state = psci_power_state_pack(state);
- err = invoke_psci_fn(fn, power_state, entry_point, 0);
- return psci_to_linux_errno(err);
-}
-static int psci_cpu_off(struct psci_power_state state)
-{
- int err;
- u32 fn, power_state;
+static DEFINE_PER_CPU_READ_MOSTLY(u32 *, psci_power_state);
- fn = psci_function_id[PSCI_FN_CPU_OFF];
- power_state = psci_power_state_pack(state);
- err = invoke_psci_fn(fn, power_state, 0, 0);
- return psci_to_linux_errno(err);
-}
-
-static int psci_cpu_on(unsigned long cpuid, unsigned long entry_point)
-{
- int err;
- u32 fn;
-
- fn = psci_function_id[PSCI_FN_CPU_ON];
- err = invoke_psci_fn(fn, cpuid, entry_point, 0);
- return psci_to_linux_errno(err);
-}
-
-static int psci_migrate(unsigned long cpuid)
-{
- int err;
- u32 fn;
-
- fn = psci_function_id[PSCI_FN_MIGRATE];
- err = invoke_psci_fn(fn, cpuid, 0, 0);
- return psci_to_linux_errno(err);
-}
-
-static int psci_affinity_info(unsigned long target_affinity,
- unsigned long lowest_affinity_level)
-{
- int err;
- u32 fn;
-
- fn = psci_function_id[PSCI_FN_AFFINITY_INFO];
- err = invoke_psci_fn(fn, target_affinity, lowest_affinity_level, 0);
- return err;
-}
-
-static int psci_migrate_info_type(void)
-{
- int err;
- u32 fn;
-
- fn = psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE];
- err = invoke_psci_fn(fn, 0, 0, 0);
- return err;
-}
-
-static int __maybe_unused cpu_psci_cpu_init_idle(struct device_node *cpu_node,
- unsigned int cpu)
+static int __maybe_unused cpu_psci_cpu_init_idle(unsigned int cpu)
{
int i, ret, count = 0;
- struct psci_power_state *psci_states;
- struct device_node *state_node;
+ u32 *psci_states;
+ struct device_node *state_node, *cpu_node;
+
+ cpu_node = of_get_cpu_node(cpu, NULL);
+ if (!cpu_node)
+ return -ENODEV;
/*
* If the PSCI cpu_suspend function hook has not been initialized
@@ -215,13 +64,13 @@ static int __maybe_unused cpu_psci_cpu_init_idle(struct device_node *cpu_node,
return -ENOMEM;
for (i = 0; i < count; i++) {
- u32 psci_power_state;
+ u32 state;
state_node = of_parse_phandle(cpu_node, "cpu-idle-states", i);
ret = of_property_read_u32(state_node,
"arm,psci-suspend-param",
- &psci_power_state);
+ &state);
if (ret) {
pr_warn(" * %s missing arm,psci-suspend-param property\n",
state_node->full_name);
@@ -230,9 +79,13 @@ static int __maybe_unused cpu_psci_cpu_init_idle(struct device_node *cpu_node,
}
of_node_put(state_node);
- pr_debug("psci-power-state %#x index %d\n", psci_power_state,
- i);
- psci_power_state_unpack(psci_power_state, &psci_states[i]);
+ pr_debug("psci-power-state %#x index %d\n", state, i);
+ if (!psci_power_state_is_valid(state)) {
+ pr_warn("Invalid PSCI power state %#x\n", state);
+ ret = -EINVAL;
+ goto free_mem;
+ }
+ psci_states[i] = state;
}
/* Idle states parsed correctly, initialize per-cpu pointer */
per_cpu(psci_power_state, cpu) = psci_states;
@@ -243,208 +96,7 @@ free_mem:
return ret;
}
-static int get_set_conduit_method(struct device_node *np)
-{
- const char *method;
-
- pr_info("probing for conduit method from DT.\n");
-
- if (of_property_read_string(np, "method", &method)) {
- pr_warn("missing \"method\" property\n");
- return -ENXIO;
- }
-
- if (!strcmp("hvc", method)) {
- invoke_psci_fn = __invoke_psci_fn_hvc;
- } else if (!strcmp("smc", method)) {
- invoke_psci_fn = __invoke_psci_fn_smc;
- } else {
- pr_warn("invalid \"method\" property: %s\n", method);
- return -EINVAL;
- }
- return 0;
-}
-
-static void psci_sys_reset(enum reboot_mode reboot_mode, const char *cmd)
-{
- invoke_psci_fn(PSCI_0_2_FN_SYSTEM_RESET, 0, 0, 0);
-}
-
-static void psci_sys_poweroff(void)
-{
- invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
-}
-
-static void __init psci_0_2_set_functions(void)
-{
- pr_info("Using standard PSCI v0.2 function IDs\n");
- psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_0_2_FN64_CPU_SUSPEND;
- psci_ops.cpu_suspend = psci_cpu_suspend;
-
- psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF;
- psci_ops.cpu_off = psci_cpu_off;
-
- psci_function_id[PSCI_FN_CPU_ON] = PSCI_0_2_FN64_CPU_ON;
- psci_ops.cpu_on = psci_cpu_on;
-
- psci_function_id[PSCI_FN_MIGRATE] = PSCI_0_2_FN64_MIGRATE;
- psci_ops.migrate = psci_migrate;
-
- psci_function_id[PSCI_FN_AFFINITY_INFO] = PSCI_0_2_FN64_AFFINITY_INFO;
- psci_ops.affinity_info = psci_affinity_info;
-
- psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE] =
- PSCI_0_2_FN_MIGRATE_INFO_TYPE;
- psci_ops.migrate_info_type = psci_migrate_info_type;
-
- arm_pm_restart = psci_sys_reset;
-
- pm_power_off = psci_sys_poweroff;
-}
-
-/*
- * Probe function for PSCI firmware versions >= 0.2
- */
-static int __init psci_probe(void)
-{
- int ver = psci_get_version();
-
- if (ver == PSCI_RET_NOT_SUPPORTED) {
- /*
- * PSCI versions >=0.2 mandates implementation of
- * PSCI_VERSION.
- */
- pr_err("PSCI firmware does not comply with the v0.2 spec.\n");
- return -EOPNOTSUPP;
- } else {
- pr_info("PSCIv%d.%d detected in firmware.\n",
- PSCI_VERSION_MAJOR(ver),
- PSCI_VERSION_MINOR(ver));
-
- if (PSCI_VERSION_MAJOR(ver) == 0 &&
- PSCI_VERSION_MINOR(ver) < 2) {
- pr_err("Conflicting PSCI version detected.\n");
- return -EINVAL;
- }
- }
-
- psci_0_2_set_functions();
-
- return 0;
-}
-
-/*
- * PSCI init function for PSCI versions >=0.2
- *
- * Probe based on PSCI PSCI_VERSION function
- */
-static int __init psci_0_2_init(struct device_node *np)
-{
- int err;
-
- err = get_set_conduit_method(np);
-
- if (err)
- goto out_put_node;
- /*
- * Starting with v0.2, the PSCI specification introduced a call
- * (PSCI_VERSION) that allows probing the firmware version, so
- * that PSCI function IDs and version specific initialization
- * can be carried out according to the specific version reported
- * by firmware
- */
- err = psci_probe();
-
-out_put_node:
- of_node_put(np);
- return err;
-}
-
-/*
- * PSCI < v0.2 get PSCI Function IDs via DT.
- */
-static int __init psci_0_1_init(struct device_node *np)
-{
- u32 id;
- int err;
-
- err = get_set_conduit_method(np);
-
- if (err)
- goto out_put_node;
-
- pr_info("Using PSCI v0.1 Function IDs from DT\n");
-
- if (!of_property_read_u32(np, "cpu_suspend", &id)) {
- psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
- psci_ops.cpu_suspend = psci_cpu_suspend;
- }
-
- if (!of_property_read_u32(np, "cpu_off", &id)) {
- psci_function_id[PSCI_FN_CPU_OFF] = id;
- psci_ops.cpu_off = psci_cpu_off;
- }
-
- if (!of_property_read_u32(np, "cpu_on", &id)) {
- psci_function_id[PSCI_FN_CPU_ON] = id;
- psci_ops.cpu_on = psci_cpu_on;
- }
-
- if (!of_property_read_u32(np, "migrate", &id)) {
- psci_function_id[PSCI_FN_MIGRATE] = id;
- psci_ops.migrate = psci_migrate;
- }
-
-out_put_node:
- of_node_put(np);
- return err;
-}
-
-static const struct of_device_id psci_of_match[] __initconst = {
- { .compatible = "arm,psci", .data = psci_0_1_init},
- { .compatible = "arm,psci-0.2", .data = psci_0_2_init},
- {},
-};
-
-int __init psci_dt_init(void)
-{
- struct device_node *np;
- const struct of_device_id *matched_np;
- psci_initcall_t init_fn;
-
- np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);
-
- if (!np)
- return -ENODEV;
-
- init_fn = (psci_initcall_t)matched_np->data;
- return init_fn(np);
-}
-
-/*
- * We use PSCI 0.2+ when ACPI is deployed on ARM64 and it's
- * explicitly clarified in SBBR
- */
-int __init psci_acpi_init(void)
-{
- if (!acpi_psci_present()) {
- pr_info("is not implemented in ACPI.\n");
- return -EOPNOTSUPP;
- }
-
- pr_info("probing for conduit method from ACPI.\n");
-
- if (acpi_psci_use_hvc())
- invoke_psci_fn = __invoke_psci_fn_hvc;
- else
- invoke_psci_fn = __invoke_psci_fn_smc;
-
- return psci_probe();
-}
-
-#ifdef CONFIG_SMP
-
-static int __init cpu_psci_cpu_init(struct device_node *dn, unsigned int cpu)
+static int __init cpu_psci_cpu_init(unsigned int cpu)
{
return 0;
}
@@ -474,6 +126,11 @@ static int cpu_psci_cpu_disable(unsigned int cpu)
/* Fail early if we don't have CPU_OFF support */
if (!psci_ops.cpu_off)
return -EOPNOTSUPP;
+
+ /* Trusted OS will deny CPU_OFF */
+ if (psci_tos_resident_on(cpu))
+ return -EPERM;
+
return 0;
}
@@ -484,9 +141,8 @@ static void cpu_psci_cpu_die(unsigned int cpu)
* There are no known implementations of PSCI actually using the
* power state field, pass a sensible default for now.
*/
- struct psci_power_state state = {
- .type = PSCI_POWER_STATE_TYPE_POWER_DOWN,
- };
+ u32 state = PSCI_POWER_STATE_TYPE_POWER_DOWN <<
+ PSCI_0_2_POWER_STATE_TYPE_SHIFT;
ret = psci_ops.cpu_off(state);
@@ -498,7 +154,7 @@ static int cpu_psci_cpu_kill(unsigned int cpu)
int err, i;
if (!psci_ops.affinity_info)
- return 1;
+ return 0;
/*
* cpu_kill could race with cpu_die and we can
* potentially end up declaring this cpu undead
@@ -509,7 +165,7 @@ static int cpu_psci_cpu_kill(unsigned int cpu)
err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) {
pr_info("CPU%d killed.\n", cpu);
- return 1;
+ return 0;
}
msleep(10);
@@ -518,15 +174,13 @@ static int cpu_psci_cpu_kill(unsigned int cpu)
pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n",
cpu, err);
- /* Make op_cpu_kill() fail. */
- return 0;
+ return -ETIMEDOUT;
}
#endif
-#endif
static int psci_suspend_finisher(unsigned long index)
{
- struct psci_power_state *state = __this_cpu_read(psci_power_state);
+ u32 *state = __this_cpu_read(psci_power_state);
return psci_ops.cpu_suspend(state[index - 1],
virt_to_phys(cpu_resume));
@@ -535,7 +189,7 @@ static int psci_suspend_finisher(unsigned long index)
static int __maybe_unused cpu_psci_cpu_suspend(unsigned long index)
{
int ret;
- struct psci_power_state *state = __this_cpu_read(psci_power_state);
+ u32 *state = __this_cpu_read(psci_power_state);
/*
* idle state index 0 corresponds to wfi, should never be called
* from the cpu_suspend operations
@@ -543,7 +197,7 @@ static int __maybe_unused cpu_psci_cpu_suspend(unsigned long index)
if (WARN_ON_ONCE(!index))
return -EINVAL;
- if (state[index - 1].type == PSCI_POWER_STATE_TYPE_STANDBY)
+ if (!psci_power_state_loses_context(state[index - 1]))
ret = psci_ops.cpu_suspend(state[index - 1], 0);
else
ret = cpu_suspend(index, psci_suspend_finisher);
@@ -557,7 +211,6 @@ const struct cpu_operations cpu_psci_ops = {
.cpu_init_idle = cpu_psci_cpu_init_idle,
.cpu_suspend = cpu_psci_cpu_suspend,
#endif
-#ifdef CONFIG_SMP
.cpu_init = cpu_psci_cpu_init,
.cpu_prepare = cpu_psci_cpu_prepare,
.cpu_boot = cpu_psci_cpu_boot,
@@ -566,6 +219,5 @@ const struct cpu_operations cpu_psci_ops = {
.cpu_die = cpu_psci_cpu_die,
.cpu_kill = cpu_psci_cpu_kill,
#endif
-#endif
};
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index bbdb53b87e13..668931813113 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -46,6 +46,7 @@
#include <linux/of_platform.h>
#include <linux/efi.h>
#include <linux/personality.h>
+#include <linux/psci.h>
#include <asm/acpi.h>
#include <asm/fixmap.h>
@@ -54,6 +55,7 @@
#include <asm/elf.h>
#include <asm/cpufeature.h>
#include <asm/cpu_ops.h>
+#include <asm/kasan.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <asm/smp_plat.h>
@@ -61,7 +63,6 @@
#include <asm/tlbflush.h>
#include <asm/traps.h>
#include <asm/memblock.h>
-#include <asm/psci.h>
#include <asm/efi.h>
#include <asm/virt.h>
@@ -142,7 +143,6 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
}
struct mpidr_hash mpidr_hash;
-#ifdef CONFIG_SMP
/**
* smp_build_mpidr_hash - Pre-compute shifts required at each affinity
* level in order to build a linear index from an
@@ -208,7 +208,6 @@ static void __init smp_build_mpidr_hash(void)
pr_warn("Large number of MPIDR hash buckets detected\n");
__flush_dcache_area(&mpidr_hash, sizeof(struct mpidr_hash));
}
-#endif
static void __init hyp_mode_check(void)
{
@@ -368,6 +367,67 @@ static void __init request_standard_resources(void)
}
}
+#ifdef CONFIG_BLK_DEV_INITRD
+/*
+ * Relocate initrd if it is not completely within the linear mapping.
+ * This would be the case if mem= cuts out all or part of it.
+ */
+static void __init relocate_initrd(void)
+{
+ phys_addr_t orig_start = __virt_to_phys(initrd_start);
+ phys_addr_t orig_end = __virt_to_phys(initrd_end);
+ phys_addr_t ram_end = memblock_end_of_DRAM();
+ phys_addr_t new_start;
+ unsigned long size, to_free = 0;
+ void *dest;
+
+ if (orig_end <= ram_end)
+ return;
+
+ /*
+ * Any of the original initrd which overlaps the linear map should
+ * be freed after relocating.
+ */
+ if (orig_start < ram_end)
+ to_free = ram_end - orig_start;
+
+ size = orig_end - orig_start;
+
+ /* initrd needs to be relocated completely inside linear mapping */
+ new_start = memblock_find_in_range(0, PFN_PHYS(max_pfn),
+ size, PAGE_SIZE);
+ if (!new_start)
+ panic("Cannot relocate initrd of size %ld\n", size);
+ memblock_reserve(new_start, size);
+
+ initrd_start = __phys_to_virt(new_start);
+ initrd_end = initrd_start + size;
+
+ pr_info("Moving initrd from [%llx-%llx] to [%llx-%llx]\n",
+ orig_start, orig_start + size - 1,
+ new_start, new_start + size - 1);
+
+ dest = (void *)initrd_start;
+
+ if (to_free) {
+ memcpy(dest, (void *)__phys_to_virt(orig_start), to_free);
+ dest += to_free;
+ }
+
+ copy_from_early_mem(dest, orig_start + to_free, size - to_free);
+
+ if (to_free) {
+ pr_info("Freeing original RAMDISK from [%llx-%llx]\n",
+ orig_start, orig_start + to_free - 1);
+ memblock_free(orig_start, to_free);
+ }
+}
+#else
+static inline void __init relocate_initrd(void)
+{
+}
+#endif
+
u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
void __init setup_arch(char **cmdline_p)
@@ -401,6 +461,10 @@ void __init setup_arch(char **cmdline_p)
acpi_boot_table_init();
paging_init();
+ relocate_initrd();
+
+ kasan_init();
+
request_standard_resources();
early_ioremap_reset();
@@ -408,18 +472,13 @@ void __init setup_arch(char **cmdline_p)
if (acpi_disabled) {
unflatten_device_tree();
psci_dt_init();
- cpu_read_bootcpu_ops();
-#ifdef CONFIG_SMP
- of_smp_init_cpus();
-#endif
} else {
psci_acpi_init();
- acpi_init_cpus();
}
-#ifdef CONFIG_SMP
+ cpu_read_bootcpu_ops();
+ smp_init_cpus();
smp_build_mpidr_hash();
-#endif
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
@@ -519,9 +578,7 @@ static int c_show(struct seq_file *m, void *v)
* online processors, looking for lines beginning with
* "processor". Give glibc what it expects.
*/
-#ifdef CONFIG_SMP
seq_printf(m, "processor\t: %d\n", i);
-#endif
seq_printf(m, "BogoMIPS\t: %lu.%02lu\n",
loops_per_jiffy / (500000UL/HZ),
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index ede186cdd452..5686a3ae3940 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -82,7 +82,6 @@ ENTRY(__cpu_suspend_enter)
str x2, [x0, #CPU_CTX_SP]
ldr x1, =sleep_save_sp
ldr x1, [x1, #SLEEP_SAVE_SP_VIRT]
-#ifdef CONFIG_SMP
mrs x7, mpidr_el1
ldr x9, =mpidr_hash
ldr x10, [x9, #MPIDR_HASH_MASK]
@@ -94,7 +93,6 @@ ENTRY(__cpu_suspend_enter)
ldp w5, w6, [x9, #(MPIDR_HASH_SHIFTS + 8)]
compute_mpidr_hash x8, x3, x4, x5, x6, x7, x10
add x1, x1, x8, lsl #3
-#endif
bl __cpu_suspend_save
/*
* Grab suspend finisher in x20 and its argument in x19
@@ -130,12 +128,14 @@ ENDPROC(__cpu_suspend_enter)
/*
* x0 must contain the sctlr value retrieved from restored context
*/
+ .pushsection ".idmap.text", "ax"
ENTRY(cpu_resume_mmu)
ldr x3, =cpu_resume_after_mmu
msr sctlr_el1, x0 // restore sctlr_el1
isb
br x3 // global jump to virtual address
ENDPROC(cpu_resume_mmu)
+ .popsection
cpu_resume_after_mmu:
mov x0, #0 // return zero on success
ldp x19, x20, [sp, #16]
@@ -149,7 +149,6 @@ ENDPROC(cpu_resume_after_mmu)
ENTRY(cpu_resume)
bl el2_setup // if in EL2 drop to EL1 cleanly
-#ifdef CONFIG_SMP
mrs x1, mpidr_el1
adrp x8, mpidr_hash
add x8, x8, #:lo12:mpidr_hash // x8 = struct mpidr_hash phys address
@@ -159,18 +158,13 @@ ENTRY(cpu_resume)
ldp w5, w6, [x8, #(MPIDR_HASH_SHIFTS + 8)]
compute_mpidr_hash x7, x3, x4, x5, x6, x1, x2
/* x7 contains hash index, let's use it to grab context pointer */
-#else
mov x7, xzr
-#endif
- adrp x0, sleep_save_sp
- add x0, x0, #:lo12:sleep_save_sp
- ldr x0, [x0, #SLEEP_SAVE_SP_PHYS]
+ ldr_l x0, sleep_save_sp + SLEEP_SAVE_SP_PHYS
ldr x0, [x0, x7, lsl #3]
/* load sp from context */
ldr x2, [x0, #CPU_CTX_SP]
- adrp x1, sleep_idmap_phys
/* load physical address of identity map page table in x1 */
- ldr x1, [x1, #:lo12:sleep_idmap_phys]
+ adrp x1, idmap_pg_dir
mov sp, x2
/*
* cpu_do_resume expects x0 to contain context physical address
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index d3a202b85ba6..a1e6ed5f0d06 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -17,6 +17,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <linux/acpi.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/spinlock.h>
@@ -248,7 +249,7 @@ static int op_cpu_kill(unsigned int cpu)
* time and hope that it's dead, so let's skip the wait and just hope.
*/
if (!cpu_ops[cpu]->cpu_kill)
- return 1;
+ return 0;
return cpu_ops[cpu]->cpu_kill(cpu);
}
@@ -261,6 +262,8 @@ static DECLARE_COMPLETION(cpu_died);
*/
void __cpu_die(unsigned int cpu)
{
+ int err;
+
if (!wait_for_completion_timeout(&cpu_died, msecs_to_jiffies(5000))) {
pr_crit("CPU%u: cpu didn't die\n", cpu);
return;
@@ -273,8 +276,10 @@ void __cpu_die(unsigned int cpu)
* verify that it has really left the kernel before we consider
* clobbering anything it might still be using.
*/
- if (!op_cpu_kill(cpu))
- pr_warn("CPU%d may not have shut down cleanly\n", cpu);
+ err = op_cpu_kill(cpu);
+ if (err)
+ pr_warn("CPU%d may not have shut down cleanly: %d\n",
+ cpu, err);
}
/*
@@ -318,57 +323,158 @@ void __init smp_prepare_boot_cpu(void)
set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
}
+static u64 __init of_get_cpu_mpidr(struct device_node *dn)
+{
+ const __be32 *cell;
+ u64 hwid;
+
+ /*
+ * A cpu node with missing "reg" property is
+ * considered invalid to build a cpu_logical_map
+ * entry.
+ */
+ cell = of_get_property(dn, "reg", NULL);
+ if (!cell) {
+ pr_err("%s: missing reg property\n", dn->full_name);
+ return INVALID_HWID;
+ }
+
+ hwid = of_read_number(cell, of_n_addr_cells(dn));
+ /*
+ * Non affinity bits must be set to 0 in the DT
+ */
+ if (hwid & ~MPIDR_HWID_BITMASK) {
+ pr_err("%s: invalid reg property\n", dn->full_name);
+ return INVALID_HWID;
+ }
+ return hwid;
+}
+
+/*
+ * Duplicate MPIDRs are a recipe for disaster. Scan all initialized
+ * entries and check for duplicates. If any is found just ignore the
+ * cpu. cpu_logical_map was initialized to INVALID_HWID to avoid
+ * matching valid MPIDR values.
+ */
+static bool __init is_mpidr_duplicate(unsigned int cpu, u64 hwid)
+{
+ unsigned int i;
+
+ for (i = 1; (i < cpu) && (i < NR_CPUS); i++)
+ if (cpu_logical_map(i) == hwid)
+ return true;
+ return false;
+}
+
+/*
+ * Initialize cpu operations for a logical cpu and
+ * set it in the possible mask on success
+ */
+static int __init smp_cpu_setup(int cpu)
+{
+ if (cpu_read_ops(cpu))
+ return -ENODEV;
+
+ if (cpu_ops[cpu]->cpu_init(cpu))
+ return -ENODEV;
+
+ set_cpu_possible(cpu, true);
+
+ return 0;
+}
+
+static bool bootcpu_valid __initdata;
+static unsigned int cpu_count = 1;
+
+#ifdef CONFIG_ACPI
+/*
+ * acpi_map_gic_cpu_interface - parse processor MADT entry
+ *
+ * Carry out sanity checks on MADT processor entry and initialize
+ * cpu_logical_map on success
+ */
+static void __init
+acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor)
+{
+ u64 hwid = processor->arm_mpidr;
+
+ if (hwid & ~MPIDR_HWID_BITMASK || hwid == INVALID_HWID) {
+ pr_err("skipping CPU entry with invalid MPIDR 0x%llx\n", hwid);
+ return;
+ }
+
+ if (!(processor->flags & ACPI_MADT_ENABLED)) {
+ pr_err("skipping disabled CPU entry with 0x%llx MPIDR\n", hwid);
+ return;
+ }
+
+ if (is_mpidr_duplicate(cpu_count, hwid)) {
+ pr_err("duplicate CPU MPIDR 0x%llx in MADT\n", hwid);
+ return;
+ }
+
+ /* Check if GICC structure of boot CPU is available in the MADT */
+ if (cpu_logical_map(0) == hwid) {
+ if (bootcpu_valid) {
+ pr_err("duplicate boot CPU MPIDR: 0x%llx in MADT\n",
+ hwid);
+ return;
+ }
+ bootcpu_valid = true;
+ return;
+ }
+
+ if (cpu_count >= NR_CPUS)
+ return;
+
+ /* map the logical cpu id to cpu MPIDR */
+ cpu_logical_map(cpu_count) = hwid;
+
+ cpu_count++;
+}
+
+static int __init
+acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header,
+ const unsigned long end)
+{
+ struct acpi_madt_generic_interrupt *processor;
+
+ processor = (struct acpi_madt_generic_interrupt *)header;
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ acpi_map_gic_cpu_interface(processor);
+
+ return 0;
+}
+#else
+#define acpi_table_parse_madt(...) do { } while (0)
+#endif
+
/*
* Enumerate the possible CPU set from the device tree and build the
* cpu logical map array containing MPIDR values related to logical
* cpus. Assumes that cpu_logical_map(0) has already been initialized.
*/
-void __init of_smp_init_cpus(void)
+void __init of_parse_and_init_cpus(void)
{
struct device_node *dn = NULL;
- unsigned int i, cpu = 1;
- bool bootcpu_valid = false;
while ((dn = of_find_node_by_type(dn, "cpu"))) {
- const u32 *cell;
- u64 hwid;
+ u64 hwid = of_get_cpu_mpidr(dn);
- /*
- * A cpu node with missing "reg" property is
- * considered invalid to build a cpu_logical_map
- * entry.
- */
- cell = of_get_property(dn, "reg", NULL);
- if (!cell) {
- pr_err("%s: missing reg property\n", dn->full_name);
+ if (hwid == INVALID_HWID)
goto next;
- }
- hwid = of_read_number(cell, of_n_addr_cells(dn));
- /*
- * Non affinity bits must be set to 0 in the DT
- */
- if (hwid & ~MPIDR_HWID_BITMASK) {
- pr_err("%s: invalid reg property\n", dn->full_name);
+ if (is_mpidr_duplicate(cpu_count, hwid)) {
+ pr_err("%s: duplicate cpu reg properties in the DT\n",
+ dn->full_name);
goto next;
}
/*
- * Duplicate MPIDRs are a recipe for disaster. Scan
- * all initialized entries and check for
- * duplicates. If any is found just ignore the cpu.
- * cpu_logical_map was initialized to INVALID_HWID to
- * avoid matching valid MPIDR values.
- */
- for (i = 1; (i < cpu) && (i < NR_CPUS); i++) {
- if (cpu_logical_map(i) == hwid) {
- pr_err("%s: duplicate cpu reg properties in the DT\n",
- dn->full_name);
- goto next;
- }
- }
-
- /*
* The numbering scheme requires that the boot CPU
* must be assigned logical id 0. Record it so that
* the logical map built from DT is validated and can
@@ -392,38 +498,58 @@ void __init of_smp_init_cpus(void)
continue;
}
- if (cpu >= NR_CPUS)
- goto next;
-
- if (cpu_read_ops(dn, cpu) != 0)
- goto next;
-
- if (cpu_ops[cpu]->cpu_init(dn, cpu))
+ if (cpu_count >= NR_CPUS)
goto next;
pr_debug("cpu logical map 0x%llx\n", hwid);
- cpu_logical_map(cpu) = hwid;
+ cpu_logical_map(cpu_count) = hwid;
next:
- cpu++;
+ cpu_count++;
}
+}
+
+/*
+ * Enumerate the possible CPU set from the device tree or ACPI and build the
+ * cpu logical map array containing MPIDR values related to logical
+ * cpus. Assumes that cpu_logical_map(0) has already been initialized.
+ */
+void __init smp_init_cpus(void)
+{
+ int i;
- /* sanity check */
- if (cpu > NR_CPUS)
- pr_warning("no. of cores (%d) greater than configured maximum of %d - clipping\n",
- cpu, NR_CPUS);
+ if (acpi_disabled)
+ of_parse_and_init_cpus();
+ else
+ /*
+ * do a walk of MADT to determine how many CPUs
+ * we have including disabled CPUs, and get information
+ * we need for SMP init
+ */
+ acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
+ acpi_parse_gic_cpu_interface, 0);
+
+ if (cpu_count > NR_CPUS)
+ pr_warn("no. of cores (%d) greater than configured maximum of %d - clipping\n",
+ cpu_count, NR_CPUS);
if (!bootcpu_valid) {
- pr_err("DT missing boot CPU MPIDR, not enabling secondaries\n");
+ pr_err("missing boot CPU MPIDR, not enabling secondaries\n");
return;
}
/*
- * All the cpus that made it to the cpu_logical_map have been
- * validated so set them as possible cpus.
+ * We need to set the cpu_logical_map entries before enabling
+ * the cpus so that cpu processor description entries (DT cpu nodes
+ * and ACPI MADT entries) can be retrieved by matching the cpu hwid
+ * with entries in cpu_logical_map while initializing the cpus.
+ * If the cpu set-up fails, invalidate the cpu_logical_map entry.
*/
- for (i = 0; i < NR_CPUS; i++)
- if (cpu_logical_map(i) != INVALID_HWID)
- set_cpu_possible(i, true);
+ for (i = 1; i < NR_CPUS; i++) {
+ if (cpu_logical_map(i) != INVALID_HWID) {
+ if (smp_cpu_setup(i))
+ cpu_logical_map(i) = INVALID_HWID;
+ }
+ }
}
void __init smp_prepare_cpus(unsigned int max_cpus)
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index 14944e5b28da..aef3605a8c47 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -49,8 +49,14 @@ static void write_pen_release(u64 val)
}
-static int smp_spin_table_cpu_init(struct device_node *dn, unsigned int cpu)
+static int smp_spin_table_cpu_init(unsigned int cpu)
{
+ struct device_node *dn;
+
+ dn = of_get_cpu_node(cpu, NULL);
+ if (!dn)
+ return -ENODEV;
+
/*
* Determine the address from which the CPU is polling.
*/
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 357418137db7..dd6ad81d53aa 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -132,7 +132,6 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
}
struct sleep_save_sp sleep_save_sp;
-phys_addr_t sleep_idmap_phys;
static int __init cpu_suspend_init(void)
{
@@ -146,9 +145,7 @@ static int __init cpu_suspend_init(void)
sleep_save_sp.save_ptr_stash = ctx_ptr;
sleep_save_sp.save_ptr_stash_phys = virt_to_phys(ctx_ptr);
- sleep_idmap_phys = virt_to_phys(idmap_pg_dir);
__flush_dcache_area(&sleep_save_sp, sizeof(struct sleep_save_sp));
- __flush_dcache_area(&sleep_idmap_phys, sizeof(sleep_idmap_phys));
return 0;
}
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index 42f9195cf2f8..149151fb42bb 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -42,7 +42,6 @@
#include <asm/thread_info.h>
#include <asm/stacktrace.h>
-#ifdef CONFIG_SMP
unsigned long profile_pc(struct pt_regs *regs)
{
struct stackframe frame;
@@ -62,7 +61,6 @@ unsigned long profile_pc(struct pt_regs *regs)
return frame.pc;
}
EXPORT_SYMBOL(profile_pc);
-#endif
void __init time_init(void)
{
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 1ef2940df13c..0a7bc087838e 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -179,11 +179,7 @@ void show_stack(struct task_struct *tsk, unsigned long *sp)
#else
#define S_PREEMPT ""
#endif
-#ifdef CONFIG_SMP
#define S_SMP " SMP"
-#else
-#define S_SMP ""
-#endif
static int __die(const char *str, int err, struct thread_info *thread,
struct pt_regs *regs)
diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S
index 60c1db54b41a..82379a70ef03 100644
--- a/arch/arm64/kernel/vdso/vdso.S
+++ b/arch/arm64/kernel/vdso/vdso.S
@@ -21,9 +21,8 @@
#include <linux/const.h>
#include <asm/page.h>
- __PAGE_ALIGNED_DATA
-
.globl vdso_start, vdso_end
+ .section .rodata
.balign PAGE_SIZE
vdso_start:
.incbin "arch/arm64/kernel/vdso/vdso.so"
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index aff07bcad882..4d77757b5894 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -38,6 +38,12 @@ jiffies = jiffies_64;
*(.hyp.text) \
VMLINUX_SYMBOL(__hyp_text_end) = .;
+#define IDMAP_TEXT \
+ . = ALIGN(SZ_4K); \
+ VMLINUX_SYMBOL(__idmap_text_start) = .; \
+ *(.idmap.text) \
+ VMLINUX_SYMBOL(__idmap_text_end) = .;
+
/*
* The size of the PE/COFF section that covers the kernel image, which
* runs from stext to _edata, must be a round multiple of the PE/COFF
@@ -98,6 +104,7 @@ SECTIONS
SCHED_TEXT
LOCK_TEXT
HYPERVISOR_TEXT
+ IDMAP_TEXT
*(.fixup)
*(.gnu.warning)
. = ALIGN(16);
@@ -170,11 +177,13 @@ SECTIONS
}
/*
- * The HYP init code can't be more than a page long,
+ * The HYP init code and ID map text can't be longer than a page each,
* and should not cross a page boundary.
*/
ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
"HYP init code too big or misaligned")
+ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
+ "ID map text too big or misaligned")
/*
* If padding is applied before .head.text, virt<->phys conversions will fail.
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
new file mode 100644
index 000000000000..410fbdb8163f
--- /dev/null
+++ b/arch/arm64/lib/copy_template.S
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/*
+ * Copy a buffer from src to dest (alignment handled by the hardware)
+ *
+ * Parameters:
+ * x0 - dest
+ * x1 - src
+ * x2 - n
+ * Returns:
+ * x0 - dest
+ */
+dstin .req x0
+src .req x1
+count .req x2
+tmp1 .req x3
+tmp1w .req w3
+tmp2 .req x4
+tmp2w .req w4
+dst .req x6
+
+A_l .req x7
+A_h .req x8
+B_l .req x9
+B_h .req x10
+C_l .req x11
+C_h .req x12
+D_l .req x13
+D_h .req x14
+
+ mov dst, dstin
+ cmp count, #16
+ /*When memory length is less than 16, the accessed are not aligned.*/
+ b.lo .Ltiny15
+
+ neg tmp2, src
+ ands tmp2, tmp2, #15/* Bytes to reach alignment. */
+ b.eq .LSrcAligned
+ sub count, count, tmp2
+ /*
+ * Copy the leading memory data from src to dst in an increasing
+ * address order.By this way,the risk of overwritting the source
+ * memory data is eliminated when the distance between src and
+ * dst is less than 16. The memory accesses here are alignment.
+ */
+ tbz tmp2, #0, 1f
+ ldrb1 tmp1w, src, #1
+ strb1 tmp1w, dst, #1
+1:
+ tbz tmp2, #1, 2f
+ ldrh1 tmp1w, src, #2
+ strh1 tmp1w, dst, #2
+2:
+ tbz tmp2, #2, 3f
+ ldr1 tmp1w, src, #4
+ str1 tmp1w, dst, #4
+3:
+ tbz tmp2, #3, .LSrcAligned
+ ldr1 tmp1, src, #8
+ str1 tmp1, dst, #8
+
+.LSrcAligned:
+ cmp count, #64
+ b.ge .Lcpy_over64
+ /*
+ * Deal with small copies quickly by dropping straight into the
+ * exit block.
+ */
+.Ltail63:
+ /*
+ * Copy up to 48 bytes of data. At this point we only need the
+ * bottom 6 bits of count to be accurate.
+ */
+ ands tmp1, count, #0x30
+ b.eq .Ltiny15
+ cmp tmp1w, #0x20
+ b.eq 1f
+ b.lt 2f
+ ldp1 A_l, A_h, src, #16
+ stp1 A_l, A_h, dst, #16
+1:
+ ldp1 A_l, A_h, src, #16
+ stp1 A_l, A_h, dst, #16
+2:
+ ldp1 A_l, A_h, src, #16
+ stp1 A_l, A_h, dst, #16
+.Ltiny15:
+ /*
+ * Prefer to break one ldp/stp into several load/store to access
+ * memory in an increasing address order,rather than to load/store 16
+ * bytes from (src-16) to (dst-16) and to backward the src to aligned
+ * address,which way is used in original cortex memcpy. If keeping
+ * the original memcpy process here, memmove need to satisfy the
+ * precondition that src address is at least 16 bytes bigger than dst
+ * address,otherwise some source data will be overwritten when memove
+ * call memcpy directly. To make memmove simpler and decouple the
+ * memcpy's dependency on memmove, withdrew the original process.
+ */
+ tbz count, #3, 1f
+ ldr1 tmp1, src, #8
+ str1 tmp1, dst, #8
+1:
+ tbz count, #2, 2f
+ ldr1 tmp1w, src, #4
+ str1 tmp1w, dst, #4
+2:
+ tbz count, #1, 3f
+ ldrh1 tmp1w, src, #2
+ strh1 tmp1w, dst, #2
+3:
+ tbz count, #0, .Lexitfunc
+ ldrb1 tmp1w, src, #1
+ strb1 tmp1w, dst, #1
+
+ b .Lexitfunc
+
+.Lcpy_over64:
+ subs count, count, #128
+ b.ge .Lcpy_body_large
+ /*
+ * Less than 128 bytes to copy, so handle 64 here and then jump
+ * to the tail.
+ */
+ ldp1 A_l, A_h, src, #16
+ stp1 A_l, A_h, dst, #16
+ ldp1 B_l, B_h, src, #16
+ ldp1 C_l, C_h, src, #16
+ stp1 B_l, B_h, dst, #16
+ stp1 C_l, C_h, dst, #16
+ ldp1 D_l, D_h, src, #16
+ stp1 D_l, D_h, dst, #16
+
+ tst count, #0x3f
+ b.ne .Ltail63
+ b .Lexitfunc
+
+ /*
+ * Critical loop. Start at a new cache line boundary. Assuming
+ * 64 bytes per line this ensures the entire loop is in one line.
+ */
+ .p2align L1_CACHE_SHIFT
+.Lcpy_body_large:
+ /* pre-get 64 bytes data. */
+ ldp1 A_l, A_h, src, #16
+ ldp1 B_l, B_h, src, #16
+ ldp1 C_l, C_h, src, #16
+ ldp1 D_l, D_h, src, #16
+1:
+ /*
+ * interlace the load of next 64 bytes data block with store of the last
+ * loaded 64 bytes data.
+ */
+ stp1 A_l, A_h, dst, #16
+ ldp1 A_l, A_h, src, #16
+ stp1 B_l, B_h, dst, #16
+ ldp1 B_l, B_h, src, #16
+ stp1 C_l, C_h, dst, #16
+ ldp1 C_l, C_h, src, #16
+ stp1 D_l, D_h, dst, #16
+ ldp1 D_l, D_h, src, #16
+ subs count, count, #64
+ b.ge 1b
+ stp1 A_l, A_h, dst, #16
+ stp1 B_l, B_h, dst, #16
+ stp1 C_l, C_h, dst, #16
+ stp1 D_l, D_h, dst, #16
+
+ tst count, #0x3f
+ b.ne .Ltail63
+.Lexitfunc:
diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S
index 8636b7549163..4444c1d25f4b 100644
--- a/arch/arm64/lib/memchr.S
+++ b/arch/arm64/lib/memchr.S
@@ -41,4 +41,4 @@ ENTRY(memchr)
ret
2: mov x0, #0
ret
-ENDPROC(memchr)
+ENDPIPROC(memchr)
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
index 6ea0776ba6de..ffbdec00327d 100644
--- a/arch/arm64/lib/memcmp.S
+++ b/arch/arm64/lib/memcmp.S
@@ -255,4 +255,4 @@ CPU_LE( rev data2, data2 )
.Lret0:
mov result, #0
ret
-ENDPROC(memcmp)
+ENDPIPROC(memcmp)
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 8a9a96d3ddae..67613937711f 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -36,166 +36,42 @@
* Returns:
* x0 - dest
*/
-dstin .req x0
-src .req x1
-count .req x2
-tmp1 .req x3
-tmp1w .req w3
-tmp2 .req x4
-tmp2w .req w4
-tmp3 .req x5
-tmp3w .req w5
-dst .req x6
+ .macro ldrb1 ptr, regB, val
+ ldrb \ptr, [\regB], \val
+ .endm
-A_l .req x7
-A_h .req x8
-B_l .req x9
-B_h .req x10
-C_l .req x11
-C_h .req x12
-D_l .req x13
-D_h .req x14
+ .macro strb1 ptr, regB, val
+ strb \ptr, [\regB], \val
+ .endm
-ENTRY(memcpy)
- mov dst, dstin
- cmp count, #16
- /*When memory length is less than 16, the accessed are not aligned.*/
- b.lo .Ltiny15
+ .macro ldrh1 ptr, regB, val
+ ldrh \ptr, [\regB], \val
+ .endm
- neg tmp2, src
- ands tmp2, tmp2, #15/* Bytes to reach alignment. */
- b.eq .LSrcAligned
- sub count, count, tmp2
- /*
- * Copy the leading memory data from src to dst in an increasing
- * address order.By this way,the risk of overwritting the source
- * memory data is eliminated when the distance between src and
- * dst is less than 16. The memory accesses here are alignment.
- */
- tbz tmp2, #0, 1f
- ldrb tmp1w, [src], #1
- strb tmp1w, [dst], #1
-1:
- tbz tmp2, #1, 2f
- ldrh tmp1w, [src], #2
- strh tmp1w, [dst], #2
-2:
- tbz tmp2, #2, 3f
- ldr tmp1w, [src], #4
- str tmp1w, [dst], #4
-3:
- tbz tmp2, #3, .LSrcAligned
- ldr tmp1, [src],#8
- str tmp1, [dst],#8
+ .macro strh1 ptr, regB, val
+ strh \ptr, [\regB], \val
+ .endm
-.LSrcAligned:
- cmp count, #64
- b.ge .Lcpy_over64
- /*
- * Deal with small copies quickly by dropping straight into the
- * exit block.
- */
-.Ltail63:
- /*
- * Copy up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate.
- */
- ands tmp1, count, #0x30
- b.eq .Ltiny15
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp A_l, A_h, [src], #16
- stp A_l, A_h, [dst], #16
-1:
- ldp A_l, A_h, [src], #16
- stp A_l, A_h, [dst], #16
-2:
- ldp A_l, A_h, [src], #16
- stp A_l, A_h, [dst], #16
-.Ltiny15:
- /*
- * Prefer to break one ldp/stp into several load/store to access
- * memory in an increasing address order,rather than to load/store 16
- * bytes from (src-16) to (dst-16) and to backward the src to aligned
- * address,which way is used in original cortex memcpy. If keeping
- * the original memcpy process here, memmove need to satisfy the
- * precondition that src address is at least 16 bytes bigger than dst
- * address,otherwise some source data will be overwritten when memove
- * call memcpy directly. To make memmove simpler and decouple the
- * memcpy's dependency on memmove, withdrew the original process.
- */
- tbz count, #3, 1f
- ldr tmp1, [src], #8
- str tmp1, [dst], #8
-1:
- tbz count, #2, 2f
- ldr tmp1w, [src], #4
- str tmp1w, [dst], #4
-2:
- tbz count, #1, 3f
- ldrh tmp1w, [src], #2
- strh tmp1w, [dst], #2
-3:
- tbz count, #0, .Lexitfunc
- ldrb tmp1w, [src]
- strb tmp1w, [dst]
+ .macro ldr1 ptr, regB, val
+ ldr \ptr, [\regB], \val
+ .endm
-.Lexitfunc:
- ret
+ .macro str1 ptr, regB, val
+ str \ptr, [\regB], \val
+ .endm
-.Lcpy_over64:
- subs count, count, #128
- b.ge .Lcpy_body_large
- /*
- * Less than 128 bytes to copy, so handle 64 here and then jump
- * to the tail.
- */
- ldp A_l, A_h, [src],#16
- stp A_l, A_h, [dst],#16
- ldp B_l, B_h, [src],#16
- ldp C_l, C_h, [src],#16
- stp B_l, B_h, [dst],#16
- stp C_l, C_h, [dst],#16
- ldp D_l, D_h, [src],#16
- stp D_l, D_h, [dst],#16
+ .macro ldp1 ptr, regB, regC, val
+ ldp \ptr, \regB, [\regC], \val
+ .endm
- tst count, #0x3f
- b.ne .Ltail63
- ret
+ .macro stp1 ptr, regB, regC, val
+ stp \ptr, \regB, [\regC], \val
+ .endm
- /*
- * Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line.
- */
- .p2align L1_CACHE_SHIFT
-.Lcpy_body_large:
- /* pre-get 64 bytes data. */
- ldp A_l, A_h, [src],#16
- ldp B_l, B_h, [src],#16
- ldp C_l, C_h, [src],#16
- ldp D_l, D_h, [src],#16
-1:
- /*
- * interlace the load of next 64 bytes data block with store of the last
- * loaded 64 bytes data.
- */
- stp A_l, A_h, [dst],#16
- ldp A_l, A_h, [src],#16
- stp B_l, B_h, [dst],#16
- ldp B_l, B_h, [src],#16
- stp C_l, C_h, [dst],#16
- ldp C_l, C_h, [src],#16
- stp D_l, D_h, [dst],#16
- ldp D_l, D_h, [src],#16
- subs count, count, #64
- b.ge 1b
- stp A_l, A_h, [dst],#16
- stp B_l, B_h, [dst],#16
- stp C_l, C_h, [dst],#16
- stp D_l, D_h, [dst],#16
-
- tst count, #0x3f
- b.ne .Ltail63
+ .weak memcpy
+ENTRY(__memcpy)
+ENTRY(memcpy)
+#include "copy_template.S"
ret
-ENDPROC(memcpy)
+ENDPIPROC(memcpy)
+ENDPROC(__memcpy)
diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S
index 57b19ea2dad4..a5a4459013b1 100644
--- a/arch/arm64/lib/memmove.S
+++ b/arch/arm64/lib/memmove.S
@@ -57,12 +57,14 @@ C_h .req x12
D_l .req x13
D_h .req x14
+ .weak memmove
+ENTRY(__memmove)
ENTRY(memmove)
cmp dstin, src
- b.lo memcpy
+ b.lo __memcpy
add tmp1, src, count
cmp dstin, tmp1
- b.hs memcpy /* No overlap. */
+ b.hs __memcpy /* No overlap. */
add dst, dstin, count
add src, src, count
@@ -194,4 +196,5 @@ ENTRY(memmove)
tst count, #0x3f
b.ne .Ltail63
ret
-ENDPROC(memmove)
+ENDPIPROC(memmove)
+ENDPROC(__memmove)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index 7c72dfd36b63..f2670a9f218c 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -54,6 +54,8 @@ dst .req x8
tmp3w .req w9
tmp3 .req x9
+ .weak memset
+ENTRY(__memset)
ENTRY(memset)
mov dst, dstin /* Preserve return value. */
and A_lw, val, #255
@@ -213,4 +215,5 @@ ENTRY(memset)
ands count, count, zva_bits_x
b.ne .Ltail_maybe_long
ret
-ENDPROC(memset)
+ENDPIPROC(memset)
+ENDPROC(__memset)
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 42f828b06c59..471fe61760ef 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -231,4 +231,4 @@ CPU_BE( orr syndrome, diff, has_nul )
lsr data1, data1, #56
sub result, data1, data2, lsr #56
ret
-ENDPROC(strcmp)
+ENDPIPROC(strcmp)
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
index 987b68b9ce44..55ccc8e24c08 100644
--- a/arch/arm64/lib/strlen.S
+++ b/arch/arm64/lib/strlen.S
@@ -123,4 +123,4 @@ CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
csinv data1, data1, xzr, le
csel data2, data2, data2a, le
b .Lrealigned
-ENDPROC(strlen)
+ENDPIPROC(strlen)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index 0224cf5a5533..e267044761c6 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -307,4 +307,4 @@ CPU_BE( orr syndrome, diff, has_nul )
.Lret0:
mov result, #0
ret
-ENDPROC(strncmp)
+ENDPIPROC(strncmp)
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index 773d37a14039..57f57fde5722 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -4,3 +4,6 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
context.o proc.o pageattr.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_ARM64_PTDUMP) += dump.o
+
+obj-$(CONFIG_KASAN) += kasan_init.o
+KASAN_SANITIZE_kasan_init.o := n
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 70a79cb6d504..3207f6e1bd59 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -171,7 +171,7 @@ ENTRY(__flush_dcache_area)
b.lo 1b
dsb sy
ret
-ENDPROC(__flush_dcache_area)
+ENDPIPROC(__flush_dcache_area)
/*
* __inval_cache_range(start, end)
@@ -204,7 +204,7 @@ __dma_inv_range:
b.lo 2b
dsb sy
ret
-ENDPROC(__inval_cache_range)
+ENDPIPROC(__inval_cache_range)
ENDPROC(__dma_inv_range)
/*
@@ -239,7 +239,7 @@ ENTRY(__dma_flush_range)
b.lo 1b
dsb sy
ret
-ENDPROC(__dma_flush_range)
+ENDPIPROC(__dma_flush_range)
/*
* __dma_map_area(start, size, dir)
@@ -252,7 +252,7 @@ ENTRY(__dma_map_area)
cmp w2, #DMA_FROM_DEVICE
b.eq __dma_inv_range
b __dma_clean_range
-ENDPROC(__dma_map_area)
+ENDPIPROC(__dma_map_area)
/*
* __dma_unmap_area(start, size, dir)
@@ -265,4 +265,4 @@ ENTRY(__dma_unmap_area)
cmp w2, #DMA_TO_DEVICE
b.ne __dma_inv_range
ret
-ENDPROC(__dma_unmap_area)
+ENDPIPROC(__dma_unmap_area)
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 76c1e6cd36fc..d70ff14dbdbd 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -53,8 +53,6 @@ static void flush_context(void)
__flush_icache_all();
}
-#ifdef CONFIG_SMP
-
static void set_mm_context(struct mm_struct *mm, unsigned int asid)
{
unsigned long flags;
@@ -110,23 +108,12 @@ static void reset_context(void *info)
cpu_switch_mm(mm->pgd, mm);
}
-#else
-
-static inline void set_mm_context(struct mm_struct *mm, unsigned int asid)
-{
- mm->context.id = asid;
- cpumask_copy(mm_cpumask(mm), cpumask_of(smp_processor_id()));
-}
-
-#endif
-
void __new_context(struct mm_struct *mm)
{
unsigned int asid;
unsigned int bits = asid_bits();
raw_spin_lock(&cpu_asid_lock);
-#ifdef CONFIG_SMP
/*
* Check the ASID again, in case the change was broadcast from another
* CPU before we acquired the lock.
@@ -136,7 +123,6 @@ void __new_context(struct mm_struct *mm)
raw_spin_unlock(&cpu_asid_lock);
return;
}
-#endif
/*
* At this point, it is guaranteed that the current mm (with an old
* ASID) isn't active on any other CPU since the ASIDs are changed
@@ -155,10 +141,8 @@ void __new_context(struct mm_struct *mm)
cpu_last_asid = ASID_FIRST_VERSION;
asid = cpu_last_asid + smp_processor_id();
flush_context();
-#ifdef CONFIG_SMP
smp_wmb();
smp_call_function(reset_context, NULL, 1);
-#endif
cpu_last_asid += NR_CPUS - 1;
}
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index b6f14e8d2121..07844184975b 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -60,14 +60,10 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
unsigned long uaddr, void *dst, const void *src,
unsigned long len)
{
-#ifdef CONFIG_SMP
preempt_disable();
-#endif
memcpy(dst, src, len);
flush_ptrace_access(vma, page, uaddr, dst, len);
-#ifdef CONFIG_SMP
preempt_enable();
-#endif
}
void __sync_icache_dcache(pte_t pte, unsigned long addr)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index ad87ce826cce..39306924be95 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -298,6 +298,9 @@ void __init mem_init(void)
#define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K)
pr_notice("Virtual kernel memory layout:\n"
+#ifdef CONFIG_KASAN
+ " kasan : 0x%16lx - 0x%16lx (%6ld GB)\n"
+#endif
" vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n"
#ifdef CONFIG_SPARSEMEM_VMEMMAP
" vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n"
@@ -310,6 +313,9 @@ void __init mem_init(void)
" .init : 0x%p" " - 0x%p" " (%6ld KB)\n"
" .text : 0x%p" " - 0x%p" " (%6ld KB)\n"
" .data : 0x%p" " - 0x%p" " (%6ld KB)\n",
+#ifdef CONFIG_KASAN
+ MLG(KASAN_SHADOW_START, KASAN_SHADOW_END),
+#endif
MLG(VMALLOC_START, VMALLOC_END),
#ifdef CONFIG_SPARSEMEM_VMEMMAP
MLG((unsigned long)vmemmap,
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
new file mode 100644
index 000000000000..cf038c7d9fa9
--- /dev/null
+++ b/arch/arm64/mm/kasan_init.c
@@ -0,0 +1,165 @@
+/*
+ * This file contains kasan initialization code for ARM64.
+ *
+ * Copyright (c) 2015 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) "kasan: " fmt
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memblock.h>
+#include <linux/start_kernel.h>
+
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
+
+static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr,
+ unsigned long end)
+{
+ pte_t *pte;
+ unsigned long next;
+
+ if (pmd_none(*pmd))
+ pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+
+ pte = pte_offset_kernel(pmd, addr);
+ do {
+ next = addr + PAGE_SIZE;
+ set_pte(pte, pfn_pte(virt_to_pfn(kasan_zero_page),
+ PAGE_KERNEL));
+ } while (pte++, addr = next, addr != end && pte_none(*pte));
+}
+
+static void __init kasan_early_pmd_populate(pud_t *pud,
+ unsigned long addr,
+ unsigned long end)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ if (pud_none(*pud))
+ pud_populate(&init_mm, pud, kasan_zero_pmd);
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ kasan_early_pte_populate(pmd, addr, next);
+ } while (pmd++, addr = next, addr != end && pmd_none(*pmd));
+}
+
+static void __init kasan_early_pud_populate(pgd_t *pgd,
+ unsigned long addr,
+ unsigned long end)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ if (pgd_none(*pgd))
+ pgd_populate(&init_mm, pgd, kasan_zero_pud);
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ kasan_early_pmd_populate(pud, addr, next);
+ } while (pud++, addr = next, addr != end && pud_none(*pud));
+}
+
+static void __init kasan_map_early_shadow(void)
+{
+ unsigned long addr = KASAN_SHADOW_START;
+ unsigned long end = KASAN_SHADOW_END;
+ unsigned long next;
+ pgd_t *pgd;
+
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ kasan_early_pud_populate(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
+}
+
+asmlinkage void __init kasan_early_init(void)
+{
+ BUILD_BUG_ON(KASAN_SHADOW_OFFSET != KASAN_SHADOW_END - (1UL << 61));
+ BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE));
+ BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE));
+ kasan_map_early_shadow();
+}
+
+static void __init clear_pgds(unsigned long start,
+ unsigned long end)
+{
+ /*
+ * Remove references to kasan page tables from
+ * swapper_pg_dir. pgd_clear() can't be used
+ * here because it's nop on 2,3-level pagetable setups
+ */
+ for (; start < end; start += PGDIR_SIZE)
+ set_pgd(pgd_offset_k(start), __pgd(0));
+}
+
+static void __init cpu_set_ttbr1(unsigned long ttbr1)
+{
+ asm(
+ " msr ttbr1_el1, %0\n"
+ " isb"
+ :
+ : "r" (ttbr1));
+}
+
+void __init kasan_init(void)
+{
+ struct memblock_region *reg;
+
+ /*
+ * We are going to perform proper setup of shadow memory.
+ * At first we should unmap early shadow (clear_pgds() call bellow).
+ * However, instrumented code couldn't execute without shadow memory.
+ * tmp_pg_dir used to keep early shadow mapped until full shadow
+ * setup will be finished.
+ */
+ memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir));
+ cpu_set_ttbr1(__pa(tmp_pg_dir));
+ flush_tlb_all();
+
+ clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
+ kasan_mem_to_shadow((void *)MODULES_VADDR));
+
+ for_each_memblock(memory, reg) {
+ void *start = (void *)__phys_to_virt(reg->base);
+ void *end = (void *)__phys_to_virt(reg->base + reg->size);
+
+ if (start >= end)
+ break;
+
+ /*
+ * end + 1 here is intentional. We check several shadow bytes in
+ * advance to slightly speed up fastpath. In some rare cases
+ * we could cross boundary of mapped shadow, so we just map
+ * some more here.
+ */
+ vmemmap_populate((unsigned long)kasan_mem_to_shadow(start),
+ (unsigned long)kasan_mem_to_shadow(end) + 1,
+ pfn_to_nid(virt_to_pfn(start)));
+ }
+
+ memset(kasan_zero_page, 0, PAGE_SIZE);
+ cpu_set_ttbr1(__pa(swapper_pg_dir));
+ flush_tlb_all();
+
+ /* At this point kasan is fully initialized. Enable error messages */
+ init_task.kasan_depth = 0;
+ pr_info("KernelAddressSanitizer initialized\n");
+}
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 71ca104f97bd..cb3ba1b812e7 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -28,8 +28,6 @@
#include "mm.h"
-#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
-
static struct kmem_cache *pgd_cache;
pgd_t *pgd_alloc(struct mm_struct *mm)
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index d253908a988d..e1d0e0a292bb 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -34,11 +34,7 @@
#define TCR_TG_FLAGS TCR_TG0_4K | TCR_TG1_4K
#endif
-#ifdef CONFIG_SMP
#define TCR_SMP_FLAGS TCR_SHARED
-#else
-#define TCR_SMP_FLAGS 0
-#endif
/* PTWs cacheable, inner/outer WBWA */
#define TCR_CACHE_FLAGS TCR_IRGN_WBWA | TCR_ORGN_WBWA
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 9b95c9b7e097..7e6ab18c488a 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2103,11 +2103,11 @@ config CPU_R4K_CACHE_TLB
config MIPS_MT_SMP
bool "MIPS MT SMP support (1 TC on each available VPE)"
- depends on SYS_SUPPORTS_MULTITHREADING
+ depends on SYS_SUPPORTS_MULTITHREADING && !CPU_MIPSR6
select CPU_MIPSR2_IRQ_VI
select CPU_MIPSR2_IRQ_EI
select SYNC_R4K
- select MIPS_GIC_IPI
+ select MIPS_GIC_IPI if MIPS_GIC
select MIPS_MT
select SMP
select SMP_UP
@@ -2204,8 +2204,8 @@ config MIPS_VPE_APSP_API_MT
config MIPS_CMP
bool "MIPS CMP framework support (DEPRECATED)"
- depends on SYS_SUPPORTS_MIPS_CMP
- select MIPS_GIC_IPI
+ depends on SYS_SUPPORTS_MIPS_CMP && !CPU_MIPSR6
+ select MIPS_GIC_IPI if MIPS_GIC
select SMP
select SYNC_R4K
select SYS_SUPPORTS_SMP
@@ -2221,11 +2221,11 @@ config MIPS_CMP
config MIPS_CPS
bool "MIPS Coherent Processing System support"
- depends on SYS_SUPPORTS_MIPS_CPS && !64BIT
+ depends on SYS_SUPPORTS_MIPS_CPS && !CPU_MIPSR6
select MIPS_CM
select MIPS_CPC
select MIPS_CPS_PM if HOTPLUG_CPU
- select MIPS_GIC_IPI
+ select MIPS_GIC_IPI if MIPS_GIC
select SMP
select SYNC_R4K if (CEVT_R4K || CSRC_R4K)
select SYS_SUPPORTS_HOTPLUG_CPU
@@ -2244,6 +2244,7 @@ config MIPS_CPS_PM
bool
config MIPS_GIC_IPI
+ depends on MIPS_GIC
bool
config MIPS_CM
diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h
index 47f11c707b65..03d4d5afb120 100644
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -30,6 +30,9 @@
#define __read_mostly __attribute__((__section__(".data..read_mostly")))
+/* Read-only memory is marked before mark_rodata_ro() is called. */
+#define __ro_after_init __read_mostly
+
void parisc_cache_init(void); /* initializes cache-flushing */
void disable_sr_hashing_asm(int); /* low level support for above */
void disable_sr_hashing(void); /* turns off space register hashing */
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index ec2df4bab302..326f043ec1f9 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -121,10 +121,6 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma
}
}
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
#include <asm/kmap_types.h>
#define ARCH_HAS_KMAP
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index f8338e6d3dd7..a34e43eec658 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1273,6 +1273,20 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
std r6, VCPU_ACOP(r9)
stw r7, VCPU_GUEST_PID(r9)
std r8, VCPU_WORT(r9)
+ /*
+ * Restore various registers to 0, where non-zero values
+ * set by the guest could disrupt the host.
+ */
+ li r0, 0
+ mtspr SPRN_IAMR, r0
+ mtspr SPRN_CIABR, r0
+ mtspr SPRN_DAWRX, r0
+ mtspr SPRN_TCSCR, r0
+ mtspr SPRN_WORT, r0
+ /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
+ li r0, 1
+ sldi r0, r0, 31
+ mtspr SPRN_MMCRS, r0
8:
/* Save and reset AMR and UAMOR before turning on the MMU */
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index fb1b93ea3e3f..e485817f7b1a 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,17 +15,25 @@
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
+ spin_lock_init(&mm->context.list_lock);
+ INIT_LIST_HEAD(&mm->context.pgtable_list);
+ INIT_LIST_HEAD(&mm->context.gmap_list);
cpumask_clear(&mm->context.cpu_attach_mask);
atomic_set(&mm->context.attach_count, 0);
mm->context.flush_mm = 0;
- mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS;
- mm->context.asce_bits |= _ASCE_TYPE_REGION3;
#ifdef CONFIG_PGSTE
mm->context.alloc_pgste = page_table_allocate_pgste;
mm->context.has_pgste = 0;
mm->context.use_skey = 0;
#endif
- mm->context.asce_limit = STACK_TOP_MAX;
+ if (mm->context.asce_limit == 0) {
+ /* context created by exec, set asce limit to 4TB */
+ mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
+ mm->context.asce_limit = STACK_TOP_MAX;
+ } else if (mm->context.asce_limit == (1UL << 31)) {
+ mm_inc_nr_pmds(mm);
+ }
crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
return 0;
}
@@ -111,8 +119,6 @@ static inline void activate_mm(struct mm_struct *prev,
static inline void arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm)
{
- if (oldmm->context.asce_limit < mm->context.asce_limit)
- crst_table_downgrade(mm, oldmm->context.asce_limit);
}
static inline void arch_exit_mmap(struct mm_struct *mm)
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 7b7858f158b4..d7cc79fb6191 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -100,12 +100,26 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
- spin_lock_init(&mm->context.list_lock);
- INIT_LIST_HEAD(&mm->context.pgtable_list);
- INIT_LIST_HEAD(&mm->context.gmap_list);
- return (pgd_t *) crst_table_alloc(mm);
+ unsigned long *table = crst_table_alloc(mm);
+
+ if (!table)
+ return NULL;
+ if (mm->context.asce_limit == (1UL << 31)) {
+ /* Forking a compat process with 2 page table levels */
+ if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
+ crst_table_free(mm, table);
+ return NULL;
+ }
+ }
+ return (pgd_t *) table;
+}
+
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ if (mm->context.asce_limit == (1UL << 31))
+ pgtable_pmd_page_dtor(virt_to_page(pgd));
+ crst_table_free(mm, (unsigned long *) pgd);
}
-#define pgd_free(mm, pgd) crst_table_free(mm, (unsigned long *) pgd)
static inline void pmd_populate(struct mm_struct *mm,
pmd_t *pmd, pgtable_t pte)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index aac357a4cd5c..d2dd3ecb8337 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -283,6 +283,9 @@ config ARCH_SUPPORTS_UPROBES
config FIX_EARLYCON_MEM
def_bool y
+config DEBUG_RODATA
+ def_bool y
+
config PGTABLE_LEVELS
int
default 4 if X86_64
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 72484a645f05..467ccb4d13ea 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -86,23 +86,12 @@ config EFI_PGT_DUMP
issues with the mapping of the EFI runtime regions into that
table.
-config DEBUG_RODATA
- bool "Write protect kernel read-only data structures"
- default y
- depends on DEBUG_KERNEL
- ---help---
- Mark the kernel read-only data as write-protected in the pagetables,
- in order to catch accidental (and incorrect) writes to such const
- data. This is recommended so that we can catch kernel bugs sooner.
- If in doubt, say "Y".
-
config DEBUG_RODATA_TEST
- bool "Testcase for the DEBUG_RODATA feature"
- depends on DEBUG_RODATA
+ bool "Testcase for the marking rodata read-only"
default y
---help---
- This option enables a testcase for the DEBUG_RODATA
- feature as well as for the change_page_attr() infrastructure.
+ This option enables a testcase for the setting rodata read-only
+ as well as for the change_page_attr() infrastructure.
If in doubt, say "N"
config DEBUG_SET_MODULE_RONX
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 47c8e32f621a..5aa976c6648a 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -84,16 +84,12 @@ int set_pages_rw(struct page *page, int numpages);
void clflush_cache_range(void *addr, unsigned int size);
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
+#define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
+
extern const int rodata_test_data;
extern int kernel_set_to_readonly;
void set_kernel_text_rw(void);
void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
#ifdef CONFIG_DEBUG_RODATA_TEST
int rodata_test(void);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c1adf33fdd0d..bc62e7cbf1b1 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void)
}
#endif /* CONFIG_KVM_GUEST */
-#ifdef CONFIG_DEBUG_RODATA
#define KVM_HYPERCALL \
ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
-#else
-/* On AMD processors, vmcall will generate a trap that we will
- * then rewrite to the appropriate instruction.
- */
-#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-#endif
/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
* instruction. The hypervisor may replace it with something else but only the
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 0a5242428659..13b6cdd0af57 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -7,7 +7,7 @@
extern char __brk_base[], __brk_limit[];
extern struct exception_table_entry __stop___ex_table[];
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
extern char __end_rodata_hpage_align[];
#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8b7b0a51e742..f74416dc3fc6 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end)
static unsigned long text_ip_addr(unsigned long ip)
{
/*
- * On x86_64, kernel text mappings are mapped read-only with
- * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
- * of the kernel text mapping to modify the kernel text.
+ * On x86_64, kernel text mappings are mapped read-only, so we use
+ * the kernel identity mapping instead of the kernel text mapping
+ * to modify the kernel text.
*
* For 32bit kernels, these mappings are same and we can use
* kernel identity mapping to modify code.
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index d6178d9791db..543dcb88988f 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -745,9 +745,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
-#ifdef CONFIG_DEBUG_RODATA
char opc[BREAK_INSTR_SIZE];
-#endif /* CONFIG_DEBUG_RODATA */
bpt->type = BP_BREAKPOINT;
err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@@ -756,7 +754,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
return err;
err = probe_kernel_write((char *)bpt->bpt_addr,
arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
-#ifdef CONFIG_DEBUG_RODATA
if (!err)
return err;
/*
@@ -773,13 +770,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
return -EINVAL;
bpt->type = BP_POKE_BREAKPOINT;
-#endif /* CONFIG_DEBUG_RODATA */
+
return err;
}
int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{
-#ifdef CONFIG_DEBUG_RODATA
int err;
char opc[BREAK_INSTR_SIZE];
@@ -796,8 +792,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
goto knl_write;
return err;
+
knl_write:
-#endif /* CONFIG_DEBUG_RODATA */
return probe_kernel_write((char *)bpt->bpt_addr,
(char *)bpt->saved_instr, BREAK_INSTR_SIZE);
}
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 3f92ce07e525..27538f183c3b 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -142,7 +142,6 @@ static int test_NX(void)
* by the error message
*/
-#ifdef CONFIG_DEBUG_RODATA
/* Test 3: Check if the .rodata section is executable */
if (rodata_test_data != 0xC3) {
printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
@@ -151,7 +150,6 @@ static int test_NX(void)
printk(KERN_ERR "test_nx: .rodata section is executable\n");
ret = -ENODEV;
}
-#endif
#if 0
/* Test 4: Check if the .data section of a module is executable */
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index 5ecbfe5099da..cb4a01b41e27 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -76,5 +76,5 @@ int rodata_test(void)
}
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
+MODULE_DESCRIPTION("Testcase for marking rodata as read-only");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 00bf300fd846..0384a734b417 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,29 +41,28 @@ ENTRY(phys_startup_64)
jiffies_64 = jiffies;
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
/*
- * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
- * we retain large page mappings for boundaries spanning kernel text, rodata
- * and data sections.
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
*
* However, kernel identity mappings will have different RWX permissions
* to the pages mapping to text and to the pages padding (which are freed) the
* text section. Hence kernel identity mappings will be broken to smaller
* pages. For 64-bit, kernel text and kernel identity mappings are different,
- * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
- * as well as retain 2MB large page mappings for kernel text.
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
*/
-#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
+#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
-#define X64_ALIGN_DEBUG_RODATA_END \
+#define X64_ALIGN_RODATA_END \
. = ALIGN(HPAGE_SIZE); \
__end_rodata_hpage_align = .;
#else
-#define X64_ALIGN_DEBUG_RODATA_BEGIN
-#define X64_ALIGN_DEBUG_RODATA_END
+#define X64_ALIGN_RODATA_BEGIN
+#define X64_ALIGN_RODATA_END
#endif
@@ -112,13 +111,11 @@ SECTIONS
EXCEPTION_TABLE(16) :text = 0x9090
-#if defined(CONFIG_DEBUG_RODATA)
/* .text should occupy whole number of pages */
. = ALIGN(PAGE_SIZE);
-#endif
- X64_ALIGN_DEBUG_RODATA_BEGIN
+ X64_ALIGN_RODATA_BEGIN
RO_DATA(PAGE_SIZE)
- X64_ALIGN_DEBUG_RODATA_END
+ X64_ALIGN_RODATA_END
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 945f9e13f1aa..917148620f49 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1674,6 +1674,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
return;
}
break;
+ case MSR_IA32_PEBS_ENABLE:
+ /* PEBS needs a quiescent period after being disabled (to write
+ * a record). Disabling PEBS through VMX MSR swapping doesn't
+ * provide that period, so a CPU could write host's record into
+ * guest's memory.
+ */
+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
for (i = 0; i < m->nr; ++i)
@@ -1711,26 +1718,31 @@ static void reload_tss(void)
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
{
- u64 guest_efer;
- u64 ignore_bits;
+ u64 guest_efer = vmx->vcpu.arch.efer;
+ u64 ignore_bits = 0;
- guest_efer = vmx->vcpu.arch.efer;
+ if (!enable_ept) {
+ /*
+ * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
+ * host CPUID is more efficient than testing guest CPUID
+ * or CR4. Host SMEP is anyway a requirement for guest SMEP.
+ */
+ if (boot_cpu_has(X86_FEATURE_SMEP))
+ guest_efer |= EFER_NX;
+ else if (!(guest_efer & EFER_NX))
+ ignore_bits |= EFER_NX;
+ }
/*
- * NX is emulated; LMA and LME handled by hardware; SCE meaningless
- * outside long mode
+ * LMA and LME handled by hardware; SCE meaningless outside long mode.
*/
- ignore_bits = EFER_NX | EFER_SCE;
+ ignore_bits |= EFER_SCE;
#ifdef CONFIG_X86_64
ignore_bits |= EFER_LMA | EFER_LME;
/* SCE is meaningful only in long mode on Intel */
if (guest_efer & EFER_LMA)
ignore_bits &= ~(u64)EFER_SCE;
#endif
- guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
- vmx->guest_msrs[efer_offset].data = guest_efer;
- vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
clear_atomic_switch_msr(vmx, MSR_EFER);
@@ -1741,16 +1753,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
*/
if (cpu_has_load_ia32_efer ||
(enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
- guest_efer = vmx->vcpu.arch.efer;
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
if (guest_efer != host_efer)
add_atomic_switch_msr(vmx, MSR_EFER,
guest_efer, host_efer);
return false;
- }
+ } else {
+ guest_efer &= ~ignore_bits;
+ guest_efer |= host_efer & ignore_bits;
- return true;
+ vmx->guest_msrs[efer_offset].data = guest_efer;
+ vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+ return true;
+ }
}
static unsigned long segment_base(u16 selector)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c23ab1ee3a9a..0f98553f995b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -871,7 +871,6 @@ static noinline int do_test_wp_bit(void)
return flag;
}
-#ifdef CONFIG_DEBUG_RODATA
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
@@ -958,5 +957,3 @@ void mark_rodata_ro(void)
#endif
mark_nxdata_nx();
}
-#endif
-
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f9977a7a9444..e10635a19727 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1062,7 +1062,6 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
-#ifdef CONFIG_DEBUG_RODATA
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
@@ -1152,8 +1151,6 @@ void mark_rodata_ro(void)
(unsigned long) __va(__pa_symbol(_sdata)));
}
-#endif
-
int kern_addr_valid(unsigned long addr)
{
unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 2dd9b3ad3bb5..ccb4b86eb1cc 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -279,7 +279,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
__pa_symbol(__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
/*
* Once the kernel maps the text as RO (kernel_set_to_readonly is set),
* kernel text mappings for the large page aligned text, rodata sections
diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h
index 0224987556ce..3f69326ed545 100644
--- a/arch/x86/vdso/vdso2c.h
+++ b/arch/x86/vdso/vdso2c.h
@@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
fprintf(outfile, "#include <asm/vdso.h>\n");
fprintf(outfile, "\n");
fprintf(outfile,
- "static unsigned char raw_data[%lu] __page_aligned_data = {",
+ "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
mapping_size);
for (j = 0; j < stripped_len; j++) {
if (j % 10 == 0)
diff --git a/block/bio.c b/block/bio.c
index cbce3e2208f4..167e5ec31717 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1983,6 +1983,29 @@ struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_
EXPORT_SYMBOL(bioset_create_nobvec);
#ifdef CONFIG_BLK_CGROUP
+
+/**
+ * bio_associate_blkcg - associate a bio with the specified blkcg
+ * @bio: target bio
+ * @blkcg_css: css of the blkcg to associate
+ *
+ * Associate @bio with the blkcg specified by @blkcg_css. Block layer will
+ * treat @bio as if it were issued by a task which belongs to the blkcg.
+ *
+ * This function takes an extra reference of @blkcg_css which will be put
+ * when @bio is released. The caller must own @bio and is responsible for
+ * synchronizing calls to this function.
+ */
+int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
+{
+ if (unlikely(bio->bi_css))
+ return -EBUSY;
+ css_get(blkcg_css);
+ bio->bi_css = blkcg_css;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkcg);
+
/**
* bio_associate_current - associate a bio with %current
* @bio: target bio
@@ -1999,28 +2022,20 @@ EXPORT_SYMBOL(bioset_create_nobvec);
int bio_associate_current(struct bio *bio)
{
struct io_context *ioc;
- struct cgroup_subsys_state *css;
- if (bio->bi_ioc)
+ if (bio->bi_css)
return -EBUSY;
ioc = current->io_context;
if (!ioc)
return -ENOENT;
- /* acquire active ref on @ioc and associate */
get_io_context_active(ioc);
bio->bi_ioc = ioc;
-
- /* associate blkcg if exists */
- rcu_read_lock();
- css = task_css(current, blkio_cgrp_id);
- if (css && css_tryget_online(css))
- bio->bi_css = css;
- rcu_read_unlock();
-
+ bio->bi_css = task_get_css(current, blkio_cgrp_id);
return 0;
}
+EXPORT_SYMBOL_GPL(bio_associate_current);
/**
* bio_disassociate_task - undo bio_associate_current()
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6817e28960b7..7040eddc4bbf 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,11 +15,12 @@
#include <linux/module.h>
#include <linux/err.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/genhd.h>
#include <linux/delay.h>
#include <linux/atomic.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
#include "blk.h"
#define MAX_KEY_LEN 100
@@ -30,6 +31,8 @@ struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
.cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
EXPORT_SYMBOL_GPL(blkcg_root);
+struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static bool blkcg_policy_enabled(struct request_queue *q,
@@ -179,6 +182,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct blkcg_gq *new_blkg)
{
struct blkcg_gq *blkg;
+ struct bdi_writeback_congested *wb_congested;
int i, ret;
WARN_ON_ONCE(!rcu_read_lock_held());
@@ -190,22 +194,30 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
goto err_free_blkg;
}
+ wb_congested = wb_congested_get_create(&q->backing_dev_info,
+ blkcg->css.id, GFP_ATOMIC);
+ if (!wb_congested) {
+ ret = -ENOMEM;
+ goto err_put_css;
+ }
+
/* allocate */
if (!new_blkg) {
new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
- goto err_put_css;
+ goto err_put_congested;
}
}
blkg = new_blkg;
+ blkg->wb_congested = wb_congested;
/* link parent */
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) {
ret = -EINVAL;
- goto err_put_css;
+ goto err_put_congested;
}
blkg_get(blkg->parent);
}
@@ -235,18 +247,15 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg->online = true;
spin_unlock(&blkcg->lock);
- if (!ret) {
- if (blkcg == &blkcg_root) {
- q->root_blkg = blkg;
- q->root_rl.blkg = blkg;
- }
+ if (!ret)
return blkg;
- }
/* @blkg failed fully initialized, use the usual release path */
blkg_put(blkg);
return ERR_PTR(ret);
+err_put_congested:
+ wb_congested_put(wb_congested);
err_put_css:
css_put(&blkcg->css);
err_free_blkg:
@@ -340,15 +349,6 @@ static void blkg_destroy(struct blkcg_gq *blkg)
rcu_assign_pointer(blkcg->blkg_hint, NULL);
/*
- * If root blkg is destroyed. Just clear the pointer since root_rl
- * does not take reference on root blkg.
- */
- if (blkcg == &blkcg_root) {
- blkg->q->root_blkg = NULL;
- blkg->q->root_rl.blkg = NULL;
- }
-
- /*
* Put the reference taken at the time of creation so that when all
* queues are gone, group can be destroyed.
*/
@@ -402,6 +402,8 @@ void __blkg_release_rcu(struct rcu_head *rcu_head)
if (blkg->parent)
blkg_put(blkg->parent);
+ wb_congested_put(blkg->wb_congested);
+
blkg_free(blkg);
}
EXPORT_SYMBOL_GPL(__blkg_release_rcu);
@@ -813,6 +815,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock_irq(&blkcg->lock);
+
+ wb_blkcg_offline(blkcg);
}
static void blkcg_css_free(struct cgroup_subsys_state *css)
@@ -843,7 +847,9 @@ done:
spin_lock_init(&blkcg->lock);
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&blkcg->blkg_list);
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+ INIT_LIST_HEAD(&blkcg->cgwb_list);
+#endif
return &blkcg->css;
}
@@ -859,9 +865,45 @@ done:
*/
int blkcg_init_queue(struct request_queue *q)
{
- might_sleep();
+ struct blkcg_gq *new_blkg, *blkg;
+ bool preloaded;
+ int ret;
+
+ new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+ if (!new_blkg)
+ return -ENOMEM;
+
+ preloaded = !radix_tree_preload(GFP_KERNEL);
+
+ /*
+ * Make sure the root blkg exists and count the existing blkgs. As
+ * @q is bypassing at this point, blkg_lookup_create() can't be
+ * used. Open code insertion.
+ */
+ rcu_read_lock();
+ spin_lock_irq(q->queue_lock);
+ blkg = blkg_create(&blkcg_root, q, new_blkg);
+ spin_unlock_irq(q->queue_lock);
+ rcu_read_unlock();
+
+ if (preloaded)
+ radix_tree_preload_end();
+
+ if (IS_ERR(blkg)) {
+ kfree(new_blkg);
+ return PTR_ERR(blkg);
+ }
- return blk_throtl_init(q);
+ q->root_blkg = blkg;
+ q->root_rl.blkg = blkg;
+
+ ret = blk_throtl_init(q);
+ if (ret) {
+ spin_lock_irq(q->queue_lock);
+ blkg_destroy_all(q);
+ spin_unlock_irq(q->queue_lock);
+ }
+ return ret;
}
/**
@@ -962,52 +1004,20 @@ int blkcg_activate_policy(struct request_queue *q,
const struct blkcg_policy *pol)
{
LIST_HEAD(pds);
- struct blkcg_gq *blkg, *new_blkg;
+ struct blkcg_gq *blkg;
struct blkg_policy_data *pd, *n;
int cnt = 0, ret;
- bool preloaded;
if (blkcg_policy_enabled(q, pol))
return 0;
- /* preallocations for root blkg */
- new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
- if (!new_blkg)
- return -ENOMEM;
-
+ /* count and allocate policy_data for all existing blkgs */
blk_queue_bypass_start(q);
-
- preloaded = !radix_tree_preload(GFP_KERNEL);
-
- /*
- * Make sure the root blkg exists and count the existing blkgs. As
- * @q is bypassing at this point, blkg_lookup_create() can't be
- * used. Open code it.
- */
spin_lock_irq(q->queue_lock);
-
- rcu_read_lock();
- blkg = __blkg_lookup(&blkcg_root, q, false);
- if (blkg)
- blkg_free(new_blkg);
- else
- blkg = blkg_create(&blkcg_root, q, new_blkg);
- rcu_read_unlock();
-
- if (preloaded)
- radix_tree_preload_end();
-
- if (IS_ERR(blkg)) {
- ret = PTR_ERR(blkg);
- goto out_unlock;
- }
-
list_for_each_entry(blkg, &q->blkg_list, q_node)
cnt++;
-
spin_unlock_irq(q->queue_lock);
- /* allocate policy_data for all existing blkgs */
while (cnt--) {
pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
if (!pd) {
@@ -1076,10 +1086,6 @@ void blkcg_deactivate_policy(struct request_queue *q,
__clear_bit(pol->plid, q->blkcg_pols);
- /* if no policy is left, no need for blkgs - shoot them down */
- if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
- blkg_destroy_all(q);
-
list_for_each_entry(blkg, &q->blkg_list, q_node) {
/* grab blkcg lock too while removing @pd from @blkg */
spin_lock(&blkg->blkcg->lock);
diff --git a/block/blk-core.c b/block/blk-core.c
index 4e7dded643a2..148e12d3f2fa 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -32,12 +32,12 @@
#include <linux/delay.h>
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
#include "blk.h"
-#include "blk-cgroup.h"
#include "blk-mq.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -63,6 +63,31 @@ struct kmem_cache *blk_requestq_cachep;
*/
static struct workqueue_struct *kblockd_workqueue;
+static void blk_clear_congested(struct request_list *rl, int sync)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+ clear_wb_congested(rl->blkg->wb_congested, sync);
+#else
+ /*
+ * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
+ * flip its congestion state for events on other blkcgs.
+ */
+ if (rl == &rl->q->root_rl)
+ clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
+static void blk_set_congested(struct request_list *rl, int sync)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+ set_wb_congested(rl->blkg->wb_congested, sync);
+#else
+ /* see blk_clear_congested() */
+ if (rl == &rl->q->root_rl)
+ set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
void blk_queue_congestion_threshold(struct request_queue *q)
{
int nr;
@@ -624,8 +649,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
- q->backing_dev_info.state = 0;
- q->backing_dev_info.capabilities = 0;
+ q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
q->backing_dev_info.name = "block";
q->node = node_id;
@@ -848,13 +872,8 @@ static void __freed_request(struct request_list *rl, int sync)
{
struct request_queue *q = rl->q;
- /*
- * bdi isn't aware of blkcg yet. As all async IOs end up root
- * blkcg anyway, just use root blkcg state.
- */
- if (rl == &q->root_rl &&
- rl->count[sync] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, sync);
+ if (rl->count[sync] < queue_congestion_off_threshold(q))
+ blk_clear_congested(rl, sync);
if (rl->count[sync] + 1 <= q->nr_requests) {
if (waitqueue_active(&rl->wait[sync]))
@@ -887,25 +906,25 @@ static void freed_request(struct request_list *rl, unsigned int flags)
int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
{
struct request_list *rl;
+ int on_thresh, off_thresh;
spin_lock_irq(q->queue_lock);
q->nr_requests = nr;
blk_queue_congestion_threshold(q);
+ on_thresh = queue_congestion_on_threshold(q);
+ off_thresh = queue_congestion_off_threshold(q);
- /* congestion isn't cgroup aware and follows root blkcg for now */
- rl = &q->root_rl;
-
- if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
- blk_set_queue_congested(q, BLK_RW_SYNC);
- else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, BLK_RW_SYNC);
+ blk_queue_for_each_rl(rl, q) {
+ if (rl->count[BLK_RW_SYNC] >= on_thresh)
+ blk_set_congested(rl, BLK_RW_SYNC);
+ else if (rl->count[BLK_RW_SYNC] < off_thresh)
+ blk_clear_congested(rl, BLK_RW_SYNC);
- if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
- blk_set_queue_congested(q, BLK_RW_ASYNC);
- else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, BLK_RW_ASYNC);
+ if (rl->count[BLK_RW_ASYNC] >= on_thresh)
+ blk_set_congested(rl, BLK_RW_ASYNC);
+ else if (rl->count[BLK_RW_ASYNC] < off_thresh)
+ blk_clear_congested(rl, BLK_RW_ASYNC);
- blk_queue_for_each_rl(rl, q) {
if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
blk_set_rl_full(rl, BLK_RW_SYNC);
} else {
@@ -1015,12 +1034,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
}
}
}
- /*
- * bdi isn't aware of blkcg yet. As all async IOs end up
- * root blkcg anyway, just use root blkcg state.
- */
- if (rl == &q->root_rl)
- blk_set_queue_congested(q, is_sync);
+ blk_set_congested(rl, is_sync);
}
/*
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 79ffb4855af0..f548b64be092 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -21,6 +21,7 @@
*/
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/mempool.h>
#include <linux/bio.h>
#include <linux/scatterlist.h>
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 2b8fd302f677..6264b382d4d1 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -6,11 +6,12 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/blktrace_api.h>
#include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
#include "blk.h"
-#include "blk-cgroup.h"
#include "blk-mq.h"
struct queue_sysfs_entry {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5b9c6d5c3636..b23193518ac7 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -9,7 +9,7 @@
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/blktrace_api.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
#include "blk.h"
/* Max dispatch from a group in 1 round */
diff --git a/block/bounce.c b/block/bounce.c
index 39d123e0a989..c0f94419a9fe 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -13,6 +13,7 @@
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
@@ -128,9 +129,6 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
struct bio_vec *bvec, *org_vec;
int i;
- if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
- set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-
/*
* free up bounce indirect pages used
*/
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5da8e6e9ab4b..bc8f42930773 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,8 +14,8 @@
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/blktrace_api.h>
+#include <linux/blk-cgroup.h>
#include "blk.h"
-#include "blk-cgroup.h"
/*
* tunables
diff --git a/block/elevator.c b/block/elevator.c
index 8985038f398c..e321bd55ddfe 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -35,11 +35,11 @@
#include <linux/hash.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
#include "blk.h"
-#include "blk-cgroup.h"
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
diff --git a/block/genhd.c b/block/genhd.c
index ea982eadaf63..59a1395eedac 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -8,6 +8,7 @@
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
diff --git a/drivers/acpi/ec_sys.c b/drivers/acpi/ec_sys.c
index b4c216bab22b..bea8e425a8de 100644
--- a/drivers/acpi/ec_sys.c
+++ b/drivers/acpi/ec_sys.c
@@ -128,7 +128,7 @@ static int acpi_ec_add_debugfs(struct acpi_ec *ec, unsigned int ec_device_count)
if (!debugfs_create_x32("gpe", 0444, dev_dir, (u32 *)&first_ec->gpe))
goto error;
if (!debugfs_create_bool("use_global_lock", 0444, dev_dir,
- (u32 *)&first_ec->global_lock))
+ &first_ec->global_lock))
goto error;
if (write_support)
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index ba4a61e964be..5819838e4b85 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -129,7 +129,7 @@ struct acpi_ec {
unsigned long gpe;
unsigned long command_addr;
unsigned long data_addr;
- unsigned long global_lock;
+ bool global_lock;
unsigned long flags;
unsigned long reference_count;
struct mutex mutex;
diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile
index 1cb8544598d5..49fddd1964cc 100644
--- a/drivers/base/power/Makefile
+++ b/drivers/base/power/Makefile
@@ -1,7 +1,7 @@
obj-$(CONFIG_PM) += sysfs.o generic_ops.o common.o qos.o runtime.o
obj-$(CONFIG_PM_SLEEP) += main.o wakeup.o
obj-$(CONFIG_PM_TRACE_RTC) += trace.o
-obj-$(CONFIG_PM_OPP) += opp.o
+obj-$(CONFIG_PM_OPP) += opp/
obj-$(CONFIG_PM_GENERIC_DOMAINS) += domain.o domain_governor.o
obj-$(CONFIG_HAVE_CLK) += clock_ops.o
diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
deleted file mode 100644
index 677fb2843553..000000000000
--- a/drivers/base/power/opp.c
+++ /dev/null
@@ -1,927 +0,0 @@
-/*
- * Generic OPP Interface
- *
- * Copyright (C) 2009-2010 Texas Instruments Incorporated.
- * Nishanth Menon
- * Romit Dasgupta
- * Kevin Hilman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/list.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/pm_opp.h>
-#include <linux/of.h>
-#include <linux/export.h>
-
-/*
- * Internal data structure organization with the OPP layer library is as
- * follows:
- * dev_opp_list (root)
- * |- device 1 (represents voltage domain 1)
- * | |- opp 1 (availability, freq, voltage)
- * | |- opp 2 ..
- * ... ...
- * | `- opp n ..
- * |- device 2 (represents the next voltage domain)
- * ...
- * `- device m (represents mth voltage domain)
- * device 1, 2.. are represented by dev_opp structure while each opp
- * is represented by the opp structure.
- */
-
-/**
- * struct dev_pm_opp - Generic OPP description structure
- * @node: opp list node. The nodes are maintained throughout the lifetime
- * of boot. It is expected only an optimal set of OPPs are
- * added to the library by the SoC framework.
- * RCU usage: opp list is traversed with RCU locks. node
- * modification is possible realtime, hence the modifications
- * are protected by the dev_opp_list_lock for integrity.
- * IMPORTANT: the opp nodes should be maintained in increasing
- * order.
- * @dynamic: not-created from static DT entries.
- * @available: true/false - marks if this OPP as available or not
- * @rate: Frequency in hertz
- * @u_volt: Nominal voltage in microvolts corresponding to this OPP
- * @dev_opp: points back to the device_opp struct this opp belongs to
- * @rcu_head: RCU callback head used for deferred freeing
- *
- * This structure stores the OPP information for a given device.
- */
-struct dev_pm_opp {
- struct list_head node;
-
- bool available;
- bool dynamic;
- unsigned long rate;
- unsigned long u_volt;
-
- struct device_opp *dev_opp;
- struct rcu_head rcu_head;
-};
-
-/**
- * struct device_opp - Device opp structure
- * @node: list node - contains the devices with OPPs that
- * have been registered. Nodes once added are not modified in this
- * list.
- * RCU usage: nodes are not modified in the list of device_opp,
- * however addition is possible and is secured by dev_opp_list_lock
- * @dev: device pointer
- * @srcu_head: notifier head to notify the OPP availability changes.
- * @rcu_head: RCU callback head used for deferred freeing
- * @opp_list: list of opps
- *
- * This is an internal data structure maintaining the link to opps attached to
- * a device. This structure is not meant to be shared to users as it is
- * meant for book keeping and private to OPP library.
- *
- * Because the opp structures can be used from both rcu and srcu readers, we
- * need to wait for the grace period of both of them before freeing any
- * resources. And so we have used kfree_rcu() from within call_srcu() handlers.
- */
-struct device_opp {
- struct list_head node;
-
- struct device *dev;
- struct srcu_notifier_head srcu_head;
- struct rcu_head rcu_head;
- struct list_head opp_list;
-};
-
-/*
- * The root of the list of all devices. All device_opp structures branch off
- * from here, with each device_opp containing the list of opp it supports in
- * various states of availability.
- */
-static LIST_HEAD(dev_opp_list);
-/* Lock to allow exclusive modification to the device and opp lists */
-static DEFINE_MUTEX(dev_opp_list_lock);
-
-#define opp_rcu_lockdep_assert() \
-do { \
- rcu_lockdep_assert(rcu_read_lock_held() || \
- lockdep_is_held(&dev_opp_list_lock), \
- "Missing rcu_read_lock() or " \
- "dev_opp_list_lock protection"); \
-} while (0)
-
-/**
- * _find_device_opp() - find device_opp struct using device pointer
- * @dev: device pointer used to lookup device OPPs
- *
- * Search list of device OPPs for one containing matching device. Does a RCU
- * reader operation to grab the pointer needed.
- *
- * Return: pointer to 'struct device_opp' if found, otherwise -ENODEV or
- * -EINVAL based on type of error.
- *
- * Locking: This function must be called under rcu_read_lock(). device_opp
- * is a RCU protected pointer. This means that device_opp is valid as long
- * as we are under RCU lock.
- */
-static struct device_opp *_find_device_opp(struct device *dev)
-{
- struct device_opp *tmp_dev_opp, *dev_opp = ERR_PTR(-ENODEV);
-
- if (unlikely(IS_ERR_OR_NULL(dev))) {
- pr_err("%s: Invalid parameters\n", __func__);
- return ERR_PTR(-EINVAL);
- }
-
- list_for_each_entry_rcu(tmp_dev_opp, &dev_opp_list, node) {
- if (tmp_dev_opp->dev == dev) {
- dev_opp = tmp_dev_opp;
- break;
- }
- }
-
- return dev_opp;
-}
-
-/**
- * dev_pm_opp_get_voltage() - Gets the voltage corresponding to an available opp
- * @opp: opp for which voltage has to be returned for
- *
- * Return: voltage in micro volt corresponding to the opp, else
- * return 0
- *
- * Locking: This function must be called under rcu_read_lock(). opp is a rcu
- * protected pointer. This means that opp which could have been fetched by
- * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
- * under RCU lock. The pointer returned by the opp_find_freq family must be
- * used in the same section as the usage of this function with the pointer
- * prior to unlocking with rcu_read_unlock() to maintain the integrity of the
- * pointer.
- */
-unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
-{
- struct dev_pm_opp *tmp_opp;
- unsigned long v = 0;
-
- opp_rcu_lockdep_assert();
-
- tmp_opp = rcu_dereference(opp);
- if (unlikely(IS_ERR_OR_NULL(tmp_opp)) || !tmp_opp->available)
- pr_err("%s: Invalid parameters\n", __func__);
- else
- v = tmp_opp->u_volt;
-
- return v;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_voltage);
-
-/**
- * dev_pm_opp_get_freq() - Gets the frequency corresponding to an available opp
- * @opp: opp for which frequency has to be returned for
- *
- * Return: frequency in hertz corresponding to the opp, else
- * return 0
- *
- * Locking: This function must be called under rcu_read_lock(). opp is a rcu
- * protected pointer. This means that opp which could have been fetched by
- * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
- * under RCU lock. The pointer returned by the opp_find_freq family must be
- * used in the same section as the usage of this function with the pointer
- * prior to unlocking with rcu_read_unlock() to maintain the integrity of the
- * pointer.
- */
-unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
-{
- struct dev_pm_opp *tmp_opp;
- unsigned long f = 0;
-
- opp_rcu_lockdep_assert();
-
- tmp_opp = rcu_dereference(opp);
- if (unlikely(IS_ERR_OR_NULL(tmp_opp)) || !tmp_opp->available)
- pr_err("%s: Invalid parameters\n", __func__);
- else
- f = tmp_opp->rate;
-
- return f;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq);
-
-/**
- * dev_pm_opp_get_opp_count() - Get number of opps available in the opp list
- * @dev: device for which we do this operation
- *
- * Return: This function returns the number of available opps if there are any,
- * else returns 0 if none or the corresponding error value.
- *
- * Locking: This function takes rcu_read_lock().
- */
-int dev_pm_opp_get_opp_count(struct device *dev)
-{
- struct device_opp *dev_opp;
- struct dev_pm_opp *temp_opp;
- int count = 0;
-
- rcu_read_lock();
-
- dev_opp = _find_device_opp(dev);
- if (IS_ERR(dev_opp)) {
- count = PTR_ERR(dev_opp);
- dev_err(dev, "%s: device OPP not found (%d)\n",
- __func__, count);
- goto out_unlock;
- }
-
- list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
- if (temp_opp->available)
- count++;
- }
-
-out_unlock:
- rcu_read_unlock();
- return count;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_count);
-
-/**
- * dev_pm_opp_find_freq_exact() - search for an exact frequency
- * @dev: device for which we do this operation
- * @freq: frequency to search for
- * @available: true/false - match for available opp
- *
- * Return: Searches for exact match in the opp list and returns pointer to the
- * matching opp if found, else returns ERR_PTR in case of error and should
- * be handled using IS_ERR. Error return values can be:
- * EINVAL: for bad pointer
- * ERANGE: no match found for search
- * ENODEV: if device not found in list of registered devices
- *
- * Note: available is a modifier for the search. if available=true, then the
- * match is for exact matching frequency and is available in the stored OPP
- * table. if false, the match is for exact frequency which is not available.
- *
- * This provides a mechanism to enable an opp which is not available currently
- * or the opposite as well.
- *
- * Locking: This function must be called under rcu_read_lock(). opp is a rcu
- * protected pointer. The reason for the same is that the opp pointer which is
- * returned will remain valid for use with opp_get_{voltage, freq} only while
- * under the locked area. The pointer returned must be used prior to unlocking
- * with rcu_read_unlock() to maintain the integrity of the pointer.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
- unsigned long freq,
- bool available)
-{
- struct device_opp *dev_opp;
- struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
- opp_rcu_lockdep_assert();
-
- dev_opp = _find_device_opp(dev);
- if (IS_ERR(dev_opp)) {
- int r = PTR_ERR(dev_opp);
- dev_err(dev, "%s: device OPP not found (%d)\n", __func__, r);
- return ERR_PTR(r);
- }
-
- list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
- if (temp_opp->available == available &&
- temp_opp->rate == freq) {
- opp = temp_opp;
- break;
- }
- }
-
- return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact);
-
-/**
- * dev_pm_opp_find_freq_ceil() - Search for an rounded ceil freq
- * @dev: device for which we do this operation
- * @freq: Start frequency
- *
- * Search for the matching ceil *available* OPP from a starting freq
- * for a device.
- *
- * Return: matching *opp and refreshes *freq accordingly, else returns
- * ERR_PTR in case of error and should be handled using IS_ERR. Error return
- * values can be:
- * EINVAL: for bad pointer
- * ERANGE: no match found for search
- * ENODEV: if device not found in list of registered devices
- *
- * Locking: This function must be called under rcu_read_lock(). opp is a rcu
- * protected pointer. The reason for the same is that the opp pointer which is
- * returned will remain valid for use with opp_get_{voltage, freq} only while
- * under the locked area. The pointer returned must be used prior to unlocking
- * with rcu_read_unlock() to maintain the integrity of the pointer.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
- unsigned long *freq)
-{
- struct device_opp *dev_opp;
- struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
- opp_rcu_lockdep_assert();
-
- if (!dev || !freq) {
- dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
- return ERR_PTR(-EINVAL);
- }
-
- dev_opp = _find_device_opp(dev);
- if (IS_ERR(dev_opp))
- return ERR_CAST(dev_opp);
-
- list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
- if (temp_opp->available && temp_opp->rate >= *freq) {
- opp = temp_opp;
- *freq = opp->rate;
- break;
- }
- }
-
- return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil);
-
-/**
- * dev_pm_opp_find_freq_floor() - Search for a rounded floor freq
- * @dev: device for which we do this operation
- * @freq: Start frequency
- *
- * Search for the matching floor *available* OPP from a starting freq
- * for a device.
- *
- * Return: matching *opp and refreshes *freq accordingly, else returns
- * ERR_PTR in case of error and should be handled using IS_ERR. Error return
- * values can be:
- * EINVAL: for bad pointer
- * ERANGE: no match found for search
- * ENODEV: if device not found in list of registered devices
- *
- * Locking: This function must be called under rcu_read_lock(). opp is a rcu
- * protected pointer. The reason for the same is that the opp pointer which is
- * returned will remain valid for use with opp_get_{voltage, freq} only while
- * under the locked area. The pointer returned must be used prior to unlocking
- * with rcu_read_unlock() to maintain the integrity of the pointer.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
- unsigned long *freq)
-{
- struct device_opp *dev_opp;
- struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
- opp_rcu_lockdep_assert();
-
- if (!dev || !freq) {
- dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
- return ERR_PTR(-EINVAL);
- }
-
- dev_opp = _find_device_opp(dev);
- if (IS_ERR(dev_opp))
- return ERR_CAST(dev_opp);
-
- list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
- if (temp_opp->available) {
- /* go to the next node, before choosing prev */
- if (temp_opp->rate > *freq)
- break;
- else
- opp = temp_opp;
- }
- }
- if (!IS_ERR(opp))
- *freq = opp->rate;
-
- return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
-
-/**
- * _add_device_opp() - Allocate a new device OPP table
- * @dev: device for which we do this operation
- *
- * New device node which uses OPPs - used when multiple devices with OPP tables
- * are maintained.
- *
- * Return: valid device_opp pointer if success, else NULL.
- */
-static struct device_opp *_add_device_opp(struct device *dev)
-{
- struct device_opp *dev_opp;
-
- /*
- * Allocate a new device OPP table. In the infrequent case where a new
- * device is needed to be added, we pay this penalty.
- */
- dev_opp = kzalloc(sizeof(*dev_opp), GFP_KERNEL);
- if (!dev_opp)
- return NULL;
-
- dev_opp->dev = dev;
- srcu_init_notifier_head(&dev_opp->srcu_head);
- INIT_LIST_HEAD(&dev_opp->opp_list);
-
- /* Secure the device list modification */
- list_add_rcu(&dev_opp->node, &dev_opp_list);
- return dev_opp;
-}
-
-/**
- * _opp_add_dynamic() - Allocate a dynamic OPP.
- * @dev: device for which we do this operation
- * @freq: Frequency in Hz for this OPP
- * @u_volt: Voltage in uVolts for this OPP
- * @dynamic: Dynamically added OPPs.
- *
- * This function adds an opp definition to the opp list and returns status.
- * The opp is made available by default and it can be controlled using
- * dev_pm_opp_enable/disable functions and may be removed by dev_pm_opp_remove.
- *
- * NOTE: "dynamic" parameter impacts OPPs added by the of_init_opp_table and
- * freed by of_free_opp_table.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
- *
- * Return:
- * 0 On success OR
- * Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST Freq are same and volt are different OR
- * Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM Memory allocation failure
- */
-static int _opp_add_dynamic(struct device *dev, unsigned long freq,
- long u_volt, bool dynamic)
-{
- struct device_opp *dev_opp = NULL;
- struct dev_pm_opp *opp, *new_opp;
- struct list_head *head;
- int ret;
-
- /* allocate new OPP node */
- new_opp = kzalloc(sizeof(*new_opp), GFP_KERNEL);
- if (!new_opp)
- return -ENOMEM;
-
- /* Hold our list modification lock here */
- mutex_lock(&dev_opp_list_lock);
-
- /* populate the opp table */
- new_opp->rate = freq;
- new_opp->u_volt = u_volt;
- new_opp->available = true;
- new_opp->dynamic = dynamic;
-
- /* Check for existing list for 'dev' */
- dev_opp = _find_device_opp(dev);
- if (IS_ERR(dev_opp)) {
- dev_opp = _add_device_opp(dev);
- if (!dev_opp) {
- ret = -ENOMEM;
- goto free_opp;
- }
-
- head = &dev_opp->opp_list;
- goto list_add;
- }
-
- /*
- * Insert new OPP in order of increasing frequency
- * and discard if already present
- */
- head = &dev_opp->opp_list;
- list_for_each_entry_rcu(opp, &dev_opp->opp_list, node) {
- if (new_opp->rate <= opp->rate)
- break;
- else
- head = &opp->node;
- }
-
- /* Duplicate OPPs ? */
- if (new_opp->rate == opp->rate) {
- ret = opp->available && new_opp->u_volt == opp->u_volt ?
- 0 : -EEXIST;
-
- dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
- __func__, opp->rate, opp->u_volt, opp->available,
- new_opp->rate, new_opp->u_volt, new_opp->available);
- goto free_opp;
- }
-
-list_add:
- new_opp->dev_opp = dev_opp;
- list_add_rcu(&new_opp->node, head);
- mutex_unlock(&dev_opp_list_lock);
-
- /*
- * Notify the changes in the availability of the operable
- * frequency/voltage list.
- */
- srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ADD, new_opp);
- return 0;
-
-free_opp:
- mutex_unlock(&dev_opp_list_lock);
- kfree(new_opp);
- return ret;
-}
-
-/**
- * dev_pm_opp_add() - Add an OPP table from a table definitions
- * @dev: device for which we do this operation
- * @freq: Frequency in Hz for this OPP
- * @u_volt: Voltage in uVolts for this OPP
- *
- * This function adds an opp definition to the opp list and returns status.
- * The opp is made available by default and it can be controlled using
- * dev_pm_opp_enable/disable functions.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
- *
- * Return:
- * 0 On success OR
- * Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST Freq are same and volt are different OR
- * Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM Memory allocation failure
- */
-int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
-{
- return _opp_add_dynamic(dev, freq, u_volt, true);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_add);
-
-/**
- * _kfree_opp_rcu() - Free OPP RCU handler
- * @head: RCU head
- */
-static void _kfree_opp_rcu(struct rcu_head *head)
-{
- struct dev_pm_opp *opp = container_of(head, struct dev_pm_opp, rcu_head);
-
- kfree_rcu(opp, rcu_head);
-}
-
-/**
- * _kfree_device_rcu() - Free device_opp RCU handler
- * @head: RCU head
- */
-static void _kfree_device_rcu(struct rcu_head *head)
-{
- struct device_opp *device_opp = container_of(head, struct device_opp, rcu_head);
-
- kfree_rcu(device_opp, rcu_head);
-}
-
-/**
- * _opp_remove() - Remove an OPP from a table definition
- * @dev_opp: points back to the device_opp struct this opp belongs to
- * @opp: pointer to the OPP to remove
- *
- * This function removes an opp definition from the opp list.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * It is assumed that the caller holds required mutex for an RCU updater
- * strategy.
- */
-static void _opp_remove(struct device_opp *dev_opp,
- struct dev_pm_opp *opp)
-{
- /*
- * Notify the changes in the availability of the operable
- * frequency/voltage list.
- */
- srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_REMOVE, opp);
- list_del_rcu(&opp->node);
- call_srcu(&dev_opp->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
-
- if (list_empty(&dev_opp->opp_list)) {
- list_del_rcu(&dev_opp->node);
- call_srcu(&dev_opp->srcu_head.srcu, &dev_opp->rcu_head,
- _kfree_device_rcu);
- }
-}
-
-/**
- * dev_pm_opp_remove() - Remove an OPP from OPP list
- * @dev: device for which we do this operation
- * @freq: OPP to remove with matching 'freq'
- *
- * This function removes an opp from the opp list.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
- */
-void dev_pm_opp_remove(struct device *dev, unsigned long freq)
-{
- struct dev_pm_opp *opp;
- struct device_opp *dev_opp;
- bool found = false;
-
- /* Hold our list modification lock here */
- mutex_lock(&dev_opp_list_lock);
-
- dev_opp = _find_device_opp(dev);
- if (IS_ERR(dev_opp))
- goto unlock;
-
- list_for_each_entry(opp, &dev_opp->opp_list, node) {
- if (opp->rate == freq) {
- found = true;
- break;
- }
- }
-
- if (!found) {
- dev_warn(dev, "%s: Couldn't find OPP with freq: %lu\n",
- __func__, freq);
- goto unlock;
- }
-
- _opp_remove(dev_opp, opp);
-unlock:
- mutex_unlock(&dev_opp_list_lock);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_remove);
-
-/**
- * _opp_set_availability() - helper to set the availability of an opp
- * @dev: device for which we do this operation
- * @freq: OPP frequency to modify availability
- * @availability_req: availability status requested for this opp
- *
- * Set the availability of an OPP with an RCU operation, opp_{enable,disable}
- * share a common logic which is isolated here.
- *
- * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modifcation was done OR modification was
- * successful.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function internally uses RCU updater strategy with mutex locks to
- * keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex locking or synchronize_rcu() blocking calls cannot be used.
- */
-static int _opp_set_availability(struct device *dev, unsigned long freq,
- bool availability_req)
-{
- struct device_opp *dev_opp;
- struct dev_pm_opp *new_opp, *tmp_opp, *opp = ERR_PTR(-ENODEV);
- int r = 0;
-
- /* keep the node allocated */
- new_opp = kmalloc(sizeof(*new_opp), GFP_KERNEL);
- if (!new_opp)
- return -ENOMEM;
-
- mutex_lock(&dev_opp_list_lock);
-
- /* Find the device_opp */
- dev_opp = _find_device_opp(dev);
- if (IS_ERR(dev_opp)) {
- r = PTR_ERR(dev_opp);
- dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r);
- goto unlock;
- }
-
- /* Do we have the frequency? */
- list_for_each_entry(tmp_opp, &dev_opp->opp_list, node) {
- if (tmp_opp->rate == freq) {
- opp = tmp_opp;
- break;
- }
- }
- if (IS_ERR(opp)) {
- r = PTR_ERR(opp);
- goto unlock;
- }
-
- /* Is update really needed? */
- if (opp->available == availability_req)
- goto unlock;
- /* copy the old data over */
- *new_opp = *opp;
-
- /* plug in new node */
- new_opp->available = availability_req;
-
- list_replace_rcu(&opp->node, &new_opp->node);
- mutex_unlock(&dev_opp_list_lock);
- call_srcu(&dev_opp->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
-
- /* Notify the change of the OPP availability */
- if (availability_req)
- srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ENABLE,
- new_opp);
- else
- srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_DISABLE,
- new_opp);
-
- return 0;
-
-unlock:
- mutex_unlock(&dev_opp_list_lock);
- kfree(new_opp);
- return r;
-}
-
-/**
- * dev_pm_opp_enable() - Enable a specific OPP
- * @dev: device for which we do this operation
- * @freq: OPP frequency to enable
- *
- * Enables a provided opp. If the operation is valid, this returns 0, else the
- * corresponding error value. It is meant to be used for users an OPP available
- * after being temporarily made unavailable with dev_pm_opp_disable.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function indirectly uses RCU and mutex locks to keep the
- * integrity of the internal data structures. Callers should ensure that
- * this function is *NOT* called under RCU protection or in contexts where
- * mutex locking or synchronize_rcu() blocking calls cannot be used.
- *
- * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modifcation was done OR modification was
- * successful.
- */
-int dev_pm_opp_enable(struct device *dev, unsigned long freq)
-{
- return _opp_set_availability(dev, freq, true);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_enable);
-
-/**
- * dev_pm_opp_disable() - Disable a specific OPP
- * @dev: device for which we do this operation
- * @freq: OPP frequency to disable
- *
- * Disables a provided opp. If the operation is valid, this returns
- * 0, else the corresponding error value. It is meant to be a temporary
- * control by users to make this OPP not available until the circumstances are
- * right to make it available again (with a call to dev_pm_opp_enable).
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function indirectly uses RCU and mutex locks to keep the
- * integrity of the internal data structures. Callers should ensure that
- * this function is *NOT* called under RCU protection or in contexts where
- * mutex locking or synchronize_rcu() blocking calls cannot be used.
- *
- * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modifcation was done OR modification was
- * successful.
- */
-int dev_pm_opp_disable(struct device *dev, unsigned long freq)
-{
- return _opp_set_availability(dev, freq, false);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_disable);
-
-/**
- * dev_pm_opp_get_notifier() - find notifier_head of the device with opp
- * @dev: device pointer used to lookup device OPPs.
- *
- * Return: pointer to notifier head if found, otherwise -ENODEV or
- * -EINVAL based on type of error casted as pointer. value must be checked
- * with IS_ERR to determine valid pointer or error result.
- *
- * Locking: This function must be called under rcu_read_lock(). dev_opp is a RCU
- * protected pointer. The reason for the same is that the opp pointer which is
- * returned will remain valid for use with opp_get_{voltage, freq} only while
- * under the locked area. The pointer returned must be used prior to unlocking
- * with rcu_read_unlock() to maintain the integrity of the pointer.
- */
-struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev)
-{
- struct device_opp *dev_opp = _find_device_opp(dev);
-
- if (IS_ERR(dev_opp))
- return ERR_CAST(dev_opp); /* matching type */
-
- return &dev_opp->srcu_head;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_notifier);
-
-#ifdef CONFIG_OF
-/**
- * of_init_opp_table() - Initialize opp table from device tree
- * @dev: device pointer used to lookup device OPPs.
- *
- * Register the initial OPP table with the OPP library for given device.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function indirectly uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
- *
- * Return:
- * 0 On success OR
- * Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST Freq are same and volt are different OR
- * Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM Memory allocation failure
- * -ENODEV when 'operating-points' property is not found or is invalid data
- * in device node.
- * -ENODATA when empty 'operating-points' property is found
- */
-int of_init_opp_table(struct device *dev)
-{
- const struct property *prop;
- const __be32 *val;
- int nr;
-
- prop = of_find_property(dev->of_node, "operating-points", NULL);
- if (!prop)
- return -ENODEV;
- if (!prop->value)
- return -ENODATA;
-
- /*
- * Each OPP is a set of tuples consisting of frequency and
- * voltage like <freq-kHz vol-uV>.
- */
- nr = prop->length / sizeof(u32);
- if (nr % 2) {
- dev_err(dev, "%s: Invalid OPP list\n", __func__);
- return -EINVAL;
- }
-
- val = prop->value;
- while (nr) {
- unsigned long freq = be32_to_cpup(val++) * 1000;
- unsigned long volt = be32_to_cpup(val++);
-
- if (_opp_add_dynamic(dev, freq, volt, false))
- dev_warn(dev, "%s: Failed to add OPP %ld\n",
- __func__, freq);
- nr -= 2;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(of_init_opp_table);
-
-/**
- * of_free_opp_table() - Free OPP table entries created from static DT entries
- * @dev: device pointer used to lookup device OPPs.
- *
- * Free OPPs created using static entries present in DT.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Hence this function indirectly uses RCU updater strategy with mutex locks
- * to keep the integrity of the internal data structures. Callers should ensure
- * that this function is *NOT* called under RCU protection or in contexts where
- * mutex cannot be locked.
- */
-void of_free_opp_table(struct device *dev)
-{
- struct device_opp *dev_opp;
- struct dev_pm_opp *opp, *tmp;
-
- /* Check for existing list for 'dev' */
- dev_opp = _find_device_opp(dev);
- if (IS_ERR(dev_opp)) {
- int error = PTR_ERR(dev_opp);
- if (error != -ENODEV)
- WARN(1, "%s: dev_opp: %d\n",
- IS_ERR_OR_NULL(dev) ?
- "Invalid device" : dev_name(dev),
- error);
- return;
- }
-
- /* Hold our list modification lock here */
- mutex_lock(&dev_opp_list_lock);
-
- /* Free static OPPs */
- list_for_each_entry_safe(opp, tmp, &dev_opp->opp_list, node) {
- if (!opp->dynamic)
- _opp_remove(dev_opp, opp);
- }
-
- mutex_unlock(&dev_opp_list_lock);
-}
-EXPORT_SYMBOL_GPL(of_free_opp_table);
-#endif
diff --git a/drivers/base/power/opp/Makefile b/drivers/base/power/opp/Makefile
new file mode 100644
index 000000000000..19837ef04d8e
--- /dev/null
+++ b/drivers/base/power/opp/Makefile
@@ -0,0 +1,3 @@
+ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
+obj-y += core.o cpu.o
+obj-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
new file mode 100644
index 000000000000..333161fd623c
--- /dev/null
+++ b/drivers/base/power/opp/core.c
@@ -0,0 +1,2053 @@
+/*
+ * Generic OPP Interface
+ *
+ * Copyright (C) 2009-2010 Texas Instruments Incorporated.
+ * Nishanth Menon
+ * Romit Dasgupta
+ * Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/clk.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/export.h>
+#include <linux/regulator/consumer.h>
+
+#include "opp.h"
+
+/*
+ * The root of the list of all opp-tables. All opp_table structures branch off
+ * from here, with each opp_table containing the list of opps it supports in
+ * various states of availability.
+ */
+static LIST_HEAD(opp_tables);
+/* Lock to allow exclusive modification to the device and opp lists */
+DEFINE_MUTEX(opp_table_lock);
+
+#define opp_rcu_lockdep_assert() \
+do { \
+ rcu_lockdep_assert(rcu_read_lock_held() || \
+ lockdep_is_held(&opp_table_lock), \
+ "Missing rcu_read_lock() or " \
+ "opp_table_lock protection"); \
+} while (0)
+
+static struct opp_device *_find_opp_dev(const struct device *dev,
+ struct opp_table *opp_table)
+{
+ struct opp_device *opp_dev;
+
+ list_for_each_entry(opp_dev, &opp_table->dev_list, node)
+ if (opp_dev->dev == dev)
+ return opp_dev;
+
+ return NULL;
+}
+
+static struct opp_table *_managed_opp(const struct device_node *np)
+{
+ struct opp_table *opp_table;
+
+ list_for_each_entry_rcu(opp_table, &opp_tables, node) {
+ if (opp_table->np == np) {
+ /*
+ * Multiple devices can point to the same OPP table and
+ * so will have same node-pointer, np.
+ *
+ * But the OPPs will be considered as shared only if the
+ * OPP table contains a "opp-shared" property.
+ */
+ return opp_table->shared_opp ? opp_table : NULL;
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * _find_opp_table() - find opp_table struct using device pointer
+ * @dev: device pointer used to lookup OPP table
+ *
+ * Search OPP table for one containing matching device. Does a RCU reader
+ * operation to grab the pointer needed.
+ *
+ * Return: pointer to 'struct opp_table' if found, otherwise -ENODEV or
+ * -EINVAL based on type of error.
+ *
+ * Locking: For readers, this function must be called under rcu_read_lock().
+ * opp_table is a RCU protected pointer, which means that opp_table is valid
+ * as long as we are under RCU lock.
+ *
+ * For Writers, this function must be called with opp_table_lock held.
+ */
+struct opp_table *_find_opp_table(struct device *dev)
+{
+ struct opp_table *opp_table;
+
+ opp_rcu_lockdep_assert();
+
+ if (IS_ERR_OR_NULL(dev)) {
+ pr_err("%s: Invalid parameters\n", __func__);
+ return ERR_PTR(-EINVAL);
+ }
+
+ list_for_each_entry_rcu(opp_table, &opp_tables, node)
+ if (_find_opp_dev(dev, opp_table))
+ return opp_table;
+
+ return ERR_PTR(-ENODEV);
+}
+
+/**
+ * dev_pm_opp_get_voltage() - Gets the voltage corresponding to an opp
+ * @opp: opp for which voltage has to be returned for
+ *
+ * Return: voltage in micro volt corresponding to the opp, else
+ * return 0
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. This means that opp which could have been fetched by
+ * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
+ * under RCU lock. The pointer returned by the opp_find_freq family must be
+ * used in the same section as the usage of this function with the pointer
+ * prior to unlocking with rcu_read_unlock() to maintain the integrity of the
+ * pointer.
+ */
+unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
+{
+ struct dev_pm_opp *tmp_opp;
+ unsigned long v = 0;
+
+ opp_rcu_lockdep_assert();
+
+ tmp_opp = rcu_dereference(opp);
+ if (IS_ERR_OR_NULL(tmp_opp))
+ pr_err("%s: Invalid parameters\n", __func__);
+ else
+ v = tmp_opp->u_volt;
+
+ return v;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_voltage);
+
+/**
+ * dev_pm_opp_get_freq() - Gets the frequency corresponding to an available opp
+ * @opp: opp for which frequency has to be returned for
+ *
+ * Return: frequency in hertz corresponding to the opp, else
+ * return 0
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. This means that opp which could have been fetched by
+ * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
+ * under RCU lock. The pointer returned by the opp_find_freq family must be
+ * used in the same section as the usage of this function with the pointer
+ * prior to unlocking with rcu_read_unlock() to maintain the integrity of the
+ * pointer.
+ */
+unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
+{
+ struct dev_pm_opp *tmp_opp;
+ unsigned long f = 0;
+
+ opp_rcu_lockdep_assert();
+
+ tmp_opp = rcu_dereference(opp);
+ if (IS_ERR_OR_NULL(tmp_opp) || !tmp_opp->available)
+ pr_err("%s: Invalid parameters\n", __func__);
+ else
+ f = tmp_opp->rate;
+
+ return f;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq);
+
+/**
+ * dev_pm_opp_is_turbo() - Returns if opp is turbo OPP or not
+ * @opp: opp for which turbo mode is being verified
+ *
+ * Turbo OPPs are not for normal use, and can be enabled (under certain
+ * conditions) for short duration of times to finish high throughput work
+ * quickly. Running on them for longer times may overheat the chip.
+ *
+ * Return: true if opp is turbo opp, else false.
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. This means that opp which could have been fetched by
+ * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
+ * under RCU lock. The pointer returned by the opp_find_freq family must be
+ * used in the same section as the usage of this function with the pointer
+ * prior to unlocking with rcu_read_unlock() to maintain the integrity of the
+ * pointer.
+ */
+bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp)
+{
+ struct dev_pm_opp *tmp_opp;
+
+ opp_rcu_lockdep_assert();
+
+ tmp_opp = rcu_dereference(opp);
+ if (IS_ERR_OR_NULL(tmp_opp) || !tmp_opp->available) {
+ pr_err("%s: Invalid parameters\n", __func__);
+ return false;
+ }
+
+ return tmp_opp->turbo;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_is_turbo);
+
+/**
+ * dev_pm_opp_get_max_clock_latency() - Get max clock latency in nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max clock latency in nanoseconds.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
+{
+ struct opp_table *opp_table;
+ unsigned long clock_latency_ns;
+
+ rcu_read_lock();
+
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table))
+ clock_latency_ns = 0;
+ else
+ clock_latency_ns = opp_table->clock_latency_ns_max;
+
+ rcu_read_unlock();
+ return clock_latency_ns;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
+
+/**
+ * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max voltage latency in nanoseconds.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
+{
+ struct opp_table *opp_table;
+ struct dev_pm_opp *opp;
+ struct regulator *reg;
+ unsigned long latency_ns = 0;
+ unsigned long min_uV = ~0, max_uV = 0;
+ int ret;
+
+ rcu_read_lock();
+
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table)) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ reg = opp_table->regulator;
+ if (IS_ERR(reg)) {
+ /* Regulator may not be required for device */
+ if (reg)
+ dev_err(dev, "%s: Invalid regulator (%ld)\n", __func__,
+ PTR_ERR(reg));
+ rcu_read_unlock();
+ return 0;
+ }
+
+ list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
+ if (!opp->available)
+ continue;
+
+ if (opp->u_volt_min < min_uV)
+ min_uV = opp->u_volt_min;
+ if (opp->u_volt_max > max_uV)
+ max_uV = opp->u_volt_max;
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * The caller needs to ensure that opp_table (and hence the regulator)
+ * isn't freed, while we are executing this routine.
+ */
+ ret = regulator_set_voltage_time(reg, min_uV, max_uV);
+ if (ret > 0)
+ latency_ns = ret * 1000;
+
+ return latency_ns;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_volt_latency);
+
+/**
+ * dev_pm_opp_get_max_transition_latency() - Get max transition latency in
+ * nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max transition latency, in nanoseconds, to
+ * switch from one OPP to other.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
+{
+ return dev_pm_opp_get_max_volt_latency(dev) +
+ dev_pm_opp_get_max_clock_latency(dev);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_transition_latency);
+
+/**
+ * dev_pm_opp_get_suspend_opp() - Get suspend opp
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns pointer to the suspend opp if it is
+ * defined and available, otherwise it returns NULL.
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. The reason for the same is that the opp pointer which is
+ * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
+{
+ struct opp_table *opp_table;
+
+ opp_rcu_lockdep_assert();
+
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table) || !opp_table->suspend_opp ||
+ !opp_table->suspend_opp->available)
+ return NULL;
+
+ return opp_table->suspend_opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp);
+
+/**
+ * dev_pm_opp_get_opp_count() - Get number of opps available in the opp table
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the number of available opps if there are any,
+ * else returns 0 if none or the corresponding error value.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+int dev_pm_opp_get_opp_count(struct device *dev)
+{
+ struct opp_table *opp_table;
+ struct dev_pm_opp *temp_opp;
+ int count = 0;
+
+ rcu_read_lock();
+
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table)) {
+ count = PTR_ERR(opp_table);
+ dev_err(dev, "%s: OPP table not found (%d)\n",
+ __func__, count);
+ goto out_unlock;
+ }
+
+ list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
+ if (temp_opp->available)
+ count++;
+ }
+
+out_unlock:
+ rcu_read_unlock();
+ return count;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_count);
+
+/**
+ * dev_pm_opp_find_freq_exact() - search for an exact frequency
+ * @dev: device for which we do this operation
+ * @freq: frequency to search for
+ * @available: true/false - match for available opp
+ *
+ * Return: Searches for exact match in the opp table and returns pointer to the
+ * matching opp if found, else returns ERR_PTR in case of error and should
+ * be handled using IS_ERR. Error return values can be:
+ * EINVAL: for bad pointer
+ * ERANGE: no match found for search
+ * ENODEV: if device not found in list of registered devices
+ *
+ * Note: available is a modifier for the search. if available=true, then the
+ * match is for exact matching frequency and is available in the stored OPP
+ * table. if false, the match is for exact frequency which is not available.
+ *
+ * This provides a mechanism to enable an opp which is not available currently
+ * or the opposite as well.
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. The reason for the same is that the opp pointer which is
+ * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
+ unsigned long freq,
+ bool available)
+{
+ struct opp_table *opp_table;
+ struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+ opp_rcu_lockdep_assert();
+
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table)) {
+ int r = PTR_ERR(opp_table);
+
+ dev_err(dev, "%s: OPP table not found (%d)\n", __func__, r);
+ return ERR_PTR(r);
+ }
+
+ list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
+ if (temp_opp->available == available &&
+ temp_opp->rate == freq) {
+ opp = temp_opp;
+ break;
+ }
+ }
+
+ return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact);
+
+/**
+ * dev_pm_opp_find_freq_ceil() - Search for an rounded ceil freq
+ * @dev: device for which we do this operation
+ * @freq: Start frequency
+ *
+ * Search for the matching ceil *available* OPP from a starting freq
+ * for a device.
+ *
+ * Return: matching *opp and refreshes *freq accordingly, else returns
+ * ERR_PTR in case of error and should be handled using IS_ERR. Error return
+ * values can be:
+ * EINVAL: for bad pointer
+ * ERANGE: no match found for search
+ * ENODEV: if device not found in list of registered devices
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. The reason for the same is that the opp pointer which is
+ * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
+ unsigned long *freq)
+{
+ struct opp_table *opp_table;
+ struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+ opp_rcu_lockdep_assert();
+
+ if (!dev || !freq) {
+ dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
+ return ERR_PTR(-EINVAL);
+ }
+
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table))
+ return ERR_CAST(opp_table);
+
+ list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
+ if (temp_opp->available && temp_opp->rate >= *freq) {
+ opp = temp_opp;
+ *freq = opp->rate;
+ break;
+ }
+ }
+
+ return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil);
+
+/**
+ * dev_pm_opp_find_freq_floor() - Search for a rounded floor freq
+ * @dev: device for which we do this operation
+ * @freq: Start frequency
+ *
+ * Search for the matching floor *available* OPP from a starting freq
+ * for a device.
+ *
+ * Return: matching *opp and refreshes *freq accordingly, else returns
+ * ERR_PTR in case of error and should be handled using IS_ERR. Error return
+ * values can be:
+ * EINVAL: for bad pointer
+ * ERANGE: no match found for search
+ * ENODEV: if device not found in list of registered devices
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. The reason for the same is that the opp pointer which is
+ * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
+ unsigned long *freq)
+{
+ struct opp_table *opp_table;
+ struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+ opp_rcu_lockdep_assert();
+
+ if (!dev || !freq) {
+ dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
+ return ERR_PTR(-EINVAL);
+ }
+
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table))
+ return ERR_CAST(opp_table);
+
+ list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
+ if (temp_opp->available) {
+ /* go to the next node, before choosing prev */
+ if (temp_opp->rate > *freq)
+ break;
+ else
+ opp = temp_opp;
+ }
+ }
+ if (!IS_ERR(opp))
+ *freq = opp->rate;
+
+ return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
+
+/*
+ * The caller needs to ensure that opp_table (and hence the clk) isn't freed,
+ * while clk returned here is used.
+ */
+static struct clk *_get_opp_clk(struct device *dev)
+{
+ struct opp_table *opp_table;
+ struct clk *clk;
+
+ rcu_read_lock();
+
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table)) {
+ dev_err(dev, "%s: device opp doesn't exist\n", __func__);
+ clk = ERR_CAST(opp_table);
+ goto unlock;
+ }
+
+ clk = opp_table->clk;
+ if (IS_ERR(clk))
+ dev_err(dev, "%s: No clock available for the device\n",
+ __func__);
+
+unlock:
+ rcu_read_unlock();
+ return clk;
+}
+
+static int _set_opp_voltage(struct device *dev, struct regulator *reg,
+ unsigned long u_volt, unsigned long u_volt_min,
+ unsigned long u_volt_max)
+{
+ int ret;
+
+ /* Regulator not available for device */
+ if (IS_ERR(reg)) {
+ dev_dbg(dev, "%s: regulator not available: %ld\n", __func__,
+ PTR_ERR(reg));
+ return 0;
+ }
+
+ dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__, u_volt_min,
+ u_volt, u_volt_max);
+
+ ret = regulator_set_voltage_triplet(reg, u_volt_min, u_volt,
+ u_volt_max);
+ if (ret)
+ dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n",
+ __func__, u_volt_min, u_volt, u_volt_max, ret);
+
+ return ret;
+}
+
+/**
+ * dev_pm_opp_set_rate() - Configure new OPP based on frequency
+ * @dev: device for which we do this operation
+ * @target_freq: frequency to achieve
+ *
+ * This configures the power-supplies and clock source to the levels specified
+ * by the OPP corresponding to the target_freq.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
+{
+ struct opp_table *opp_table;
+ struct dev_pm_opp *old_opp, *opp;
+ struct regulator *reg;
+ struct clk *clk;
+ unsigned long freq, old_freq;
+ unsigned long u_volt, u_volt_min, u_volt_max;
+ unsigned long ou_volt, ou_volt_min, ou_volt_max;
+ int ret;
+
+ if (unlikely(!target_freq)) {
+ dev_err(dev, "%s: Invalid target frequency %lu\n", __func__,
+ target_freq);
+ return -EINVAL;
+ }
+
+ clk = _get_opp_clk(dev);
+ if (IS_ERR(clk))
+ return PTR_ERR(clk);
+
+ freq = clk_round_rate(clk, target_freq);
+ if ((long)freq <= 0)
+ freq = target_freq;
+
+ old_freq = clk_get_rate(clk);
+
+ /* Return early if nothing to do */
+ if (old_freq == freq) {
+ dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n",
+ __func__, freq);
+ return 0;
+ }
+
+ rcu_read_lock();
+
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table)) {
+ dev_err(dev, "%s: device opp doesn't exist\n", __func__);
+ rcu_read_unlock();
+ return PTR_ERR(opp_table);
+ }
+
+ old_opp = dev_pm_opp_find_freq_ceil(dev, &old_freq);
+ if (!IS_ERR(old_opp)) {
+ ou_volt = old_opp->u_volt;
+ ou_volt_min = old_opp->u_volt_min;
+ ou_volt_max = old_opp->u_volt_max;
+ } else {
+ dev_err(dev, "%s: failed to find current OPP for freq %lu (%ld)\n",
+ __func__, old_freq, PTR_ERR(old_opp));
+ }
+
+ opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+ if (IS_ERR(opp)) {
+ ret = PTR_ERR(opp);
+ dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n",
+ __func__, freq, ret);
+ rcu_read_unlock();
+ return ret;
+ }
+
+ u_volt = opp->u_volt;
+ u_volt_min = opp->u_volt_min;
+ u_volt_max = opp->u_volt_max;
+
+ reg = opp_table->regulator;
+
+ rcu_read_unlock();
+
+ /* Scaling up? Scale voltage before frequency */
+ if (freq > old_freq) {
+ ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
+ u_volt_max);
+ if (ret)
+ goto restore_voltage;
+ }
+
+ /* Change frequency */
+
+ dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n",
+ __func__, old_freq, freq);
+
+ ret = clk_set_rate(clk, freq);
+ if (ret) {
+ dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+ ret);
+ goto restore_voltage;
+ }
+
+ /* Scaling down? Scale voltage after frequency */
+ if (freq < old_freq) {
+ ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
+ u_volt_max);
+ if (ret)
+ goto restore_freq;
+ }
+
+ return 0;
+
+restore_freq:
+ if (clk_set_rate(clk, old_freq))
+ dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
+ __func__, old_freq);
+restore_voltage:
+ /* This shouldn't harm even if the voltages weren't updated earlier */
+ if (!IS_ERR(old_opp))
+ _set_opp_voltage(dev, reg, ou_volt, ou_volt_min, ou_volt_max);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
+
+/* OPP-dev Helpers */
+static void _kfree_opp_dev_rcu(struct rcu_head *head)
+{
+ struct opp_device *opp_dev;
+
+ opp_dev = container_of(head, struct opp_device, rcu_head);
+ kfree_rcu(opp_dev, rcu_head);
+}
+
+static void _remove_opp_dev(struct opp_device *opp_dev,
+ struct opp_table *opp_table)
+{
+ opp_debug_unregister(opp_dev, opp_table);
+ list_del(&opp_dev->node);
+ call_srcu(&opp_table->srcu_head.srcu, &opp_dev->rcu_head,
+ _kfree_opp_dev_rcu);
+}
+
+struct opp_device *_add_opp_dev(const struct device *dev,
+ struct opp_table *opp_table)
+{
+ struct opp_device *opp_dev;
+ int ret;
+
+ opp_dev = kzalloc(sizeof(*opp_dev), GFP_KERNEL);
+ if (!opp_dev)
+ return NULL;
+
+ /* Initialize opp-dev */
+ opp_dev->dev = dev;
+ list_add_rcu(&opp_dev->node, &opp_table->dev_list);
+
+ /* Create debugfs entries for the opp_table */
+ ret = opp_debug_register(opp_dev, opp_table);
+ if (ret)
+ dev_err(dev, "%s: Failed to register opp debugfs (%d)\n",
+ __func__, ret);
+
+ return opp_dev;
+}
+
+/**
+ * _add_opp_table() - Find OPP table or allocate a new one
+ * @dev: device for which we do this operation
+ *
+ * It tries to find an existing table first, if it couldn't find one, it
+ * allocates a new OPP table and returns that.
+ *
+ * Return: valid opp_table pointer if success, else NULL.
+ */
+static struct opp_table *_add_opp_table(struct device *dev)
+{
+ struct opp_table *opp_table;
+ struct opp_device *opp_dev;
+ struct device_node *np;
+ int ret;
+
+ /* Check for existing table for 'dev' first */
+ opp_table = _find_opp_table(dev);
+ if (!IS_ERR(opp_table))
+ return opp_table;
+
+ /*
+ * Allocate a new OPP table. In the infrequent case where a new
+ * device is needed to be added, we pay this penalty.
+ */
+ opp_table = kzalloc(sizeof(*opp_table), GFP_KERNEL);
+ if (!opp_table)
+ return NULL;
+
+ INIT_LIST_HEAD(&opp_table->dev_list);
+
+ opp_dev = _add_opp_dev(dev, opp_table);
+ if (!opp_dev) {
+ kfree(opp_table);
+ return NULL;
+ }
+
+ /*
+ * Only required for backward compatibility with v1 bindings, but isn't
+ * harmful for other cases. And so we do it unconditionally.
+ */
+ np = of_node_get(dev->of_node);
+ if (np) {
+ u32 val;
+
+ if (!of_property_read_u32(np, "clock-latency", &val))
+ opp_table->clock_latency_ns_max = val;
+ of_property_read_u32(np, "voltage-tolerance",
+ &opp_table->voltage_tolerance_v1);
+ of_node_put(np);
+ }
+
+ /* Set regulator to a non-NULL error value */
+ opp_table->regulator = ERR_PTR(-ENXIO);
+
+ /* Find clk for the device */
+ opp_table->clk = clk_get(dev, NULL);
+ if (IS_ERR(opp_table->clk)) {
+ ret = PTR_ERR(opp_table->clk);
+ if (ret != -EPROBE_DEFER)
+ dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__,
+ ret);
+ }
+
+ srcu_init_notifier_head(&opp_table->srcu_head);
+ INIT_LIST_HEAD(&opp_table->opp_list);
+
+ /* Secure the device table modification */
+ list_add_rcu(&opp_table->node, &opp_tables);
+ return opp_table;
+}
+
+/**
+ * _kfree_device_rcu() - Free opp_table RCU handler
+ * @head: RCU head
+ */
+static void _kfree_device_rcu(struct rcu_head *head)
+{
+ struct opp_table *opp_table = container_of(head, struct opp_table,
+ rcu_head);
+
+ kfree_rcu(opp_table, rcu_head);
+}
+
+/**
+ * _remove_opp_table() - Removes a OPP table
+ * @opp_table: OPP table to be removed.
+ *
+ * Removes/frees OPP table if it doesn't contain any OPPs.
+ */
+static void _remove_opp_table(struct opp_table *opp_table)
+{
+ struct opp_device *opp_dev;
+
+ if (!list_empty(&opp_table->opp_list))
+ return;
+
+ if (opp_table->supported_hw)
+ return;
+
+ if (opp_table->prop_name)
+ return;
+
+ if (!IS_ERR(opp_table->regulator))
+ return;
+
+ /* Release clk */
+ if (!IS_ERR(opp_table->clk))
+ clk_put(opp_table->clk);
+
+ opp_dev = list_first_entry(&opp_table->dev_list, struct opp_device,
+ node);
+
+ _remove_opp_dev(opp_dev, opp_table);
+
+ /* dev_list must be empty now */
+ WARN_ON(!list_empty(&opp_table->dev_list));
+
+ list_del_rcu(&opp_table->node);
+ call_srcu(&opp_table->srcu_head.srcu, &opp_table->rcu_head,
+ _kfree_device_rcu);
+}
+
+/**
+ * _kfree_opp_rcu() - Free OPP RCU handler
+ * @head: RCU head
+ */
+static void _kfree_opp_rcu(struct rcu_head *head)
+{
+ struct dev_pm_opp *opp = container_of(head, struct dev_pm_opp, rcu_head);
+
+ kfree_rcu(opp, rcu_head);
+}
+
+/**
+ * _opp_remove() - Remove an OPP from a table definition
+ * @opp_table: points back to the opp_table struct this opp belongs to
+ * @opp: pointer to the OPP to remove
+ * @notify: OPP_EVENT_REMOVE notification should be sent or not
+ *
+ * This function removes an opp definition from the opp table.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * It is assumed that the caller holds required mutex for an RCU updater
+ * strategy.
+ */
+static void _opp_remove(struct opp_table *opp_table,
+ struct dev_pm_opp *opp, bool notify)
+{
+ /*
+ * Notify the changes in the availability of the operable
+ * frequency/voltage list.
+ */
+ if (notify)
+ srcu_notifier_call_chain(&opp_table->srcu_head,
+ OPP_EVENT_REMOVE, opp);
+ opp_debug_remove_one(opp);
+ list_del_rcu(&opp->node);
+ call_srcu(&opp_table->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
+
+ _remove_opp_table(opp_table);
+}
+
+/**
+ * dev_pm_opp_remove() - Remove an OPP from OPP table
+ * @dev: device for which we do this operation
+ * @freq: OPP to remove with matching 'freq'
+ *
+ * This function removes an opp from the opp table.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_remove(struct device *dev, unsigned long freq)
+{
+ struct dev_pm_opp *opp;
+ struct opp_table *opp_table;
+ bool found = false;
+
+ /* Hold our table modification lock here */
+ mutex_lock(&opp_table_lock);
+
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table))
+ goto unlock;
+
+ list_for_each_entry(opp, &opp_table->opp_list, node) {
+ if (opp->rate == freq) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ dev_warn(dev, "%s: Couldn't find OPP with freq: %lu\n",
+ __func__, freq);
+ goto unlock;
+ }
+
+ _opp_remove(opp_table, opp, true);
+unlock:
+ mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_remove);
+
+static struct dev_pm_opp *_allocate_opp(struct device *dev,
+ struct opp_table **opp_table)
+{
+ struct dev_pm_opp *opp;
+
+ /* allocate new OPP node */
+ opp = kzalloc(sizeof(*opp), GFP_KERNEL);
+ if (!opp)
+ return NULL;
+
+ INIT_LIST_HEAD(&opp->node);
+
+ *opp_table = _add_opp_table(dev);
+ if (!*opp_table) {
+ kfree(opp);
+ return NULL;
+ }
+
+ return opp;
+}
+
+static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
+ struct opp_table *opp_table)
+{
+ struct regulator *reg = opp_table->regulator;
+
+ if (!IS_ERR(reg) &&
+ !regulator_is_supported_voltage(reg, opp->u_volt_min,
+ opp->u_volt_max)) {
+ pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
+ __func__, opp->u_volt_min, opp->u_volt_max);
+ return false;
+ }
+
+ return true;
+}
+
+static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
+ struct opp_table *opp_table)
+{
+ struct dev_pm_opp *opp;
+ struct list_head *head = &opp_table->opp_list;
+ int ret;
+
+ /*
+ * Insert new OPP in order of increasing frequency and discard if
+ * already present.
+ *
+ * Need to use &opp_table->opp_list in the condition part of the 'for'
+ * loop, don't replace it with head otherwise it will become an infinite
+ * loop.
+ */
+ list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
+ if (new_opp->rate > opp->rate) {
+ head = &opp->node;
+ continue;
+ }
+
+ if (new_opp->rate < opp->rate)
+ break;
+
+ /* Duplicate OPPs */
+ dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
+ __func__, opp->rate, opp->u_volt, opp->available,
+ new_opp->rate, new_opp->u_volt, new_opp->available);
+
+ return opp->available && new_opp->u_volt == opp->u_volt ?
+ 0 : -EEXIST;
+ }
+
+ new_opp->opp_table = opp_table;
+ list_add_rcu(&new_opp->node, head);
+
+ ret = opp_debug_create_one(new_opp, opp_table);
+ if (ret)
+ dev_err(dev, "%s: Failed to register opp to debugfs (%d)\n",
+ __func__, ret);
+
+ if (!_opp_supported_by_regulators(new_opp, opp_table)) {
+ new_opp->available = false;
+ dev_warn(dev, "%s: OPP not supported by regulators (%lu)\n",
+ __func__, new_opp->rate);
+ }
+
+ return 0;
+}
+
+/**
+ * _opp_add_v1() - Allocate a OPP based on v1 bindings.
+ * @dev: device for which we do this operation
+ * @freq: Frequency in Hz for this OPP
+ * @u_volt: Voltage in uVolts for this OPP
+ * @dynamic: Dynamically added OPPs.
+ *
+ * This function adds an opp definition to the opp table and returns status.
+ * The opp is made available by default and it can be controlled using
+ * dev_pm_opp_enable/disable functions and may be removed by dev_pm_opp_remove.
+ *
+ * NOTE: "dynamic" parameter impacts OPPs added by the dev_pm_opp_of_add_table
+ * and freed by dev_pm_opp_of_remove_table.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ *
+ * Return:
+ * 0 On success OR
+ * Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST Freq are same and volt are different OR
+ * Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM Memory allocation failure
+ */
+static int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
+ bool dynamic)
+{
+ struct opp_table *opp_table;
+ struct dev_pm_opp *new_opp;
+ unsigned long tol;
+ int ret;
+
+ /* Hold our table modification lock here */
+ mutex_lock(&opp_table_lock);
+
+ new_opp = _allocate_opp(dev, &opp_table);
+ if (!new_opp) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ /* populate the opp table */
+ new_opp->rate = freq;
+ tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
+ new_opp->u_volt = u_volt;
+ new_opp->u_volt_min = u_volt - tol;
+ new_opp->u_volt_max = u_volt + tol;
+ new_opp->available = true;
+ new_opp->dynamic = dynamic;
+
+ ret = _opp_add(dev, new_opp, opp_table);
+ if (ret)
+ goto free_opp;
+
+ mutex_unlock(&opp_table_lock);
+
+ /*
+ * Notify the changes in the availability of the operable
+ * frequency/voltage list.
+ */
+ srcu_notifier_call_chain(&opp_table->srcu_head, OPP_EVENT_ADD, new_opp);
+ return 0;
+
+free_opp:
+ _opp_remove(opp_table, new_opp, false);
+unlock:
+ mutex_unlock(&opp_table_lock);
+ return ret;
+}
+
+/* TODO: Support multiple regulators */
+static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
+ struct opp_table *opp_table)
+{
+ u32 microvolt[3] = {0};
+ u32 val;
+ int count, ret;
+ struct property *prop = NULL;
+ char name[NAME_MAX];
+
+ /* Search for "opp-microvolt-<name>" */
+ if (opp_table->prop_name) {
+ snprintf(name, sizeof(name), "opp-microvolt-%s",
+ opp_table->prop_name);
+ prop = of_find_property(opp->np, name, NULL);
+ }
+
+ if (!prop) {
+ /* Search for "opp-microvolt" */
+ sprintf(name, "opp-microvolt");
+ prop = of_find_property(opp->np, name, NULL);
+
+ /* Missing property isn't a problem, but an invalid entry is */
+ if (!prop)
+ return 0;
+ }
+
+ count = of_property_count_u32_elems(opp->np, name);
+ if (count < 0) {
+ dev_err(dev, "%s: Invalid %s property (%d)\n",
+ __func__, name, count);
+ return count;
+ }
+
+ /* There can be one or three elements here */
+ if (count != 1 && count != 3) {
+ dev_err(dev, "%s: Invalid number of elements in %s property (%d)\n",
+ __func__, name, count);
+ return -EINVAL;
+ }
+
+ ret = of_property_read_u32_array(opp->np, name, microvolt, count);
+ if (ret) {
+ dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret);
+ return -EINVAL;
+ }
+
+ opp->u_volt = microvolt[0];
+
+ if (count == 1) {
+ opp->u_volt_min = opp->u_volt;
+ opp->u_volt_max = opp->u_volt;
+ } else {
+ opp->u_volt_min = microvolt[1];
+ opp->u_volt_max = microvolt[2];
+ }
+
+ /* Search for "opp-microamp-<name>" */
+ prop = NULL;
+ if (opp_table->prop_name) {
+ snprintf(name, sizeof(name), "opp-microamp-%s",
+ opp_table->prop_name);
+ prop = of_find_property(opp->np, name, NULL);
+ }
+
+ if (!prop) {
+ /* Search for "opp-microamp" */
+ sprintf(name, "opp-microamp");
+ prop = of_find_property(opp->np, name, NULL);
+ }
+
+ if (prop && !of_property_read_u32(opp->np, name, &val))
+ opp->u_amp = val;
+
+ return 0;
+}
+
+/**
+ * dev_pm_opp_set_supported_hw() - Set supported platforms
+ * @dev: Device for which supported-hw has to be set.
+ * @versions: Array of hierarchy of versions to match.
+ * @count: Number of elements in the array.
+ *
+ * This is required only for the V2 bindings, and it enables a platform to
+ * specify the hierarchy of versions it supports. OPP layer will then enable
+ * OPPs, which are available for those versions, based on its 'opp-supported-hw'
+ * property.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
+ unsigned int count)
+{
+ struct opp_table *opp_table;
+ int ret = 0;
+
+ /* Hold our table modification lock here */
+ mutex_lock(&opp_table_lock);
+
+ opp_table = _add_opp_table(dev);
+ if (!opp_table) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ /* Make sure there are no concurrent readers while updating opp_table */
+ WARN_ON(!list_empty(&opp_table->opp_list));
+
+ /* Do we already have a version hierarchy associated with opp_table? */
+ if (opp_table->supported_hw) {
+ dev_err(dev, "%s: Already have supported hardware list\n",
+ __func__);
+ ret = -EBUSY;
+ goto err;
+ }
+
+ opp_table->supported_hw = kmemdup(versions, count * sizeof(*versions),
+ GFP_KERNEL);
+ if (!opp_table->supported_hw) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ opp_table->supported_hw_count = count;
+ mutex_unlock(&opp_table_lock);
+ return 0;
+
+err:
+ _remove_opp_table(opp_table);
+unlock:
+ mutex_unlock(&opp_table_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
+
+/**
+ * dev_pm_opp_put_supported_hw() - Releases resources blocked for supported hw
+ * @dev: Device for which supported-hw has to be put.
+ *
+ * This is required only for the V2 bindings, and is called for a matching
+ * dev_pm_opp_set_supported_hw(). Until this is called, the opp_table structure
+ * will not be freed.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_supported_hw(struct device *dev)
+{
+ struct opp_table *opp_table;
+
+ /* Hold our table modification lock here */
+ mutex_lock(&opp_table_lock);
+
+ /* Check for existing table for 'dev' first */
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table)) {
+ dev_err(dev, "Failed to find opp_table: %ld\n",
+ PTR_ERR(opp_table));
+ goto unlock;
+ }
+
+ /* Make sure there are no concurrent readers while updating opp_table */
+ WARN_ON(!list_empty(&opp_table->opp_list));
+
+ if (!opp_table->supported_hw) {
+ dev_err(dev, "%s: Doesn't have supported hardware list\n",
+ __func__);
+ goto unlock;
+ }
+
+ kfree(opp_table->supported_hw);
+ opp_table->supported_hw = NULL;
+ opp_table->supported_hw_count = 0;
+
+ /* Try freeing opp_table if this was the last blocking resource */
+ _remove_opp_table(opp_table);
+
+unlock:
+ mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
+
+/**
+ * dev_pm_opp_set_prop_name() - Set prop-extn name
+ * @dev: Device for which the prop-name has to be set.
+ * @name: name to postfix to properties.
+ *
+ * This is required only for the V2 bindings, and it enables a platform to
+ * specify the extn to be used for certain property names. The properties to
+ * which the extension will apply are opp-microvolt and opp-microamp. OPP core
+ * should postfix the property name with -<name> while looking for them.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+{
+ struct opp_table *opp_table;
+ int ret = 0;
+
+ /* Hold our table modification lock here */
+ mutex_lock(&opp_table_lock);
+
+ opp_table = _add_opp_table(dev);
+ if (!opp_table) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ /* Make sure there are no concurrent readers while updating opp_table */
+ WARN_ON(!list_empty(&opp_table->opp_list));
+
+ /* Do we already have a prop-name associated with opp_table? */
+ if (opp_table->prop_name) {
+ dev_err(dev, "%s: Already have prop-name %s\n", __func__,
+ opp_table->prop_name);
+ ret = -EBUSY;
+ goto err;
+ }
+
+ opp_table->prop_name = kstrdup(name, GFP_KERNEL);
+ if (!opp_table->prop_name) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ mutex_unlock(&opp_table_lock);
+ return 0;
+
+err:
+ _remove_opp_table(opp_table);
+unlock:
+ mutex_unlock(&opp_table_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
+
+/**
+ * dev_pm_opp_put_prop_name() - Releases resources blocked for prop-name
+ * @dev: Device for which the prop-name has to be put.
+ *
+ * This is required only for the V2 bindings, and is called for a matching
+ * dev_pm_opp_set_prop_name(). Until this is called, the opp_table structure
+ * will not be freed.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_prop_name(struct device *dev)
+{
+ struct opp_table *opp_table;
+
+ /* Hold our table modification lock here */
+ mutex_lock(&opp_table_lock);
+
+ /* Check for existing table for 'dev' first */
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table)) {
+ dev_err(dev, "Failed to find opp_table: %ld\n",
+ PTR_ERR(opp_table));
+ goto unlock;
+ }
+
+ /* Make sure there are no concurrent readers while updating opp_table */
+ WARN_ON(!list_empty(&opp_table->opp_list));
+
+ if (!opp_table->prop_name) {
+ dev_err(dev, "%s: Doesn't have a prop-name\n", __func__);
+ goto unlock;
+ }
+
+ kfree(opp_table->prop_name);
+ opp_table->prop_name = NULL;
+
+ /* Try freeing opp_table if this was the last blocking resource */
+ _remove_opp_table(opp_table);
+
+unlock:
+ mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
+
+/**
+ * dev_pm_opp_set_regulator() - Set regulator name for the device
+ * @dev: Device for which regulator name is being set.
+ * @name: Name of the regulator.
+ *
+ * In order to support OPP switching, OPP layer needs to know the name of the
+ * device's regulator, as the core would be required to switch voltages as well.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+{
+ struct opp_table *opp_table;
+ struct regulator *reg;
+ int ret;
+
+ mutex_lock(&opp_table_lock);
+
+ opp_table = _add_opp_table(dev);
+ if (!opp_table) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ /* This should be called before OPPs are initialized */
+ if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+ ret = -EBUSY;
+ goto err;
+ }
+
+ /* Already have a regulator set */
+ if (WARN_ON(!IS_ERR(opp_table->regulator))) {
+ ret = -EBUSY;
+ goto err;
+ }
+ /* Allocate the regulator */
+ reg = regulator_get_optional(dev, name);
+ if (IS_ERR(reg)) {
+ ret = PTR_ERR(reg);
+ if (ret != -EPROBE_DEFER)
+ dev_err(dev, "%s: no regulator (%s) found: %d\n",
+ __func__, name, ret);
+ goto err;
+ }
+
+ opp_table->regulator = reg;
+
+ mutex_unlock(&opp_table_lock);
+ return 0;
+
+err:
+ _remove_opp_table(opp_table);
+unlock:
+ mutex_unlock(&opp_table_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
+
+/**
+ * dev_pm_opp_put_regulator() - Releases resources blocked for regulator
+ * @dev: Device for which regulator was set.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_regulator(struct device *dev)
+{
+ struct opp_table *opp_table;
+
+ mutex_lock(&opp_table_lock);
+
+ /* Check for existing table for 'dev' first */
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table)) {
+ dev_err(dev, "Failed to find opp_table: %ld\n",
+ PTR_ERR(opp_table));
+ goto unlock;
+ }
+
+ if (IS_ERR(opp_table->regulator)) {
+ dev_err(dev, "%s: Doesn't have regulator set\n", __func__);
+ goto unlock;
+ }
+
+ /* Make sure there are no concurrent readers while updating opp_table */
+ WARN_ON(!list_empty(&opp_table->opp_list));
+
+ regulator_put(opp_table->regulator);
+ opp_table->regulator = ERR_PTR(-ENXIO);
+
+ /* Try freeing opp_table if this was the last blocking resource */
+ _remove_opp_table(opp_table);
+
+unlock:
+ mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulator);
+
+static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
+ struct device_node *np)
+{
+ unsigned int count = opp_table->supported_hw_count;
+ u32 version;
+ int ret;
+
+ if (!opp_table->supported_hw)
+ return true;
+
+ while (count--) {
+ ret = of_property_read_u32_index(np, "opp-supported-hw", count,
+ &version);
+ if (ret) {
+ dev_warn(dev, "%s: failed to read opp-supported-hw property at index %d: %d\n",
+ __func__, count, ret);
+ return false;
+ }
+
+ /* Both of these are bitwise masks of the versions */
+ if (!(version & opp_table->supported_hw[count]))
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * _opp_add_static_v2() - Allocate static OPPs (As per 'v2' DT bindings)
+ * @dev: device for which we do this operation
+ * @np: device node
+ *
+ * This function adds an opp definition to the opp table and returns status. The
+ * opp can be controlled using dev_pm_opp_enable/disable functions and may be
+ * removed by dev_pm_opp_remove.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ *
+ * Return:
+ * 0 On success OR
+ * Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST Freq are same and volt are different OR
+ * Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM Memory allocation failure
+ * -EINVAL Failed parsing the OPP node
+ */
+static int _opp_add_static_v2(struct device *dev, struct device_node *np)
+{
+ struct opp_table *opp_table;
+ struct dev_pm_opp *new_opp;
+ u64 rate;
+ u32 val;
+ int ret;
+
+ /* Hold our table modification lock here */
+ mutex_lock(&opp_table_lock);
+
+ new_opp = _allocate_opp(dev, &opp_table);
+ if (!new_opp) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ ret = of_property_read_u64(np, "opp-hz", &rate);
+ if (ret < 0) {
+ dev_err(dev, "%s: opp-hz not found\n", __func__);
+ goto free_opp;
+ }
+
+ /* Check if the OPP supports hardware's hierarchy of versions or not */
+ if (!_opp_is_supported(dev, opp_table, np)) {
+ dev_dbg(dev, "OPP not supported by hardware: %llu\n", rate);
+ goto free_opp;
+ }
+
+ /*
+ * Rate is defined as an unsigned long in clk API, and so casting
+ * explicitly to its type. Must be fixed once rate is 64 bit
+ * guaranteed in clk API.
+ */
+ new_opp->rate = (unsigned long)rate;
+ new_opp->turbo = of_property_read_bool(np, "turbo-mode");
+
+ new_opp->np = np;
+ new_opp->dynamic = false;
+ new_opp->available = true;
+
+ if (!of_property_read_u32(np, "clock-latency-ns", &val))
+ new_opp->clock_latency_ns = val;
+
+ ret = opp_parse_supplies(new_opp, dev, opp_table);
+ if (ret)
+ goto free_opp;
+
+ ret = _opp_add(dev, new_opp, opp_table);
+ if (ret)
+ goto free_opp;
+
+ /* OPP to select on device suspend */
+ if (of_property_read_bool(np, "opp-suspend")) {
+ if (opp_table->suspend_opp) {
+ dev_warn(dev, "%s: Multiple suspend OPPs found (%lu %lu)\n",
+ __func__, opp_table->suspend_opp->rate,
+ new_opp->rate);
+ } else {
+ new_opp->suspend = true;
+ opp_table->suspend_opp = new_opp;
+ }
+ }
+
+ if (new_opp->clock_latency_ns > opp_table->clock_latency_ns_max)
+ opp_table->clock_latency_ns_max = new_opp->clock_latency_ns;
+
+ mutex_unlock(&opp_table_lock);
+
+ pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
+ __func__, new_opp->turbo, new_opp->rate, new_opp->u_volt,
+ new_opp->u_volt_min, new_opp->u_volt_max,
+ new_opp->clock_latency_ns);
+
+ /*
+ * Notify the changes in the availability of the operable
+ * frequency/voltage list.
+ */
+ srcu_notifier_call_chain(&opp_table->srcu_head, OPP_EVENT_ADD, new_opp);
+ return 0;
+
+free_opp:
+ _opp_remove(opp_table, new_opp, false);
+unlock:
+ mutex_unlock(&opp_table_lock);
+ return ret;
+}
+
+/**
+ * dev_pm_opp_add() - Add an OPP table from a table definitions
+ * @dev: device for which we do this operation
+ * @freq: Frequency in Hz for this OPP
+ * @u_volt: Voltage in uVolts for this OPP
+ *
+ * This function adds an opp definition to the opp table and returns status.
+ * The opp is made available by default and it can be controlled using
+ * dev_pm_opp_enable/disable functions.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ *
+ * Return:
+ * 0 On success OR
+ * Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST Freq are same and volt are different OR
+ * Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM Memory allocation failure
+ */
+int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
+{
+ return _opp_add_v1(dev, freq, u_volt, true);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_add);
+
+/**
+ * _opp_set_availability() - helper to set the availability of an opp
+ * @dev: device for which we do this operation
+ * @freq: OPP frequency to modify availability
+ * @availability_req: availability status requested for this opp
+ *
+ * Set the availability of an OPP with an RCU operation, opp_{enable,disable}
+ * share a common logic which is isolated here.
+ *
+ * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
+ * copy operation, returns 0 if no modification was done OR modification was
+ * successful.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks to
+ * keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex locking or synchronize_rcu() blocking calls cannot be used.
+ */
+static int _opp_set_availability(struct device *dev, unsigned long freq,
+ bool availability_req)
+{
+ struct opp_table *opp_table;
+ struct dev_pm_opp *new_opp, *tmp_opp, *opp = ERR_PTR(-ENODEV);
+ int r = 0;
+
+ /* keep the node allocated */
+ new_opp = kmalloc(sizeof(*new_opp), GFP_KERNEL);
+ if (!new_opp)
+ return -ENOMEM;
+
+ mutex_lock(&opp_table_lock);
+
+ /* Find the opp_table */
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table)) {
+ r = PTR_ERR(opp_table);
+ dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r);
+ goto unlock;
+ }
+
+ /* Do we have the frequency? */
+ list_for_each_entry(tmp_opp, &opp_table->opp_list, node) {
+ if (tmp_opp->rate == freq) {
+ opp = tmp_opp;
+ break;
+ }
+ }
+ if (IS_ERR(opp)) {
+ r = PTR_ERR(opp);
+ goto unlock;
+ }
+
+ /* Is update really needed? */
+ if (opp->available == availability_req)
+ goto unlock;
+ /* copy the old data over */
+ *new_opp = *opp;
+
+ /* plug in new node */
+ new_opp->available = availability_req;
+
+ list_replace_rcu(&opp->node, &new_opp->node);
+ mutex_unlock(&opp_table_lock);
+ call_srcu(&opp_table->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
+
+ /* Notify the change of the OPP availability */
+ if (availability_req)
+ srcu_notifier_call_chain(&opp_table->srcu_head,
+ OPP_EVENT_ENABLE, new_opp);
+ else
+ srcu_notifier_call_chain(&opp_table->srcu_head,
+ OPP_EVENT_DISABLE, new_opp);
+
+ return 0;
+
+unlock:
+ mutex_unlock(&opp_table_lock);
+ kfree(new_opp);
+ return r;
+}
+
+/**
+ * dev_pm_opp_enable() - Enable a specific OPP
+ * @dev: device for which we do this operation
+ * @freq: OPP frequency to enable
+ *
+ * Enables a provided opp. If the operation is valid, this returns 0, else the
+ * corresponding error value. It is meant to be used for users an OPP available
+ * after being temporarily made unavailable with dev_pm_opp_disable.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function indirectly uses RCU and mutex locks to keep the
+ * integrity of the internal data structures. Callers should ensure that
+ * this function is *NOT* called under RCU protection or in contexts where
+ * mutex locking or synchronize_rcu() blocking calls cannot be used.
+ *
+ * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
+ * copy operation, returns 0 if no modification was done OR modification was
+ * successful.
+ */
+int dev_pm_opp_enable(struct device *dev, unsigned long freq)
+{
+ return _opp_set_availability(dev, freq, true);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_enable);
+
+/**
+ * dev_pm_opp_disable() - Disable a specific OPP
+ * @dev: device for which we do this operation
+ * @freq: OPP frequency to disable
+ *
+ * Disables a provided opp. If the operation is valid, this returns
+ * 0, else the corresponding error value. It is meant to be a temporary
+ * control by users to make this OPP not available until the circumstances are
+ * right to make it available again (with a call to dev_pm_opp_enable).
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function indirectly uses RCU and mutex locks to keep the
+ * integrity of the internal data structures. Callers should ensure that
+ * this function is *NOT* called under RCU protection or in contexts where
+ * mutex locking or synchronize_rcu() blocking calls cannot be used.
+ *
+ * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
+ * copy operation, returns 0 if no modification was done OR modification was
+ * successful.
+ */
+int dev_pm_opp_disable(struct device *dev, unsigned long freq)
+{
+ return _opp_set_availability(dev, freq, false);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_disable);
+
+/**
+ * dev_pm_opp_get_notifier() - find notifier_head of the device with opp
+ * @dev: device pointer used to lookup OPP table.
+ *
+ * Return: pointer to notifier head if found, otherwise -ENODEV or
+ * -EINVAL based on type of error casted as pointer. value must be checked
+ * with IS_ERR to determine valid pointer or error result.
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp_table is a
+ * RCU protected pointer. The reason for the same is that the opp pointer which
+ * is returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev)
+{
+ struct opp_table *opp_table = _find_opp_table(dev);
+
+ if (IS_ERR(opp_table))
+ return ERR_CAST(opp_table); /* matching type */
+
+ return &opp_table->srcu_head;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_notifier);
+
+#ifdef CONFIG_OF
+/**
+ * dev_pm_opp_of_remove_table() - Free OPP table entries created from static DT
+ * entries
+ * @dev: device pointer used to lookup OPP table.
+ *
+ * Free OPPs created using static entries present in DT.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function indirectly uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_of_remove_table(struct device *dev)
+{
+ struct opp_table *opp_table;
+ struct dev_pm_opp *opp, *tmp;
+
+ /* Hold our table modification lock here */
+ mutex_lock(&opp_table_lock);
+
+ /* Check for existing table for 'dev' */
+ opp_table = _find_opp_table(dev);
+ if (IS_ERR(opp_table)) {
+ int error = PTR_ERR(opp_table);
+
+ if (error != -ENODEV)
+ WARN(1, "%s: opp_table: %d\n",
+ IS_ERR_OR_NULL(dev) ?
+ "Invalid device" : dev_name(dev),
+ error);
+ goto unlock;
+ }
+
+ /* Find if opp_table manages a single device */
+ if (list_is_singular(&opp_table->dev_list)) {
+ /* Free static OPPs */
+ list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) {
+ if (!opp->dynamic)
+ _opp_remove(opp_table, opp, true);
+ }
+ } else {
+ _remove_opp_dev(_find_opp_dev(dev, opp_table), opp_table);
+ }
+
+unlock:
+ mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
+
+/* Returns opp descriptor node for a device, caller must do of_node_put() */
+struct device_node *_of_get_opp_desc_node(struct device *dev)
+{
+ /*
+ * TODO: Support for multiple OPP tables.
+ *
+ * There should be only ONE phandle present in "operating-points-v2"
+ * property.
+ */
+
+ return of_parse_phandle(dev->of_node, "operating-points-v2", 0);
+}
+
+/* Initializes OPP tables based on new bindings */
+static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np)
+{
+ struct device_node *np;
+ struct opp_table *opp_table;
+ int ret = 0, count = 0;
+
+ mutex_lock(&opp_table_lock);
+
+ opp_table = _managed_opp(opp_np);
+ if (opp_table) {
+ /* OPPs are already managed */
+ if (!_add_opp_dev(dev, opp_table))
+ ret = -ENOMEM;
+ mutex_unlock(&opp_table_lock);
+ return ret;
+ }
+ mutex_unlock(&opp_table_lock);
+
+ /* We have opp-table node now, iterate over it and add OPPs */
+ for_each_available_child_of_node(opp_np, np) {
+ count++;
+
+ ret = _opp_add_static_v2(dev, np);
+ if (ret) {
+ dev_err(dev, "%s: Failed to add OPP, %d\n", __func__,
+ ret);
+ goto free_table;
+ }
+ }
+
+ /* There should be one of more OPP defined */
+ if (WARN_ON(!count))
+ return -ENOENT;
+
+ mutex_lock(&opp_table_lock);
+
+ opp_table = _find_opp_table(dev);
+ if (WARN_ON(IS_ERR(opp_table))) {
+ ret = PTR_ERR(opp_table);
+ mutex_unlock(&opp_table_lock);
+ goto free_table;
+ }
+
+ opp_table->np = opp_np;
+ opp_table->shared_opp = of_property_read_bool(opp_np, "opp-shared");
+
+ mutex_unlock(&opp_table_lock);
+
+ return 0;
+
+free_table:
+ dev_pm_opp_of_remove_table(dev);
+
+ return ret;
+}
+
+/* Initializes OPP tables based on old-deprecated bindings */
+static int _of_add_opp_table_v1(struct device *dev)
+{
+ const struct property *prop;
+ const __be32 *val;
+ int nr;
+
+ prop = of_find_property(dev->of_node, "operating-points", NULL);
+ if (!prop)
+ return -ENODEV;
+ if (!prop->value)
+ return -ENODATA;
+
+ /*
+ * Each OPP is a set of tuples consisting of frequency and
+ * voltage like <freq-kHz vol-uV>.
+ */
+ nr = prop->length / sizeof(u32);
+ if (nr % 2) {
+ dev_err(dev, "%s: Invalid OPP table\n", __func__);
+ return -EINVAL;
+ }
+
+ val = prop->value;
+ while (nr) {
+ unsigned long freq = be32_to_cpup(val++) * 1000;
+ unsigned long volt = be32_to_cpup(val++);
+
+ if (_opp_add_v1(dev, freq, volt, false))
+ dev_warn(dev, "%s: Failed to add OPP %ld\n",
+ __func__, freq);
+ nr -= 2;
+ }
+
+ return 0;
+}
+
+/**
+ * dev_pm_opp_of_add_table() - Initialize opp table from device tree
+ * @dev: device pointer used to lookup OPP table.
+ *
+ * Register the initial OPP table with the OPP library for given device.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function indirectly uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ *
+ * Return:
+ * 0 On success OR
+ * Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST Freq are same and volt are different OR
+ * Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM Memory allocation failure
+ * -ENODEV when 'operating-points' property is not found or is invalid data
+ * in device node.
+ * -ENODATA when empty 'operating-points' property is found
+ * -EINVAL when invalid entries are found in opp-v2 table
+ */
+int dev_pm_opp_of_add_table(struct device *dev)
+{
+ struct device_node *opp_np;
+ int ret;
+
+ /*
+ * OPPs have two version of bindings now. The older one is deprecated,
+ * try for the new binding first.
+ */
+ opp_np = _of_get_opp_desc_node(dev);
+ if (!opp_np) {
+ /*
+ * Try old-deprecated bindings for backward compatibility with
+ * older dtbs.
+ */
+ return _of_add_opp_table_v1(dev);
+ }
+
+ ret = _of_add_opp_table_v2(dev, opp_np);
+ of_node_put(opp_np);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table);
+#endif
diff --git a/drivers/base/power/opp/cpu.c b/drivers/base/power/opp/cpu.c
new file mode 100644
index 000000000000..ba2bdbd932ef
--- /dev/null
+++ b/drivers/base/power/opp/cpu.c
@@ -0,0 +1,271 @@
+/*
+ * Generic OPP helper interface for CPU device
+ *
+ * Copyright (C) 2009-2014 Texas Instruments Incorporated.
+ * Nishanth Menon
+ * Romit Dasgupta
+ * Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/cpufreq.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+
+#include "opp.h"
+
+#ifdef CONFIG_CPU_FREQ
+
+/**
+ * dev_pm_opp_init_cpufreq_table() - create a cpufreq table for a device
+ * @dev: device for which we do this operation
+ * @table: Cpufreq table returned back to caller
+ *
+ * Generate a cpufreq table for a provided device- this assumes that the
+ * opp table is already initialized and ready for usage.
+ *
+ * This function allocates required memory for the cpufreq table. It is
+ * expected that the caller does the required maintenance such as freeing
+ * the table as required.
+ *
+ * Returns -EINVAL for bad pointers, -ENODEV if the device is not found, -ENOMEM
+ * if no memory available for the operation (table is not populated), returns 0
+ * if successful and table is populated.
+ *
+ * WARNING: It is important for the callers to ensure refreshing their copy of
+ * the table if any of the mentioned functions have been invoked in the interim.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Since we just use the regular accessor functions to access the internal data
+ * structures, we use RCU read lock inside this function. As a result, users of
+ * this function DONOT need to use explicit locks for invoking.
+ */
+int dev_pm_opp_init_cpufreq_table(struct device *dev,
+ struct cpufreq_frequency_table **table)
+{
+ struct dev_pm_opp *opp;
+ struct cpufreq_frequency_table *freq_table = NULL;
+ int i, max_opps, ret = 0;
+ unsigned long rate;
+
+ rcu_read_lock();
+
+ max_opps = dev_pm_opp_get_opp_count(dev);
+ if (max_opps <= 0) {
+ ret = max_opps ? max_opps : -ENODATA;
+ goto out;
+ }
+
+ freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_ATOMIC);
+ if (!freq_table) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0, rate = 0; i < max_opps; i++, rate++) {
+ /* find next rate */
+ opp = dev_pm_opp_find_freq_ceil(dev, &rate);
+ if (IS_ERR(opp)) {
+ ret = PTR_ERR(opp);
+ goto out;
+ }
+ freq_table[i].driver_data = i;
+ freq_table[i].frequency = rate / 1000;
+
+ /* Is Boost/turbo opp ? */
+ if (dev_pm_opp_is_turbo(opp))
+ freq_table[i].flags = CPUFREQ_BOOST_FREQ;
+ }
+
+ freq_table[i].driver_data = i;
+ freq_table[i].frequency = CPUFREQ_TABLE_END;
+
+ *table = &freq_table[0];
+
+out:
+ rcu_read_unlock();
+ if (ret)
+ kfree(freq_table);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_init_cpufreq_table);
+
+/**
+ * dev_pm_opp_free_cpufreq_table() - free the cpufreq table
+ * @dev: device for which we do this operation
+ * @table: table to free
+ *
+ * Free up the table allocated by dev_pm_opp_init_cpufreq_table
+ */
+void dev_pm_opp_free_cpufreq_table(struct device *dev,
+ struct cpufreq_frequency_table **table)
+{
+ if (!table)
+ return;
+
+ kfree(*table);
+ *table = NULL;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table);
+#endif /* CONFIG_CPU_FREQ */
+
+/* Required only for V1 bindings, as v2 can manage it from DT itself */
+int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask)
+{
+ struct opp_device *opp_dev;
+ struct opp_table *opp_table;
+ struct device *dev;
+ int cpu, ret = 0;
+
+ mutex_lock(&opp_table_lock);
+
+ opp_table = _find_opp_table(cpu_dev);
+ if (IS_ERR(opp_table)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ for_each_cpu(cpu, cpumask) {
+ if (cpu == cpu_dev->id)
+ continue;
+
+ dev = get_cpu_device(cpu);
+ if (!dev) {
+ dev_err(cpu_dev, "%s: failed to get cpu%d device\n",
+ __func__, cpu);
+ continue;
+ }
+
+ opp_dev = _add_opp_dev(dev, opp_table);
+ if (!opp_dev) {
+ dev_err(dev, "%s: failed to add opp-dev for cpu%d device\n",
+ __func__, cpu);
+ continue;
+ }
+ }
+unlock:
+ mutex_unlock(&opp_table_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_sharing_cpus);
+
+#ifdef CONFIG_OF
+void dev_pm_opp_of_cpumask_remove_table(cpumask_var_t cpumask)
+{
+ struct device *cpu_dev;
+ int cpu;
+
+ WARN_ON(cpumask_empty(cpumask));
+
+ for_each_cpu(cpu, cpumask) {
+ cpu_dev = get_cpu_device(cpu);
+ if (!cpu_dev) {
+ pr_err("%s: failed to get cpu%d device\n", __func__,
+ cpu);
+ continue;
+ }
+
+ dev_pm_opp_of_remove_table(cpu_dev);
+ }
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_remove_table);
+
+int dev_pm_opp_of_cpumask_add_table(cpumask_var_t cpumask)
+{
+ struct device *cpu_dev;
+ int cpu, ret = 0;
+
+ WARN_ON(cpumask_empty(cpumask));
+
+ for_each_cpu(cpu, cpumask) {
+ cpu_dev = get_cpu_device(cpu);
+ if (!cpu_dev) {
+ pr_err("%s: failed to get cpu%d device\n", __func__,
+ cpu);
+ continue;
+ }
+
+ ret = dev_pm_opp_of_add_table(cpu_dev);
+ if (ret) {
+ pr_err("%s: couldn't find opp table for cpu:%d, %d\n",
+ __func__, cpu, ret);
+
+ /* Free all other OPPs */
+ dev_pm_opp_of_cpumask_remove_table(cpumask);
+ break;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_add_table);
+
+/*
+ * Works only for OPP v2 bindings.
+ *
+ * Returns -ENOENT if operating-points-v2 bindings aren't supported.
+ */
+int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask)
+{
+ struct device_node *np, *tmp_np;
+ struct device *tcpu_dev;
+ int cpu, ret = 0;
+
+ /* Get OPP descriptor node */
+ np = _of_get_opp_desc_node(cpu_dev);
+ if (!np) {
+ dev_dbg(cpu_dev, "%s: Couldn't find cpu_dev node.\n", __func__);
+ return -ENOENT;
+ }
+
+ cpumask_set_cpu(cpu_dev->id, cpumask);
+
+ /* OPPs are shared ? */
+ if (!of_property_read_bool(np, "opp-shared"))
+ goto put_cpu_node;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu == cpu_dev->id)
+ continue;
+
+ tcpu_dev = get_cpu_device(cpu);
+ if (!tcpu_dev) {
+ dev_err(cpu_dev, "%s: failed to get cpu%d device\n",
+ __func__, cpu);
+ ret = -ENODEV;
+ goto put_cpu_node;
+ }
+
+ /* Get OPP descriptor node */
+ tmp_np = _of_get_opp_desc_node(tcpu_dev);
+ if (!tmp_np) {
+ dev_err(tcpu_dev, "%s: Couldn't find tcpu_dev node.\n",
+ __func__);
+ ret = -ENOENT;
+ goto put_cpu_node;
+ }
+
+ /* CPUs are sharing opp node */
+ if (np == tmp_np)
+ cpumask_set_cpu(cpu, cpumask);
+
+ of_node_put(tmp_np);
+ }
+
+put_cpu_node:
+ of_node_put(np);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_sharing_cpus);
+#endif
diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c
new file mode 100644
index 000000000000..ef1ae6b52042
--- /dev/null
+++ b/drivers/base/power/opp/debugfs.c
@@ -0,0 +1,218 @@
+/*
+ * Generic OPP debugfs interface
+ *
+ * Copyright (C) 2015-2016 Viresh Kumar <viresh.kumar@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/limits.h>
+
+#include "opp.h"
+
+static struct dentry *rootdir;
+
+static void opp_set_dev_name(const struct device *dev, char *name)
+{
+ if (dev->parent)
+ snprintf(name, NAME_MAX, "%s-%s", dev_name(dev->parent),
+ dev_name(dev));
+ else
+ snprintf(name, NAME_MAX, "%s", dev_name(dev));
+}
+
+void opp_debug_remove_one(struct dev_pm_opp *opp)
+{
+ debugfs_remove_recursive(opp->dentry);
+}
+
+int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
+{
+ struct dentry *pdentry = opp_table->dentry;
+ struct dentry *d;
+ char name[25]; /* 20 chars for 64 bit value + 5 (opp:\0) */
+
+ /* Rate is unique to each OPP, use it to give opp-name */
+ snprintf(name, sizeof(name), "opp:%lu", opp->rate);
+
+ /* Create per-opp directory */
+ d = debugfs_create_dir(name, pdentry);
+ if (!d)
+ return -ENOMEM;
+
+ if (!debugfs_create_bool("available", S_IRUGO, d, &opp->available))
+ return -ENOMEM;
+
+ if (!debugfs_create_bool("dynamic", S_IRUGO, d, &opp->dynamic))
+ return -ENOMEM;
+
+ if (!debugfs_create_bool("turbo", S_IRUGO, d, &opp->turbo))
+ return -ENOMEM;
+
+ if (!debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend))
+ return -ENOMEM;
+
+ if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
+ return -ENOMEM;
+
+ if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d, &opp->u_volt))
+ return -ENOMEM;
+
+ if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d, &opp->u_volt_min))
+ return -ENOMEM;
+
+ if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d, &opp->u_volt_max))
+ return -ENOMEM;
+
+ if (!debugfs_create_ulong("u_amp", S_IRUGO, d, &opp->u_amp))
+ return -ENOMEM;
+
+ if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
+ &opp->clock_latency_ns))
+ return -ENOMEM;
+
+ opp->dentry = d;
+ return 0;
+}
+
+static int opp_list_debug_create_dir(struct opp_device *opp_dev,
+ struct opp_table *opp_table)
+{
+ const struct device *dev = opp_dev->dev;
+ struct dentry *d;
+
+ opp_set_dev_name(dev, opp_table->dentry_name);
+
+ /* Create device specific directory */
+ d = debugfs_create_dir(opp_table->dentry_name, rootdir);
+ if (!d) {
+ dev_err(dev, "%s: Failed to create debugfs dir\n", __func__);
+ return -ENOMEM;
+ }
+
+ opp_dev->dentry = d;
+ opp_table->dentry = d;
+
+ return 0;
+}
+
+static int opp_list_debug_create_link(struct opp_device *opp_dev,
+ struct opp_table *opp_table)
+{
+ const struct device *dev = opp_dev->dev;
+ char name[NAME_MAX];
+ struct dentry *d;
+
+ opp_set_dev_name(opp_dev->dev, name);
+
+ /* Create device specific directory link */
+ d = debugfs_create_symlink(name, rootdir, opp_table->dentry_name);
+ if (!d) {
+ dev_err(dev, "%s: Failed to create link\n", __func__);
+ return -ENOMEM;
+ }
+
+ opp_dev->dentry = d;
+
+ return 0;
+}
+
+/**
+ * opp_debug_register - add a device opp node to the debugfs 'opp' directory
+ * @opp_dev: opp-dev pointer for device
+ * @opp_table: the device-opp being added
+ *
+ * Dynamically adds device specific directory in debugfs 'opp' directory. If the
+ * device-opp is shared with other devices, then links will be created for all
+ * devices except the first.
+ *
+ * Return: 0 on success, otherwise negative error.
+ */
+int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table)
+{
+ if (!rootdir) {
+ pr_debug("%s: Uninitialized rootdir\n", __func__);
+ return -EINVAL;
+ }
+
+ if (opp_table->dentry)
+ return opp_list_debug_create_link(opp_dev, opp_table);
+
+ return opp_list_debug_create_dir(opp_dev, opp_table);
+}
+
+static void opp_migrate_dentry(struct opp_device *opp_dev,
+ struct opp_table *opp_table)
+{
+ struct opp_device *new_dev;
+ const struct device *dev;
+ struct dentry *dentry;
+
+ /* Look for next opp-dev */
+ list_for_each_entry(new_dev, &opp_table->dev_list, node)
+ if (new_dev != opp_dev)
+ break;
+
+ /* new_dev is guaranteed to be valid here */
+ dev = new_dev->dev;
+ debugfs_remove_recursive(new_dev->dentry);
+
+ opp_set_dev_name(dev, opp_table->dentry_name);
+
+ dentry = debugfs_rename(rootdir, opp_dev->dentry, rootdir,
+ opp_table->dentry_name);
+ if (!dentry) {
+ dev_err(dev, "%s: Failed to rename link from: %s to %s\n",
+ __func__, dev_name(opp_dev->dev), dev_name(dev));
+ return;
+ }
+
+ new_dev->dentry = dentry;
+ opp_table->dentry = dentry;
+}
+
+/**
+ * opp_debug_unregister - remove a device opp node from debugfs opp directory
+ * @opp_dev: opp-dev pointer for device
+ * @opp_table: the device-opp being removed
+ *
+ * Dynamically removes device specific directory from debugfs 'opp' directory.
+ */
+void opp_debug_unregister(struct opp_device *opp_dev,
+ struct opp_table *opp_table)
+{
+ if (opp_dev->dentry == opp_table->dentry) {
+ /* Move the real dentry object under another device */
+ if (!list_is_singular(&opp_table->dev_list)) {
+ opp_migrate_dentry(opp_dev, opp_table);
+ goto out;
+ }
+ opp_table->dentry = NULL;
+ }
+
+ debugfs_remove_recursive(opp_dev->dentry);
+
+out:
+ opp_dev->dentry = NULL;
+}
+
+static int __init opp_debug_init(void)
+{
+ /* Create /sys/kernel/debug/opp directory */
+ rootdir = debugfs_create_dir("opp", NULL);
+ if (!rootdir) {
+ pr_err("%s: Failed to create root directory\n", __func__);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+core_initcall(opp_debug_init);
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
new file mode 100644
index 000000000000..f67f806fcf3a
--- /dev/null
+++ b/drivers/base/power/opp/opp.h
@@ -0,0 +1,207 @@
+/*
+ * Generic OPP Interface
+ *
+ * Copyright (C) 2009-2010 Texas Instruments Incorporated.
+ * Nishanth Menon
+ * Romit Dasgupta
+ * Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __DRIVER_OPP_H__
+#define __DRIVER_OPP_H__
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/limits.h>
+#include <linux/pm_opp.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+
+struct clk;
+struct regulator;
+
+/* Lock to allow exclusive modification to the device and opp lists */
+extern struct mutex opp_table_lock;
+
+/*
+ * Internal data structure organization with the OPP layer library is as
+ * follows:
+ * opp_tables (root)
+ * |- device 1 (represents voltage domain 1)
+ * | |- opp 1 (availability, freq, voltage)
+ * | |- opp 2 ..
+ * ... ...
+ * | `- opp n ..
+ * |- device 2 (represents the next voltage domain)
+ * ...
+ * `- device m (represents mth voltage domain)
+ * device 1, 2.. are represented by opp_table structure while each opp
+ * is represented by the opp structure.
+ */
+
+/**
+ * struct dev_pm_opp - Generic OPP description structure
+ * @node: opp table node. The nodes are maintained throughout the lifetime
+ * of boot. It is expected only an optimal set of OPPs are
+ * added to the library by the SoC framework.
+ * RCU usage: opp table is traversed with RCU locks. node
+ * modification is possible realtime, hence the modifications
+ * are protected by the opp_table_lock for integrity.
+ * IMPORTANT: the opp nodes should be maintained in increasing
+ * order.
+ * @available: true/false - marks if this OPP as available or not
+ * @dynamic: not-created from static DT entries.
+ * @turbo: true if turbo (boost) OPP
+ * @suspend: true if suspend OPP
+ * @rate: Frequency in hertz
+ * @u_volt: Target voltage in microvolts corresponding to this OPP
+ * @u_volt_min: Minimum voltage in microvolts corresponding to this OPP
+ * @u_volt_max: Maximum voltage in microvolts corresponding to this OPP
+ * @u_amp: Maximum current drawn by the device in microamperes
+ * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
+ * frequency from any other OPP's frequency.
+ * @opp_table: points back to the opp_table struct this opp belongs to
+ * @rcu_head: RCU callback head used for deferred freeing
+ * @np: OPP's device node.
+ * @dentry: debugfs dentry pointer (per opp)
+ *
+ * This structure stores the OPP information for a given device.
+ */
+struct dev_pm_opp {
+ struct list_head node;
+
+ bool available;
+ bool dynamic;
+ bool turbo;
+ bool suspend;
+ unsigned long rate;
+
+ unsigned long u_volt;
+ unsigned long u_volt_min;
+ unsigned long u_volt_max;
+ unsigned long u_amp;
+ unsigned long clock_latency_ns;
+
+ struct opp_table *opp_table;
+ struct rcu_head rcu_head;
+
+ struct device_node *np;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *dentry;
+#endif
+};
+
+/**
+ * struct opp_device - devices managed by 'struct opp_table'
+ * @node: list node
+ * @dev: device to which the struct object belongs
+ * @rcu_head: RCU callback head used for deferred freeing
+ * @dentry: debugfs dentry pointer (per device)
+ *
+ * This is an internal data structure maintaining the devices that are managed
+ * by 'struct opp_table'.
+ */
+struct opp_device {
+ struct list_head node;
+ const struct device *dev;
+ struct rcu_head rcu_head;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *dentry;
+#endif
+};
+
+/**
+ * struct opp_table - Device opp structure
+ * @node: table node - contains the devices with OPPs that
+ * have been registered. Nodes once added are not modified in this
+ * table.
+ * RCU usage: nodes are not modified in the table of opp_table,
+ * however addition is possible and is secured by opp_table_lock
+ * @srcu_head: notifier head to notify the OPP availability changes.
+ * @rcu_head: RCU callback head used for deferred freeing
+ * @dev_list: list of devices that share these OPPs
+ * @opp_list: table of opps
+ * @np: struct device_node pointer for opp's DT node.
+ * @clock_latency_ns_max: Max clock latency in nanoseconds.
+ * @shared_opp: OPP is shared between multiple devices.
+ * @suspend_opp: Pointer to OPP to be used during device suspend.
+ * @supported_hw: Array of version number to support.
+ * @supported_hw_count: Number of elements in supported_hw array.
+ * @prop_name: A name to postfix to many DT properties, while parsing them.
+ * @clk: Device's clock handle
+ * @regulator: Supply regulator
+ * @dentry: debugfs dentry pointer of the real device directory (not links).
+ * @dentry_name: Name of the real dentry.
+ *
+ * @voltage_tolerance_v1: In percentage, for v1 bindings only.
+ *
+ * This is an internal data structure maintaining the link to opps attached to
+ * a device. This structure is not meant to be shared to users as it is
+ * meant for book keeping and private to OPP library.
+ *
+ * Because the opp structures can be used from both rcu and srcu readers, we
+ * need to wait for the grace period of both of them before freeing any
+ * resources. And so we have used kfree_rcu() from within call_srcu() handlers.
+ */
+struct opp_table {
+ struct list_head node;
+
+ struct srcu_notifier_head srcu_head;
+ struct rcu_head rcu_head;
+ struct list_head dev_list;
+ struct list_head opp_list;
+
+ struct device_node *np;
+ unsigned long clock_latency_ns_max;
+
+ /* For backward compatibility with v1 bindings */
+ unsigned int voltage_tolerance_v1;
+
+ bool shared_opp;
+ struct dev_pm_opp *suspend_opp;
+
+ unsigned int *supported_hw;
+ unsigned int supported_hw_count;
+ const char *prop_name;
+ struct clk *clk;
+ struct regulator *regulator;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *dentry;
+ char dentry_name[NAME_MAX];
+#endif
+};
+
+/* Routines internal to opp core */
+struct opp_table *_find_opp_table(struct device *dev);
+struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
+struct device_node *_of_get_opp_desc_node(struct device *dev);
+
+#ifdef CONFIG_DEBUG_FS
+void opp_debug_remove_one(struct dev_pm_opp *opp);
+int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table);
+int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table);
+void opp_debug_unregister(struct opp_device *opp_dev, struct opp_table *opp_table);
+#else
+static inline void opp_debug_remove_one(struct dev_pm_opp *opp) {}
+
+static inline int opp_debug_create_one(struct dev_pm_opp *opp,
+ struct opp_table *opp_table)
+{ return 0; }
+static inline int opp_debug_register(struct opp_device *opp_dev,
+ struct opp_table *opp_table)
+{ return 0; }
+
+static inline void opp_debug_unregister(struct opp_device *opp_dev,
+ struct opp_table *opp_table)
+{ }
+#endif /* DEBUG_FS */
+
+#endif /* __DRIVER_OPP_H__ */
diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index a13587b5c2be..03558c1d9914 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -122,16 +122,16 @@ struct regmap {
unsigned int num_reg_defaults_raw;
/* if set, only the cache is modified not the HW */
- u32 cache_only;
+ bool cache_only;
/* if set, only the HW is modified not the cache */
- u32 cache_bypass;
+ bool cache_bypass;
/* if set, remember to free reg_defaults_raw */
bool cache_free;
struct reg_default *reg_defaults;
const void *reg_defaults_raw;
void *cache;
- u32 cache_dirty;
+ bool cache_dirty;
struct reg_default *patch;
int patch_regs;
diff --git a/drivers/base/regmap/regcache-lzo.c b/drivers/base/regmap/regcache-lzo.c
index 2d53f6f138e1..736e0d378567 100644
--- a/drivers/base/regmap/regcache-lzo.c
+++ b/drivers/base/regmap/regcache-lzo.c
@@ -355,9 +355,9 @@ static int regcache_lzo_sync(struct regmap *map, unsigned int min,
if (ret > 0 && val == map->reg_defaults[ret].def)
continue;
- map->cache_bypass = 1;
+ map->cache_bypass = true;
ret = _regmap_write(map, i, val);
- map->cache_bypass = 0;
+ map->cache_bypass = false;
if (ret)
return ret;
dev_dbg(map->dev, "Synced register %#x, value %#x\n",
diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c
index 7eb7b3b98794..d58d74dc372c 100644
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c
@@ -54,11 +54,11 @@ static int regcache_hw_init(struct regmap *map)
return -ENOMEM;
if (!map->reg_defaults_raw) {
- u32 cache_bypass = map->cache_bypass;
+ bool cache_bypass = map->cache_bypass;
dev_warn(map->dev, "No cache defaults, reading back from HW\n");
/* Bypass the cache access till data read from HW*/
- map->cache_bypass = 1;
+ map->cache_bypass = true;
tmp_buf = kmalloc(map->cache_size_raw, GFP_KERNEL);
if (!tmp_buf) {
ret = -ENOMEM;
@@ -271,9 +271,9 @@ static int regcache_default_sync(struct regmap *map, unsigned int min,
if (ret >= 0 && val == map->reg_defaults[ret].def)
continue;
- map->cache_bypass = 1;
+ map->cache_bypass = true;
ret = _regmap_write(map, reg, val);
- map->cache_bypass = 0;
+ map->cache_bypass = false;
if (ret) {
dev_err(map->dev, "Unable to sync register %#x. %d\n",
reg, ret);
@@ -301,7 +301,7 @@ int regcache_sync(struct regmap *map)
int ret = 0;
unsigned int i;
const char *name;
- unsigned int bypass;
+ bool bypass;
BUG_ON(!map->cache_ops);
@@ -319,7 +319,7 @@ int regcache_sync(struct regmap *map)
map->async = true;
/* Apply any patch first */
- map->cache_bypass = 1;
+ map->cache_bypass = true;
for (i = 0; i < map->patch_regs; i++) {
ret = _regmap_write(map, map->patch[i].reg, map->patch[i].def);
if (ret != 0) {
@@ -328,7 +328,7 @@ int regcache_sync(struct regmap *map)
goto out;
}
}
- map->cache_bypass = 0;
+ map->cache_bypass = false;
if (map->cache_ops->sync)
ret = map->cache_ops->sync(map, 0, map->max_register);
@@ -369,7 +369,7 @@ int regcache_sync_region(struct regmap *map, unsigned int min,
{
int ret = 0;
const char *name;
- unsigned int bypass;
+ bool bypass;
BUG_ON(!map->cache_ops);
@@ -619,11 +619,11 @@ static int regcache_sync_block_single(struct regmap *map, void *block,
if (ret >= 0 && val == map->reg_defaults[ret].def)
continue;
- map->cache_bypass = 1;
+ map->cache_bypass = true;
ret = _regmap_write(map, regtmp, val);
- map->cache_bypass = 0;
+ map->cache_bypass = false;
if (ret != 0) {
dev_err(map->dev, "Unable to sync register %#x. %d\n",
regtmp, ret);
@@ -650,14 +650,14 @@ static int regcache_sync_block_raw_flush(struct regmap *map, const void **data,
dev_dbg(map->dev, "Writing %zu bytes for %d registers from 0x%x-0x%x\n",
count * val_bytes, count, base, cur - map->reg_stride);
- map->cache_bypass = 1;
+ map->cache_bypass = true;
ret = _regmap_raw_write(map, base, *data, count * val_bytes);
if (ret)
dev_err(map->dev, "Unable to sync registers %#x-%#x. %d\n",
base, cur - map->reg_stride, ret);
- map->cache_bypass = 0;
+ map->cache_bypass = false;
*data = NULL;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b905e9888b88..efd19c2da9c2 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -38,6 +38,7 @@
#include <linux/mutex.h>
#include <linux/major.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/genhd.h>
#include <linux/idr.h>
#include <net/tcp.h>
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 81fde9ef7f8e..a1518539b858 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2359,7 +2359,7 @@ static void drbd_cleanup(void)
* @congested_data: User data
* @bdi_bits: Bits the BDI flusher thread is currently interested in
*
- * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ * Returns 1<<WB_async_congested and/or 1<<WB_sync_congested if we are congested.
*/
static int drbd_congested(void *congested_data, int bdi_bits)
{
@@ -2376,14 +2376,14 @@ static int drbd_congested(void *congested_data, int bdi_bits)
}
if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) {
- r |= (1 << BDI_async_congested);
+ r |= (1 << WB_async_congested);
/* Without good local data, we would need to read from remote,
* and that would need the worker thread as well, which is
* currently blocked waiting for that usermode helper to
* finish.
*/
if (!get_ldev_if_state(device, D_UP_TO_DATE))
- r |= (1 << BDI_sync_congested);
+ r |= (1 << WB_sync_congested);
else
put_ldev(device);
r &= bdi_bits;
@@ -2399,9 +2399,9 @@ static int drbd_congested(void *congested_data, int bdi_bits)
reason = 'b';
}
- if (bdi_bits & (1 << BDI_async_congested) &&
+ if (bdi_bits & (1 << WB_async_congested) &&
test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) {
- r |= (1 << BDI_async_congested);
+ r |= (1 << WB_async_congested);
reason = reason == 'b' ? 'a' : 'n';
}
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 09e628dafd9d..4c20c228184c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -61,6 +61,7 @@
#include <linux/freezer.h>
#include <linux/mutex.h>
#include <linux/slab.h>
+#include <linux/backing-dev.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_ioctl.h>
#include <scsi/scsi.h>
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 5fc291c6157e..60316fbaf295 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -12,6 +12,7 @@
#include <linux/fs.h>
#include <linux/major.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/raw.h>
#include <linux/capability.h>
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index cdce92ae2e8b..476079fccf33 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -1,6 +1,5 @@
# CPUfreq core
obj-$(CONFIG_CPU_FREQ) += cpufreq.o freq_table.o
-obj-$(CONFIG_PM_OPP) += cpufreq_opp.o
# CPUfreq stats
obj-$(CONFIG_CPU_FREQ_STAT) += cpufreq_stats.o
diff --git a/drivers/cpufreq/arm_big_little.h b/drivers/cpufreq/arm_big_little.h
index a211f7db9d32..b88889d9387e 100644
--- a/drivers/cpufreq/arm_big_little.h
+++ b/drivers/cpufreq/arm_big_little.h
@@ -28,7 +28,7 @@ struct cpufreq_arm_bL_ops {
/*
* This must set opp table for cpu_dev in a similar way as done by
- * of_init_opp_table().
+ * dev_pm_opp_of_add_table().
*/
int (*init_opp_table)(struct device *cpu_dev);
diff --git a/drivers/cpufreq/arm_big_little_dt.c b/drivers/cpufreq/arm_big_little_dt.c
index 36d91dba2965..16ddeefe9443 100644
--- a/drivers/cpufreq/arm_big_little_dt.c
+++ b/drivers/cpufreq/arm_big_little_dt.c
@@ -54,7 +54,7 @@ static int dt_init_opp_table(struct device *cpu_dev)
return -ENOENT;
}
- ret = of_init_opp_table(cpu_dev);
+ ret = dev_pm_opp_of_add_table(cpu_dev);
of_node_put(np);
return ret;
@@ -82,7 +82,7 @@ static struct cpufreq_arm_bL_ops dt_bL_ops = {
.name = "dt-bl",
.get_transition_latency = dt_get_transition_latency,
.init_opp_table = dt_init_opp_table,
- .free_opp_table = of_free_opp_table,
+ .free_opp_table = dev_pm_opp_of_remove_table,
};
static int generic_bL_probe(struct platform_device *pdev)
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 663045ce6fac..64ccbc4216be 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -31,185 +31,180 @@
struct private_data {
struct device *cpu_dev;
- struct regulator *cpu_reg;
struct thermal_cooling_device *cdev;
- unsigned int voltage_tolerance; /* in percentage */
+ const char *reg_name;
+};
+
+static struct freq_attr *cpufreq_dt_attr[] = {
+ &cpufreq_freq_attr_scaling_available_freqs,
+ NULL, /* Extra space for boost-attr if required */
+ NULL,
};
static int set_target(struct cpufreq_policy *policy, unsigned int index)
{
- struct dev_pm_opp *opp;
- struct cpufreq_frequency_table *freq_table = policy->freq_table;
- struct clk *cpu_clk = policy->clk;
struct private_data *priv = policy->driver_data;
- struct device *cpu_dev = priv->cpu_dev;
- struct regulator *cpu_reg = priv->cpu_reg;
- unsigned long volt = 0, volt_old = 0, tol = 0;
- unsigned int old_freq, new_freq;
- long freq_Hz, freq_exact;
- int ret;
- freq_Hz = clk_round_rate(cpu_clk, freq_table[index].frequency * 1000);
- if (freq_Hz <= 0)
- freq_Hz = freq_table[index].frequency * 1000;
-
- freq_exact = freq_Hz;
- new_freq = freq_Hz / 1000;
- old_freq = clk_get_rate(cpu_clk) / 1000;
+ return dev_pm_opp_set_rate(priv->cpu_dev,
+ policy->freq_table[index].frequency * 1000);
+}
- if (!IS_ERR(cpu_reg)) {
- unsigned long opp_freq;
+/*
+ * An earlier version of opp-v1 bindings used to name the regulator
+ * "cpu0-supply", we still need to handle that for backwards compatibility.
+ */
+static const char *find_supply_name(struct device *dev)
+{
+ struct device_node *np;
+ struct property *pp;
+ int cpu = dev->id;
+ const char *name = NULL;
- rcu_read_lock();
- opp = dev_pm_opp_find_freq_ceil(cpu_dev, &freq_Hz);
- if (IS_ERR(opp)) {
- rcu_read_unlock();
- dev_err(cpu_dev, "failed to find OPP for %ld\n",
- freq_Hz);
- return PTR_ERR(opp);
- }
- volt = dev_pm_opp_get_voltage(opp);
- opp_freq = dev_pm_opp_get_freq(opp);
- rcu_read_unlock();
- tol = volt * priv->voltage_tolerance / 100;
- volt_old = regulator_get_voltage(cpu_reg);
- dev_dbg(cpu_dev, "Found OPP: %ld kHz, %ld uV\n",
- opp_freq / 1000, volt);
- }
+ np = of_node_get(dev->of_node);
- dev_dbg(cpu_dev, "%u MHz, %ld mV --> %u MHz, %ld mV\n",
- old_freq / 1000, (volt_old > 0) ? volt_old / 1000 : -1,
- new_freq / 1000, volt ? volt / 1000 : -1);
+ /* This must be valid for sure */
+ if (WARN_ON(!np))
+ return NULL;
- /* scaling up? scale voltage before frequency */
- if (!IS_ERR(cpu_reg) && new_freq > old_freq) {
- ret = regulator_set_voltage_tol(cpu_reg, volt, tol);
- if (ret) {
- dev_err(cpu_dev, "failed to scale voltage up: %d\n",
- ret);
- return ret;
+ /* Try "cpu0" for older DTs */
+ if (!cpu) {
+ pp = of_find_property(np, "cpu0-supply", NULL);
+ if (pp) {
+ name = "cpu0";
+ goto node_put;
}
}
- ret = clk_set_rate(cpu_clk, freq_exact);
- if (ret) {
- dev_err(cpu_dev, "failed to set clock rate: %d\n", ret);
- if (!IS_ERR(cpu_reg) && volt_old > 0)
- regulator_set_voltage_tol(cpu_reg, volt_old, tol);
- return ret;
+ pp = of_find_property(np, "cpu-supply", NULL);
+ if (pp) {
+ name = "cpu";
+ goto node_put;
}
- /* scaling down? scale voltage after frequency */
- if (!IS_ERR(cpu_reg) && new_freq < old_freq) {
- ret = regulator_set_voltage_tol(cpu_reg, volt, tol);
- if (ret) {
- dev_err(cpu_dev, "failed to scale voltage down: %d\n",
- ret);
- clk_set_rate(cpu_clk, old_freq * 1000);
- }
- }
-
- return ret;
+ dev_dbg(dev, "no regulator for cpu%d\n", cpu);
+node_put:
+ of_node_put(np);
+ return name;
}
-static int allocate_resources(int cpu, struct device **cdev,
- struct regulator **creg, struct clk **cclk)
+static int resources_available(void)
{
struct device *cpu_dev;
struct regulator *cpu_reg;
struct clk *cpu_clk;
int ret = 0;
- char *reg_cpu0 = "cpu0", *reg_cpu = "cpu", *reg;
+ const char *name;
- cpu_dev = get_cpu_device(cpu);
+ cpu_dev = get_cpu_device(0);
if (!cpu_dev) {
- pr_err("failed to get cpu%d device\n", cpu);
+ pr_err("failed to get cpu0 device\n");
return -ENODEV;
}
- /* Try "cpu0" for older DTs */
- if (!cpu)
- reg = reg_cpu0;
- else
- reg = reg_cpu;
-
-try_again:
- cpu_reg = regulator_get_optional(cpu_dev, reg);
- if (IS_ERR(cpu_reg)) {
+ cpu_clk = clk_get(cpu_dev, NULL);
+ ret = PTR_ERR_OR_ZERO(cpu_clk);
+ if (ret) {
/*
- * If cpu's regulator supply node is present, but regulator is
- * not yet registered, we should try defering probe.
+ * If cpu's clk node is present, but clock is not yet
+ * registered, we should try defering probe.
*/
- if (PTR_ERR(cpu_reg) == -EPROBE_DEFER) {
- dev_dbg(cpu_dev, "cpu%d regulator not ready, retry\n",
- cpu);
- return -EPROBE_DEFER;
- }
-
- /* Try with "cpu-supply" */
- if (reg == reg_cpu0) {
- reg = reg_cpu;
- goto try_again;
- }
+ if (ret == -EPROBE_DEFER)
+ dev_dbg(cpu_dev, "clock not ready, retry\n");
+ else
+ dev_err(cpu_dev, "failed to get clock: %d\n", ret);
- dev_dbg(cpu_dev, "no regulator for cpu%d: %ld\n",
- cpu, PTR_ERR(cpu_reg));
+ return ret;
}
- cpu_clk = clk_get(cpu_dev, NULL);
- if (IS_ERR(cpu_clk)) {
- /* put regulator */
- if (!IS_ERR(cpu_reg))
- regulator_put(cpu_reg);
+ clk_put(cpu_clk);
- ret = PTR_ERR(cpu_clk);
+ name = find_supply_name(cpu_dev);
+ /* Platform doesn't require regulator */
+ if (!name)
+ return 0;
+ cpu_reg = regulator_get_optional(cpu_dev, name);
+ ret = PTR_ERR_OR_ZERO(cpu_reg);
+ if (ret) {
/*
- * If cpu's clk node is present, but clock is not yet
- * registered, we should try defering probe.
+ * If cpu's regulator supply node is present, but regulator is
+ * not yet registered, we should try defering probe.
*/
if (ret == -EPROBE_DEFER)
- dev_dbg(cpu_dev, "cpu%d clock not ready, retry\n", cpu);
+ dev_dbg(cpu_dev, "cpu0 regulator not ready, retry\n");
else
- dev_err(cpu_dev, "failed to get cpu%d clock: %d\n", cpu,
- ret);
- } else {
- *cdev = cpu_dev;
- *creg = cpu_reg;
- *cclk = cpu_clk;
+ dev_dbg(cpu_dev, "no regulator for cpu0: %d\n", ret);
+
+ return ret;
}
- return ret;
+ regulator_put(cpu_reg);
+ return 0;
}
static int cpufreq_init(struct cpufreq_policy *policy)
{
- struct cpufreq_dt_platform_data *pd;
struct cpufreq_frequency_table *freq_table;
- struct device_node *np;
struct private_data *priv;
struct device *cpu_dev;
- struct regulator *cpu_reg;
struct clk *cpu_clk;
- unsigned long min_uV = ~0, max_uV = 0;
+ struct dev_pm_opp *suspend_opp;
unsigned int transition_latency;
+ bool opp_v1 = false;
+ const char *name;
int ret;
- ret = allocate_resources(policy->cpu, &cpu_dev, &cpu_reg, &cpu_clk);
- if (ret) {
- pr_err("%s: Failed to allocate resources: %d\n", __func__, ret);
+ cpu_dev = get_cpu_device(policy->cpu);
+ if (!cpu_dev) {
+ pr_err("failed to get cpu%d device\n", policy->cpu);
+ return -ENODEV;
+ }
+
+ cpu_clk = clk_get(cpu_dev, NULL);
+ if (IS_ERR(cpu_clk)) {
+ ret = PTR_ERR(cpu_clk);
+ dev_err(cpu_dev, "%s: failed to get clk: %d\n", __func__, ret);
return ret;
}
- np = of_node_get(cpu_dev->of_node);
- if (!np) {
- dev_err(cpu_dev, "failed to find cpu%d node\n", policy->cpu);
- ret = -ENOENT;
- goto out_put_reg_clk;
+ /* Get OPP-sharing information from "operating-points-v2" bindings */
+ ret = dev_pm_opp_of_get_sharing_cpus(cpu_dev, policy->cpus);
+ if (ret) {
+ /*
+ * operating-points-v2 not supported, fallback to old method of
+ * finding shared-OPPs for backward compatibility.
+ */
+ if (ret == -ENOENT)
+ opp_v1 = true;
+ else
+ goto out_put_clk;
+ }
+
+ /*
+ * OPP layer will be taking care of regulators now, but it needs to know
+ * the name of the regulator first.
+ */
+ name = find_supply_name(cpu_dev);
+ if (name) {
+ ret = dev_pm_opp_set_regulator(cpu_dev, name);
+ if (ret) {
+ dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
+ policy->cpu, ret);
+ goto out_put_clk;
+ }
}
- /* OPPs might be populated at runtime, don't check for error here */
- of_init_opp_table(cpu_dev);
+ /*
+ * Initialize OPP tables for all policy->cpus. They will be shared by
+ * all CPUs which have marked their CPUs shared with OPP bindings.
+ *
+ * For platforms not using operating-points-v2 bindings, we do this
+ * before updating policy->cpus. Otherwise, we will end up creating
+ * duplicate OPPs for policy->cpus.
+ *
+ * OPPs might be populated at runtime, don't check for error here
+ */
+ dev_pm_opp_of_cpumask_add_table(policy->cpus);
/*
* But we need OPP table to function so if it is not there let's
@@ -217,74 +212,51 @@ static int cpufreq_init(struct cpufreq_policy *policy)
*/
ret = dev_pm_opp_get_opp_count(cpu_dev);
if (ret <= 0) {
- pr_debug("OPP table is not ready, deferring probe\n");
+ dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
ret = -EPROBE_DEFER;
goto out_free_opp;
}
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (!priv) {
- ret = -ENOMEM;
- goto out_free_opp;
- }
+ if (opp_v1) {
+ struct cpufreq_dt_platform_data *pd = cpufreq_get_driver_data();
- of_property_read_u32(np, "voltage-tolerance", &priv->voltage_tolerance);
-
- if (of_property_read_u32(np, "clock-latency", &transition_latency))
- transition_latency = CPUFREQ_ETERNAL;
-
- if (!IS_ERR(cpu_reg)) {
- unsigned long opp_freq = 0;
+ if (!pd || !pd->independent_clocks)
+ cpumask_setall(policy->cpus);
/*
- * Disable any OPPs where the connected regulator isn't able to
- * provide the specified voltage and record minimum and maximum
- * voltage levels.
+ * OPP tables are initialized only for policy->cpu, do it for
+ * others as well.
*/
- while (1) {
- struct dev_pm_opp *opp;
- unsigned long opp_uV, tol_uV;
-
- rcu_read_lock();
- opp = dev_pm_opp_find_freq_ceil(cpu_dev, &opp_freq);
- if (IS_ERR(opp)) {
- rcu_read_unlock();
- break;
- }
- opp_uV = dev_pm_opp_get_voltage(opp);
- rcu_read_unlock();
-
- tol_uV = opp_uV * priv->voltage_tolerance / 100;
- if (regulator_is_supported_voltage(cpu_reg,
- opp_uV - tol_uV,
- opp_uV + tol_uV)) {
- if (opp_uV < min_uV)
- min_uV = opp_uV;
- if (opp_uV > max_uV)
- max_uV = opp_uV;
- } else {
- dev_pm_opp_disable(cpu_dev, opp_freq);
- }
-
- opp_freq++;
- }
+ ret = dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus);
+ if (ret)
+ dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
+ __func__, ret);
+ }
- ret = regulator_set_voltage_time(cpu_reg, min_uV, max_uV);
- if (ret > 0)
- transition_latency += ret * 1000;
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv) {
+ ret = -ENOMEM;
+ goto out_free_opp;
}
+ priv->reg_name = name;
+
ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
if (ret) {
- pr_err("failed to init cpufreq table: %d\n", ret);
+ dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret);
goto out_free_priv;
}
priv->cpu_dev = cpu_dev;
- priv->cpu_reg = cpu_reg;
policy->driver_data = priv;
-
policy->clk = cpu_clk;
+
+ rcu_read_lock();
+ suspend_opp = dev_pm_opp_get_suspend_opp(cpu_dev);
+ if (suspend_opp)
+ policy->suspend_freq = dev_pm_opp_get_freq(suspend_opp) / 1000;
+ rcu_read_unlock();
+
ret = cpufreq_table_validate_and_show(policy, freq_table);
if (ret) {
dev_err(cpu_dev, "%s: invalid frequency table: %d\n", __func__,
@@ -292,13 +264,20 @@ static int cpufreq_init(struct cpufreq_policy *policy)
goto out_free_cpufreq_table;
}
- policy->cpuinfo.transition_latency = transition_latency;
+ /* Support turbo/boost mode */
+ if (policy_has_boost_freq(policy)) {
+ /* This gets disabled by core on driver unregister */
+ ret = cpufreq_enable_boost_support();
+ if (ret)
+ goto out_free_cpufreq_table;
+ cpufreq_dt_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs;
+ }
- pd = cpufreq_get_driver_data();
- if (!pd || !pd->independent_clocks)
- cpumask_setall(policy->cpus);
+ transition_latency = dev_pm_opp_get_max_transition_latency(cpu_dev);
+ if (!transition_latency)
+ transition_latency = CPUFREQ_ETERNAL;
- of_node_put(np);
+ policy->cpuinfo.transition_latency = transition_latency;
return 0;
@@ -307,12 +286,11 @@ out_free_cpufreq_table:
out_free_priv:
kfree(priv);
out_free_opp:
- of_free_opp_table(cpu_dev);
- of_node_put(np);
-out_put_reg_clk:
+ dev_pm_opp_of_cpumask_remove_table(policy->cpus);
+ if (name)
+ dev_pm_opp_put_regulator(cpu_dev);
+out_put_clk:
clk_put(cpu_clk);
- if (!IS_ERR(cpu_reg))
- regulator_put(cpu_reg);
return ret;
}
@@ -323,10 +301,11 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
cpufreq_cooling_unregister(priv->cdev);
dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
- of_free_opp_table(priv->cpu_dev);
+ dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
+ if (priv->reg_name)
+ dev_pm_opp_put_regulator(priv->cpu_dev);
+
clk_put(policy->clk);
- if (!IS_ERR(priv->cpu_reg))
- regulator_put(priv->cpu_reg);
kfree(priv);
return 0;
@@ -368,14 +347,12 @@ static struct cpufreq_driver dt_cpufreq_driver = {
.exit = cpufreq_exit,
.ready = cpufreq_ready,
.name = "cpufreq-dt",
- .attr = cpufreq_generic_attr,
+ .attr = cpufreq_dt_attr,
+ .suspend = cpufreq_generic_suspend,
};
static int dt_cpufreq_probe(struct platform_device *pdev)
{
- struct device *cpu_dev;
- struct regulator *cpu_reg;
- struct clk *cpu_clk;
int ret;
/*
@@ -385,19 +362,15 @@ static int dt_cpufreq_probe(struct platform_device *pdev)
*
* FIXME: Is checking this only for CPU0 sufficient ?
*/
- ret = allocate_resources(0, &cpu_dev, &cpu_reg, &cpu_clk);
+ ret = resources_available();
if (ret)
return ret;
- clk_put(cpu_clk);
- if (!IS_ERR(cpu_reg))
- regulator_put(cpu_reg);
-
dt_cpufreq_driver.driver_data = dev_get_platdata(&pdev->dev);
ret = cpufreq_register_driver(&dt_cpufreq_driver);
if (ret)
- dev_err(cpu_dev, "failed register driver: %d\n", ret);
+ dev_err(&pdev->dev, "failed register driver: %d\n", ret);
return ret;
}
@@ -417,6 +390,7 @@ static struct platform_driver dt_cpufreq_platdrv = {
};
module_platform_driver(dt_cpufreq_platdrv);
+MODULE_ALIAS("platform:cpufreq-dt");
MODULE_AUTHOR("Viresh Kumar <viresh.kumar@linaro.org>");
MODULE_AUTHOR("Shawn Guo <shawn.guo@linaro.org>");
MODULE_DESCRIPTION("Generic cpufreq driver");
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ce1d93e93d1a..74844b15a123 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2358,6 +2358,49 @@ int cpufreq_boost_supported(void)
}
EXPORT_SYMBOL_GPL(cpufreq_boost_supported);
+static int create_boost_sysfs_file(void)
+{
+ int ret;
+
+ if (!cpufreq_boost_supported())
+ return 0;
+
+ /*
+ * Check if driver provides function to enable boost -
+ * if not, use cpufreq_boost_set_sw as default
+ */
+ if (!cpufreq_driver->set_boost)
+ cpufreq_driver->set_boost = cpufreq_boost_set_sw;
+
+ ret = cpufreq_sysfs_create_file(&boost.attr);
+ if (ret)
+ pr_err("%s: cannot register global BOOST sysfs file\n",
+ __func__);
+
+ return ret;
+}
+
+static void remove_boost_sysfs_file(void)
+{
+ if (cpufreq_boost_supported())
+ cpufreq_sysfs_remove_file(&boost.attr);
+}
+
+int cpufreq_enable_boost_support(void)
+{
+ if (!cpufreq_driver)
+ return -EINVAL;
+
+ if (cpufreq_boost_supported())
+ return 0;
+
+ cpufreq_driver->boost_supported = true;
+
+ /* This will get removed on driver unregister */
+ return create_boost_sysfs_file();
+}
+EXPORT_SYMBOL_GPL(cpufreq_enable_boost_support);
+
int cpufreq_boost_enabled(void)
{
return cpufreq_driver->boost_enabled;
@@ -2407,21 +2450,9 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
if (driver_data->setpolicy)
driver_data->flags |= CPUFREQ_CONST_LOOPS;
- if (cpufreq_boost_supported()) {
- /*
- * Check if driver provides function to enable boost -
- * if not, use cpufreq_boost_set_sw as default
- */
- if (!cpufreq_driver->set_boost)
- cpufreq_driver->set_boost = cpufreq_boost_set_sw;
-
- ret = cpufreq_sysfs_create_file(&boost.attr);
- if (ret) {
- pr_err("%s: cannot register global BOOST sysfs file\n",
- __func__);
- goto err_null_driver;
- }
- }
+ ret = create_boost_sysfs_file();
+ if (ret)
+ goto err_null_driver;
ret = subsys_interface_register(&cpufreq_interface);
if (ret)
@@ -2442,8 +2473,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
err_if_unreg:
subsys_interface_unregister(&cpufreq_interface);
err_boost_unreg:
- if (cpufreq_boost_supported())
- cpufreq_sysfs_remove_file(&boost.attr);
+ remove_boost_sysfs_file();
err_null_driver:
write_lock_irqsave(&cpufreq_driver_lock, flags);
cpufreq_driver = NULL;
@@ -2472,9 +2502,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
/* Protect against concurrent cpu hotplug */
get_online_cpus();
subsys_interface_unregister(&cpufreq_interface);
- if (cpufreq_boost_supported())
- cpufreq_sysfs_remove_file(&boost.attr);
-
+ remove_boost_sysfs_file();
unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
write_lock_irqsave(&cpufreq_driver_lock, flags);
diff --git a/drivers/cpufreq/cpufreq_opp.c b/drivers/cpufreq/cpufreq_opp.c
deleted file mode 100644
index 773bcde893c0..000000000000
--- a/drivers/cpufreq/cpufreq_opp.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Generic OPP helper interface for CPUFreq drivers
- *
- * Copyright (C) 2009-2014 Texas Instruments Incorporated.
- * Nishanth Menon
- * Romit Dasgupta
- * Kevin Hilman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/cpufreq.h>
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/pm_opp.h>
-#include <linux/rcupdate.h>
-#include <linux/slab.h>
-
-/**
- * dev_pm_opp_init_cpufreq_table() - create a cpufreq table for a device
- * @dev: device for which we do this operation
- * @table: Cpufreq table returned back to caller
- *
- * Generate a cpufreq table for a provided device- this assumes that the
- * opp list is already initialized and ready for usage.
- *
- * This function allocates required memory for the cpufreq table. It is
- * expected that the caller does the required maintenance such as freeing
- * the table as required.
- *
- * Returns -EINVAL for bad pointers, -ENODEV if the device is not found, -ENOMEM
- * if no memory available for the operation (table is not populated), returns 0
- * if successful and table is populated.
- *
- * WARNING: It is important for the callers to ensure refreshing their copy of
- * the table if any of the mentioned functions have been invoked in the interim.
- *
- * Locking: The internal device_opp and opp structures are RCU protected.
- * Since we just use the regular accessor functions to access the internal data
- * structures, we use RCU read lock inside this function. As a result, users of
- * this function DONOT need to use explicit locks for invoking.
- */
-int dev_pm_opp_init_cpufreq_table(struct device *dev,
- struct cpufreq_frequency_table **table)
-{
- struct dev_pm_opp *opp;
- struct cpufreq_frequency_table *freq_table = NULL;
- int i, max_opps, ret = 0;
- unsigned long rate;
-
- rcu_read_lock();
-
- max_opps = dev_pm_opp_get_opp_count(dev);
- if (max_opps <= 0) {
- ret = max_opps ? max_opps : -ENODATA;
- goto out;
- }
-
- freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_ATOMIC);
- if (!freq_table) {
- ret = -ENOMEM;
- goto out;
- }
-
- for (i = 0, rate = 0; i < max_opps; i++, rate++) {
- /* find next rate */
- opp = dev_pm_opp_find_freq_ceil(dev, &rate);
- if (IS_ERR(opp)) {
- ret = PTR_ERR(opp);
- goto out;
- }
- freq_table[i].driver_data = i;
- freq_table[i].frequency = rate / 1000;
- }
-
- freq_table[i].driver_data = i;
- freq_table[i].frequency = CPUFREQ_TABLE_END;
-
- *table = &freq_table[0];
-
-out:
- rcu_read_unlock();
- if (ret)
- kfree(freq_table);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_init_cpufreq_table);
-
-/**
- * dev_pm_opp_free_cpufreq_table() - free the cpufreq table
- * @dev: device for which we do this operation
- * @table: table to free
- *
- * Free up the table allocated by dev_pm_opp_init_cpufreq_table
- */
-void dev_pm_opp_free_cpufreq_table(struct device *dev,
- struct cpufreq_frequency_table **table)
-{
- if (!table)
- return;
-
- kfree(*table);
- *table = NULL;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table);
diff --git a/drivers/cpufreq/exynos5440-cpufreq.c b/drivers/cpufreq/exynos5440-cpufreq.c
index 21a90ed7f3d8..c0f3373706f4 100644
--- a/drivers/cpufreq/exynos5440-cpufreq.c
+++ b/drivers/cpufreq/exynos5440-cpufreq.c
@@ -360,7 +360,7 @@ static int exynos_cpufreq_probe(struct platform_device *pdev)
goto err_put_node;
}
- ret = of_init_opp_table(dvfs_info->dev);
+ ret = dev_pm_opp_of_add_table(dvfs_info->dev);
if (ret) {
dev_err(dvfs_info->dev, "failed to init OPP table: %d\n", ret);
goto err_put_node;
@@ -424,7 +424,7 @@ static int exynos_cpufreq_probe(struct platform_device *pdev)
err_free_table:
dev_pm_opp_free_cpufreq_table(dvfs_info->dev, &dvfs_info->freq_table);
err_free_opp:
- of_free_opp_table(dvfs_info->dev);
+ dev_pm_opp_of_remove_table(dvfs_info->dev);
err_put_node:
of_node_put(np);
dev_err(&pdev->dev, "%s: failed initialization\n", __func__);
@@ -435,7 +435,7 @@ static int exynos_cpufreq_remove(struct platform_device *pdev)
{
cpufreq_unregister_driver(&exynos_driver);
dev_pm_opp_free_cpufreq_table(dvfs_info->dev, &dvfs_info->freq_table);
- of_free_opp_table(dvfs_info->dev);
+ dev_pm_opp_of_remove_table(dvfs_info->dev);
return 0;
}
diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c
index df14766a8e06..d1a0cbfdba63 100644
--- a/drivers/cpufreq/freq_table.c
+++ b/drivers/cpufreq/freq_table.c
@@ -18,6 +18,21 @@
* FREQUENCY TABLE HELPERS *
*********************************************************************/
+bool policy_has_boost_freq(struct cpufreq_policy *policy)
+{
+ struct cpufreq_frequency_table *pos, *table = policy->freq_table;
+
+ if (!table)
+ return false;
+
+ cpufreq_for_each_valid_entry(pos, table)
+ if (pos->flags & CPUFREQ_BOOST_FREQ)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(policy_has_boost_freq);
+
int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
struct cpufreq_frequency_table *table)
{
diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 380a90d3c57e..84fbc8e8fc56 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -202,7 +202,7 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev)
*/
num = dev_pm_opp_get_opp_count(cpu_dev);
if (num < 0) {
- ret = of_init_opp_table(cpu_dev);
+ ret = dev_pm_opp_of_add_table(cpu_dev);
if (ret < 0) {
dev_err(cpu_dev, "failed to init OPP table: %d\n", ret);
goto put_reg;
@@ -312,7 +312,7 @@ free_freq_table:
dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table);
out_free_opp:
if (free_opp)
- of_free_opp_table(cpu_dev);
+ dev_pm_opp_of_remove_table(cpu_dev);
put_reg:
if (!IS_ERR(arm_reg))
regulator_put(arm_reg);
@@ -340,7 +340,7 @@ static int imx6q_cpufreq_remove(struct platform_device *pdev)
cpufreq_unregister_driver(&imx6q_cpufreq_driver);
dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table);
if (free_opp)
- of_free_opp_table(cpu_dev);
+ dev_pm_opp_of_remove_table(cpu_dev);
regulator_put(arm_reg);
if (!IS_ERR(pu_reg))
regulator_put(pu_reg);
diff --git a/drivers/cpuidle/cpuidle-calxeda.c b/drivers/cpuidle/cpuidle-calxeda.c
index 9445e6cc02be..1c3242c3e3a7 100644
--- a/drivers/cpuidle/cpuidle-calxeda.c
+++ b/drivers/cpuidle/cpuidle-calxeda.c
@@ -25,16 +25,21 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/platform_device.h>
+#include <linux/psci.h>
+
#include <asm/cpuidle.h>
#include <asm/suspend.h>
-#include <asm/psci.h>
+
+#include <uapi/linux/psci.h>
+
+#define CALXEDA_IDLE_PARAM \
+ ((0 << PSCI_0_2_POWER_STATE_ID_SHIFT) | \
+ (0 << PSCI_0_2_POWER_STATE_AFFL_SHIFT) | \
+ (PSCI_POWER_STATE_TYPE_POWER_DOWN << PSCI_0_2_POWER_STATE_TYPE_SHIFT))
static int calxeda_idle_finish(unsigned long val)
{
- const struct psci_power_state ps = {
- .type = PSCI_POWER_STATE_TYPE_POWER_DOWN,
- };
- return psci_ops.cpu_suspend(ps, __pa(cpu_resume));
+ return psci_ops.cpu_suspend(CALXEDA_IDLE_PARAM, __pa(cpu_resume));
}
static int calxeda_pwrdown_idle(struct cpuidle_device *dev,
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 8d8c35623f2a..ffa809f30b19 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -176,6 +176,7 @@
#define AT_XDMAC_MAX_CHAN 0x20
#define AT_XDMAC_MAX_CSIZE 16 /* 16 data */
#define AT_XDMAC_MAX_DWIDTH 8 /* 64 bits */
+#define AT_XDMAC_RESIDUE_MAX_RETRIES 5
#define AT_XDMAC_DMA_BUSWIDTHS\
(BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED) |\
@@ -925,8 +926,8 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
struct at_xdmac_desc *desc, *_desc;
struct list_head *descs_list;
enum dma_status ret;
- int residue;
- u32 cur_nda, mask, value;
+ int residue, retry;
+ u32 cur_nda, check_nda, cur_ubc, mask, value;
u8 dwidth = 0;
unsigned long flags;
@@ -963,7 +964,42 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
cpu_relax();
}
+ /*
+ * When processing the residue, we need to read two registers but we
+ * can't do it in an atomic way. AT_XDMAC_CNDA is used to find where
+ * we stand in the descriptor list and AT_XDMAC_CUBC is used
+ * to know how many data are remaining for the current descriptor.
+ * Since the dma channel is not paused to not loose data, between the
+ * AT_XDMAC_CNDA and AT_XDMAC_CUBC read, we may have change of
+ * descriptor.
+ * For that reason, after reading AT_XDMAC_CUBC, we check if we are
+ * still using the same descriptor by reading a second time
+ * AT_XDMAC_CNDA. If AT_XDMAC_CNDA has changed, it means we have to
+ * read again AT_XDMAC_CUBC.
+ * Memory barriers are used to ensure the read order of the registers.
+ * A max number of retries is set because unlikely it can never ends if
+ * we are transferring a lot of data with small buffers.
+ */
cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+ rmb();
+ cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+ for (retry = 0; retry < AT_XDMAC_RESIDUE_MAX_RETRIES; retry++) {
+ rmb();
+ check_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+
+ if (likely(cur_nda == check_nda))
+ break;
+
+ cur_nda = check_nda;
+ rmb();
+ cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+ }
+
+ if (unlikely(retry >= AT_XDMAC_RESIDUE_MAX_RETRIES)) {
+ ret = DMA_ERROR;
+ goto spin_unlock;
+ }
+
/*
* Remove size of all microblocks already transferred and the current
* one. Then add the remaining size to transfer of the current
@@ -976,7 +1012,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda)
break;
}
- residue += at_xdmac_chan_read(atchan, AT_XDMAC_CUBC) << dwidth;
+ residue += cur_ubc << dwidth;
dma_set_residue(txstate, residue);
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 6517132e5d8b..24be7a34b84a 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -5,6 +5,9 @@
menu "Firmware Drivers"
+config ARM_PSCI_FW
+ bool
+
config EDD
tristate "BIOS Enhanced Disk Drive calls determine boot disk"
depends on X86
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 3fdd3912709a..6b051293e689 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -1,6 +1,7 @@
#
# Makefile for the linux kernel.
#
+obj-$(CONFIG_ARM_PSCI_FW) += psci.o
obj-$(CONFIG_DMI) += dmi_scan.o
obj-$(CONFIG_DMI_SYSFS) += dmi-sysfs.o
obj-$(CONFIG_EDD) += edd.o
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index d8be608a9f3b..513ae7ce1449 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -1,6 +1,14 @@
#
# Makefile for linux kernel
#
+
+#
+# ARM64 maps efi runtime services in userspace addresses
+# which don't have KASAN shadow. So dereference of these addresses
+# in efi_call_virt() will cause crash if this code instrumented.
+#
+KASAN_SANITIZE_runtime-wrappers.o := n
+
obj-$(CONFIG_EFI) += efi.o vars.o reboot.o
obj-$(CONFIG_EFI_VARS) += efivars.o
obj-$(CONFIG_EFI_VARS_PSTORE) += efi-pstore.o
diff --git a/drivers/firmware/psci.c b/drivers/firmware/psci.c
new file mode 100644
index 000000000000..ae70d2485ca1
--- /dev/null
+++ b/drivers/firmware/psci.c
@@ -0,0 +1,470 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 ARM Limited
+ */
+
+#define pr_fmt(fmt) "psci: " fmt
+
+#include <linux/errno.h>
+#include <linux/linkage.h>
+#include <linux/of.h>
+#include <linux/pm.h>
+#include <linux/printk.h>
+#include <linux/psci.h>
+#include <linux/reboot.h>
+#include <linux/suspend.h>
+
+#include <uapi/linux/psci.h>
+
+#include <asm/cputype.h>
+#include <asm/system_misc.h>
+#include <asm/smp_plat.h>
+#include <asm/suspend.h>
+
+/*
+ * While a 64-bit OS can make calls with SMC32 calling conventions, for some
+ * calls it is necessary to use SMC64 to pass or return 64-bit values.
+ * For such calls PSCI_FN_NATIVE(version, name) will choose the appropriate
+ * (native-width) function ID.
+ */
+#ifdef CONFIG_64BIT
+#define PSCI_FN_NATIVE(version, name) PSCI_##version##_FN64_##name
+#else
+#define PSCI_FN_NATIVE(version, name) PSCI_##version##_FN_##name
+#endif
+
+/*
+ * The CPU any Trusted OS is resident on. The trusted OS may reject CPU_OFF
+ * calls to its resident CPU, so we must avoid issuing those. We never migrate
+ * a Trusted OS even if it claims to be capable of migration -- doing so will
+ * require cooperation with a Trusted OS driver.
+ */
+static int resident_cpu = -1;
+
+bool psci_tos_resident_on(int cpu)
+{
+ return cpu == resident_cpu;
+}
+
+struct psci_operations psci_ops;
+
+typedef unsigned long (psci_fn)(unsigned long, unsigned long,
+ unsigned long, unsigned long);
+asmlinkage psci_fn __invoke_psci_fn_hvc;
+asmlinkage psci_fn __invoke_psci_fn_smc;
+static psci_fn *invoke_psci_fn;
+
+enum psci_function {
+ PSCI_FN_CPU_SUSPEND,
+ PSCI_FN_CPU_ON,
+ PSCI_FN_CPU_OFF,
+ PSCI_FN_MIGRATE,
+ PSCI_FN_MAX,
+};
+
+static u32 psci_function_id[PSCI_FN_MAX];
+
+#define PSCI_0_2_POWER_STATE_MASK \
+ (PSCI_0_2_POWER_STATE_ID_MASK | \
+ PSCI_0_2_POWER_STATE_TYPE_MASK | \
+ PSCI_0_2_POWER_STATE_AFFL_MASK)
+
+#define PSCI_1_0_EXT_POWER_STATE_MASK \
+ (PSCI_1_0_EXT_POWER_STATE_ID_MASK | \
+ PSCI_1_0_EXT_POWER_STATE_TYPE_MASK)
+
+static u32 psci_cpu_suspend_feature;
+
+static inline bool psci_has_ext_power_state(void)
+{
+ return psci_cpu_suspend_feature &
+ PSCI_1_0_FEATURES_CPU_SUSPEND_PF_MASK;
+}
+
+bool psci_power_state_loses_context(u32 state)
+{
+ const u32 mask = psci_has_ext_power_state() ?
+ PSCI_1_0_EXT_POWER_STATE_TYPE_MASK :
+ PSCI_0_2_POWER_STATE_TYPE_MASK;
+
+ return state & mask;
+}
+
+bool psci_power_state_is_valid(u32 state)
+{
+ const u32 valid_mask = psci_has_ext_power_state() ?
+ PSCI_1_0_EXT_POWER_STATE_MASK :
+ PSCI_0_2_POWER_STATE_MASK;
+
+ return !(state & ~valid_mask);
+}
+
+static int psci_to_linux_errno(int errno)
+{
+ switch (errno) {
+ case PSCI_RET_SUCCESS:
+ return 0;
+ case PSCI_RET_NOT_SUPPORTED:
+ return -EOPNOTSUPP;
+ case PSCI_RET_INVALID_PARAMS:
+ case PSCI_RET_INVALID_ADDRESS:
+ return -EINVAL;
+ case PSCI_RET_DENIED:
+ return -EPERM;
+ };
+
+ return -EINVAL;
+}
+
+static u32 psci_get_version(void)
+{
+ return invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0);
+}
+
+static int psci_cpu_suspend(u32 state, unsigned long entry_point)
+{
+ int err;
+ u32 fn;
+
+ fn = psci_function_id[PSCI_FN_CPU_SUSPEND];
+ err = invoke_psci_fn(fn, state, entry_point, 0);
+ return psci_to_linux_errno(err);
+}
+
+static int psci_cpu_off(u32 state)
+{
+ int err;
+ u32 fn;
+
+ fn = psci_function_id[PSCI_FN_CPU_OFF];
+ err = invoke_psci_fn(fn, state, 0, 0);
+ return psci_to_linux_errno(err);
+}
+
+static int psci_cpu_on(unsigned long cpuid, unsigned long entry_point)
+{
+ int err;
+ u32 fn;
+
+ fn = psci_function_id[PSCI_FN_CPU_ON];
+ err = invoke_psci_fn(fn, cpuid, entry_point, 0);
+ return psci_to_linux_errno(err);
+}
+
+static int psci_migrate(unsigned long cpuid)
+{
+ int err;
+ u32 fn;
+
+ fn = psci_function_id[PSCI_FN_MIGRATE];
+ err = invoke_psci_fn(fn, cpuid, 0, 0);
+ return psci_to_linux_errno(err);
+}
+
+static int psci_affinity_info(unsigned long target_affinity,
+ unsigned long lowest_affinity_level)
+{
+ return invoke_psci_fn(PSCI_FN_NATIVE(0_2, AFFINITY_INFO),
+ target_affinity, lowest_affinity_level, 0);
+}
+
+static int psci_migrate_info_type(void)
+{
+ return invoke_psci_fn(PSCI_0_2_FN_MIGRATE_INFO_TYPE, 0, 0, 0);
+}
+
+static unsigned long psci_migrate_info_up_cpu(void)
+{
+ return invoke_psci_fn(PSCI_FN_NATIVE(0_2, MIGRATE_INFO_UP_CPU),
+ 0, 0, 0);
+}
+
+static int get_set_conduit_method(struct device_node *np)
+{
+ const char *method;
+
+ pr_info("probing for conduit method from DT.\n");
+
+ if (of_property_read_string(np, "method", &method)) {
+ pr_warn("missing \"method\" property\n");
+ return -ENXIO;
+ }
+
+ if (!strcmp("hvc", method)) {
+ invoke_psci_fn = __invoke_psci_fn_hvc;
+ } else if (!strcmp("smc", method)) {
+ invoke_psci_fn = __invoke_psci_fn_smc;
+ } else {
+ pr_warn("invalid \"method\" property: %s\n", method);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void psci_sys_reset(enum reboot_mode reboot_mode, const char *cmd)
+{
+ invoke_psci_fn(PSCI_0_2_FN_SYSTEM_RESET, 0, 0, 0);
+}
+
+static void psci_sys_poweroff(void)
+{
+ invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
+}
+
+static int __init psci_features(u32 psci_func_id)
+{
+ return invoke_psci_fn(PSCI_1_0_FN_PSCI_FEATURES,
+ psci_func_id, 0, 0);
+}
+
+static int psci_system_suspend(unsigned long unused)
+{
+ return invoke_psci_fn(PSCI_FN_NATIVE(1_0, SYSTEM_SUSPEND),
+ virt_to_phys(cpu_resume), 0, 0);
+}
+
+static int psci_system_suspend_enter(suspend_state_t state)
+{
+ return cpu_suspend(0, psci_system_suspend);
+}
+
+static const struct platform_suspend_ops psci_suspend_ops = {
+ .valid = suspend_valid_only_mem,
+ .enter = psci_system_suspend_enter,
+};
+
+static void __init psci_init_system_suspend(void)
+{
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_SUSPEND))
+ return;
+
+ ret = psci_features(PSCI_FN_NATIVE(1_0, SYSTEM_SUSPEND));
+
+ if (ret != PSCI_RET_NOT_SUPPORTED)
+ suspend_set_ops(&psci_suspend_ops);
+}
+
+static void __init psci_init_cpu_suspend(void)
+{
+ int feature = psci_features(psci_function_id[PSCI_FN_CPU_SUSPEND]);
+
+ if (feature != PSCI_RET_NOT_SUPPORTED)
+ psci_cpu_suspend_feature = feature;
+}
+
+/*
+ * Detect the presence of a resident Trusted OS which may cause CPU_OFF to
+ * return DENIED (which would be fatal).
+ */
+static void __init psci_init_migrate(void)
+{
+ unsigned long cpuid;
+ int type, cpu = -1;
+
+ type = psci_ops.migrate_info_type();
+
+ if (type == PSCI_0_2_TOS_MP) {
+ pr_info("Trusted OS migration not required\n");
+ return;
+ }
+
+ if (type == PSCI_RET_NOT_SUPPORTED) {
+ pr_info("MIGRATE_INFO_TYPE not supported.\n");
+ return;
+ }
+
+ if (type != PSCI_0_2_TOS_UP_MIGRATE &&
+ type != PSCI_0_2_TOS_UP_NO_MIGRATE) {
+ pr_err("MIGRATE_INFO_TYPE returned unknown type (%d)\n", type);
+ return;
+ }
+
+ cpuid = psci_migrate_info_up_cpu();
+ if (cpuid & ~MPIDR_HWID_BITMASK) {
+ pr_warn("MIGRATE_INFO_UP_CPU reported invalid physical ID (0x%lx)\n",
+ cpuid);
+ return;
+ }
+
+ cpu = get_logical_index(cpuid);
+ resident_cpu = cpu >= 0 ? cpu : -1;
+
+ pr_info("Trusted OS resident on physical CPU 0x%lx\n", cpuid);
+}
+
+static void __init psci_0_2_set_functions(void)
+{
+ pr_info("Using standard PSCI v0.2 function IDs\n");
+ psci_function_id[PSCI_FN_CPU_SUSPEND] =
+ PSCI_FN_NATIVE(0_2, CPU_SUSPEND);
+ psci_ops.cpu_suspend = psci_cpu_suspend;
+
+ psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF;
+ psci_ops.cpu_off = psci_cpu_off;
+
+ psci_function_id[PSCI_FN_CPU_ON] = PSCI_FN_NATIVE(0_2, CPU_ON);
+ psci_ops.cpu_on = psci_cpu_on;
+
+ psci_function_id[PSCI_FN_MIGRATE] = PSCI_FN_NATIVE(0_2, MIGRATE);
+ psci_ops.migrate = psci_migrate;
+
+ psci_ops.affinity_info = psci_affinity_info;
+
+ psci_ops.migrate_info_type = psci_migrate_info_type;
+
+ arm_pm_restart = psci_sys_reset;
+
+ pm_power_off = psci_sys_poweroff;
+}
+
+/*
+ * Probe function for PSCI firmware versions >= 0.2
+ */
+static int __init psci_probe(void)
+{
+ u32 ver = psci_get_version();
+
+ pr_info("PSCIv%d.%d detected in firmware.\n",
+ PSCI_VERSION_MAJOR(ver),
+ PSCI_VERSION_MINOR(ver));
+
+ if (PSCI_VERSION_MAJOR(ver) == 0 && PSCI_VERSION_MINOR(ver) < 2) {
+ pr_err("Conflicting PSCI version detected.\n");
+ return -EINVAL;
+ }
+
+ psci_0_2_set_functions();
+
+ psci_init_migrate();
+
+ if (PSCI_VERSION_MAJOR(ver) >= 1) {
+ psci_init_cpu_suspend();
+ psci_init_system_suspend();
+ }
+
+ return 0;
+}
+
+typedef int (*psci_initcall_t)(const struct device_node *);
+
+/*
+ * PSCI init function for PSCI versions >=0.2
+ *
+ * Probe based on PSCI PSCI_VERSION function
+ */
+static int __init psci_0_2_init(struct device_node *np)
+{
+ int err;
+
+ err = get_set_conduit_method(np);
+
+ if (err)
+ goto out_put_node;
+ /*
+ * Starting with v0.2, the PSCI specification introduced a call
+ * (PSCI_VERSION) that allows probing the firmware version, so
+ * that PSCI function IDs and version specific initialization
+ * can be carried out according to the specific version reported
+ * by firmware
+ */
+ err = psci_probe();
+
+out_put_node:
+ of_node_put(np);
+ return err;
+}
+
+/*
+ * PSCI < v0.2 get PSCI Function IDs via DT.
+ */
+static int __init psci_0_1_init(struct device_node *np)
+{
+ u32 id;
+ int err;
+
+ err = get_set_conduit_method(np);
+
+ if (err)
+ goto out_put_node;
+
+ pr_info("Using PSCI v0.1 Function IDs from DT\n");
+
+ if (!of_property_read_u32(np, "cpu_suspend", &id)) {
+ psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
+ psci_ops.cpu_suspend = psci_cpu_suspend;
+ }
+
+ if (!of_property_read_u32(np, "cpu_off", &id)) {
+ psci_function_id[PSCI_FN_CPU_OFF] = id;
+ psci_ops.cpu_off = psci_cpu_off;
+ }
+
+ if (!of_property_read_u32(np, "cpu_on", &id)) {
+ psci_function_id[PSCI_FN_CPU_ON] = id;
+ psci_ops.cpu_on = psci_cpu_on;
+ }
+
+ if (!of_property_read_u32(np, "migrate", &id)) {
+ psci_function_id[PSCI_FN_MIGRATE] = id;
+ psci_ops.migrate = psci_migrate;
+ }
+
+out_put_node:
+ of_node_put(np);
+ return err;
+}
+
+static const struct of_device_id psci_of_match[] __initconst = {
+ { .compatible = "arm,psci", .data = psci_0_1_init},
+ { .compatible = "arm,psci-0.2", .data = psci_0_2_init},
+ { .compatible = "arm,psci-1.0", .data = psci_0_2_init},
+ {},
+};
+
+int __init psci_dt_init(void)
+{
+ struct device_node *np;
+ const struct of_device_id *matched_np;
+ psci_initcall_t init_fn;
+
+ np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);
+
+ if (!np)
+ return -ENODEV;
+
+ init_fn = (psci_initcall_t)matched_np->data;
+ return init_fn(np);
+}
+
+#ifdef CONFIG_ACPI
+/*
+ * We use PSCI 0.2+ when ACPI is deployed on ARM64 and it's
+ * explicitly clarified in SBBR
+ */
+int __init psci_acpi_init(void)
+{
+ if (!acpi_psci_present()) {
+ pr_info("is not implemented in ACPI.\n");
+ return -EOPNOTSUPP;
+ }
+
+ pr_info("probing for conduit method from ACPI.\n");
+
+ if (acpi_psci_use_hvc())
+ invoke_psci_fn = __invoke_psci_fn_hvc;
+ else
+ invoke_psci_fn = __invoke_psci_fn_smc;
+
+ return psci_probe();
+}
+#endif
diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c
index 447dbfa6c793..7ac42d063574 100644
--- a/drivers/gpu/drm/radeon/atombios_dp.c
+++ b/drivers/gpu/drm/radeon/atombios_dp.c
@@ -254,7 +254,7 @@ void radeon_dp_aux_init(struct radeon_connector *radeon_connector)
#define DP_VOLTAGE_MAX DP_TRAIN_VOLTAGE_SWING_LEVEL_3
#define DP_PRE_EMPHASIS_MAX DP_TRAIN_PRE_EMPH_LEVEL_3
-static void dp_get_adjust_train(u8 link_status[DP_LINK_STATUS_SIZE],
+static void dp_get_adjust_train(const u8 link_status[DP_LINK_STATUS_SIZE],
int lane_count,
u8 train_set[4])
{
@@ -301,77 +301,43 @@ static int convert_bpc_to_bpp(int bpc)
return bpc * 3;
}
-/* get the max pix clock supported by the link rate and lane num */
-static int dp_get_max_dp_pix_clock(int link_rate,
- int lane_num,
- int bpp)
-{
- return (link_rate * lane_num * 8) / bpp;
-}
-
/***** radeon specific DP functions *****/
-int radeon_dp_get_max_link_rate(struct drm_connector *connector,
- u8 dpcd[DP_DPCD_SIZE])
-{
- int max_link_rate;
-
- if (radeon_connector_is_dp12_capable(connector))
- max_link_rate = min(drm_dp_max_link_rate(dpcd), 540000);
- else
- max_link_rate = min(drm_dp_max_link_rate(dpcd), 270000);
-
- return max_link_rate;
-}
-
-/* First get the min lane# when low rate is used according to pixel clock
- * (prefer low rate), second check max lane# supported by DP panel,
- * if the max lane# < low rate lane# then use max lane# instead.
- */
-static int radeon_dp_get_dp_lane_number(struct drm_connector *connector,
- u8 dpcd[DP_DPCD_SIZE],
- int pix_clock)
+int radeon_dp_get_dp_link_config(struct drm_connector *connector,
+ const u8 dpcd[DP_DPCD_SIZE],
+ unsigned pix_clock,
+ unsigned *dp_lanes, unsigned *dp_rate)
{
int bpp = convert_bpc_to_bpp(radeon_get_monitor_bpc(connector));
- int max_link_rate = radeon_dp_get_max_link_rate(connector, dpcd);
- int max_lane_num = drm_dp_max_lane_count(dpcd);
- int lane_num;
- int max_dp_pix_clock;
-
- for (lane_num = 1; lane_num < max_lane_num; lane_num <<= 1) {
- max_dp_pix_clock = dp_get_max_dp_pix_clock(max_link_rate, lane_num, bpp);
- if (pix_clock <= max_dp_pix_clock)
- break;
- }
-
- return lane_num;
-}
-
-static int radeon_dp_get_dp_link_clock(struct drm_connector *connector,
- u8 dpcd[DP_DPCD_SIZE],
- int pix_clock)
-{
- int bpp = convert_bpc_to_bpp(radeon_get_monitor_bpc(connector));
- int lane_num, max_pix_clock;
+ static const unsigned link_rates[3] = { 162000, 270000, 540000 };
+ unsigned max_link_rate = drm_dp_max_link_rate(dpcd);
+ unsigned max_lane_num = drm_dp_max_lane_count(dpcd);
+ unsigned lane_num, i, max_pix_clock;
if (radeon_connector_encoder_get_dp_bridge_encoder_id(connector) ==
- ENCODER_OBJECT_ID_NUTMEG)
- return 270000;
-
- lane_num = radeon_dp_get_dp_lane_number(connector, dpcd, pix_clock);
- max_pix_clock = dp_get_max_dp_pix_clock(162000, lane_num, bpp);
- if (pix_clock <= max_pix_clock)
- return 162000;
- max_pix_clock = dp_get_max_dp_pix_clock(270000, lane_num, bpp);
- if (pix_clock <= max_pix_clock)
- return 270000;
- if (radeon_connector_is_dp12_capable(connector)) {
- max_pix_clock = dp_get_max_dp_pix_clock(540000, lane_num, bpp);
- if (pix_clock <= max_pix_clock)
- return 540000;
+ ENCODER_OBJECT_ID_NUTMEG) {
+ for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+ max_pix_clock = (lane_num * 270000 * 8) / bpp;
+ if (max_pix_clock >= pix_clock) {
+ *dp_lanes = lane_num;
+ *dp_rate = 270000;
+ return 0;
+ }
+ }
+ } else {
+ for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+ for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+ max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+ if (max_pix_clock >= pix_clock) {
+ *dp_lanes = lane_num;
+ *dp_rate = link_rates[i];
+ return 0;
+ }
+ }
+ }
}
- return radeon_dp_get_max_link_rate(connector, dpcd);
+ return -EINVAL;
}
static u8 radeon_dp_encoder_service(struct radeon_device *rdev,
@@ -490,6 +456,7 @@ void radeon_dp_set_link_config(struct drm_connector *connector,
{
struct radeon_connector *radeon_connector = to_radeon_connector(connector);
struct radeon_connector_atom_dig *dig_connector;
+ int ret;
if (!radeon_connector->con_priv)
return;
@@ -497,10 +464,14 @@ void radeon_dp_set_link_config(struct drm_connector *connector,
if ((dig_connector->dp_sink_type == CONNECTOR_OBJECT_ID_DISPLAYPORT) ||
(dig_connector->dp_sink_type == CONNECTOR_OBJECT_ID_eDP)) {
- dig_connector->dp_clock =
- radeon_dp_get_dp_link_clock(connector, dig_connector->dpcd, mode->clock);
- dig_connector->dp_lane_count =
- radeon_dp_get_dp_lane_number(connector, dig_connector->dpcd, mode->clock);
+ ret = radeon_dp_get_dp_link_config(connector, dig_connector->dpcd,
+ mode->clock,
+ &dig_connector->dp_lane_count,
+ &dig_connector->dp_clock);
+ if (ret) {
+ dig_connector->dp_clock = 0;
+ dig_connector->dp_lane_count = 0;
+ }
}
}
@@ -509,7 +480,8 @@ int radeon_dp_mode_valid_helper(struct drm_connector *connector,
{
struct radeon_connector *radeon_connector = to_radeon_connector(connector);
struct radeon_connector_atom_dig *dig_connector;
- int dp_clock;
+ unsigned dp_clock, dp_lanes;
+ int ret;
if ((mode->clock > 340000) &&
(!radeon_connector_is_dp12_capable(connector)))
@@ -519,8 +491,12 @@ int radeon_dp_mode_valid_helper(struct drm_connector *connector,
return MODE_CLOCK_HIGH;
dig_connector = radeon_connector->con_priv;
- dp_clock =
- radeon_dp_get_dp_link_clock(connector, dig_connector->dpcd, mode->clock);
+ ret = radeon_dp_get_dp_link_config(connector, dig_connector->dpcd,
+ mode->clock,
+ &dp_lanes,
+ &dp_clock);
+ if (ret)
+ return MODE_CLOCK_HIGH;
if ((dp_clock == 540000) &&
(!radeon_connector_is_dp12_capable(connector)))
diff --git a/drivers/gpu/drm/radeon/radeon_dp_mst.c b/drivers/gpu/drm/radeon/radeon_dp_mst.c
index c9ff4cf4c4e7..c4b4c0233937 100644
--- a/drivers/gpu/drm/radeon/radeon_dp_mst.c
+++ b/drivers/gpu/drm/radeon/radeon_dp_mst.c
@@ -520,11 +520,17 @@ static bool radeon_mst_mode_fixup(struct drm_encoder *encoder,
drm_mode_set_crtcinfo(adjusted_mode, 0);
{
struct radeon_connector_atom_dig *dig_connector;
+ int ret;
dig_connector = mst_enc->connector->con_priv;
- dig_connector->dp_lane_count = drm_dp_max_lane_count(dig_connector->dpcd);
- dig_connector->dp_clock = radeon_dp_get_max_link_rate(&mst_enc->connector->base,
- dig_connector->dpcd);
+ ret = radeon_dp_get_dp_link_config(&mst_enc->connector->base,
+ dig_connector->dpcd, adjusted_mode->clock,
+ &dig_connector->dp_lane_count,
+ &dig_connector->dp_clock);
+ if (ret) {
+ dig_connector->dp_lane_count = 0;
+ dig_connector->dp_clock = 0;
+ }
DRM_DEBUG_KMS("dig clock %p %d %d\n", dig_connector,
dig_connector->dp_lane_count, dig_connector->dp_clock);
}
diff --git a/drivers/gpu/drm/radeon/radeon_mode.h b/drivers/gpu/drm/radeon/radeon_mode.h
index 9af2d8398e90..43ba333949c7 100644
--- a/drivers/gpu/drm/radeon/radeon_mode.h
+++ b/drivers/gpu/drm/radeon/radeon_mode.h
@@ -752,8 +752,10 @@ extern u8 radeon_dp_getsinktype(struct radeon_connector *radeon_connector);
extern bool radeon_dp_getdpcd(struct radeon_connector *radeon_connector);
extern int radeon_dp_get_panel_mode(struct drm_encoder *encoder,
struct drm_connector *connector);
-int radeon_dp_get_max_link_rate(struct drm_connector *connector,
- u8 *dpcd);
+extern int radeon_dp_get_dp_link_config(struct drm_connector *connector,
+ const u8 *dpcd,
+ unsigned pix_clock,
+ unsigned *dp_lanes, unsigned *dp_rate);
extern void radeon_dp_set_rx_power_state(struct drm_connector *connector,
u8 power_state);
extern void radeon_dp_aux_init(struct radeon_connector *radeon_connector);
diff --git a/drivers/gpu/drm/radeon/radeon_pm.c b/drivers/gpu/drm/radeon/radeon_pm.c
index a56eab7f0ab1..8319eed613b0 100644
--- a/drivers/gpu/drm/radeon/radeon_pm.c
+++ b/drivers/gpu/drm/radeon/radeon_pm.c
@@ -1079,6 +1079,8 @@ force:
/* update display watermarks based on new power state */
radeon_bandwidth_update(rdev);
+ /* update displays */
+ radeon_dpm_display_configuration_changed(rdev);
/* wait for the rings to drain */
for (i = 0; i < RADEON_NUM_RINGS; i++) {
@@ -1095,9 +1097,6 @@ force:
radeon_dpm_post_set_power_state(rdev);
- /* update displays */
- radeon_dpm_display_configuration_changed(rdev);
-
rdev->pm.dpm.current_active_crtcs = rdev->pm.dpm.new_active_crtcs;
rdev->pm.dpm.current_active_crtc_count = rdev->pm.dpm.new_active_crtc_count;
rdev->pm.dpm.single_display = single_display;
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 1750db0ef61c..2669b21bdfee 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -138,7 +138,7 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
to handle */
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
-u32 amd_iommu_unmap_flush; /* if true, flush on every unmap */
+bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
system */
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index cbfd0f4c4608..d86bc243a57e 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -681,7 +681,7 @@ extern unsigned long *amd_iommu_pd_alloc_bitmap;
* If true, the addresses will be flushed on unmap time, not when
* they are reused
*/
-extern u32 amd_iommu_unmap_flush;
+extern bool amd_iommu_unmap_flush;
/* Smallest max PASID supported by any IOMMU in the system */
extern u32 amd_iommu_max_pasid;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ab43faddb447..9c083b9d02d3 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/hash.h>
#include <linux/random.h>
+#include <linux/backing-dev.h>
#include <trace/events/bcache.h>
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 85cef98423b2..7835da039268 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2159,7 +2159,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
* the query about congestion status of request_queue
*/
if (dm_request_based(md))
- r = md->queue->backing_dev_info.state &
+ r = md->queue->backing_dev_info.wb.state &
bdi_bits;
else
r = dm_table_any_congested(map, bdi_bits);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 6123c2bf9150..4e984993d40a 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -14,6 +14,7 @@
#include <linux/device-mapper.h>
#include <linux/list.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/hdreg.h>
#include <linux/completion.h>
#include <linux/kobject.h>
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4046a6c6f223..7da6e9c3cb53 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -16,6 +16,7 @@
#define _MD_MD_H
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mm.h>
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index bff6c1c7fecb..1342cd2ccb24 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -745,7 +745,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
struct r1conf *conf = mddev->private;
int i, ret = 0;
- if ((bits & (1 << BDI_async_congested)) &&
+ if ((bits & (1 << WB_async_congested)) &&
conf->pending_count >= max_queued_requests)
return 1;
@@ -760,7 +760,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
/* Note the '|| 1' - when read_balance prefers
* non-congested targets, it can be removed
*/
- if ((bits & (1<<BDI_async_congested)) || 1)
+ if ((bits & (1 << WB_async_congested)) || 1)
ret |= bdi_congested(&q->backing_dev_info, bits);
else
ret &= bdi_congested(&q->backing_dev_info, bits);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index adfc83a0f023..646074d33ceb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -914,7 +914,7 @@ static int raid10_congested(struct mddev *mddev, int bits)
struct r10conf *conf = mddev->private;
int i, ret = 0;
- if ((bits & (1 << BDI_async_congested)) &&
+ if ((bits & (1 << WB_async_congested)) &&
conf->pending_count >= max_queued_requests)
return 1;
diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c
index b5abe34120b8..da9fb01cd441 100644
--- a/drivers/misc/lkdtm.c
+++ b/drivers/misc/lkdtm.c
@@ -103,6 +103,7 @@ enum ctype {
CT_EXEC_USERSPACE,
CT_ACCESS_USERSPACE,
CT_WRITE_RO,
+ CT_WRITE_RO_AFTER_INIT,
CT_WRITE_KERN,
};
@@ -140,6 +141,7 @@ static char* cp_type[] = {
"EXEC_USERSPACE",
"ACCESS_USERSPACE",
"WRITE_RO",
+ "WRITE_RO_AFTER_INIT",
"WRITE_KERN",
};
@@ -162,6 +164,7 @@ static DEFINE_SPINLOCK(lock_me_up);
static u8 data_area[EXEC_SIZE];
static const unsigned long rodata = 0xAA55AA55;
+static unsigned long ro_after_init __ro_after_init = 0x55AA5500;
module_param(recur_count, int, 0644);
MODULE_PARM_DESC(recur_count, " Recursion level for the stack overflow test");
@@ -497,11 +500,28 @@ static void lkdtm_do_action(enum ctype which)
break;
}
case CT_WRITE_RO: {
- unsigned long *ptr;
+ /* Explicitly cast away "const" for the test. */
+ unsigned long *ptr = (unsigned long *)&rodata;
- ptr = (unsigned long *)&rodata;
+ pr_info("attempting bad rodata write at %p\n", ptr);
+ *ptr ^= 0xabcd1234;
- pr_info("attempting bad write at %p\n", ptr);
+ break;
+ }
+ case CT_WRITE_RO_AFTER_INIT: {
+ unsigned long *ptr = &ro_after_init;
+
+ /*
+ * Verify we were written to during init. Since an Oops
+ * is considered a "success", a failure is to just skip the
+ * real test.
+ */
+ if ((*ptr & 0xAA) != 0xAA) {
+ pr_info("%p was NOT written during init!?\n", ptr);
+ break;
+ }
+
+ pr_info("attempting bad ro_after_init write at %p\n", ptr);
*ptr ^= 0xabcd1234;
break;
@@ -811,6 +831,9 @@ static int __init lkdtm_module_init(void)
int n_debugfs_entries = 1; /* Assume only the direct entry */
int i;
+ /* Make sure we can write to __ro_after_init values during __init */
+ ro_after_init |= 0xAA;
+
/* Register debugfs interface */
lkdtm_debugfs_root = debugfs_create_dir("provoke-crash", NULL);
if (!lkdtm_debugfs_root) {
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index b16f3cda97ff..e2c0057737e6 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -20,6 +20,7 @@
#include <linux/delay.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/pagemap.h>
#include <linux/list.h>
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index 8b4d3e6875eb..21924f52863f 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -826,9 +826,8 @@ static struct gs_can *gs_make_candev(unsigned int channel, struct usb_interface
static void gs_destroy_candev(struct gs_can *dev)
{
unregister_candev(dev->netdev);
- free_candev(dev->netdev);
usb_kill_anchored_urbs(&dev->tx_submitted);
- kfree(dev);
+ free_candev(dev->netdev);
}
static int gs_usb_probe(struct usb_interface *intf, const struct usb_device_id *id)
@@ -913,12 +912,15 @@ static int gs_usb_probe(struct usb_interface *intf, const struct usb_device_id *
for (i = 0; i < icount; i++) {
dev->canch[i] = gs_make_candev(i, intf);
if (IS_ERR_OR_NULL(dev->canch[i])) {
+ /* save error code to return later */
+ rc = PTR_ERR(dev->canch[i]);
+
/* on failure destroy previously created candevs */
icount = i;
- for (i = 0; i < icount; i++) {
+ for (i = 0; i < icount; i++)
gs_destroy_candev(dev->canch[i]);
- dev->canch[i] = NULL;
- }
+
+ usb_kill_anchored_urbs(&dev->rx_submitted);
kfree(dev);
return rc;
}
@@ -939,16 +941,12 @@ static void gs_usb_disconnect(struct usb_interface *intf)
return;
}
- for (i = 0; i < GS_MAX_INTF; i++) {
- struct gs_can *can = dev->canch[i];
-
- if (!can)
- continue;
-
- gs_destroy_candev(can);
- }
+ for (i = 0; i < GS_MAX_INTF; i++)
+ if (dev->canch[i])
+ gs_destroy_candev(dev->canch[i]);
usb_kill_anchored_urbs(&dev->rx_submitted);
+ kfree(dev);
}
static const struct usb_device_id gs_usb_table[] = {
diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h
index f65310c3ba5f..9134e79fa8db 100644
--- a/drivers/net/wireless/ath/ath10k/core.h
+++ b/drivers/net/wireless/ath/ath10k/core.h
@@ -598,7 +598,7 @@ struct ath10k {
bool monitor_started;
unsigned int filter_flags;
unsigned long dev_flags;
- u32 dfs_block_radar_events;
+ bool dfs_block_radar_events;
/* protected by conf_mutex */
bool radar_enabled;
diff --git a/drivers/net/wireless/ath/ath5k/ath5k.h b/drivers/net/wireless/ath/ath5k/ath5k.h
index 7ca0d6f930fd..cce76b2dabce 100644
--- a/drivers/net/wireless/ath/ath5k/ath5k.h
+++ b/drivers/net/wireless/ath/ath5k/ath5k.h
@@ -1366,7 +1366,7 @@ struct ath5k_hw {
u8 ah_retry_long;
u8 ah_retry_short;
- u32 ah_use_32khz_clock;
+ bool ah_use_32khz_clock;
u8 ah_coverage_class;
bool ah_ack_bitrate_high;
diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
index 5e15e8e10ed3..1ad05024d8a3 100644
--- a/drivers/net/wireless/ath/ath9k/hw.c
+++ b/drivers/net/wireless/ath/ath9k/hw.c
@@ -384,7 +384,7 @@ static void ath9k_hw_init_config(struct ath_hw *ah)
ah->config.dma_beacon_response_time = 1;
ah->config.sw_beacon_response_time = 6;
- ah->config.cwm_ignore_extcca = 0;
+ ah->config.cwm_ignore_extcca = false;
ah->config.analog_shiftreg = 1;
ah->config.rx_intr_mitigation = true;
diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h
index c1d2d0340feb..6368c664483a 100644
--- a/drivers/net/wireless/ath/ath9k/hw.h
+++ b/drivers/net/wireless/ath/ath9k/hw.h
@@ -332,14 +332,14 @@ enum ath9k_hw_hang_checks {
struct ath9k_ops_config {
int dma_beacon_response_time;
int sw_beacon_response_time;
- u32 cwm_ignore_extcca;
+ bool cwm_ignore_extcca;
u32 pcie_waen;
u8 analog_shiftreg;
u32 ofdm_trig_low;
u32 ofdm_trig_high;
u32 cck_trig_high;
u32 cck_trig_low;
- u32 enable_paprd;
+ bool enable_paprd;
int serialize_regmode;
bool rx_intr_mitigation;
bool tx_intr_mitigation;
diff --git a/drivers/net/wireless/b43/debugfs.c b/drivers/net/wireless/b43/debugfs.c
index e807bd930647..b4bcd94aff6c 100644
--- a/drivers/net/wireless/b43/debugfs.c
+++ b/drivers/net/wireless/b43/debugfs.c
@@ -676,15 +676,15 @@ static void b43_add_dynamic_debug(struct b43_wldev *dev)
e->dyn_debug_dentries[id] = d; \
} while (0)
- add_dyn_dbg("debug_xmitpower", B43_DBG_XMITPOWER, 0);
- add_dyn_dbg("debug_dmaoverflow", B43_DBG_DMAOVERFLOW, 0);
- add_dyn_dbg("debug_dmaverbose", B43_DBG_DMAVERBOSE, 0);
- add_dyn_dbg("debug_pwork_fast", B43_DBG_PWORK_FAST, 0);
- add_dyn_dbg("debug_pwork_stop", B43_DBG_PWORK_STOP, 0);
- add_dyn_dbg("debug_lo", B43_DBG_LO, 0);
- add_dyn_dbg("debug_firmware", B43_DBG_FIRMWARE, 0);
- add_dyn_dbg("debug_keys", B43_DBG_KEYS, 0);
- add_dyn_dbg("debug_verbose_stats", B43_DBG_VERBOSESTATS, 0);
+ add_dyn_dbg("debug_xmitpower", B43_DBG_XMITPOWER, false);
+ add_dyn_dbg("debug_dmaoverflow", B43_DBG_DMAOVERFLOW, false);
+ add_dyn_dbg("debug_dmaverbose", B43_DBG_DMAVERBOSE, false);
+ add_dyn_dbg("debug_pwork_fast", B43_DBG_PWORK_FAST, false);
+ add_dyn_dbg("debug_pwork_stop", B43_DBG_PWORK_STOP, false);
+ add_dyn_dbg("debug_lo", B43_DBG_LO, false);
+ add_dyn_dbg("debug_firmware", B43_DBG_FIRMWARE, false);
+ add_dyn_dbg("debug_keys", B43_DBG_KEYS, false);
+ add_dyn_dbg("debug_verbose_stats", B43_DBG_VERBOSESTATS, false);
#undef add_dyn_dbg
}
diff --git a/drivers/net/wireless/b43/debugfs.h b/drivers/net/wireless/b43/debugfs.h
index 50517b801cb4..d05377745011 100644
--- a/drivers/net/wireless/b43/debugfs.h
+++ b/drivers/net/wireless/b43/debugfs.h
@@ -68,7 +68,7 @@ struct b43_dfsentry {
u32 shm32read_addr_next;
/* Enabled/Disabled list for the dynamic debugging features. */
- u32 dyn_debug[__B43_NR_DYNDBG];
+ bool dyn_debug[__B43_NR_DYNDBG];
/* Dentries for the dynamic debugging entries. */
struct dentry *dyn_debug_dentries[__B43_NR_DYNDBG];
};
diff --git a/drivers/net/wireless/b43legacy/debugfs.c b/drivers/net/wireless/b43legacy/debugfs.c
index 1965edb765a2..090910ea259e 100644
--- a/drivers/net/wireless/b43legacy/debugfs.c
+++ b/drivers/net/wireless/b43legacy/debugfs.c
@@ -369,11 +369,11 @@ static void b43legacy_add_dynamic_debug(struct b43legacy_wldev *dev)
e->dyn_debug_dentries[id] = d; \
} while (0)
- add_dyn_dbg("debug_xmitpower", B43legacy_DBG_XMITPOWER, 0);
- add_dyn_dbg("debug_dmaoverflow", B43legacy_DBG_DMAOVERFLOW, 0);
- add_dyn_dbg("debug_dmaverbose", B43legacy_DBG_DMAVERBOSE, 0);
- add_dyn_dbg("debug_pwork_fast", B43legacy_DBG_PWORK_FAST, 0);
- add_dyn_dbg("debug_pwork_stop", B43legacy_DBG_PWORK_STOP, 0);
+ add_dyn_dbg("debug_xmitpower", B43legacy_DBG_XMITPOWER, false);
+ add_dyn_dbg("debug_dmaoverflow", B43legacy_DBG_DMAOVERFLOW, false);
+ add_dyn_dbg("debug_dmaverbose", B43legacy_DBG_DMAVERBOSE, false);
+ add_dyn_dbg("debug_pwork_fast", B43legacy_DBG_PWORK_FAST, false);
+ add_dyn_dbg("debug_pwork_stop", B43legacy_DBG_PWORK_STOP, false);
#undef add_dyn_dbg
}
diff --git a/drivers/net/wireless/b43legacy/debugfs.h b/drivers/net/wireless/b43legacy/debugfs.h
index ae3b0d0fa849..9ee32158b947 100644
--- a/drivers/net/wireless/b43legacy/debugfs.h
+++ b/drivers/net/wireless/b43legacy/debugfs.h
@@ -47,7 +47,7 @@ struct b43legacy_dfsentry {
struct b43legacy_txstatus_log txstatlog;
/* Enabled/Disabled list for the dynamic debugging features. */
- u32 dyn_debug[__B43legacy_NR_DYNDBG];
+ bool dyn_debug[__B43legacy_NR_DYNDBG];
/* Dentries for the dynamic debugging entries. */
struct dentry *dyn_debug_dentries[__B43legacy_NR_DYNDBG];
};
diff --git a/drivers/net/wireless/iwlegacy/common.h b/drivers/net/wireless/iwlegacy/common.h
index 5b972798bdff..ce52cf114fde 100644
--- a/drivers/net/wireless/iwlegacy/common.h
+++ b/drivers/net/wireless/iwlegacy/common.h
@@ -1425,9 +1425,9 @@ struct il_priv {
#endif /* CONFIG_IWLEGACY_DEBUGFS */
struct work_struct txpower_work;
- u32 disable_sens_cal;
- u32 disable_chain_noise_cal;
- u32 disable_tx_power_cal;
+ bool disable_sens_cal;
+ bool disable_chain_noise_cal;
+ bool disable_tx_power_cal;
struct work_struct run_time_calib_work;
struct timer_list stats_periodic;
struct timer_list watchdog;
diff --git a/drivers/net/wireless/iwlwifi/mvm/mvm.h b/drivers/net/wireless/iwlwifi/mvm/mvm.h
index 83273adfabdd..34769e3f5f52 100644
--- a/drivers/net/wireless/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/iwlwifi/mvm/mvm.h
@@ -664,7 +664,7 @@ struct iwl_mvm {
const struct iwl_fw_bcast_filter *bcast_filters;
#ifdef CONFIG_IWLWIFI_DEBUGFS
struct {
- u32 override; /* u32 for debugfs_create_bool */
+ bool override;
struct iwl_bcast_filter_cmd cmd;
} dbgfs_bcast_filtering;
#endif
@@ -688,7 +688,7 @@ struct iwl_mvm {
bool disable_power_off;
bool disable_power_off_d3;
- u32 scan_iter_notif_enabled; /* must be u32 for debugfs_create_bool */
+ bool scan_iter_notif_enabled;
struct debugfs_blob_wrapper nvm_hw_blob;
struct debugfs_blob_wrapper nvm_sw_blob;
@@ -742,7 +742,7 @@ struct iwl_mvm {
int n_nd_channels;
bool net_detect;
#ifdef CONFIG_IWLWIFI_DEBUGFS
- u32 d3_wake_sysassert; /* must be u32 for debugfs_create_bool */
+ bool d3_wake_sysassert;
bool d3_test_active;
bool store_d3_resume_sram;
void *d3_resume_sram;
diff --git a/drivers/net/wireless/iwlwifi/mvm/tx.c b/drivers/net/wireless/iwlwifi/mvm/tx.c
index 281451c274ca..771097f2162d 100644
--- a/drivers/net/wireless/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/iwlwifi/mvm/tx.c
@@ -370,6 +370,15 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
return -1;
}
+ /*
+ * Increase the pending frames counter, so that later when a reply comes
+ * in and the counter is decreased - we don't start getting negative
+ * values.
+ * Note that we don't need to make sure it isn't agg'd, since we're
+ * TXing non-sta
+ */
+ atomic_inc(&mvm->pending_frames[sta_id]);
+
return 0;
}
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
index d72605864b0a..14562788e4e0 100644
--- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -55,9 +55,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
if (PagePrivate(page))
page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
- if (TestClearPageDirty(page))
- account_page_cleaned(page, mapping);
-
+ cancel_dirty_page(page);
ClearPageMappedToDisk(page);
ll_delete_from_page_cache(page);
}
diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c
index ad48837ead42..eed7c5a31b15 100644
--- a/drivers/target/target_core_tmr.c
+++ b/drivers/target/target_core_tmr.c
@@ -181,7 +181,6 @@ void core_tmr_abort_task(
if (!__target_check_io_state(se_cmd, se_sess, 0)) {
spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags);
- target_put_sess_cmd(se_cmd);
goto out;
}
diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c
index 0b1e5a9449b5..991374b13571 100644
--- a/drivers/uwb/uwb-debug.c
+++ b/drivers/uwb/uwb-debug.c
@@ -55,7 +55,7 @@
struct uwb_dbg {
struct uwb_pal pal;
- u32 accept;
+ bool accept;
struct list_head rsvs;
struct dentry *root_d;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ccfd31f1df3a..b1aa9877acba 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -14,6 +14,7 @@
#include <linux/device_cgroup.h>
#include <linux/highmem.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
@@ -546,7 +547,8 @@ static struct file_system_type bd_type = {
.kill_sb = kill_anon_super,
};
-static struct super_block *blockdev_superblock __read_mostly;
+struct super_block *blockdev_superblock __read_mostly;
+EXPORT_SYMBOL_GPL(blockdev_superblock);
void __init bdev_cache_init(void)
{
@@ -687,11 +689,6 @@ static struct block_device *bd_acquire(struct inode *inode)
return bdev;
}
-int sb_is_blkdev_sb(struct super_block *sb)
-{
- return sb == blockdev_superblock;
-}
-
/* Call when you free inode */
void bd_forget(struct inode *inode)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 99e8f60c7962..ff10d6e093f5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3272,11 +3272,8 @@ static int write_dev_supers(struct btrfs_device *device,
*/
static void btrfs_end_empty_barrier(struct bio *bio, int err)
{
- if (err) {
- if (err == -EOPNOTSUPP)
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+ if (err)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
- }
if (bio->bi_private)
complete(bio->bi_private);
bio_put(bio);
@@ -3304,11 +3301,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
wait_for_completion(&device->flush_wait);
- if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
- printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
- rcu_str_deref(device->name));
- device->nobarriers = 1;
- } else if (!bio_flagged(bio, BIO_UPTODATE)) {
+ if (!bio_flagged(bio, BIO_UPTODATE)) {
ret = -EIO;
btrfs_dev_stat_inc_and_print(device,
BTRFS_DEV_STAT_FLUSH_ERRS);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 885f533a34d9..32e0df2d0bd6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2767,8 +2767,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
else
btrfsic_submit_bio(rw, bio);
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
bio_put(bio);
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 2907544c3a1d..8ba5bf01d341 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -30,6 +30,7 @@
#include <linux/quotaops.h>
#include <linux/highmem.h>
#include <linux/export.h>
+#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/hash.h>
#include <linux/suspend.h>
@@ -44,6 +45,9 @@
#include <trace/events/block.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+ unsigned long bio_flags,
+ struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -615,21 +619,22 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
*
* If warn is true, then emit a warning if the page is not uptodate and has
* not been truncated.
+ *
+ * The caller must hold mem_cgroup_begin_page_stat() lock.
*/
-static void __set_page_dirty(struct page *page,
- struct address_space *mapping, int warn)
+static void __set_page_dirty(struct page *page, struct address_space *mapping,
+ struct mem_cgroup *memcg, int warn)
{
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping);
+ account_page_dirtied(page, mapping, memcg);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
/*
@@ -660,6 +665,7 @@ static void __set_page_dirty(struct page *page,
int __set_page_dirty_buffers(struct page *page)
{
int newly_dirty;
+ struct mem_cgroup *memcg;
struct address_space *mapping = page_mapping(page);
if (unlikely(!mapping))
@@ -675,11 +681,22 @@ int __set_page_dirty_buffers(struct page *page)
bh = bh->b_this_page;
} while (bh != head);
}
+ /*
+ * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+ * per-memcg dirty page counters.
+ */
+ memcg = mem_cgroup_begin_page_stat(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
if (newly_dirty)
- __set_page_dirty(page, mapping, 1);
+ __set_page_dirty(page, mapping, memcg, 1);
+
+ mem_cgroup_end_page_stat(memcg);
+
+ if (newly_dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
return newly_dirty;
}
EXPORT_SYMBOL(__set_page_dirty_buffers);
@@ -1150,11 +1167,18 @@ void mark_buffer_dirty(struct buffer_head *bh)
if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
+ struct address_space *mapping = NULL;
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_begin_page_stat(page);
if (!TestSetPageDirty(page)) {
- struct address_space *mapping = page_mapping(page);
+ mapping = page_mapping(page);
if (mapping)
- __set_page_dirty(page, mapping, 0);
+ __set_page_dirty(page, mapping, memcg, 0);
}
+ mem_cgroup_end_page_stat(memcg);
+ if (mapping)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
}
EXPORT_SYMBOL(mark_buffer_dirty);
@@ -1676,8 +1700,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
struct buffer_head *bh, *head;
unsigned int blocksize, bbits;
int nr_underway = 0;
- int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC : WRITE);
+ int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
head = create_page_buffers(page, inode,
(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1766,7 +1789,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh(write_op, bh);
+ submit_bh_wbc(write_op, bh, 0, wbc);
nr_underway++;
}
bh = next;
@@ -1820,7 +1843,7 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh(write_op, bh);
+ submit_bh_wbc(write_op, bh, 0, wbc);
nr_underway++;
}
bh = next;
@@ -2930,10 +2953,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
{
struct buffer_head *bh = bio->bi_private;
- if (err == -EOPNOTSUPP) {
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- }
-
if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
set_bit(BH_Quiet, &bh->b_state);
@@ -2989,7 +3008,8 @@ void guard_bio_eod(int rw, struct bio *bio)
}
}
-int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+ unsigned long bio_flags, struct writeback_control *wbc)
{
struct bio *bio;
int ret = 0;
@@ -3012,6 +3032,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
*/
bio = bio_alloc(GFP_NOIO, 1);
+ if (wbc) {
+ wbc_init_bio(wbc, bio);
+ wbc_account_io(wbc, bh->b_page, bh->b_size);
+ }
+
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_io_vec[0].bv_page = bh->b_page;
@@ -3033,20 +3058,19 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
if (buffer_prio(bh))
rw |= REQ_PRIO;
- bio_get(bio);
submit_bio(rw, bio);
-
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
-
- bio_put(bio);
return ret;
}
+
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+{
+ return submit_bh_wbc(rw, bh, bio_flags, NULL);
+}
EXPORT_SYMBOL_GPL(_submit_bh);
int submit_bh(int rw, struct buffer_head *bh)
{
- return _submit_bh(rw, bh, 0);
+ return submit_bh_wbc(rw, bh, 0, NULL);
}
EXPORT_SYMBOL(submit_bh);
@@ -3235,8 +3259,8 @@ int try_to_free_buffers(struct page *page)
* to synchronise against __set_page_dirty_buffers and prevent the
* dirty bit from being lost.
*/
- if (ret && TestClearPageDirty(page))
- account_page_cleaned(page, mapping);
+ if (ret)
+ cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
out:
if (buffers_to_free) {
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 830a7e76f5c6..39916043da10 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -54,6 +54,22 @@ const struct inode_operations debugfs_link_operations = {
.follow_link = debugfs_follow_link,
};
+static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
+ struct dentry *parent, void *value,
+ const struct file_operations *fops,
+ const struct file_operations *fops_ro,
+ const struct file_operations *fops_wo)
+{
+ /* if there are no write bits set, make read only */
+ if (!(mode & S_IWUGO))
+ return debugfs_create_file(name, mode, parent, value, fops_ro);
+ /* if there are no read bits set, make write only */
+ if (!(mode & S_IRUGO))
+ return debugfs_create_file(name, mode, parent, value, fops_wo);
+
+ return debugfs_create_file(name, mode, parent, value, fops);
+}
+
static int debugfs_u8_set(void *data, u64 val)
{
*(u8 *)data = val;
@@ -95,14 +111,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
struct dentry *debugfs_create_u8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u8_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u8_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u8);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u8,
+ &fops_u8_ro, &fops_u8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -147,14 +157,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
struct dentry *debugfs_create_u16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u16_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u16_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u16);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u16,
+ &fops_u16_ro, &fops_u16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -199,14 +203,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
struct dentry *debugfs_create_u32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u32_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u32_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u32);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u32,
+ &fops_u32_ro, &fops_u32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u32);
@@ -252,17 +250,59 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
struct dentry *debugfs_create_u64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u64_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u64);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u64,
+ &fops_u64_ro, &fops_u64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u64);
+static int debugfs_ulong_set(void *data, u64 val)
+{
+ *(unsigned long *)data = val;
+ return 0;
+}
+
+static int debugfs_ulong_get(void *data, u64 *val)
+{
+ *val = *(unsigned long *)data;
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
+
+/**
+ * debugfs_create_ulong - create a debugfs file that is used to read and write
+ * an unsigned long value.
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value. If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned. It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
+ struct dentry *parent, unsigned long *value)
+{
+ return debugfs_create_mode(name, mode, parent, value, &fops_ulong,
+ &fops_ulong_ro, &fops_ulong_wo);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_ulong);
+
DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
@@ -276,6 +316,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
/*
* debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
@@ -298,14 +340,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n
struct dentry *debugfs_create_x8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x8_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x8_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x8);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x8,
+ &fops_x8_ro, &fops_x8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -322,14 +358,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
struct dentry *debugfs_create_x16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x16_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x16_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x16);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x16,
+ &fops_x16_ro, &fops_x16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -346,14 +376,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
struct dentry *debugfs_create_x32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x32_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x32_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x32);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x32,
+ &fops_x32_ro, &fops_x32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -370,7 +394,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
struct dentry *debugfs_create_x64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_x64);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x64,
+ &fops_x64_ro, &fops_x64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x64);
@@ -387,6 +412,8 @@ static int debugfs_size_t_get(void *data, u64 *val)
}
DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
"%llu\n"); /* %llu and %zu are more or less the same */
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
/**
* debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
@@ -401,7 +428,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
struct dentry *parent, size_t *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+ return debugfs_create_mode(name, mode, parent, value, &fops_size_t,
+ &fops_size_t_ro, &fops_size_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
@@ -434,24 +462,16 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
struct dentry *parent, atomic_t *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value,
- &fops_atomic_t_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value,
- &fops_atomic_t_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+ return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t,
+ &fops_atomic_t_ro, &fops_atomic_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
-static ssize_t read_file_bool(struct file *file, char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[3];
- u32 *val = file->private_data;
+ bool *val = file->private_data;
if (*val)
buf[0] = 'Y';
@@ -461,14 +481,15 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf,
buf[2] = 0x00;
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}
+EXPORT_SYMBOL_GPL(debugfs_read_file_bool);
-static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[32];
size_t buf_size;
bool bv;
- u32 *val = file->private_data;
+ bool *val = file->private_data;
buf_size = min(count, (sizeof(buf)-1));
if (copy_from_user(buf, user_buf, buf_size))
@@ -480,10 +501,23 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
return count;
}
+EXPORT_SYMBOL_GPL(debugfs_write_file_bool);
static const struct file_operations fops_bool = {
- .read = read_file_bool,
- .write = write_file_bool,
+ .read = debugfs_read_file_bool,
+ .write = debugfs_write_file_bool,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations fops_bool_ro = {
+ .read = debugfs_read_file_bool,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations fops_bool_wo = {
+ .write = debugfs_write_file_bool,
.open = simple_open,
.llseek = default_llseek,
};
@@ -513,9 +547,10 @@ static const struct file_operations fops_bool = {
* code.
*/
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
- struct dentry *parent, u32 *value)
+ struct dentry *parent, bool *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_bool);
+ return debugfs_create_mode(name, mode, parent, value, &fops_bool,
+ &fops_bool_ro, &fops_bool_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_bool);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d0e746e96511..900e19cf9ef6 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -882,6 +882,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
MS_POSIXACL : 0);
+ sb->s_iflags |= SB_I_CGROUPWB;
if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
(EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9a83f149ac85..8b033f24a6b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -191,7 +191,7 @@ typedef struct ext4_io_end {
} ext4_io_end_t;
struct ext4_io_submit {
- int io_op;
+ struct writeback_control *io_wbc;
struct bio *io_bio;
ext4_io_end_t *io_end;
sector_t io_next_block;
@@ -873,6 +873,15 @@ struct ext4_inode_info {
* by other means, so we have i_data_sem.
*/
struct rw_semaphore i_data_sem;
+ /*
+ * i_mmap_sem is for serializing page faults with truncate / punch hole
+ * operations. We have to make sure that new page cannot be faulted in
+ * a section of the inode that is being punched. We cannot easily use
+ * i_data_sem for this since we need protection for the whole punch
+ * operation and i_data_sem ranks below transaction start so we have
+ * to occasionally drop it.
+ */
+ struct rw_semaphore i_mmap_sem;
struct inode vfs_inode;
struct jbd2_inode *jinode;
@@ -2287,6 +2296,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
@@ -2632,6 +2642,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
return changed;
}
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+ loff_t len);
+
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 87ba10d1d3bc..677955986ad7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -39,6 +39,7 @@
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/fiemap.h>
+#include <linux/backing-dev.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
@@ -4741,7 +4742,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
int partial_begin, partial_end;
loff_t start, end;
ext4_lblk_t lblk;
- struct address_space *mapping = inode->i_mapping;
unsigned int blkbits = inode->i_blkbits;
trace_ext4_zero_range(inode, offset, len, mode);
@@ -4757,17 +4757,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
/*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- ret = filemap_write_and_wait_range(mapping, offset,
- offset + len - 1);
- if (ret)
- return ret;
- }
-
- /*
* Round up offset. This is not fallocate, we neet to zero out
* blocks, so convert interior block aligned part of the range to
* unwritten and possibly manually zero out unaligned parts of the
@@ -4810,6 +4799,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
/* Preallocate the range including the unaligned edges */
if (partial_begin || partial_end) {
ret = ext4_alloc_file_blocks(file,
@@ -4818,7 +4811,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
round_down(offset, 1 << blkbits)) >> blkbits,
new_size, flags, mode);
if (ret)
- goto out_mutex;
+ goto out_dio;
}
@@ -4827,16 +4820,23 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
EXT4_EX_NOCACHE);
- /* Now release the pages and zero block aligned part of pages*/
+ /*
+ * Prevent page faults from reinstantiating pages we have
+ * released from page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ ret = ext4_update_disksize_before_punch(inode, offset, len);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_mmap_sem);
+ goto out_dio;
+ }
+ /* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
-
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
if (ret)
goto out_dio;
}
@@ -4964,8 +4964,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto out;
}
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
+ ext4_inode_resume_unlocked_dio(inode);
if (ret)
goto out;
@@ -5424,21 +5429,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
- /*
- * Need to round down offset to be aligned with page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
-
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- return ret;
-
- /* Take mutex lock */
mutex_lock(&inode->i_mutex);
-
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
@@ -5454,17 +5445,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}
- truncate_pagecache(inode, ioffset);
-
/* Wait for existing dio to complete */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ /*
+ * Need to round down offset to be aligned with page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+ /*
+ * Write tail of the last page before removed range since it will get
+ * removed from the page cache below.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+ if (ret)
+ goto out_mmap;
+ /*
+ * Write data that will be shifted to preserve them when discarding
+ * page cache below. We are also protected from pages becoming dirty
+ * by i_mmap_sem.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+ LLONG_MAX);
+ if (ret)
+ goto out_mmap;
+ truncate_pagecache(inode, ioffset);
+
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_dio;
+ goto out_mmap;
}
down_write(&EXT4_I(inode)->i_data_sem);
@@ -5503,7 +5520,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
mutex_unlock(&inode->i_mutex);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0613c256c344..dd65fac5ff2f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -213,7 +213,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
#endif
static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = filemap_fault,
+ .fault = ext4_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2b3a53a51582..3291e1af0e24 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3524,6 +3524,35 @@ int ext4_can_truncate(struct inode *inode)
}
/*
+ * We have to make sure i_disksize gets properly updated before we truncate
+ * page cache due to hole punching or zero range. Otherwise i_disksize update
+ * can get lost as it may have been postponed to submission of writeback but
+ * that will never happen after we truncate page cache.
+ */
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+ loff_t len)
+{
+ handle_t *handle;
+ loff_t size = i_size_read(inode);
+
+ WARN_ON(!mutex_is_locked(&inode->i_mutex));
+ if (offset > size || offset + len < size)
+ return 0;
+
+ if (EXT4_I(inode)->i_disksize >= size)
+ return 0;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ext4_update_i_disksize(inode, size);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+ return 0;
+}
+
+/*
* ext4_punch_hole: punches a hole in a file by releaseing the blocks
* associated with the given offset and length
*
@@ -3588,17 +3617,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
first_block_offset = round_up(offset, sb->s_blocksize);
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
/* Now release the pages and zero block aligned part of pages*/
- if (last_block_offset > first_block_offset)
+ if (last_block_offset > first_block_offset) {
+ ret = ext4_update_disksize_before_punch(inode, offset, length);
+ if (ret)
+ goto out_dio;
truncate_pagecache_range(inode, first_block_offset,
last_block_offset);
-
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
+ }
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
@@ -3645,16 +3683,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- /* Now release the pages again to reduce race window */
- if (last_block_offset > first_block_offset)
- truncate_pagecache_range(inode, first_block_offset,
- last_block_offset);
-
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
ext4_journal_stop(handle);
out_dio:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
mutex_unlock(&inode->i_mutex);
@@ -4775,11 +4809,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
} else
ext4_wait_for_tail_page_commit(inode);
}
+ down_write(&EXT4_I(inode)->i_mmap_sem);
/*
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
*/
truncate_pagecache(inode, inode->i_size);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
}
/*
* We want to call ext4_truncate() even if attr->ia_size ==
@@ -5234,6 +5270,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
+
+ down_read(&EXT4_I(inode)->i_mmap_sem);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
@@ -5303,6 +5341,19 @@ retry_alloc:
out_ret:
ret = block_page_mkwrite_return(ret);
out:
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
}
+
+int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ int err;
+
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ err = filemap_fault(vma, vmf);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+
+ return err;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 41260489d3bc..5b1613a54307 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -26,6 +26,7 @@
#include <linux/log2.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/backing-dev.h>
#include <trace/events/ext4.h>
#ifdef CONFIG_EXT4_DEBUG
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 8082565c59a9..00cda09e4412 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -357,9 +357,10 @@ void ext4_io_submit(struct ext4_io_submit *io)
struct bio *bio = io->io_bio;
if (bio) {
+ int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE;
bio_get(io->io_bio);
- submit_bio(io->io_op, io->io_bio);
- BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+ submit_bio(io_op, io->io_bio);
bio_put(io->io_bio);
}
io->io_bio = NULL;
@@ -368,7 +369,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc)
{
- io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ io->io_wbc = wbc;
io->io_bio = NULL;
io->io_end = NULL;
}
@@ -382,6 +383,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
if (!bio)
return -ENOMEM;
+ wbc_init_bio(io->io_wbc, bio);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_end_io = ext4_end_bio;
@@ -410,6 +412,7 @@ submit_and_retry:
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
+ wbc_account_io(io->io_wbc, page, bh->b_size);
io->io_next_block++;
return 0;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8a3b9f14d198..abb7ec3f19b9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -24,6 +24,7 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/parser.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
@@ -945,6 +946,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
init_rwsem(&ei->i_data_sem);
+ init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
}
@@ -3649,6 +3651,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (test_opt(sb, DELALLOC))
clear_opt(sb, DELALLOC);
+ } else {
+ sb->s_iflags |= SB_I_CGROUPWB;
}
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index 011ba6670d99..c70d06a383e2 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -10,8 +10,10 @@
*/
static inline void ext4_truncate_failed_write(struct inode *inode)
{
+ down_write(&EXT4_I(inode)->i_mmap_sem);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
}
/*
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 8ab0cf1930bd..d211602e0f86 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -53,7 +53,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
PAGE_CACHE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == DIRTY_DENTS) {
- if (sbi->sb->s_bdi->dirty_exceeded)
+ if (sbi->sb->s_bdi->wb.dirty_exceeded)
return false;
mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
@@ -70,7 +70,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else {
- if (sbi->sb->s_bdi->dirty_exceeded)
+ if (sbi->sb->s_bdi->wb.dirty_exceeded)
return false;
}
return res;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 85d7fa7514b2..aba72f7a8ac4 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -9,6 +9,7 @@
* published by the Free Software Foundation.
*/
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
/* constant macro */
#define NULL_SEGNO ((unsigned int)(~0))
@@ -713,7 +714,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
*/
static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
{
- if (sbi->sb->s_bdi->dirty_exceeded)
+ if (sbi->sb->s_bdi->wb.dirty_exceeded)
return 0;
if (type == DATA)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 442d50a0e33e..a08f1039909a 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,6 +11,7 @@
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include "fat.h"
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c06774658345..509411dd3698 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -18,6 +18,7 @@
#include <linux/parser.h>
#include <linux/uio.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <asm/unaligned.h>
#include "fat.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 32a8bbd7a9ad..518c6294bf6c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -27,6 +27,7 @@
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
#include <linux/device.h>
+#include <linux/memcontrol.h>
#include "internal.h"
/*
@@ -34,6 +35,10 @@
*/
#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
+struct wb_completion {
+ atomic_t cnt;
+};
+
/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
@@ -47,13 +52,29 @@ struct wb_writeback_work {
unsigned int range_cyclic:1;
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
+ unsigned int auto_free:1; /* free on completion */
+ unsigned int single_wait:1;
+ unsigned int single_done:1;
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
- struct completion *done; /* set if the caller waits */
+ struct wb_completion *done; /* set if the caller waits */
};
/*
+ * If one wants to wait for one or more wb_writeback_works, each work's
+ * ->done should be set to a wb_completion defined using the following
+ * macro. Once all work items are issued with wb_queue_work(), the caller
+ * can wait for the completion of all using wb_wait_for_completion(). Work
+ * items which are waited upon aren't freed automatically on completion.
+ */
+#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \
+ struct wb_completion cmpl = { \
+ .cnt = ATOMIC_INIT(1), \
+ }
+
+
+/*
* If an inode is constantly having its pages dirtied, but then the
* updates stop dirtytime_expire_interval seconds in the past, it's
* possible for the worst case time between when an inode has its
@@ -65,35 +86,6 @@ struct wb_writeback_work {
*/
unsigned int dirtytime_expire_interval = 12 * 60 * 60;
-/**
- * writeback_in_progress - determine whether there is writeback in progress
- * @bdi: the device's backing_dev_info structure.
- *
- * Determine whether there is writeback waiting to be handled against a
- * backing device.
- */
-int writeback_in_progress(struct backing_dev_info *bdi)
-{
- return test_bit(BDI_writeback_running, &bdi->state);
-}
-EXPORT_SYMBOL(writeback_in_progress);
-
-struct backing_dev_info *inode_to_bdi(struct inode *inode)
-{
- struct super_block *sb;
-
- if (!inode)
- return &noop_backing_dev_info;
-
- sb = inode->i_sb;
-#ifdef CONFIG_BLOCK
- if (sb_is_blkdev_sb(sb))
- return blk_get_backing_dev_info(I_BDEV(inode));
-#endif
- return sb->s_bdi;
-}
-EXPORT_SYMBOL_GPL(inode_to_bdi);
-
static inline struct inode *wb_inode(struct list_head *head)
{
return list_entry(head, struct inode, i_wb_list);
@@ -109,45 +101,831 @@ static inline struct inode *wb_inode(struct list_head *head)
EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
-static void bdi_wakeup_thread(struct backing_dev_info *bdi)
+static bool wb_io_lists_populated(struct bdi_writeback *wb)
+{
+ if (wb_has_dirty_io(wb)) {
+ return false;
+ } else {
+ set_bit(WB_has_dirty_io, &wb->state);
+ WARN_ON_ONCE(!wb->avg_write_bandwidth);
+ atomic_long_add(wb->avg_write_bandwidth,
+ &wb->bdi->tot_write_bandwidth);
+ return true;
+ }
+}
+
+static void wb_io_lists_depopulated(struct bdi_writeback *wb)
{
- spin_lock_bh(&bdi->wb_lock);
- if (test_bit(BDI_registered, &bdi->state))
- mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
- spin_unlock_bh(&bdi->wb_lock);
+ if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
+ list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
+ clear_bit(WB_has_dirty_io, &wb->state);
+ WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
+ &wb->bdi->tot_write_bandwidth) < 0);
+ }
}
-static void bdi_queue_work(struct backing_dev_info *bdi,
- struct wb_writeback_work *work)
+/**
+ * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list
+ * @inode: inode to be moved
+ * @wb: target bdi_writeback
+ * @head: one of @wb->b_{dirty|io|more_io}
+ *
+ * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io.
+ * Returns %true if @inode is the first occupant of the !dirty_time IO
+ * lists; otherwise, %false.
+ */
+static bool inode_wb_list_move_locked(struct inode *inode,
+ struct bdi_writeback *wb,
+ struct list_head *head)
{
- trace_writeback_queue(bdi, work);
+ assert_spin_locked(&wb->list_lock);
+
+ list_move(&inode->i_wb_list, head);
- spin_lock_bh(&bdi->wb_lock);
- if (!test_bit(BDI_registered, &bdi->state)) {
- if (work->done)
- complete(work->done);
+ /* dirty_time doesn't count as dirty_io until expiration */
+ if (head != &wb->b_dirty_time)
+ return wb_io_lists_populated(wb);
+
+ wb_io_lists_depopulated(wb);
+ return false;
+}
+
+/**
+ * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list
+ * @inode: inode to be removed
+ * @wb: bdi_writeback @inode is being removed from
+ *
+ * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
+ * clear %WB_has_dirty_io if all are empty afterwards.
+ */
+static void inode_wb_list_del_locked(struct inode *inode,
+ struct bdi_writeback *wb)
+{
+ assert_spin_locked(&wb->list_lock);
+
+ list_del_init(&inode->i_wb_list);
+ wb_io_lists_depopulated(wb);
+}
+
+static void wb_wakeup(struct bdi_writeback *wb)
+{
+ spin_lock_bh(&wb->work_lock);
+ if (test_bit(WB_registered, &wb->state))
+ mod_delayed_work(bdi_wq, &wb->dwork, 0);
+ spin_unlock_bh(&wb->work_lock);
+}
+
+static void wb_queue_work(struct bdi_writeback *wb,
+ struct wb_writeback_work *work)
+{
+ trace_writeback_queue(wb->bdi, work);
+
+ spin_lock_bh(&wb->work_lock);
+ if (!test_bit(WB_registered, &wb->state)) {
+ if (work->single_wait)
+ work->single_done = 1;
goto out_unlock;
}
- list_add_tail(&work->list, &bdi->work_list);
- mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+ if (work->done)
+ atomic_inc(&work->done->cnt);
+ list_add_tail(&work->list, &wb->work_list);
+ mod_delayed_work(bdi_wq, &wb->dwork, 0);
out_unlock:
- spin_unlock_bh(&bdi->wb_lock);
+ spin_unlock_bh(&wb->work_lock);
+}
+
+/**
+ * wb_wait_for_completion - wait for completion of bdi_writeback_works
+ * @bdi: bdi work items were issued to
+ * @done: target wb_completion
+ *
+ * Wait for one or more work items issued to @bdi with their ->done field
+ * set to @done, which should have been defined with
+ * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such
+ * work items are completed. Work items which are waited upon aren't freed
+ * automatically on completion.
+ */
+static void wb_wait_for_completion(struct backing_dev_info *bdi,
+ struct wb_completion *done)
+{
+ atomic_dec(&done->cnt); /* put down the initial count */
+ wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/* parameters for foreign inode detection, see wb_detach_inode() */
+#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */
+#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */
+#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */
+#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */
+
+#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */
+#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
+ /* each slot's duration is 2s / 16 */
+#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
+ /* if foreign slots >= 8, switch */
+#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
+ /* one round can affect upto 5 slots */
+
+void __inode_attach_wb(struct inode *inode, struct page *page)
+{
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb = NULL;
+
+ if (inode_cgwb_enabled(inode)) {
+ struct cgroup_subsys_state *memcg_css;
+
+ if (page) {
+ memcg_css = mem_cgroup_css_from_page(page);
+ wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ } else {
+ /* must pin memcg_css, see wb_get_create() */
+ memcg_css = task_get_css(current, memory_cgrp_id);
+ wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ css_put(memcg_css);
+ }
+ }
+
+ if (!wb)
+ wb = &bdi->wb;
+
+ /*
+ * There may be multiple instances of this function racing to
+ * update the same inode. Use cmpxchg() to tell the winner.
+ */
+ if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
+ wb_put(wb);
+}
+
+/**
+ * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
+ * @inode: inode of interest with i_lock held
+ *
+ * Returns @inode's wb with its list_lock held. @inode->i_lock must be
+ * held on entry and is released on return. The returned wb is guaranteed
+ * to stay @inode's associated wb until its list_lock is released.
+ */
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+ __releases(&inode->i_lock)
+ __acquires(&wb->list_lock)
+{
+ while (true) {
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ /*
+ * inode_to_wb() association is protected by both
+ * @inode->i_lock and @wb->list_lock but list_lock nests
+ * outside i_lock. Drop i_lock and verify that the
+ * association hasn't changed after acquiring list_lock.
+ */
+ wb_get(wb);
+ spin_unlock(&inode->i_lock);
+ spin_lock(&wb->list_lock);
+ wb_put(wb); /* not gonna deref it anymore */
+
+ /* i_wb may have changed inbetween, can't use inode_to_wb() */
+ if (likely(wb == inode->i_wb))
+ return wb; /* @inode already has ref */
+
+ spin_unlock(&wb->list_lock);
+ cpu_relax();
+ spin_lock(&inode->i_lock);
+ }
+}
+
+/**
+ * inode_to_wb_and_lock_list - determine an inode's wb and lock it
+ * @inode: inode of interest
+ *
+ * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
+ * on entry.
+ */
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+ __acquires(&wb->list_lock)
+{
+ spin_lock(&inode->i_lock);
+ return locked_inode_to_wb_and_lock_list(inode);
+}
+
+struct inode_switch_wbs_context {
+ struct inode *inode;
+ struct bdi_writeback *new_wb;
+
+ struct rcu_head rcu_head;
+ struct work_struct work;
+};
+
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+ struct inode_switch_wbs_context *isw =
+ container_of(work, struct inode_switch_wbs_context, work);
+ struct inode *inode = isw->inode;
+ struct address_space *mapping = inode->i_mapping;
+ struct bdi_writeback *old_wb = inode->i_wb;
+ struct bdi_writeback *new_wb = isw->new_wb;
+ struct radix_tree_iter iter;
+ bool switched = false;
+ void **slot;
+
+ /*
+ * By the time control reaches here, RCU grace period has passed
+ * since I_WB_SWITCH assertion and all wb stat update transactions
+ * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+ * synchronizing against mapping->tree_lock.
+ *
+ * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
+ * gives us exclusion against all wb related operations on @inode
+ * including IO list manipulations and stat updates.
+ */
+ if (old_wb < new_wb) {
+ spin_lock(&old_wb->list_lock);
+ spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&new_wb->list_lock);
+ spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+ }
+ spin_lock(&inode->i_lock);
+ spin_lock_irq(&mapping->tree_lock);
+
+ /*
+ * Once I_FREEING is visible under i_lock, the eviction path owns
+ * the inode and we shouldn't modify ->i_wb_list.
+ */
+ if (unlikely(inode->i_state & I_FREEING))
+ goto skip_switch;
+
+ /*
+ * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
+ * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
+ * pages actually under underwriteback.
+ */
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+ PAGECACHE_TAG_DIRTY) {
+ struct page *page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (likely(page) && PageDirty(page)) {
+ __dec_wb_stat(old_wb, WB_RECLAIMABLE);
+ __inc_wb_stat(new_wb, WB_RECLAIMABLE);
+ }
+ }
+
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+ PAGECACHE_TAG_WRITEBACK) {
+ struct page *page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (likely(page)) {
+ WARN_ON_ONCE(!PageWriteback(page));
+ __dec_wb_stat(old_wb, WB_WRITEBACK);
+ __inc_wb_stat(new_wb, WB_WRITEBACK);
+ }
+ }
+
+ wb_get(new_wb);
+
+ /*
+ * Transfer to @new_wb's IO list if necessary. The specific list
+ * @inode was on is ignored and the inode is put on ->b_dirty which
+ * is always correct including from ->b_dirty_time. The transfer
+ * preserves @inode->dirtied_when ordering.
+ */
+ if (!list_empty(&inode->i_wb_list)) {
+ struct inode *pos;
+
+ inode_wb_list_del_locked(inode, old_wb);
+ inode->i_wb = new_wb;
+ list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+ if (time_after_eq(inode->dirtied_when,
+ pos->dirtied_when))
+ break;
+ inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+ } else {
+ inode->i_wb = new_wb;
+ }
+
+ /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
+ inode->i_wb_frn_winner = 0;
+ inode->i_wb_frn_avg_time = 0;
+ inode->i_wb_frn_history = 0;
+ switched = true;
+skip_switch:
+ /*
+ * Paired with load_acquire in unlocked_inode_to_wb_begin() and
+ * ensures that the new wb is visible if they see !I_WB_SWITCH.
+ */
+ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+
+ spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&new_wb->list_lock);
+ spin_unlock(&old_wb->list_lock);
+
+ if (switched) {
+ wb_wakeup(new_wb);
+ wb_put(old_wb);
+ }
+ wb_put(new_wb);
+
+ iput(inode);
+ kfree(isw);
+}
+
+static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
+{
+ struct inode_switch_wbs_context *isw = container_of(rcu_head,
+ struct inode_switch_wbs_context, rcu_head);
+
+ /* needs to grab bh-unsafe locks, bounce to work item */
+ INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
+ schedule_work(&isw->work);
+}
+
+/**
+ * inode_switch_wbs - change the wb association of an inode
+ * @inode: target inode
+ * @new_wb_id: ID of the new wb
+ *
+ * Switch @inode's wb association to the wb identified by @new_wb_id. The
+ * switching is performed asynchronously and may fail silently.
+ */
+static void inode_switch_wbs(struct inode *inode, int new_wb_id)
+{
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct cgroup_subsys_state *memcg_css;
+ struct inode_switch_wbs_context *isw;
+
+ /* noop if seems to be already in progress */
+ if (inode->i_state & I_WB_SWITCH)
+ return;
+
+ isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+ if (!isw)
+ return;
+
+ /* find and pin the new wb */
+ rcu_read_lock();
+ memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
+ if (memcg_css)
+ isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ rcu_read_unlock();
+ if (!isw->new_wb)
+ goto out_free;
+
+ /* while holding I_WB_SWITCH, no one else can update the association */
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+ inode_to_wb(inode) == isw->new_wb) {
+ spin_unlock(&inode->i_lock);
+ goto out_free;
+ }
+ inode->i_state |= I_WB_SWITCH;
+ spin_unlock(&inode->i_lock);
+
+ ihold(inode);
+ isw->inode = inode;
+
+ /*
+ * In addition to synchronizing among switchers, I_WB_SWITCH tells
+ * the RCU protected stat update paths to grab the mapping's
+ * tree_lock so that stat transfer can synchronize against them.
+ * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+ */
+ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+ return;
+
+out_free:
+ if (isw->new_wb)
+ wb_put(isw->new_wb);
+ kfree(isw);
+}
+
+/**
+ * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * @inode is locked and about to be written back under the control of @wbc.
+ * Record @inode's writeback context into @wbc and unlock the i_lock. On
+ * writeback completion, wbc_detach_inode() should be called. This is used
+ * to track the cgroup writeback context.
+ */
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+{
+ if (!inode_cgwb_enabled(inode)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+
+ wbc->wb = inode_to_wb(inode);
+ wbc->inode = inode;
+
+ wbc->wb_id = wbc->wb->memcg_css->id;
+ wbc->wb_lcand_id = inode->i_wb_frn_winner;
+ wbc->wb_tcand_id = 0;
+ wbc->wb_bytes = 0;
+ wbc->wb_lcand_bytes = 0;
+ wbc->wb_tcand_bytes = 0;
+
+ wb_get(wbc->wb);
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * A dying wb indicates that the memcg-blkcg mapping has changed
+ * and a new wb is already serving the memcg. Switch immediately.
+ */
+ if (unlikely(wb_dying(wbc->wb)))
+ inode_switch_wbs(inode, wbc->wb_id);
+}
+
+/**
+ * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
+ * @wbc: writeback_control of the just finished writeback
+ *
+ * To be called after a writeback attempt of an inode finishes and undoes
+ * wbc_attach_and_unlock_inode(). Can be called under any context.
+ *
+ * As concurrent write sharing of an inode is expected to be very rare and
+ * memcg only tracks page ownership on first-use basis severely confining
+ * the usefulness of such sharing, cgroup writeback tracks ownership
+ * per-inode. While the support for concurrent write sharing of an inode
+ * is deemed unnecessary, an inode being written to by different cgroups at
+ * different points in time is a lot more common, and, more importantly,
+ * charging only by first-use can too readily lead to grossly incorrect
+ * behaviors (single foreign page can lead to gigabytes of writeback to be
+ * incorrectly attributed).
+ *
+ * To resolve this issue, cgroup writeback detects the majority dirtier of
+ * an inode and transfers the ownership to it. To avoid unnnecessary
+ * oscillation, the detection mechanism keeps track of history and gives
+ * out the switch verdict only if the foreign usage pattern is stable over
+ * a certain amount of time and/or writeback attempts.
+ *
+ * On each writeback attempt, @wbc tries to detect the majority writer
+ * using Boyer-Moore majority vote algorithm. In addition to the byte
+ * count from the majority voting, it also counts the bytes written for the
+ * current wb and the last round's winner wb (max of last round's current
+ * wb, the winner from two rounds ago, and the last round's majority
+ * candidate). Keeping track of the historical winner helps the algorithm
+ * to semi-reliably detect the most active writer even when it's not the
+ * absolute majority.
+ *
+ * Once the winner of the round is determined, whether the winner is
+ * foreign or not and how much IO time the round consumed is recorded in
+ * inode->i_wb_frn_history. If the amount of recorded foreign IO time is
+ * over a certain threshold, the switch verdict is given.
+ */
+void wbc_detach_inode(struct writeback_control *wbc)
+{
+ struct bdi_writeback *wb = wbc->wb;
+ struct inode *inode = wbc->inode;
+ unsigned long avg_time, max_bytes, max_time;
+ u16 history;
+ int max_id;
+
+ if (!wb)
+ return;
+
+ history = inode->i_wb_frn_history;
+ avg_time = inode->i_wb_frn_avg_time;
+
+ /* pick the winner of this round */
+ if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
+ wbc->wb_bytes >= wbc->wb_tcand_bytes) {
+ max_id = wbc->wb_id;
+ max_bytes = wbc->wb_bytes;
+ } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
+ max_id = wbc->wb_lcand_id;
+ max_bytes = wbc->wb_lcand_bytes;
+ } else {
+ max_id = wbc->wb_tcand_id;
+ max_bytes = wbc->wb_tcand_bytes;
+ }
+
+ /*
+ * Calculate the amount of IO time the winner consumed and fold it
+ * into the running average kept per inode. If the consumed IO
+ * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
+ * deciding whether to switch or not. This is to prevent one-off
+ * small dirtiers from skewing the verdict.
+ */
+ max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
+ wb->avg_write_bandwidth);
+ if (avg_time)
+ avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
+ (avg_time >> WB_FRN_TIME_AVG_SHIFT);
+ else
+ avg_time = max_time; /* immediate catch up on first run */
+
+ if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
+ int slots;
+
+ /*
+ * The switch verdict is reached if foreign wb's consume
+ * more than a certain proportion of IO time in a
+ * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot
+ * history mask where each bit represents one sixteenth of
+ * the period. Determine the number of slots to shift into
+ * history from @max_time.
+ */
+ slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
+ (unsigned long)WB_FRN_HIST_MAX_SLOTS);
+ history <<= slots;
+ if (wbc->wb_id != max_id)
+ history |= (1U << slots) - 1;
+
+ /*
+ * Switch if the current wb isn't the consistent winner.
+ * If there are multiple closely competing dirtiers, the
+ * inode may switch across them repeatedly over time, which
+ * is okay. The main goal is avoiding keeping an inode on
+ * the wrong wb for an extended period of time.
+ */
+ if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
+ inode_switch_wbs(inode, max_id);
+ }
+
+ /*
+ * Multiple instances of this function may race to update the
+ * following fields but we don't mind occassional inaccuracies.
+ */
+ inode->i_wb_frn_winner = max_id;
+ inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
+ inode->i_wb_frn_history = history;
+
+ wb_put(wbc->wb);
+ wbc->wb = NULL;
+}
+
+/**
+ * wbc_account_io - account IO issued during writeback
+ * @wbc: writeback_control of the writeback in progress
+ * @page: page being written out
+ * @bytes: number of bytes being written out
+ *
+ * @bytes from @page are about to written out during the writeback
+ * controlled by @wbc. Keep the book for foreign inode detection. See
+ * wbc_detach_inode().
+ */
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+ size_t bytes)
+{
+ int id;
+
+ /*
+ * pageout() path doesn't attach @wbc to the inode being written
+ * out. This is intentional as we don't want the function to block
+ * behind a slow cgroup. Ultimately, we want pageout() to kick off
+ * regular writeback instead of writing things out itself.
+ */
+ if (!wbc->wb)
+ return;
+
+ rcu_read_lock();
+ id = mem_cgroup_css_from_page(page)->id;
+ rcu_read_unlock();
+
+ if (id == wbc->wb_id) {
+ wbc->wb_bytes += bytes;
+ return;
+ }
+
+ if (id == wbc->wb_lcand_id)
+ wbc->wb_lcand_bytes += bytes;
+
+ /* Boyer-Moore majority vote algorithm */
+ if (!wbc->wb_tcand_bytes)
+ wbc->wb_tcand_id = id;
+ if (id == wbc->wb_tcand_id)
+ wbc->wb_tcand_bytes += bytes;
+ else
+ wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
}
+EXPORT_SYMBOL_GPL(wbc_account_io);
-static void
-__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- bool range_cyclic, enum wb_reason reason)
+/**
+ * inode_congested - test whether an inode is congested
+ * @inode: inode to test for congestion
+ * @cong_bits: mask of WB_[a]sync_congested bits to test
+ *
+ * Tests whether @inode is congested. @cong_bits is the mask of congestion
+ * bits to test and the return value is the mask of set bits.
+ *
+ * If cgroup writeback is enabled for @inode, the congestion state is
+ * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
+ * associated with @inode is congested; otherwise, the root wb's congestion
+ * state is used.
+ */
+int inode_congested(struct inode *inode, int cong_bits)
+{
+ /*
+ * Once set, ->i_wb never becomes NULL while the inode is alive.
+ * Start transaction iff ->i_wb is visible.
+ */
+ if (inode && inode_to_wb_is_valid(inode)) {
+ struct bdi_writeback *wb;
+ bool locked, congested;
+
+ wb = unlocked_inode_to_wb_begin(inode, &locked);
+ congested = wb_congested(wb, cong_bits);
+ unlocked_inode_to_wb_end(inode, locked);
+ return congested;
+ }
+
+ return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+}
+EXPORT_SYMBOL_GPL(inode_congested);
+
+/**
+ * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
+ * @bdi: bdi the work item was issued to
+ * @work: work item to wait for
+ *
+ * Wait for the completion of @work which was issued to one of @bdi's
+ * bdi_writeback's. The caller must have set @work->single_wait before
+ * issuing it. This wait operates independently fo
+ * wb_wait_for_completion() and also disables automatic freeing of @work.
+ */
+static void wb_wait_for_single_work(struct backing_dev_info *bdi,
+ struct wb_writeback_work *work)
+{
+ if (WARN_ON_ONCE(!work->single_wait))
+ return;
+
+ wait_event(bdi->wb_waitq, work->single_done);
+
+ /*
+ * Paired with smp_wmb() in wb_do_writeback() and ensures that all
+ * modifications to @work prior to assertion of ->single_done is
+ * visible to the caller once this function returns.
+ */
+ smp_rmb();
+}
+
+/**
+ * wb_split_bdi_pages - split nr_pages to write according to bandwidth
+ * @wb: target bdi_writeback to split @nr_pages to
+ * @nr_pages: number of pages to write for the whole bdi
+ *
+ * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
+ * relation to the total write bandwidth of all wb's w/ dirty inodes on
+ * @wb->bdi.
+ */
+static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
+{
+ unsigned long this_bw = wb->avg_write_bandwidth;
+ unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+
+ if (nr_pages == LONG_MAX)
+ return LONG_MAX;
+
+ /*
+ * This may be called on clean wb's and proportional distribution
+ * may not make sense, just use the original @nr_pages in those
+ * cases. In general, we wanna err on the side of writing more.
+ */
+ if (!tot_bw || this_bw >= tot_bw)
+ return nr_pages;
+ else
+ return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
+}
+
+/**
+ * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
+ * @wb: target bdi_writeback
+ * @base_work: source wb_writeback_work
+ *
+ * Try to make a clone of @base_work and issue it to @wb. If cloning
+ * succeeds, %true is returned; otherwise, @base_work is issued directly
+ * and %false is returned. In the latter case, the caller is required to
+ * wait for @base_work's completion using wb_wait_for_single_work().
+ *
+ * A clone is auto-freed on completion. @base_work never is.
+ */
+static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
+ struct wb_writeback_work *base_work)
{
struct wb_writeback_work *work;
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ *work = *base_work;
+ work->auto_free = 1;
+ work->single_wait = 0;
+ } else {
+ work = base_work;
+ work->auto_free = 0;
+ work->single_wait = 1;
+ }
+ work->single_done = 0;
+ wb_queue_work(wb, work);
+ return work != base_work;
+}
+
+/**
+ * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
+ * @bdi: target backing_dev_info
+ * @base_work: wb_writeback_work to issue
+ * @skip_if_busy: skip wb's which already have writeback in progress
+ *
+ * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
+ * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's
+ * distributed to the busy wbs according to each wb's proportion in the
+ * total active write bandwidth of @bdi.
+ */
+static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+ struct wb_writeback_work *base_work,
+ bool skip_if_busy)
+{
+ long nr_pages = base_work->nr_pages;
+ int next_blkcg_id = 0;
+ struct bdi_writeback *wb;
+ struct wb_iter iter;
+
+ might_sleep();
+
+ if (!bdi_has_dirty_io(bdi))
+ return;
+restart:
+ rcu_read_lock();
+ bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+ if (!wb_has_dirty_io(wb) ||
+ (skip_if_busy && writeback_in_progress(wb)))
+ continue;
+
+ base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
+ if (!wb_clone_and_queue_work(wb, base_work)) {
+ next_blkcg_id = wb->blkcg_css->id + 1;
+ rcu_read_unlock();
+ wb_wait_for_single_work(bdi, base_work);
+ goto restart;
+ }
+ }
+ rcu_read_unlock();
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+ __releases(&inode->i_lock)
+ __acquires(&wb->list_lock)
+{
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ spin_unlock(&inode->i_lock);
+ spin_lock(&wb->list_lock);
+ return wb;
+}
+
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+ __acquires(&wb->list_lock)
+{
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ spin_lock(&wb->list_lock);
+ return wb;
+}
+
+static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
+{
+ return nr_pages;
+}
+
+static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+ struct wb_writeback_work *base_work,
+ bool skip_if_busy)
+{
+ might_sleep();
+
+ if (bdi_has_dirty_io(bdi) &&
+ (!skip_if_busy || !writeback_in_progress(&bdi->wb))) {
+ base_work->auto_free = 0;
+ base_work->single_wait = 0;
+ base_work->single_done = 0;
+ wb_queue_work(&bdi->wb, base_work);
+ }
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+ bool range_cyclic, enum wb_reason reason)
+{
+ struct wb_writeback_work *work;
+
+ if (!wb_has_dirty_io(wb))
+ return;
+
/*
* This is WB_SYNC_NONE writeback, so if allocation fails just
* wakeup the thread for old dirty data writeback
*/
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
- trace_writeback_nowork(bdi);
- bdi_wakeup_thread(bdi);
+ trace_writeback_nowork(wb->bdi);
+ wb_wakeup(wb);
return;
}
@@ -155,46 +933,29 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
work->nr_pages = nr_pages;
work->range_cyclic = range_cyclic;
work->reason = reason;
+ work->auto_free = 1;
- bdi_queue_work(bdi, work);
+ wb_queue_work(wb, work);
}
/**
- * bdi_start_writeback - start writeback
- * @bdi: the backing device to write from
- * @nr_pages: the number of pages to write
- * @reason: reason why some writeback work was initiated
- *
- * Description:
- * This does WB_SYNC_NONE opportunistic writeback. The IO is only
- * started when this function returns, we make no guarantees on
- * completion. Caller need not hold sb s_umount semaphore.
- *
- */
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- enum wb_reason reason)
-{
- __bdi_start_writeback(bdi, nr_pages, true, reason);
-}
-
-/**
- * bdi_start_background_writeback - start background writeback
- * @bdi: the backing device to write from
+ * wb_start_background_writeback - start background writeback
+ * @wb: bdi_writback to write from
*
* Description:
* This makes sure WB_SYNC_NONE background writeback happens. When
- * this function returns, it is only guaranteed that for given BDI
+ * this function returns, it is only guaranteed that for given wb
* some IO is happening if we are over background dirty threshold.
* Caller need not hold sb s_umount semaphore.
*/
-void bdi_start_background_writeback(struct backing_dev_info *bdi)
+void wb_start_background_writeback(struct bdi_writeback *wb)
{
/*
* We just wake up the flusher thread. It will perform background
* writeback as soon as there is no other work to do.
*/
- trace_writeback_wake_background(bdi);
- bdi_wakeup_thread(bdi);
+ trace_writeback_wake_background(wb->bdi);
+ wb_wakeup(wb);
}
/*
@@ -202,11 +963,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
*/
void inode_wb_list_del(struct inode *inode)
{
- struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb;
- spin_lock(&bdi->wb.list_lock);
- list_del_init(&inode->i_wb_list);
- spin_unlock(&bdi->wb.list_lock);
+ wb = inode_to_wb_and_lock_list(inode);
+ inode_wb_list_del_locked(inode, wb);
+ spin_unlock(&wb->list_lock);
}
/*
@@ -220,7 +981,6 @@ void inode_wb_list_del(struct inode *inode)
*/
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
- assert_spin_locked(&wb->list_lock);
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
@@ -228,7 +988,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- list_move(&inode->i_wb_list, &wb->b_dirty);
+ inode_wb_list_move_locked(inode, wb, &wb->b_dirty);
}
/*
@@ -236,8 +996,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
*/
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
- assert_spin_locked(&wb->list_lock);
- list_move(&inode->i_wb_list, &wb->b_more_io);
+ inode_wb_list_move_locked(inode, wb, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
@@ -346,6 +1105,8 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
EXPIRE_DIRTY_ATIME, work);
+ if (moved)
+ wb_io_lists_populated(wb);
trace_writeback_queue_io(wb, work, moved);
}
@@ -471,10 +1232,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
redirty_tail(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
- list_move(&inode->i_wb_list, &wb->b_dirty_time);
+ inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time);
} else {
/* The inode is clean. Remove from writeback lists. */
- list_del_init(&inode->i_wb_list);
+ inode_wb_list_del_locked(inode, wb);
}
}
@@ -605,10 +1366,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
goto out;
inode->i_state |= I_SYNC;
- spin_unlock(&inode->i_lock);
+ wbc_attach_and_unlock_inode(wbc, inode);
ret = __writeback_single_inode(inode, wbc);
+ wbc_detach_inode(wbc);
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
/*
@@ -616,7 +1378,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* touch it. See comment above for explanation.
*/
if (!(inode->i_state & I_DIRTY_ALL))
- list_del_init(&inode->i_wb_list);
+ inode_wb_list_del_locked(inode, wb);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
@@ -624,7 +1386,7 @@ out:
return ret;
}
-static long writeback_chunk_size(struct backing_dev_info *bdi,
+static long writeback_chunk_size(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
long pages;
@@ -645,8 +1407,8 @@ static long writeback_chunk_size(struct backing_dev_info *bdi,
if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
pages = LONG_MAX;
else {
- pages = min(bdi->avg_write_bandwidth / 2,
- global_dirty_limit / DIRTY_SCOPE);
+ pages = min(wb->avg_write_bandwidth / 2,
+ global_wb_domain.dirty_limit / DIRTY_SCOPE);
pages = min(pages, work->nr_pages);
pages = round_down(pages + MIN_WRITEBACK_PAGES,
MIN_WRITEBACK_PAGES);
@@ -741,9 +1503,9 @@ static long writeback_sb_inodes(struct super_block *sb,
continue;
}
inode->i_state |= I_SYNC;
- spin_unlock(&inode->i_lock);
+ wbc_attach_and_unlock_inode(&wbc, inode);
- write_chunk = writeback_chunk_size(wb->bdi, work);
+ write_chunk = writeback_chunk_size(wb, work);
wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
@@ -753,6 +1515,7 @@ static long writeback_sb_inodes(struct super_block *sb,
*/
__writeback_single_inode(inode, &wbc);
+ wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote += write_chunk - wbc.nr_to_write;
spin_lock(&wb->list_lock);
@@ -830,33 +1593,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
return nr_pages - work.nr_pages;
}
-static bool over_bground_thresh(struct backing_dev_info *bdi)
-{
- unsigned long background_thresh, dirty_thresh;
-
- global_dirty_limits(&background_thresh, &dirty_thresh);
-
- if (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) > background_thresh)
- return true;
-
- if (bdi_stat(bdi, BDI_RECLAIMABLE) >
- bdi_dirty_limit(bdi, background_thresh))
- return true;
-
- return false;
-}
-
-/*
- * Called under wb->list_lock. If there are multiple wb per bdi,
- * only the flusher working on the first wb should do it.
- */
-static void wb_update_bandwidth(struct bdi_writeback *wb,
- unsigned long start_time)
-{
- __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
-}
-
/*
* Explicit flushing or periodic writeback of "old" data.
*
@@ -899,14 +1635,14 @@ static long wb_writeback(struct bdi_writeback *wb,
* after the other works are all done.
*/
if ((work->for_background || work->for_kupdate) &&
- !list_empty(&wb->bdi->work_list))
+ !list_empty(&wb->work_list))
break;
/*
* For background writeout, stop when we are below the
* background dirty threshold
*/
- if (work->for_background && !over_bground_thresh(wb->bdi))
+ if (work->for_background && !wb_over_bg_thresh(wb))
break;
/*
@@ -970,18 +1706,17 @@ static long wb_writeback(struct bdi_writeback *wb,
/*
* Return the next wb_writeback_work struct that hasn't been processed yet.
*/
-static struct wb_writeback_work *
-get_next_work_item(struct backing_dev_info *bdi)
+static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
{
struct wb_writeback_work *work = NULL;
- spin_lock_bh(&bdi->wb_lock);
- if (!list_empty(&bdi->work_list)) {
- work = list_entry(bdi->work_list.next,
+ spin_lock_bh(&wb->work_lock);
+ if (!list_empty(&wb->work_list)) {
+ work = list_entry(wb->work_list.next,
struct wb_writeback_work, list);
list_del_init(&work->list);
}
- spin_unlock_bh(&bdi->wb_lock);
+ spin_unlock_bh(&wb->work_lock);
return work;
}
@@ -998,7 +1733,7 @@ static unsigned long get_nr_dirty_pages(void)
static long wb_check_background_flush(struct bdi_writeback *wb)
{
- if (over_bground_thresh(wb->bdi)) {
+ if (wb_over_bg_thresh(wb)) {
struct wb_writeback_work work = {
.nr_pages = LONG_MAX,
@@ -1053,25 +1788,33 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
*/
static long wb_do_writeback(struct bdi_writeback *wb)
{
- struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
long wrote = 0;
- set_bit(BDI_writeback_running, &wb->bdi->state);
- while ((work = get_next_work_item(bdi)) != NULL) {
+ set_bit(WB_writeback_running, &wb->state);
+ while ((work = get_next_work_item(wb)) != NULL) {
+ struct wb_completion *done = work->done;
+ bool need_wake_up = false;
- trace_writeback_exec(bdi, work);
+ trace_writeback_exec(wb->bdi, work);
wrote += wb_writeback(wb, work);
- /*
- * Notify the caller of completion if this is a synchronous
- * work item, otherwise just free it.
- */
- if (work->done)
- complete(work->done);
- else
+ if (work->single_wait) {
+ WARN_ON_ONCE(work->auto_free);
+ /* paired w/ rmb in wb_wait_for_single_work() */
+ smp_wmb();
+ work->single_done = 1;
+ need_wake_up = true;
+ } else if (work->auto_free) {
kfree(work);
+ }
+
+ if (done && atomic_dec_and_test(&done->cnt))
+ need_wake_up = true;
+
+ if (need_wake_up)
+ wake_up_all(&wb->bdi->wb_waitq);
}
/*
@@ -1079,7 +1822,7 @@ static long wb_do_writeback(struct bdi_writeback *wb)
*/
wrote += wb_check_old_data_flush(wb);
wrote += wb_check_background_flush(wb);
- clear_bit(BDI_writeback_running, &wb->bdi->state);
+ clear_bit(WB_writeback_running, &wb->state);
return wrote;
}
@@ -1088,43 +1831,42 @@ static long wb_do_writeback(struct bdi_writeback *wb)
* Handle writeback of dirty data for the device backed by this bdi. Also
* reschedules periodically and does kupdated style flushing.
*/
-void bdi_writeback_workfn(struct work_struct *work)
+void wb_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(to_delayed_work(work),
struct bdi_writeback, dwork);
- struct backing_dev_info *bdi = wb->bdi;
long pages_written;
- set_worker_desc("flush-%s", dev_name(bdi->dev));
+ set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
- !test_bit(BDI_registered, &bdi->state))) {
+ !test_bit(WB_registered, &wb->state))) {
/*
- * The normal path. Keep writing back @bdi until its
+ * The normal path. Keep writing back @wb until its
* work_list is empty. Note that this path is also taken
- * if @bdi is shutting down even when we're running off the
+ * if @wb is shutting down even when we're running off the
* rescuer as work_list needs to be drained.
*/
do {
pages_written = wb_do_writeback(wb);
trace_writeback_pages_written(pages_written);
- } while (!list_empty(&bdi->work_list));
+ } while (!list_empty(&wb->work_list));
} else {
/*
* bdi_wq can't get enough workers and we're running off
* the emergency worker. Don't hog it. Hopefully, 1024 is
* enough for efficient IO.
*/
- pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+ pages_written = writeback_inodes_wb(wb, 1024,
WB_REASON_FORKER_THREAD);
trace_writeback_pages_written(pages_written);
}
- if (!list_empty(&bdi->work_list))
+ if (!list_empty(&wb->work_list))
mod_delayed_work(bdi_wq, &wb->dwork, 0);
else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
- bdi_wakeup_thread_delayed(bdi);
+ wb_wakeup_delayed(wb);
current->flags &= ~PF_SWAPWRITE;
}
@@ -1142,9 +1884,15 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+ struct bdi_writeback *wb;
+ struct wb_iter iter;
+
if (!bdi_has_dirty_io(bdi))
continue;
- __bdi_start_writeback(bdi, nr_pages, false, reason);
+
+ bdi_for_each_wb(wb, bdi, &iter, 0)
+ wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
+ false, reason);
}
rcu_read_unlock();
}
@@ -1173,9 +1921,12 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
- if (list_empty(&bdi->wb.b_dirty_time))
- continue;
- bdi_wakeup_thread(bdi);
+ struct bdi_writeback *wb;
+ struct wb_iter iter;
+
+ bdi_for_each_wb(wb, bdi, &iter, 0)
+ if (!list_empty(&bdi->wb.b_dirty_time))
+ wb_wakeup(&bdi->wb);
}
rcu_read_unlock();
schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
@@ -1249,7 +2000,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
- struct backing_dev_info *bdi = NULL;
int dirtytime;
trace_writeback_mark_inode_dirty(inode, flags);
@@ -1289,6 +2039,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
+ inode_attach_wb(inode, NULL);
+
if (flags & I_DIRTY_INODE)
inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;
@@ -1317,38 +2069,39 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* reposition it (that would break b_dirty time-ordering).
*/
if (!was_dirty) {
+ struct bdi_writeback *wb;
+ struct list_head *dirty_list;
bool wakeup_bdi = false;
- bdi = inode_to_bdi(inode);
- spin_unlock(&inode->i_lock);
- spin_lock(&bdi->wb.list_lock);
- if (bdi_cap_writeback_dirty(bdi)) {
- WARN(!test_bit(BDI_registered, &bdi->state),
- "bdi-%s not registered\n", bdi->name);
+ wb = locked_inode_to_wb_and_lock_list(inode);
- /*
- * If this is the first dirty inode for this
- * bdi, we have to wake-up the corresponding
- * bdi thread to make sure background
- * write-back happens later.
- */
- if (!wb_has_dirty_io(&bdi->wb))
- wakeup_bdi = true;
- }
+ WARN(bdi_cap_writeback_dirty(wb->bdi) &&
+ !test_bit(WB_registered, &wb->state),
+ "bdi-%s not registered\n", wb->bdi->name);
inode->dirtied_when = jiffies;
if (dirtytime)
inode->dirtied_time_when = jiffies;
+
if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
- list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+ dirty_list = &wb->b_dirty;
else
- list_move(&inode->i_wb_list,
- &bdi->wb.b_dirty_time);
- spin_unlock(&bdi->wb.list_lock);
+ dirty_list = &wb->b_dirty_time;
+
+ wakeup_bdi = inode_wb_list_move_locked(inode, wb,
+ dirty_list);
+
+ spin_unlock(&wb->list_lock);
trace_writeback_dirty_inode_enqueue(inode);
- if (wakeup_bdi)
- bdi_wakeup_thread_delayed(bdi);
+ /*
+ * If this is the first dirty inode for this bdi,
+ * we have to wake-up the corresponding bdi thread
+ * to make sure background write-back happens
+ * later.
+ */
+ if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
+ wb_wakeup_delayed(wb);
return;
}
}
@@ -1411,6 +2164,28 @@ static void wait_sb_inodes(struct super_block *sb)
iput(old_inode);
}
+static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+ enum wb_reason reason, bool skip_if_busy)
+{
+ DEFINE_WB_COMPLETION_ONSTACK(done);
+ struct wb_writeback_work work = {
+ .sb = sb,
+ .sync_mode = WB_SYNC_NONE,
+ .tagged_writepages = 1,
+ .done = &done,
+ .nr_pages = nr,
+ .reason = reason,
+ };
+ struct backing_dev_info *bdi = sb->s_bdi;
+
+ if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
+ return;
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+ bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
+ wb_wait_for_completion(bdi, &done);
+}
+
/**
* writeback_inodes_sb_nr - writeback dirty inodes from given super_block
* @sb: the superblock
@@ -1425,21 +2200,7 @@ void writeback_inodes_sb_nr(struct super_block *sb,
unsigned long nr,
enum wb_reason reason)
{
- DECLARE_COMPLETION_ONSTACK(done);
- struct wb_writeback_work work = {
- .sb = sb,
- .sync_mode = WB_SYNC_NONE,
- .tagged_writepages = 1,
- .done = &done,
- .nr_pages = nr,
- .reason = reason,
- };
-
- if (sb->s_bdi == &noop_backing_dev_info)
- return;
- WARN_ON(!rwsem_is_locked(&sb->s_umount));
- bdi_queue_work(sb->s_bdi, &work);
- wait_for_completion(&done);
+ __writeback_inodes_sb_nr(sb, nr, reason, false);
}
EXPORT_SYMBOL(writeback_inodes_sb_nr);
@@ -1467,19 +2228,15 @@ EXPORT_SYMBOL(writeback_inodes_sb);
* Invoke writeback_inodes_sb_nr if no writeback is currently underway.
* Returns 1 if writeback was started, 0 if not.
*/
-int try_to_writeback_inodes_sb_nr(struct super_block *sb,
- unsigned long nr,
- enum wb_reason reason)
+bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+ enum wb_reason reason)
{
- if (writeback_in_progress(sb->s_bdi))
- return 1;
-
if (!down_read_trylock(&sb->s_umount))
- return 0;
+ return false;
- writeback_inodes_sb_nr(sb, nr, reason);
+ __writeback_inodes_sb_nr(sb, nr, reason, true);
up_read(&sb->s_umount);
- return 1;
+ return true;
}
EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
@@ -1491,7 +2248,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
* Implement by try_to_writeback_inodes_sb_nr()
* Returns 1 if writeback was started, 0 if not.
*/
-int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
+bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
@@ -1506,7 +2263,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
*/
void sync_inodes_sb(struct super_block *sb)
{
- DECLARE_COMPLETION_ONSTACK(done);
+ DEFINE_WB_COMPLETION_ONSTACK(done);
struct wb_writeback_work work = {
.sb = sb,
.sync_mode = WB_SYNC_ALL,
@@ -1516,14 +2273,15 @@ void sync_inodes_sb(struct super_block *sb)
.reason = WB_REASON_SYNC,
.for_sync = 1,
};
+ struct backing_dev_info *bdi = sb->s_bdi;
/* Nothing to do? */
- if (sb->s_bdi == &noop_backing_dev_info)
+ if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
- bdi_queue_work(sb->s_bdi, &work);
- wait_for_completion(&done);
+ bdi_split_work_to_wbs(bdi, &work, false);
+ wb_wait_for_completion(bdi, &done);
wait_sb_inodes(sb);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5ef05b5c4cff..8c5e2fa68835 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1445,9 +1445,9 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
list_del(&req->writepages_entry);
for (i = 0; i < req->num_pages; i++) {
- dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
- bdi_writeout_inc(bdi);
+ wb_writeout_inc(&bdi->wb);
}
wake_up(&fi->page_waitq);
}
@@ -1634,7 +1634,7 @@ static int fuse_writepage_locked(struct page *page)
req->end = fuse_writepage_end;
req->inode = inode;
- inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fc->lock);
@@ -1749,9 +1749,9 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
copy_highpage(old_req->pages[0], page);
spin_unlock(&fc->lock);
- dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK_TEMP);
- bdi_writeout_inc(bdi);
+ wb_writeout_inc(&bdi->wb);
fuse_writepage_free(fc, new_req);
fuse_request_free(new_req);
goto out;
@@ -1848,7 +1848,7 @@ static int fuse_writepages_fill(struct page *page,
req->page_descs[req->num_pages].offset = 0;
req->page_descs[req->num_pages].length = PAGE_SIZE;
- inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c18b49dc5d4f..894fb01a91da 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode == WB_SYNC_ALL)
gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
- if (bdi->dirty_exceeded)
+ if (bdi->wb.dirty_exceeded)
gfs2_ail1_flush(sdp, wbc);
else
filemap_fdatawrite(metamapping);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 410b65eea683..4574fdd3d421 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/nls.h>
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 593af2fdcc2d..7302d96ae8bf 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/vfs.h>
diff --git a/fs/inode.c b/fs/inode.c
index 6e342cadef81..a049dc467c1a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -223,6 +223,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
void __destroy_inode(struct inode *inode)
{
BUG_ON(inode_has_buffers(inode));
+ inode_detach_wb(inode);
security_inode_free(inode);
fsnotify_inode_delete(inode);
locks_free_lock_context(inode->i_flctx);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 1ba5c97943b8..cfbceb116356 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -845,9 +845,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
__func__, ret);
- /* Might as well let the VFS know */
- d_instantiate(new_dentry, d_inode(old_dentry));
- ihold(d_inode(old_dentry));
+ /*
+ * We can't keep the target in dcache after that.
+ * For one thing, we can't afford dentry aliases for directories.
+ * For another, if there was a victim, we _can't_ set new inode
+ * for that sucker and we have to trigger mount eviction - the
+ * caller won't do it on its own since we are returning an error.
+ */
+ d_invalidate(new_dentry);
new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
return ret;
}
diff --git a/fs/mpage.c b/fs/mpage.c
index 3e79220babac..ca0244b69de8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -605,6 +605,8 @@ alloc_new:
bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
if (bio == NULL)
goto confused;
+
+ wbc_init_bio(wbc, bio);
}
/*
@@ -612,6 +614,7 @@ alloc_new:
* the confused fail path above (OOM) will be very confused when
* it finds all bh marked clean (i.e. it will not write anything)
*/
+ wbc_account_io(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
bio = mpage_bio_submit(WRITE, bio);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 80021c709af9..0c2632386f35 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
d_rehash(newdent);
} else {
spin_lock(&dentry->d_lock);
- NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+ NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
spin_unlock(&dentry->d_lock);
}
} else {
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index fb1fb2774d34..02ec07973bc4 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -32,6 +32,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/module.h>
+#include <linux/backing-dev.h>
#include <linux/sunrpc/metrics.h>
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9e6475bc5ba2..7e3c4604bea8 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -607,7 +607,7 @@ void nfs_mark_page_unstable(struct page *page)
struct inode *inode = page_file_mapping(page)->host;
inc_zone_page_state(page, NR_UNSTABLE_NFS);
- inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d9851a6a2813..5051926f6356 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -853,7 +853,8 @@ static void
nfs_clear_page_commit(struct page *page)
{
dec_zone_page_state(page, NR_UNSTABLE_NFS);
- dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
+ dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
+ WB_RECLAIMABLE);
}
/* Called holding inode (/cinfo) lock */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index dc3a9efdaab8..42468e5ab3e7 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -343,11 +343,6 @@ static void nilfs_end_bio_write(struct bio *bio, int err)
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct nilfs_segment_buffer *segbuf = bio->bi_private;
- if (err == -EOPNOTSUPP) {
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- /* to be detected by nilfs_segbuf_submit_bio() */
- }
-
if (!uptodate)
atomic_inc(&segbuf->sb_err);
@@ -374,15 +369,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
bio->bi_end_io = nilfs_end_bio_write;
bio->bi_private = segbuf;
- bio_get(bio);
submit_bio(mode, bio);
segbuf->sb_nbio++;
- if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
- bio_put(bio);
- err = -EOPNOTSUPP;
- goto failed;
- }
- bio_put(bio);
wi->bio = NULL;
wi->rest_blocks -= wi->end - wi->start;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d8b670cbd909..8f1feca89fb0 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -37,6 +37,7 @@
#include <linux/falloc.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <cluster/masklog.h>
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 692ceda3bc21..a2b1d7ce3e1a 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -618,7 +618,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
- d_drop(dentry);
+ if (!err)
+ d_drop(dentry);
mutex_unlock(&dir->i_mutex);
return err;
@@ -903,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
+ /*
+ * Old dentry now lives in different location. Dentries in
+ * lowerstack are stale. We cannot drop them here because
+ * access to them is lockless. This could be only pure upper
+ * or opaque directory - numlower is zero. Or upper non-dir
+ * entry - its pureness is tracked by flag opaque.
+ */
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index a1b069e5e363..e505b44a9184 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -66,6 +66,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
if (upperdentry) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
+ if (!err)
+ ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
mutex_unlock(&upperdentry->d_inode->i_mutex);
} else {
err = ovl_copy_up_last(dentry, attr, false);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index bd6d5c1e667d..39266655d2bd 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
if (oe->__upperdentry) {
type = __OVL_PATH_UPPER;
- if (oe->numlower) {
- if (S_ISDIR(dentry->d_inode->i_mode))
- type |= __OVL_PATH_MERGE;
- } else if (!oe->opaque) {
+ /*
+ * Non-dir dentry can hold lower dentry from previous
+ * location. Its purity depends only on opaque flag.
+ */
+ if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
+ type |= __OVL_PATH_MERGE;
+ else if (!oe->opaque)
type |= __OVL_PATH_PURE;
- }
} else {
if (oe->numlower > 1)
type |= __OVL_PATH_MERGE;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index cf6fa25f884b..9a895b415711 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -21,6 +21,7 @@
#include "xattr.h"
#include <linux/init.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/quotaops.h>
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index dc33f9416340..250579a80d90 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -80,6 +80,7 @@
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/parser.h>
#include <linux/buffer_head.h>
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a56960dd1684..feb7e3f01e8e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1874,6 +1874,7 @@ xfs_vm_set_page_dirty(
loff_t end_offset;
loff_t offset;
int newly_dirty;
+ struct mem_cgroup *memcg;
if (unlikely(!mapping))
return !TestSetPageDirty(page);
@@ -1893,6 +1894,11 @@ xfs_vm_set_page_dirty(
offset += 1 << inode->i_blkbits;
} while (bh != head);
}
+ /*
+ * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+ * per-memcg dirty page counters.
+ */
+ memcg = mem_cgroup_begin_page_stat(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
@@ -1903,13 +1909,15 @@ xfs_vm_set_page_dirty(
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, mapping);
+ account_page_dirtied(page, mapping, memcg);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
+ mem_cgroup_end_page_stat(memcg);
+ if (newly_dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 3b7591224f4a..7c62fca53e2f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -41,6 +41,7 @@
#include <linux/dcache.h>
#include <linux/falloc.h>
#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
static const struct vm_operations_struct xfs_file_vm_ops;
diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h
index a5de55c04fb2..e539f27ec51b 100644
--- a/include/asm-generic/early_ioremap.h
+++ b/include/asm-generic/early_ioremap.h
@@ -33,6 +33,12 @@ extern void early_ioremap_setup(void);
*/
extern void early_ioremap_reset(void);
+/*
+ * Early copy from unmapped memory to kernel mapped memory.
+ */
+extern void copy_from_early_mem(void *dest, phys_addr_t src,
+ unsigned long size);
+
#else
static inline void early_ioremap_init(void) { }
static inline void early_ioremap_setup(void) { }
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8bd374d3cf21..6a3ea792aa83 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -246,6 +246,7 @@
.rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \
VMLINUX_SYMBOL(__start_rodata) = .; \
*(.rodata) *(.rodata.*) \
+ *(.data..ro_after_init) /* Read only after init */ \
*(__vermagic) /* Kernel version magic */ \
. = ALIGN(8); \
VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .; \
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
new file mode 100644
index 000000000000..a23209b43842
--- /dev/null
+++ b/include/linux/backing-dev-defs.h
@@ -0,0 +1,256 @@
+#ifndef __LINUX_BACKING_DEV_DEFS_H
+#define __LINUX_BACKING_DEV_DEFS_H
+
+#include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/percpu_counter.h>
+#include <linux/percpu-refcount.h>
+#include <linux/flex_proportions.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+struct page;
+struct device;
+struct dentry;
+
+/*
+ * Bits in bdi_writeback.state
+ */
+enum wb_state {
+ WB_registered, /* bdi_register() was done */
+ WB_writeback_running, /* Writeback is in progress */
+ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
+};
+
+enum wb_congested_state {
+ WB_async_congested, /* The async (write) queue is getting full */
+ WB_sync_congested, /* The sync queue is getting full */
+};
+
+typedef int (congested_fn)(void *, int);
+
+enum wb_stat_item {
+ WB_RECLAIMABLE,
+ WB_WRITEBACK,
+ WB_DIRTIED,
+ WB_WRITTEN,
+ NR_WB_STAT_ITEMS
+};
+
+#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
+/*
+ * For cgroup writeback, multiple wb's may map to the same blkcg. Those
+ * wb's can operate mostly independently but should share the congested
+ * state. To facilitate such sharing, the congested state is tracked using
+ * the following struct which is created on demand, indexed by blkcg ID on
+ * its bdi, and refcounted.
+ */
+struct bdi_writeback_congested {
+ unsigned long state; /* WB_[a]sync_congested flags */
+ atomic_t refcnt; /* nr of attached wb's and blkg */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct backing_dev_info *bdi; /* the associated bdi */
+ int blkcg_id; /* ID of the associated blkcg */
+ struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */
+#endif
+};
+
+/*
+ * Each wb (bdi_writeback) can perform writeback operations, is measured
+ * and throttled, independently. Without cgroup writeback, each bdi
+ * (bdi_writeback) is served by its embedded bdi->wb.
+ *
+ * On the default hierarchy, blkcg implicitly enables memcg. This allows
+ * using memcg's page ownership for attributing writeback IOs, and every
+ * memcg - blkcg combination can be served by its own wb by assigning a
+ * dedicated wb to each memcg, which enables isolation across different
+ * cgroups and propagation of IO back pressure down from the IO layer upto
+ * the tasks which are generating the dirty pages to be written back.
+ *
+ * A cgroup wb is indexed on its bdi by the ID of the associated memcg,
+ * refcounted with the number of inodes attached to it, and pins the memcg
+ * and the corresponding blkcg. As the corresponding blkcg for a memcg may
+ * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
+ * is tested for blkcg after lookup and removed from index on mismatch so
+ * that a new wb for the combination can be created.
+ */
+struct bdi_writeback {
+ struct backing_dev_info *bdi; /* our parent bdi */
+
+ unsigned long state; /* Always use atomic bitops on this */
+ unsigned long last_old_flush; /* last old data flush */
+
+ struct list_head b_dirty; /* dirty inodes */
+ struct list_head b_io; /* parked for writeback */
+ struct list_head b_more_io; /* parked for more writeback */
+ struct list_head b_dirty_time; /* time stamps are dirty */
+ spinlock_t list_lock; /* protects the b_* lists */
+
+ struct percpu_counter stat[NR_WB_STAT_ITEMS];
+
+ struct bdi_writeback_congested *congested;
+
+ unsigned long bw_time_stamp; /* last time write bw is updated */
+ unsigned long dirtied_stamp;
+ unsigned long written_stamp; /* pages written at bw_time_stamp */
+ unsigned long write_bandwidth; /* the estimated write bandwidth */
+ unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */
+
+ /*
+ * The base dirty throttle rate, re-calculated on every 200ms.
+ * All the bdi tasks' dirty rate will be curbed under it.
+ * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
+ * in small steps and is much more smooth/stable than the latter.
+ */
+ unsigned long dirty_ratelimit;
+ unsigned long balanced_dirty_ratelimit;
+
+ struct fprop_local_percpu completions;
+ int dirty_exceeded;
+
+ spinlock_t work_lock; /* protects work_list & dwork scheduling */
+ struct list_head work_list;
+ struct delayed_work dwork; /* work item used for writeback */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct percpu_ref refcnt; /* used only for !root wb's */
+ struct fprop_local_percpu memcg_completions;
+ struct cgroup_subsys_state *memcg_css; /* the associated memcg */
+ struct cgroup_subsys_state *blkcg_css; /* and blkcg */
+ struct list_head memcg_node; /* anchored at memcg->cgwb_list */
+ struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
+
+ union {
+ struct work_struct release_work;
+ struct rcu_head rcu;
+ };
+#endif
+};
+
+struct backing_dev_info {
+ struct list_head bdi_list;
+ unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
+ unsigned int capabilities; /* Device capabilities */
+ congested_fn *congested_fn; /* Function pointer if device is md/dm */
+ void *congested_data; /* Pointer to aux data for congested func */
+
+ char *name;
+
+ unsigned int min_ratio;
+ unsigned int max_ratio, max_prop_frac;
+
+ /*
+ * Sum of avg_write_bw of wbs with dirty inodes. > 0 if there are
+ * any dirty wbs, which is depended upon by bdi_has_dirty().
+ */
+ atomic_long_t tot_write_bandwidth;
+
+ struct bdi_writeback wb; /* the root writeback info for this bdi */
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
+ struct rb_root cgwb_congested_tree; /* their congested states */
+ atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
+#else
+ struct bdi_writeback_congested *wb_congested;
+#endif
+ wait_queue_head_t wb_waitq;
+
+ struct device *dev;
+
+ struct timer_list laptop_mode_wb_timer;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debug_dir;
+ struct dentry *debug_stats;
+#endif
+};
+
+enum {
+ BLK_RW_ASYNC = 0,
+ BLK_RW_SYNC = 1,
+};
+
+void clear_wb_congested(struct bdi_writeback_congested *congested, int sync);
+void set_wb_congested(struct bdi_writeback_congested *congested, int sync);
+
+static inline void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+ clear_wb_congested(bdi->wb.congested, sync);
+}
+
+static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+ set_wb_congested(bdi->wb.congested, sync);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/**
+ * wb_tryget - try to increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+ if (wb != &wb->bdi->wb)
+ return percpu_ref_tryget(&wb->refcnt);
+ return true;
+}
+
+/**
+ * wb_get - increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline void wb_get(struct bdi_writeback *wb)
+{
+ if (wb != &wb->bdi->wb)
+ percpu_ref_get(&wb->refcnt);
+}
+
+/**
+ * wb_put - decrement a wb's refcount
+ * @wb: bdi_writeback to put
+ */
+static inline void wb_put(struct bdi_writeback *wb)
+{
+ if (wb != &wb->bdi->wb)
+ percpu_ref_put(&wb->refcnt);
+}
+
+/**
+ * wb_dying - is a wb dying?
+ * @wb: bdi_writeback of interest
+ *
+ * Returns whether @wb is unlinked and being drained.
+ */
+static inline bool wb_dying(struct bdi_writeback *wb)
+{
+ return percpu_ref_is_dying(&wb->refcnt);
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+ return true;
+}
+
+static inline void wb_get(struct bdi_writeback *wb)
+{
+}
+
+static inline void wb_put(struct bdi_writeback *wb)
+{
+}
+
+static inline bool wb_dying(struct bdi_writeback *wb)
+{
+ return false;
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+#endif /* __LINUX_BACKING_DEV_DEFS_H */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index d87d8eced064..0fe9df983ab7 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -8,106 +8,14 @@
#ifndef _LINUX_BACKING_DEV_H
#define _LINUX_BACKING_DEV_H
-#include <linux/percpu_counter.h>
-#include <linux/log2.h>
-#include <linux/flex_proportions.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/sched.h>
-#include <linux/timer.h>
+#include <linux/blkdev.h>
#include <linux/writeback.h>
-#include <linux/atomic.h>
-#include <linux/sysctl.h>
-#include <linux/workqueue.h>
-
-struct page;
-struct device;
-struct dentry;
-
-/*
- * Bits in backing_dev_info.state
- */
-enum bdi_state {
- BDI_async_congested, /* The async (write) queue is getting full */
- BDI_sync_congested, /* The sync queue is getting full */
- BDI_registered, /* bdi_register() was done */
- BDI_writeback_running, /* Writeback is in progress */
-};
-
-typedef int (congested_fn)(void *, int);
-
-enum bdi_stat_item {
- BDI_RECLAIMABLE,
- BDI_WRITEBACK,
- BDI_DIRTIED,
- BDI_WRITTEN,
- NR_BDI_STAT_ITEMS
-};
-
-#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
-
-struct bdi_writeback {
- struct backing_dev_info *bdi; /* our parent bdi */
-
- unsigned long last_old_flush; /* last old data flush */
-
- struct delayed_work dwork; /* work item used for writeback */
- struct list_head b_dirty; /* dirty inodes */
- struct list_head b_io; /* parked for writeback */
- struct list_head b_more_io; /* parked for more writeback */
- struct list_head b_dirty_time; /* time stamps are dirty */
- spinlock_t list_lock; /* protects the b_* lists */
-};
-
-struct backing_dev_info {
- struct list_head bdi_list;
- unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
- unsigned long state; /* Always use atomic bitops on this */
- unsigned int capabilities; /* Device capabilities */
- congested_fn *congested_fn; /* Function pointer if device is md/dm */
- void *congested_data; /* Pointer to aux data for congested func */
-
- char *name;
-
- struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
-
- unsigned long bw_time_stamp; /* last time write bw is updated */
- unsigned long dirtied_stamp;
- unsigned long written_stamp; /* pages written at bw_time_stamp */
- unsigned long write_bandwidth; /* the estimated write bandwidth */
- unsigned long avg_write_bandwidth; /* further smoothed write bw */
-
- /*
- * The base dirty throttle rate, re-calculated on every 200ms.
- * All the bdi tasks' dirty rate will be curbed under it.
- * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
- * in small steps and is much more smooth/stable than the latter.
- */
- unsigned long dirty_ratelimit;
- unsigned long balanced_dirty_ratelimit;
-
- struct fprop_local_percpu completions;
- int dirty_exceeded;
-
- unsigned int min_ratio;
- unsigned int max_ratio, max_prop_frac;
-
- struct bdi_writeback wb; /* default writeback info for this bdi */
- spinlock_t wb_lock; /* protects work_list & wb.dwork scheduling */
-
- struct list_head work_list;
-
- struct device *dev;
-
- struct timer_list laptop_mode_wb_timer;
-
-#ifdef CONFIG_DEBUG_FS
- struct dentry *debug_dir;
- struct dentry *debug_stats;
-#endif
-};
-
-struct backing_dev_info *inode_to_bdi(struct inode *inode);
+#include <linux/blk-cgroup.h>
+#include <linux/backing-dev-defs.h>
+#include <linux/slab.h>
int __must_check bdi_init(struct backing_dev_info *bdi);
void bdi_destroy(struct backing_dev_info *bdi);
@@ -117,97 +25,99 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...);
int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- enum wb_reason reason);
-void bdi_start_background_writeback(struct backing_dev_info *bdi);
-void bdi_writeback_workfn(struct work_struct *work);
-int bdi_has_dirty_io(struct backing_dev_info *bdi);
-void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+ bool range_cyclic, enum wb_reason reason);
+void wb_start_background_writeback(struct bdi_writeback *wb);
+void wb_workfn(struct work_struct *work);
+void wb_wakeup_delayed(struct bdi_writeback *wb);
extern spinlock_t bdi_lock;
extern struct list_head bdi_list;
extern struct workqueue_struct *bdi_wq;
-static inline int wb_has_dirty_io(struct bdi_writeback *wb)
+static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
- return !list_empty(&wb->b_dirty) ||
- !list_empty(&wb->b_io) ||
- !list_empty(&wb->b_more_io);
+ return test_bit(WB_has_dirty_io, &wb->state);
+}
+
+static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
+{
+ /*
+ * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
+ * any dirty wbs. See wb_update_write_bandwidth().
+ */
+ return atomic_long_read(&bdi->tot_write_bandwidth);
}
-static inline void __add_bdi_stat(struct backing_dev_info *bdi,
- enum bdi_stat_item item, s64 amount)
+static inline void __add_wb_stat(struct bdi_writeback *wb,
+ enum wb_stat_item item, s64 amount)
{
- __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH);
+ __percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH);
}
-static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
- enum bdi_stat_item item)
+static inline void __inc_wb_stat(struct bdi_writeback *wb,
+ enum wb_stat_item item)
{
- __add_bdi_stat(bdi, item, 1);
+ __add_wb_stat(wb, item, 1);
}
-static inline void inc_bdi_stat(struct backing_dev_info *bdi,
- enum bdi_stat_item item)
+static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
unsigned long flags;
local_irq_save(flags);
- __inc_bdi_stat(bdi, item);
+ __inc_wb_stat(wb, item);
local_irq_restore(flags);
}
-static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
- enum bdi_stat_item item)
+static inline void __dec_wb_stat(struct bdi_writeback *wb,
+ enum wb_stat_item item)
{
- __add_bdi_stat(bdi, item, -1);
+ __add_wb_stat(wb, item, -1);
}
-static inline void dec_bdi_stat(struct backing_dev_info *bdi,
- enum bdi_stat_item item)
+static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
unsigned long flags;
local_irq_save(flags);
- __dec_bdi_stat(bdi, item);
+ __dec_wb_stat(wb, item);
local_irq_restore(flags);
}
-static inline s64 bdi_stat(struct backing_dev_info *bdi,
- enum bdi_stat_item item)
+static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
- return percpu_counter_read_positive(&bdi->bdi_stat[item]);
+ return percpu_counter_read_positive(&wb->stat[item]);
}
-static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
- enum bdi_stat_item item)
+static inline s64 __wb_stat_sum(struct bdi_writeback *wb,
+ enum wb_stat_item item)
{
- return percpu_counter_sum_positive(&bdi->bdi_stat[item]);
+ return percpu_counter_sum_positive(&wb->stat[item]);
}
-static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
- enum bdi_stat_item item)
+static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
{
s64 sum;
unsigned long flags;
local_irq_save(flags);
- sum = __bdi_stat_sum(bdi, item);
+ sum = __wb_stat_sum(wb, item);
local_irq_restore(flags);
return sum;
}
-extern void bdi_writeout_inc(struct backing_dev_info *bdi);
+extern void wb_writeout_inc(struct bdi_writeback *wb);
/*
* maximal error of a stat counter.
*/
-static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
+static inline unsigned long wb_stat_error(struct bdi_writeback *wb)
{
#ifdef CONFIG_SMP
- return nr_cpu_ids * BDI_STAT_BATCH;
+ return nr_cpu_ids * WB_STAT_BATCH;
#else
return 1;
#endif
@@ -231,50 +141,57 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
* BDI_CAP_NO_WRITEBACK: Don't write pages back
* BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages
* BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold.
+ *
+ * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
*/
#define BDI_CAP_NO_ACCT_DIRTY 0x00000001
#define BDI_CAP_NO_WRITEBACK 0x00000002
#define BDI_CAP_NO_ACCT_WB 0x00000004
#define BDI_CAP_STABLE_WRITES 0x00000008
#define BDI_CAP_STRICTLIMIT 0x00000010
+#define BDI_CAP_CGROUP_WRITEBACK 0x00000020
#define BDI_CAP_NO_ACCT_AND_WRITEBACK \
(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
extern struct backing_dev_info noop_backing_dev_info;
-int writeback_in_progress(struct backing_dev_info *bdi);
-
-static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
+/**
+ * writeback_in_progress - determine whether there is writeback in progress
+ * @wb: bdi_writeback of interest
+ *
+ * Determine whether there is writeback waiting to be handled against a
+ * bdi_writeback.
+ */
+static inline bool writeback_in_progress(struct bdi_writeback *wb)
{
- if (bdi->congested_fn)
- return bdi->congested_fn(bdi->congested_data, bdi_bits);
- return (bdi->state & bdi_bits);
+ return test_bit(WB_writeback_running, &wb->state);
}
-static inline int bdi_read_congested(struct backing_dev_info *bdi)
+static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
- return bdi_congested(bdi, 1 << BDI_sync_congested);
-}
+ struct super_block *sb;
-static inline int bdi_write_congested(struct backing_dev_info *bdi)
-{
- return bdi_congested(bdi, 1 << BDI_async_congested);
+ if (!inode)
+ return &noop_backing_dev_info;
+
+ sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+ if (sb_is_blkdev_sb(sb))
+ return blk_get_backing_dev_info(I_BDEV(inode));
+#endif
+ return sb->s_bdi;
}
-static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
{
- return bdi_congested(bdi, (1 << BDI_sync_congested) |
- (1 << BDI_async_congested));
-}
+ struct backing_dev_info *bdi = wb->bdi;
-enum {
- BLK_RW_ASYNC = 0,
- BLK_RW_SYNC = 1,
-};
+ if (bdi->congested_fn)
+ return bdi->congested_fn(bdi->congested_data, cong_bits);
+ return wb->congested->state & cong_bits;
+}
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
-void set_bdi_congested(struct backing_dev_info *bdi, int sync);
long congestion_wait(int sync, long timeout);
long wait_iff_congested(struct zone *zone, int sync, long timeout);
int pdflush_proc_obsolete(struct ctl_table *table, int write,
@@ -318,4 +235,336 @@ static inline int bdi_sched_wait(void *word)
return 0;
}
-#endif /* _LINUX_BACKING_DEV_H */
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
+void wb_congested_put(struct bdi_writeback_congested *congested);
+struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+ struct cgroup_subsys_state *memcg_css,
+ gfp_t gfp);
+void wb_memcg_offline(struct mem_cgroup *memcg);
+void wb_blkcg_offline(struct blkcg *blkcg);
+int inode_congested(struct inode *inode, int cong_bits);
+
+/**
+ * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
+ * @inode: inode of interest
+ *
+ * cgroup writeback requires support from both the bdi and filesystem.
+ * Test whether @inode has both.
+ */
+static inline bool inode_cgwb_enabled(struct inode *inode)
+{
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+ return bdi_cap_account_dirty(bdi) &&
+ (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
+ (inode->i_sb->s_iflags & SB_I_CGROUPWB);
+}
+
+/**
+ * wb_find_current - find wb for %current on a bdi
+ * @bdi: bdi of interest
+ *
+ * Find the wb of @bdi which matches both the memcg and blkcg of %current.
+ * Must be called under rcu_read_lock() which protects the returend wb.
+ * NULL if not found.
+ */
+static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+{
+ struct cgroup_subsys_state *memcg_css;
+ struct bdi_writeback *wb;
+
+ memcg_css = task_css(current, memory_cgrp_id);
+ if (!memcg_css->parent)
+ return &bdi->wb;
+
+ wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+
+ /*
+ * %current's blkcg equals the effective blkcg of its memcg. No
+ * need to use the relatively expensive cgroup_get_e_css().
+ */
+ if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+ return wb;
+ return NULL;
+}
+
+/**
+ * wb_get_create_current - get or create wb for %current on a bdi
+ * @bdi: bdi of interest
+ * @gfp: allocation mask
+ *
+ * Equivalent to wb_get_create() on %current's memcg. This function is
+ * called from a relatively hot path and optimizes the common cases using
+ * wb_find_current().
+ */
+static inline struct bdi_writeback *
+wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+{
+ struct bdi_writeback *wb;
+
+ rcu_read_lock();
+ wb = wb_find_current(bdi);
+ if (wb && unlikely(!wb_tryget(wb)))
+ wb = NULL;
+ rcu_read_unlock();
+
+ if (unlikely(!wb)) {
+ struct cgroup_subsys_state *memcg_css;
+
+ memcg_css = task_get_css(current, memory_cgrp_id);
+ wb = wb_get_create(bdi, memcg_css, gfp);
+ css_put(memcg_css);
+ }
+ return wb;
+}
+
+/**
+ * inode_to_wb_is_valid - test whether an inode has a wb associated
+ * @inode: inode of interest
+ *
+ * Returns %true if @inode has a wb associated. May be called without any
+ * locking.
+ */
+static inline bool inode_to_wb_is_valid(struct inode *inode)
+{
+ return inode->i_wb;
+}
+
+/**
+ * inode_to_wb - determine the wb of an inode
+ * @inode: inode of interest
+ *
+ * Returns the wb @inode is currently associated with. The caller must be
+ * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the
+ * associated wb's list_lock.
+ */
+static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+{
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(debug_locks &&
+ (!lockdep_is_held(&inode->i_lock) &&
+ !lockdep_is_held(&inode->i_mapping->tree_lock) &&
+ !lockdep_is_held(&inode->i_wb->list_lock)));
+#endif
+ return inode->i_wb;
+}
+
+/**
+ * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
+ * @inode: target inode
+ * @lockedp: temp bool output param, to be passed to the end function
+ *
+ * The caller wants to access the wb associated with @inode but isn't
+ * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This
+ * function determines the wb associated with @inode and ensures that the
+ * association doesn't change until the transaction is finished with
+ * unlocked_inode_to_wb_end().
+ *
+ * The caller must call unlocked_inode_to_wb_end() with *@lockdep
+ * afterwards and can't sleep during transaction. IRQ may or may not be
+ * disabled on return.
+ */
+static inline struct bdi_writeback *
+unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+{
+ rcu_read_lock();
+
+ /*
+ * Paired with store_release in inode_switch_wb_work_fn() and
+ * ensures that we see the new wb if we see cleared I_WB_SWITCH.
+ */
+ *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+
+ if (unlikely(*lockedp))
+ spin_lock_irq(&inode->i_mapping->tree_lock);
+
+ /*
+ * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
+ * inode_to_wb() will bark. Deref directly.
+ */
+ return inode->i_wb;
+}
+
+/**
+ * unlocked_inode_to_wb_end - end inode wb access transaction
+ * @inode: target inode
+ * @locked: *@lockedp from unlocked_inode_to_wb_begin()
+ */
+static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+{
+ if (unlikely(locked))
+ spin_unlock_irq(&inode->i_mapping->tree_lock);
+
+ rcu_read_unlock();
+}
+
+struct wb_iter {
+ int start_blkcg_id;
+ struct radix_tree_iter tree_iter;
+ void **slot;
+};
+
+static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
+ struct backing_dev_info *bdi)
+{
+ struct radix_tree_iter *titer = &iter->tree_iter;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (iter->start_blkcg_id >= 0) {
+ iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
+ iter->start_blkcg_id = -1;
+ } else {
+ iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
+ }
+
+ if (!iter->slot)
+ iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
+ if (iter->slot)
+ return *iter->slot;
+ return NULL;
+}
+
+static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
+ struct backing_dev_info *bdi,
+ int start_blkcg_id)
+{
+ iter->start_blkcg_id = start_blkcg_id;
+
+ if (start_blkcg_id)
+ return __wb_iter_next(iter, bdi);
+ else
+ return &bdi->wb;
+}
+
+/**
+ * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+ * @wb_cur: cursor struct bdi_writeback pointer
+ * @bdi: bdi to walk wb's of
+ * @iter: pointer to struct wb_iter to be used as iteration buffer
+ * @start_blkcg_id: blkcg ID to start iteration from
+ *
+ * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
+ * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter
+ * to be used as temp storage during iteration. rcu_read_lock() must be
+ * held throughout iteration.
+ */
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \
+ for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \
+ (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool inode_cgwb_enabled(struct inode *inode)
+{
+ return false;
+}
+
+static inline struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+{
+ atomic_inc(&bdi->wb_congested->refcnt);
+ return bdi->wb_congested;
+}
+
+static inline void wb_congested_put(struct bdi_writeback_congested *congested)
+{
+ if (atomic_dec_and_test(&congested->refcnt))
+ kfree(congested);
+}
+
+static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+{
+ return &bdi->wb;
+}
+
+static inline struct bdi_writeback *
+wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+{
+ return &bdi->wb;
+}
+
+static inline bool inode_to_wb_is_valid(struct inode *inode)
+{
+ return true;
+}
+
+static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+{
+ return &inode_to_bdi(inode)->wb;
+}
+
+static inline struct bdi_writeback *
+unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+{
+ return inode_to_wb(inode);
+}
+
+static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+{
+}
+
+static inline void wb_memcg_offline(struct mem_cgroup *memcg)
+{
+}
+
+static inline void wb_blkcg_offline(struct blkcg *blkcg)
+{
+}
+
+struct wb_iter {
+ int next_id;
+};
+
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \
+ for ((iter)->next_id = (start_blkcg_id); \
+ ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
+
+static inline int inode_congested(struct inode *inode, int cong_bits)
+{
+ return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+static inline int inode_read_congested(struct inode *inode)
+{
+ return inode_congested(inode, 1 << WB_sync_congested);
+}
+
+static inline int inode_write_congested(struct inode *inode)
+{
+ return inode_congested(inode, 1 << WB_async_congested);
+}
+
+static inline int inode_rw_congested(struct inode *inode)
+{
+ return inode_congested(inode, (1 << WB_sync_congested) |
+ (1 << WB_async_congested));
+}
+
+static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits)
+{
+ return wb_congested(&bdi->wb, cong_bits);
+}
+
+static inline int bdi_read_congested(struct backing_dev_info *bdi)
+{
+ return bdi_congested(bdi, 1 << WB_sync_congested);
+}
+
+static inline int bdi_write_congested(struct backing_dev_info *bdi)
+{
+ return bdi_congested(bdi, 1 << WB_async_congested);
+}
+
+static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+{
+ return bdi_congested(bdi, (1 << WB_sync_congested) |
+ (1 << WB_async_congested));
+}
+
+#endif /* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index da3a127c9958..cbc5d1d0e47c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -469,9 +469,12 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
extern unsigned int bvec_nr_vecs(unsigned short idx);
#ifdef CONFIG_BLK_CGROUP
+int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
int bio_associate_current(struct bio *bio);
void bio_disassociate_task(struct bio *bio);
#else /* CONFIG_BLK_CGROUP */
+static inline int bio_associate_blkcg(struct bio *bio,
+ struct cgroup_subsys_state *blkcg_css) { return 0; }
static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
static inline void bio_disassociate_task(struct bio *bio) { }
#endif /* CONFIG_BLK_CGROUP */
diff --git a/block/blk-cgroup.h b/include/linux/blk-cgroup.h
index c567865b5f1d..07a32b813ed8 100644
--- a/block/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -53,6 +53,10 @@ struct blkcg {
/* TODO: per-policy storage in blkcg */
unsigned int cfq_weight; /* belongs to cfq */
unsigned int cfq_leaf_weight;
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head cgwb_list;
+#endif
};
struct blkg_stat {
@@ -95,6 +99,12 @@ struct blkcg_gq {
struct hlist_node blkcg_node;
struct blkcg *blkcg;
+ /*
+ * Each blkg gets congested separately and the congestion state is
+ * propagated to the matching bdi_writeback_congested.
+ */
+ struct bdi_writeback_congested *wb_congested;
+
/* all non-root blkcg_gq's are guaranteed to have access to parent */
struct blkcg_gq *parent;
@@ -134,6 +144,7 @@ struct blkcg_policy {
};
extern struct blkcg blkcg_root;
+extern struct cgroup_subsys_state * const blkcg_root_css;
struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
@@ -194,6 +205,12 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
return task_blkcg(current);
}
+static inline struct cgroup_subsys_state *
+task_get_blkcg_css(struct task_struct *task)
+{
+ return task_get_css(task, blkio_cgrp_id);
+}
+
/**
* blkcg_parent - get the parent of a blkcg
* @blkcg: blkcg of interest
@@ -558,8 +575,8 @@ static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
#else /* CONFIG_BLK_CGROUP */
-struct cgroup;
-struct blkcg;
+struct blkcg {
+};
struct blkg_policy_data {
};
@@ -570,6 +587,16 @@ struct blkcg_gq {
struct blkcg_policy {
};
+#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+
+static inline struct cgroup_subsys_state *
+task_get_blkcg_css(struct task_struct *task)
+{
+ return NULL;
+}
+
+#ifdef CONFIG_BLOCK
+
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
static inline void blkcg_drain_queue(struct request_queue *q) { }
@@ -599,5 +626,6 @@ static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q
#define blk_queue_for_each_rl(rl, q) \
for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+#endif /* CONFIG_BLOCK */
#endif /* CONFIG_BLK_CGROUP */
#endif /* _BLK_CGROUP_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index b7299febc4b4..7fda78fc098a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -118,7 +118,6 @@ struct bio {
#define BIO_CLONED 4 /* doesn't own data */
#define BIO_BOUNCED 5 /* bio is a bounce bio */
#define BIO_USER_MAPPED 6 /* contains user pages */
-#define BIO_EOPNOTSUPP 7 /* not supported */
#define BIO_NULL_MAPPED 8 /* contains invalid user pages */
#define BIO_QUIET 9 /* Make BIO Quiet */
#define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 37faf63af7f7..5f0f608bf07e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -12,7 +12,7 @@
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/pagemap.h>
-#include <linux/backing-dev.h>
+#include <linux/backing-dev-defs.h>
#include <linux/wait.h>
#include <linux/mempool.h>
#include <linux/bio.h>
@@ -822,25 +822,6 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
struct scsi_ioctl_command __user *);
-/*
- * A queue has just exitted congestion. Note this in the global counter of
- * congested queues, and wake up anyone who was waiting for requests to be
- * put back.
- */
-static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
-{
- clear_bdi_congested(&q->backing_dev_info, sync);
-}
-
-/*
- * A queue has just entered congestion. Flag that in the queue's VM-visible
- * state flags and increment the global gounter of congested queues.
- */
-static inline void blk_set_queue_congested(struct request_queue *q, int sync)
-{
- set_bdi_congested(&q->backing_dev_info, sync);
-}
-
extern void blk_start_queue(struct request_queue *q);
extern void blk_stop_queue(struct request_queue *q);
extern void blk_sync_queue(struct request_queue *q);
diff --git a/include/linux/cache.h b/include/linux/cache.h
index 17e7e82d2aa7..1be04f8c563a 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -12,10 +12,24 @@
#define SMP_CACHE_BYTES L1_CACHE_BYTES
#endif
+/*
+ * __read_mostly is used to keep rarely changing variables out of frequently
+ * updated cachelines. If an architecture doesn't support it, ignore the
+ * hint.
+ */
#ifndef __read_mostly
#define __read_mostly
#endif
+/*
+ * __ro_after_init is used to mark things that are read-only after init (i.e.
+ * after mark_rodata_ro() has been called). These are effectively read-only,
+ * but may get written to during init, so can't live in .rodata (via "const").
+ */
+#ifndef __ro_after_init
+#define __ro_after_init __attribute__((__section__(".data..ro_after_init")))
+#endif
+
#ifndef ____cacheline_aligned
#define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
#endif
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7f08633d839a..e3f81b7fcee3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -325,6 +325,31 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
}
/**
+ * task_get_css - find and get the css for (task, subsys)
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * Find the css for the (@task, @subsys_id) combination, increment a
+ * reference on and return it. This function is guaranteed to return a
+ * valid css.
+ */
+static inline struct cgroup_subsys_state *
+task_get_css(struct task_struct *task, int subsys_id)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+ while (true) {
+ css = task_css(task, subsys_id);
+ if (likely(css_tryget_online(css)))
+ break;
+ cpu_relax();
+ }
+ rcu_read_unlock();
+ return css;
+}
+
+/**
* task_css_is_root - test whether a task belongs to the root css
* @task: the target task
* @subsys_id: the target subsystem ID
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 2ee4888c1f47..26509b509569 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -574,6 +574,8 @@ ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf);
int cpufreq_boost_trigger_state(int state);
int cpufreq_boost_supported(void);
int cpufreq_boost_enabled(void);
+int cpufreq_enable_boost_support(void);
+bool policy_has_boost_freq(struct cpufreq_policy *policy);
#else
static inline int cpufreq_boost_trigger_state(int state)
{
@@ -587,12 +589,23 @@ static inline int cpufreq_boost_enabled(void)
{
return 0;
}
+
+static inline int cpufreq_enable_boost_support(void)
+{
+ return -EINVAL;
+}
+
+static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
+{
+ return false;
+}
#endif
/* the following funtion is for cpufreq core use only */
struct cpufreq_frequency_table *cpufreq_frequency_get_table(unsigned int cpu);
/* the following are really really optional */
extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
+extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;
extern struct freq_attr *cpufreq_generic_attr[];
int cpufreq_table_validate_and_show(struct cpufreq_policy *policy,
struct cpufreq_frequency_table *table);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index cb25af461054..fe5b1169a6a7 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -80,6 +80,8 @@ struct dentry *debugfs_create_u32(const char *name, umode_t mode,
struct dentry *parent, u32 *value);
struct dentry *debugfs_create_u64(const char *name, umode_t mode,
struct dentry *parent, u64 *value);
+struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
+ struct dentry *parent, unsigned long *value);
struct dentry *debugfs_create_x8(const char *name, umode_t mode,
struct dentry *parent, u8 *value);
struct dentry *debugfs_create_x16(const char *name, umode_t mode,
@@ -93,7 +95,7 @@ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
struct dentry *parent, atomic_t *value);
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
- struct dentry *parent, u32 *value);
+ struct dentry *parent, bool *value);
struct dentry *debugfs_create_blob(const char *name, umode_t mode,
struct dentry *parent,
@@ -117,6 +119,12 @@ struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name,
bool debugfs_initialized(void);
+ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos);
+
+ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos);
+
#else
#include <linux/err.h>
@@ -238,7 +246,7 @@ static inline struct dentry *debugfs_create_atomic_t(const char *name, umode_t m
static inline struct dentry *debugfs_create_bool(const char *name, umode_t mode,
struct dentry *parent,
- u32 *value)
+ bool *value)
{
return ERR_PTR(-ENODEV);
}
@@ -283,6 +291,20 @@ static inline struct dentry *debugfs_create_devm_seqfile(struct device *dev,
return ERR_PTR(-ENODEV);
}
+static inline ssize_t debugfs_read_file_bool(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ return -ENODEV;
+}
+
+static inline ssize_t debugfs_write_file_bool(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ return -ENODEV;
+}
+
#endif
#endif
diff --git a/include/linux/edac.h b/include/linux/edac.h
index da3b72e95db3..7c6b7ba55589 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -772,7 +772,7 @@ struct mem_ctl_info {
#ifdef CONFIG_EDAC_DEBUG
struct dentry *debugfs;
u8 fake_inject_layer[EDAC_MAX_LAYERS];
- u32 fake_inject_ue;
+ bool fake_inject_ue;
u16 fake_inject_count;
#endif
};
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 798fad9e420d..3159a7dba034 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -18,7 +18,7 @@ struct fault_attr {
atomic_t times;
atomic_t space;
unsigned long verbose;
- u32 task_filter;
+ bool task_filter;
unsigned long stacktrace_depth;
unsigned long require_start;
unsigned long require_end;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fdc369fa69e8..a3efb3c0d310 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -35,6 +35,7 @@
#include <uapi/linux/fs.h>
struct backing_dev_info;
+struct bdi_writeback;
struct export_operations;
struct hd_geometry;
struct iovec;
@@ -635,6 +636,14 @@ struct inode {
struct hlist_node i_hash;
struct list_head i_wb_list; /* backing dev IO list */
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct bdi_writeback *i_wb; /* the associated cgroup wb */
+
+ /* foreign inode detection, see wbc_detach_inode() */
+ int i_wb_frn_winner;
+ u16 i_wb_frn_avg_time;
+ u16 i_wb_frn_history;
+#endif
struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list;
union {
@@ -1247,6 +1256,8 @@ struct mm_struct;
#define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */
#define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */
+/* sb->s_iflags */
+#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
/* Possible states of 'frozen' field */
enum {
@@ -1285,6 +1296,7 @@ struct super_block {
const struct quotactl_ops *s_qcop;
const struct export_operations *s_export_op;
unsigned long s_flags;
+ unsigned long s_iflags; /* internal SB_I_* flags */
unsigned long s_magic;
struct dentry *s_root;
struct rw_semaphore s_umount;
@@ -1820,6 +1832,11 @@ struct super_operations {
*
* I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit().
*
+ * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to
+ * synchronize competing switching instances and to tell
+ * wb stat updates to grab mapping->tree_lock. See
+ * inode_switch_wb_work_fn() for details.
+ *
* Q: What is the difference between I_WILL_FREE and I_FREEING?
*/
#define I_DIRTY_SYNC (1 << 0)
@@ -1839,6 +1856,7 @@ struct super_operations {
#define I_DIRTY_TIME (1 << 11)
#define __I_DIRTY_TIME_EXPIRED 12
#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED)
+#define I_WB_SWITCH (1 << 13)
#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
@@ -2248,7 +2266,13 @@ extern struct super_block *freeze_bdev(struct block_device *);
extern void emergency_thaw_all(void);
extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
extern int fsync_bdev(struct block_device *);
-extern int sb_is_blkdev_sb(struct super_block *sb);
+
+extern struct super_block *blockdev_superblock;
+
+static inline bool sb_is_blkdev_sb(struct super_block *sb)
+{
+ return sb == blockdev_superblock;
+}
#else
static inline void bd_forget(struct inode *inode) {}
static inline int sync_blockdev(struct block_device *bdev) { return 0; }
diff --git a/include/linux/init.h b/include/linux/init.h
index 21b6d768edd7..5e386444194b 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -153,6 +153,10 @@ void prepare_namespace(void);
void __init load_default_modules(void);
int __init init_rootfs(void);
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void);
+#endif
+
extern void (*late_time_init)(void);
extern bool initcall_debug;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c8918114804..73b02b0a8f60 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -41,6 +41,7 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
+ MEM_CGROUP_STAT_DIRTY, /* # of dirty pages in page cache */
MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
MEM_CGROUP_STAT_NSTATS,
@@ -67,6 +68,8 @@ enum mem_cgroup_events_index {
};
#ifdef CONFIG_MEMCG
+extern struct cgroup_subsys_state *mem_cgroup_root_css;
+
void mem_cgroup_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx,
unsigned int nr);
@@ -112,6 +115,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
}
extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
+extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
struct mem_cgroup *,
@@ -195,6 +199,8 @@ void mem_cgroup_split_huge_fixup(struct page *head);
#else /* CONFIG_MEMCG */
struct mem_cgroup;
+#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx,
unsigned int nr)
@@ -382,6 +388,29 @@ enum {
OVER_LIMIT,
};
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+ unsigned long *pdirty, unsigned long *pwriteback);
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+ return NULL;
+}
+
+static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
+ unsigned long *pavail,
+ unsigned long *pdirty,
+ unsigned long *pwriteback)
+{
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
struct sock;
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
void sock_update_memcg(struct sock *sk);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b2085582d44e..ed271d320a00 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -27,6 +27,7 @@ struct anon_vma_chain;
struct file_ra_state;
struct user_struct;
struct writeback_control;
+struct bdi_writeback;
#ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */
extern unsigned long max_mapnr;
@@ -1239,10 +1240,13 @@ int __set_page_dirty_nobuffers(struct page *page);
int __set_page_dirty_no_writeback(struct page *page);
int redirty_page_for_writepage(struct writeback_control *wbc,
struct page *page);
-void account_page_dirtied(struct page *page, struct address_space *mapping);
-void account_page_cleaned(struct page *page, struct address_space *mapping);
+void account_page_dirtied(struct page *page, struct address_space *mapping,
+ struct mem_cgroup *memcg);
+void account_page_cleaned(struct page *page, struct address_space *mapping,
+ struct mem_cgroup *memcg, struct bdi_writeback *wb);
int set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);
+void cancel_dirty_page(struct page *page);
int clear_page_dirty_for_io(struct page *page);
int get_cmdline(struct task_struct *task, char *buffer, int buflen);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b3736f7065c..fb0814ca65c7 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -651,7 +651,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern void delete_from_page_cache(struct page *page);
-extern void __delete_from_page_cache(struct page *page, void *shadow);
+extern void __delete_from_page_cache(struct page *page, void *shadow,
+ struct mem_cgroup *memcg);
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
/*
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index cec2d4540914..cccaf4a29e9f 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -30,7 +30,13 @@ unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp);
unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp);
+bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp);
+
int dev_pm_opp_get_opp_count(struct device *dev);
+unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev);
+unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev);
+unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev);
+struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev);
struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
unsigned long freq,
@@ -51,6 +57,14 @@ int dev_pm_opp_enable(struct device *dev, unsigned long freq);
int dev_pm_opp_disable(struct device *dev, unsigned long freq);
struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev);
+int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
+ unsigned int count);
+void dev_pm_opp_put_supported_hw(struct device *dev);
+int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
+void dev_pm_opp_put_prop_name(struct device *dev);
+int dev_pm_opp_set_regulator(struct device *dev, const char *name);
+void dev_pm_opp_put_regulator(struct device *dev);
+int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
#else
static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
{
@@ -62,11 +76,36 @@ static inline unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
return 0;
}
+static inline bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp)
+{
+ return false;
+}
+
static inline int dev_pm_opp_get_opp_count(struct device *dev)
{
return 0;
}
+static inline unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
+{
+ return 0;
+}
+
+static inline unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
+{
+ return 0;
+}
+
+static inline unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
+{
+ return 0;
+}
+
+static inline struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
+{
+ return NULL;
+}
+
static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
unsigned long freq, bool available)
{
@@ -110,19 +149,71 @@ static inline struct srcu_notifier_head *dev_pm_opp_get_notifier(
{
return ERR_PTR(-EINVAL);
}
+
+static inline int dev_pm_opp_set_supported_hw(struct device *dev,
+ const u32 *versions,
+ unsigned int count)
+{
+ return -EINVAL;
+}
+
+static inline void dev_pm_opp_put_supported_hw(struct device *dev) {}
+
+static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+{
+ return -EINVAL;
+}
+
+static inline void dev_pm_opp_put_prop_name(struct device *dev) {}
+
+static inline int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+{
+ return -EINVAL;
+}
+
+static inline void dev_pm_opp_put_regulator(struct device *dev) {}
+
+static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
+{
+ return -EINVAL;
+}
+
#endif /* CONFIG_PM_OPP */
#if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
-int of_init_opp_table(struct device *dev);
-void of_free_opp_table(struct device *dev);
+int dev_pm_opp_of_add_table(struct device *dev);
+void dev_pm_opp_of_remove_table(struct device *dev);
+int dev_pm_opp_of_cpumask_add_table(cpumask_var_t cpumask);
+void dev_pm_opp_of_cpumask_remove_table(cpumask_var_t cpumask);
+int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask);
+int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask);
#else
-static inline int of_init_opp_table(struct device *dev)
+static inline int dev_pm_opp_of_add_table(struct device *dev)
{
return -EINVAL;
}
-static inline void of_free_opp_table(struct device *dev)
+static inline void dev_pm_opp_of_remove_table(struct device *dev)
+{
+}
+
+static inline int dev_pm_opp_of_cpumask_add_table(cpumask_var_t cpumask)
+{
+ return -ENOSYS;
+}
+
+static inline void dev_pm_opp_of_cpumask_remove_table(cpumask_var_t cpumask)
+{
+}
+
+static inline int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask)
+{
+ return -ENOSYS;
+}
+
+static inline int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask)
{
+ return -ENOSYS;
}
#endif
diff --git a/include/linux/psci.h b/include/linux/psci.h
new file mode 100644
index 000000000000..12c4865457ad
--- /dev/null
+++ b/include/linux/psci.h
@@ -0,0 +1,54 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 ARM Limited
+ */
+
+#ifndef __LINUX_PSCI_H
+#define __LINUX_PSCI_H
+
+#include <linux/init.h>
+#include <linux/types.h>
+
+#define PSCI_POWER_STATE_TYPE_STANDBY 0
+#define PSCI_POWER_STATE_TYPE_POWER_DOWN 1
+
+bool psci_tos_resident_on(int cpu);
+bool psci_power_state_loses_context(u32 state);
+bool psci_power_state_is_valid(u32 state);
+
+struct psci_operations {
+ int (*cpu_suspend)(u32 state, unsigned long entry_point);
+ int (*cpu_off)(u32 state);
+ int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
+ int (*migrate)(unsigned long cpuid);
+ int (*affinity_info)(unsigned long target_affinity,
+ unsigned long lowest_affinity_level);
+ int (*migrate_info_type)(void);
+};
+
+extern struct psci_operations psci_ops;
+
+#if defined(CONFIG_ARM_PSCI_FW)
+int __init psci_dt_init(void);
+#else
+static inline int psci_dt_init(void) { return 0; }
+#endif
+
+#if defined(CONFIG_ARM_PSCI_FW) && defined(CONFIG_ACPI)
+int __init psci_acpi_init(void);
+bool __init acpi_psci_present(void);
+bool __init acpi_psci_use_hvc(void);
+#else
+static inline int psci_acpi_init(void) { return 0; }
+static inline bool acpi_psci_present(void) { return false; }
+#endif
+
+#endif /* __LINUX_PSCI_H */
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index f8a689ed62a5..e325d4606b62 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -552,6 +552,16 @@ static inline int regulator_count_voltages(struct regulator *regulator)
}
#endif
+static inline int regulator_set_voltage_triplet(struct regulator *regulator,
+ int min_uV, int target_uV,
+ int max_uV)
+{
+ if (regulator_set_voltage(regulator, target_uV, max_uV) == 0)
+ return 0;
+
+ return regulator_set_voltage(regulator, min_uV, max_uV);
+}
+
static inline int regulator_set_voltage_tol(struct regulator *regulator,
int new_uV, int tol_uV)
{
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 8bf439eb3018..e7cec37ccf53 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -42,6 +42,7 @@
/* Default weight of a bound cooling device */
#define THERMAL_WEIGHT_DEFAULT 0
+
/* use value, which < 0K, to indicate an invalid/uninitialized temperature */
#define THERMAL_TEMP_INVALID -274000
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index a6e1bca88cc6..8454fb35fcbe 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -131,9 +131,6 @@ extern void syscall_unregfunc(void);
void *it_func; \
void *__data; \
\
- if (!cpu_online(raw_smp_processor_id())) \
- return; \
- \
if (!(cond)) \
return; \
prercu; \
@@ -332,15 +329,19 @@ extern void syscall_unregfunc(void);
* "void *__data, proto" as the callback prototype.
*/
#define DECLARE_TRACE_NOARGS(name) \
- __DECLARE_TRACE(name, void, , 1, void *__data, __data)
+ __DECLARE_TRACE(name, void, , \
+ cpu_online(raw_smp_processor_id()), \
+ void *__data, __data)
#define DECLARE_TRACE(name, proto, args) \
- __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), 1, \
- PARAMS(void *__data, proto), \
- PARAMS(__data, args))
+ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \
+ cpu_online(raw_smp_processor_id()), \
+ PARAMS(void *__data, proto), \
+ PARAMS(__data, args))
#define DECLARE_TRACE_CONDITION(name, proto, args, cond) \
- __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), PARAMS(cond), \
+ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \
+ cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \
PARAMS(void *__data, proto), \
PARAMS(__data, args))
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b2dd371ec0ca..b333c945e571 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -7,6 +7,8 @@
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/fs.h>
+#include <linux/flex_proportions.h>
+#include <linux/backing-dev-defs.h>
DECLARE_PER_CPU(int, dirty_throttle_leaks);
@@ -84,18 +86,95 @@ struct writeback_control {
unsigned for_reclaim:1; /* Invoked from the page allocator */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct bdi_writeback *wb; /* wb this writeback is issued under */
+ struct inode *inode; /* inode being written out */
+
+ /* foreign inode detection, see wbc_detach_inode() */
+ int wb_id; /* current wb id */
+ int wb_lcand_id; /* last foreign candidate wb id */
+ int wb_tcand_id; /* this foreign candidate wb id */
+ size_t wb_bytes; /* bytes written by current wb */
+ size_t wb_lcand_bytes; /* bytes written by last candidate */
+ size_t wb_tcand_bytes; /* bytes written by this candidate */
+#endif
};
/*
+ * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
+ * and are measured against each other in. There always is one global
+ * domain, global_wb_domain, that every wb in the system is a member of.
+ * This allows measuring the relative bandwidth of each wb to distribute
+ * dirtyable memory accordingly.
+ */
+struct wb_domain {
+ spinlock_t lock;
+
+ /*
+ * Scale the writeback cache size proportional to the relative
+ * writeout speed.
+ *
+ * We do this by keeping a floating proportion between BDIs, based
+ * on page writeback completions [end_page_writeback()]. Those
+ * devices that write out pages fastest will get the larger share,
+ * while the slower will get a smaller share.
+ *
+ * We use page writeout completions because we are interested in
+ * getting rid of dirty pages. Having them written out is the
+ * primary goal.
+ *
+ * We introduce a concept of time, a period over which we measure
+ * these events, because demand can/will vary over time. The length
+ * of this period itself is measured in page writeback completions.
+ */
+ struct fprop_global completions;
+ struct timer_list period_timer; /* timer for aging of completions */
+ unsigned long period_time;
+
+ /*
+ * The dirtyable memory and dirty threshold could be suddenly
+ * knocked down by a large amount (eg. on the startup of KVM in a
+ * swapless system). This may throw the system into deep dirty
+ * exceeded state and throttle heavy/light dirtiers alike. To
+ * retain good responsiveness, maintain global_dirty_limit for
+ * tracking slowly down to the knocked down dirty threshold.
+ *
+ * Both fields are protected by ->lock.
+ */
+ unsigned long dirty_limit_tstamp;
+ unsigned long dirty_limit;
+};
+
+/**
+ * wb_domain_size_changed - memory available to a wb_domain has changed
+ * @dom: wb_domain of interest
+ *
+ * This function should be called when the amount of memory available to
+ * @dom has changed. It resets @dom's dirty limit parameters to prevent
+ * the past values which don't match the current configuration from skewing
+ * dirty throttling. Without this, when memory size of a wb_domain is
+ * greatly reduced, the dirty throttling logic may allow too many pages to
+ * be dirtied leading to consecutive unnecessary OOMs and may get stuck in
+ * that situation.
+ */
+static inline void wb_domain_size_changed(struct wb_domain *dom)
+{
+ spin_lock(&dom->lock);
+ dom->dirty_limit_tstamp = jiffies;
+ dom->dirty_limit = 0;
+ spin_unlock(&dom->lock);
+}
+
+/*
* fs/fs-writeback.c
*/
struct bdi_writeback;
void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
enum wb_reason reason);
-int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
-int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
- enum wb_reason reason);
+bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
+bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
+ enum wb_reason reason);
void sync_inodes_sb(struct super_block *);
void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
void inode_wait_for_writeback(struct inode *inode);
@@ -107,6 +186,123 @@ static inline void wait_on_inode(struct inode *inode)
wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
}
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#include <linux/cgroup.h>
+#include <linux/bio.h>
+
+void __inode_attach_wb(struct inode *inode, struct page *page);
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+ __releases(&inode->i_lock);
+void wbc_detach_inode(struct writeback_control *wbc);
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+ size_t bytes);
+
+/**
+ * inode_attach_wb - associate an inode with its wb
+ * @inode: inode of interest
+ * @page: page being dirtied (may be NULL)
+ *
+ * If @inode doesn't have its wb, associate it with the wb matching the
+ * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o
+ * @inode->i_lock.
+ */
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+ if (!inode->i_wb)
+ __inode_attach_wb(inode, page);
+}
+
+/**
+ * inode_detach_wb - disassociate an inode from its wb
+ * @inode: inode of interest
+ *
+ * @inode is being freed. Detach from its wb.
+ */
+static inline void inode_detach_wb(struct inode *inode)
+{
+ if (inode->i_wb) {
+ wb_put(inode->i_wb);
+ inode->i_wb = NULL;
+ }
+}
+
+/**
+ * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * This function is to be used by __filemap_fdatawrite_range(), which is an
+ * alternative entry point into writeback code, and first ensures @inode is
+ * associated with a bdi_writeback and attaches it to @wbc.
+ */
+static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ inode_attach_wb(inode, NULL);
+ wbc_attach_and_unlock_inode(wbc, inode);
+}
+
+/**
+ * wbc_init_bio - writeback specific initializtion of bio
+ * @wbc: writeback_control for the writeback in progress
+ * @bio: bio to be initialized
+ *
+ * @bio is a part of the writeback in progress controlled by @wbc. Perform
+ * writeback specific initialization. This is used to apply the cgroup
+ * writeback context.
+ */
+static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+{
+ /*
+ * pageout() path doesn't attach @wbc to the inode being written
+ * out. This is intentional as we don't want the function to block
+ * behind a slow cgroup. Ultimately, we want pageout() to kick off
+ * regular writeback instead of writing things out itself.
+ */
+ if (wbc->wb)
+ bio_associate_blkcg(bio, wbc->wb->blkcg_css);
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+}
+
+static inline void inode_detach_wb(struct inode *inode)
+{
+}
+
+static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+ __releases(&inode->i_lock)
+{
+ spin_unlock(&inode->i_lock);
+}
+
+static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+ struct inode *inode)
+{
+}
+
+static inline void wbc_detach_inode(struct writeback_control *wbc)
+{
+}
+
+static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+{
+}
+
+static inline void wbc_account_io(struct writeback_control *wbc,
+ struct page *page, size_t bytes)
+{
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
/*
* mm/page-writeback.c
*/
@@ -120,8 +316,12 @@ static inline void laptop_sync_completion(void) { }
#endif
void throttle_vm_writeout(gfp_t gfp_mask);
bool zone_dirty_ok(struct zone *zone);
+int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
+#ifdef CONFIG_CGROUP_WRITEBACK
+void wb_domain_exit(struct wb_domain *dom);
+#endif
-extern unsigned long global_dirty_limit;
+extern struct wb_domain global_wb_domain;
/* These are exported to sysctl. */
extern int dirty_background_ratio;
@@ -155,19 +355,12 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
-unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
- unsigned long dirty);
-
-void __bdi_update_bandwidth(struct backing_dev_info *bdi,
- unsigned long thresh,
- unsigned long bg_thresh,
- unsigned long dirty,
- unsigned long bdi_thresh,
- unsigned long bdi_dirty,
- unsigned long start_time);
+unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
+void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
void page_writeback_init(void);
void balance_dirty_pages_ratelimited(struct address_space *mapping);
+bool wb_over_bg_thresh(struct bdi_writeback *wb);
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
void *data);
diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h
index 8f81bbbc38fc..e0f4109e64c6 100644
--- a/include/net/iw_handler.h
+++ b/include/net/iw_handler.h
@@ -439,6 +439,12 @@ int dev_get_wireless_info(char *buffer, char **start, off_t offset, int length);
/* Send a single event to user space */
void wireless_send_event(struct net_device *dev, unsigned int cmd,
union iwreq_data *wrqu, const char *extra);
+#ifdef CONFIG_WEXT_CORE
+/* flush all previous wext events - if work is done from netdev notifiers */
+void wireless_nlevent_flush(void);
+#else
+static inline void wireless_nlevent_flush(void) {}
+#endif
/* We may need a function to send a stream of events to user space.
* More on that later... */
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index c178d13d6f4c..a7aa607a4c55 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -360,7 +360,7 @@ TRACE_EVENT(global_dirty_state,
__entry->nr_written = global_page_state(NR_WRITTEN);
__entry->background_thresh = background_thresh;
__entry->dirty_thresh = dirty_thresh;
- __entry->dirty_limit = global_dirty_limit;
+ __entry->dirty_limit = global_wb_domain.dirty_limit;
),
TP_printk("dirty=%lu writeback=%lu unstable=%lu "
@@ -399,13 +399,13 @@ TRACE_EVENT(bdi_dirty_ratelimit,
TP_fast_assign(
strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
- __entry->write_bw = KBps(bdi->write_bandwidth);
- __entry->avg_write_bw = KBps(bdi->avg_write_bandwidth);
+ __entry->write_bw = KBps(bdi->wb.write_bandwidth);
+ __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth);
__entry->dirty_rate = KBps(dirty_rate);
- __entry->dirty_ratelimit = KBps(bdi->dirty_ratelimit);
+ __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
__entry->task_ratelimit = KBps(task_ratelimit);
__entry->balanced_dirty_ratelimit =
- KBps(bdi->balanced_dirty_ratelimit);
+ KBps(bdi->wb.balanced_dirty_ratelimit);
),
TP_printk("bdi %s: "
@@ -462,8 +462,9 @@ TRACE_EVENT(balance_dirty_pages,
unsigned long freerun = (thresh + bg_thresh) / 2;
strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
- __entry->limit = global_dirty_limit;
- __entry->setpoint = (global_dirty_limit + freerun) / 2;
+ __entry->limit = global_wb_domain.dirty_limit;
+ __entry->setpoint = (global_wb_domain.dirty_limit +
+ freerun) / 2;
__entry->dirty = dirty;
__entry->bdi_setpoint = __entry->setpoint *
bdi_thresh / (thresh + 1);
diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h
index 310d83e0a91b..3d7a0fc021a7 100644
--- a/include/uapi/linux/psci.h
+++ b/include/uapi/linux/psci.h
@@ -46,6 +46,11 @@
#define PSCI_0_2_FN64_MIGRATE PSCI_0_2_FN64(5)
#define PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU PSCI_0_2_FN64(7)
+#define PSCI_1_0_FN_PSCI_FEATURES PSCI_0_2_FN(10)
+#define PSCI_1_0_FN_SYSTEM_SUSPEND PSCI_0_2_FN(14)
+
+#define PSCI_1_0_FN64_SYSTEM_SUSPEND PSCI_0_2_FN64(14)
+
/* PSCI v0.2 power state encoding for CPU_SUSPEND function */
#define PSCI_0_2_POWER_STATE_ID_MASK 0xffff
#define PSCI_0_2_POWER_STATE_ID_SHIFT 0
@@ -56,6 +61,13 @@
#define PSCI_0_2_POWER_STATE_AFFL_MASK \
(0x3 << PSCI_0_2_POWER_STATE_AFFL_SHIFT)
+/* PSCI extended power state encoding for CPU_SUSPEND function */
+#define PSCI_1_0_EXT_POWER_STATE_ID_MASK 0xfffffff
+#define PSCI_1_0_EXT_POWER_STATE_ID_SHIFT 0
+#define PSCI_1_0_EXT_POWER_STATE_TYPE_SHIFT 30
+#define PSCI_1_0_EXT_POWER_STATE_TYPE_MASK \
+ (0x1 << PSCI_1_0_EXT_POWER_STATE_TYPE_SHIFT)
+
/* PSCI v0.2 affinity level state returned by AFFINITY_INFO */
#define PSCI_0_2_AFFINITY_LEVEL_ON 0
#define PSCI_0_2_AFFINITY_LEVEL_OFF 1
@@ -76,6 +88,11 @@
#define PSCI_VERSION_MINOR(ver) \
((ver) & PSCI_VERSION_MINOR_MASK)
+/* PSCI features decoding (>=1.0) */
+#define PSCI_1_0_FEATURES_CPU_SUSPEND_PF_SHIFT 1
+#define PSCI_1_0_FEATURES_CPU_SUSPEND_PF_MASK \
+ (0x1 << PSCI_1_0_FEATURES_CPU_SUSPEND_PF_SHIFT)
+
/* PSCI return values (inclusive of all PSCI versions) */
#define PSCI_RET_SUCCESS 0
#define PSCI_RET_NOT_SUPPORTED -1
@@ -86,5 +103,6 @@
#define PSCI_RET_INTERNAL_FAILURE -6
#define PSCI_RET_NOT_PRESENT -7
#define PSCI_RET_DISABLED -8
+#define PSCI_RET_INVALID_ADDRESS -9
#endif /* _UAPI_LINUX_PSCI_H */
diff --git a/init/Kconfig b/init/Kconfig
index a70b5002df06..1c731b53d0ec 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1142,6 +1142,11 @@ config DEBUG_BLK_CGROUP
Enable some debugging help. Currently it exports additional stat
files in a cgroup which can be useful for debugging.
+config CGROUP_WRITEBACK
+ bool
+ depends on MEMCG && BLK_CGROUP
+ default y
+
endif # CGROUPS
config CHECKPOINT_RESTORE
diff --git a/init/main.c b/init/main.c
index 0486a8e11fc0..8e61f8c6ecbc 100644
--- a/init/main.c
+++ b/init/main.c
@@ -93,9 +93,6 @@ static int kernel_init(void *);
extern void init_IRQ(void);
extern void fork_init(void);
extern void radix_tree_init(void);
-#ifndef CONFIG_DEBUG_RODATA
-static inline void mark_rodata_ro(void) { }
-#endif
/*
* Debug helper: via this flag we know that we are in 'early bootup code'
@@ -926,6 +923,28 @@ static int try_to_run_init_process(const char *init_filename)
static noinline void __init kernel_init_freeable(void);
+#ifdef CONFIG_DEBUG_RODATA
+static bool rodata_enabled = true;
+static int __init set_debug_rodata(char *str)
+{
+ return strtobool(str, &rodata_enabled);
+}
+__setup("rodata=", set_debug_rodata);
+
+static void mark_readonly(void)
+{
+ if (rodata_enabled)
+ mark_rodata_ro();
+ else
+ pr_info("Kernel memory protection disabled.\n");
+}
+#else
+static inline void mark_readonly(void)
+{
+ pr_warn("This architecture does not have kernel memory protection.\n");
+}
+#endif
+
static int __ref kernel_init(void *unused)
{
int ret;
@@ -934,7 +953,7 @@ static int __ref kernel_init(void *unused)
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
free_initmem();
- mark_rodata_ro();
+ mark_readonly();
system_state = SYSTEM_RUNNING;
numa_default_policy();
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index e1dbf4a2c69e..90ff129c88a2 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
} else {
kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
__func__, bp->bp_addr);
-#ifdef CONFIG_DEBUG_RODATA
if (!bp->bp_type) {
kdb_printf("Software breakpoints are unavailable.\n"
- " Change the kernel CONFIG_DEBUG_RODATA=n\n"
+ " Boot the kernel with rodata=off\n"
" OR use hw breaks: help bph\n");
}
-#endif
return 1;
}
return 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1e7392d7b29a..7f5202756a67 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1562,14 +1562,14 @@ event_sched_out(struct perf_event *event,
perf_pmu_disable(event->pmu);
+ event->tstamp_stopped = tstamp;
+ event->pmu->del(event, 0);
+ event->oncpu = -1;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = tstamp;
- event->pmu->del(event, 0);
- event->oncpu = -1;
if (!is_software_event(event))
cpuctx->active_oncpu--;
@@ -7642,6 +7642,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
+ /* symmetric to unaccount_event() in _free_event() */
+ account_event(event);
+
return event;
err_per_task:
@@ -8005,8 +8008,6 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
- account_event(event);
-
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -8222,7 +8223,12 @@ err_context:
perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
- free_event(event);
+ /*
+ * If event_file is set, the fput() above will have called ->release()
+ * and that will take care of freeing the event.
+ */
+ if (!event_file)
+ free_event(event);
err_cpus:
put_online_cpus();
err_task:
@@ -8266,8 +8272,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = EVENT_OWNER_KERNEL;
- account_event(event);
-
ctx = find_get_context(event->pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 517a568f038d..3e0ea4520fc9 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -100,7 +100,7 @@ static LIST_HEAD(free_entries);
static DEFINE_SPINLOCK(free_entries_lock);
/* Global disable flag - will be set in case of an error */
-static u32 global_disable __read_mostly;
+static bool global_disable __read_mostly;
/* Early initialization disable flag, set at the end of dma_debug_init */
static bool dma_debug_initialized __read_mostly;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 000e7b3b9896..e1bd6e864152 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -18,6 +18,7 @@ struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
@@ -48,7 +49,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
struct bdi_writeback *wb = &bdi->wb;
unsigned long background_thresh;
unsigned long dirty_thresh;
- unsigned long bdi_thresh;
+ unsigned long wb_thresh;
unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
struct inode *inode;
@@ -66,7 +67,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
- bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+ wb_thresh = wb_calc_thresh(wb, dirty_thresh);
#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
@@ -84,19 +85,19 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"b_dirty_time: %10lu\n"
"bdi_list: %10u\n"
"state: %10lx\n",
- (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
- (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
- K(bdi_thresh),
+ (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
+ (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
+ K(wb_thresh),
K(dirty_thresh),
K(background_thresh),
- (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
- (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
- (unsigned long) K(bdi->write_bandwidth),
+ (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
+ (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
+ (unsigned long) K(wb->write_bandwidth),
nr_dirty,
nr_io,
nr_more_io,
nr_dirty_time,
- !list_empty(&bdi->bdi_list), bdi->state);
+ !list_empty(&bdi->bdi_list), bdi->wb.state);
#undef K
return 0;
@@ -255,13 +256,8 @@ static int __init default_bdi_init(void)
}
subsys_initcall(default_bdi_init);
-int bdi_has_dirty_io(struct backing_dev_info *bdi)
-{
- return wb_has_dirty_io(&bdi->wb);
-}
-
/*
- * This function is used when the first inode for this bdi is marked dirty. It
+ * This function is used when the first inode for this wb is marked dirty. It
* wakes-up the corresponding bdi thread which should then take care of the
* periodic background write-out of dirty inodes. Since the write-out would
* starts only 'dirty_writeback_interval' centisecs from now anyway, we just
@@ -274,162 +270,582 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
* We have to be careful not to postpone flush work if it is scheduled for
* earlier. Thus we use queue_delayed_work().
*/
-void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+void wb_wakeup_delayed(struct bdi_writeback *wb)
{
unsigned long timeout;
timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
- spin_lock_bh(&bdi->wb_lock);
- if (test_bit(BDI_registered, &bdi->state))
- queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
- spin_unlock_bh(&bdi->wb_lock);
+ spin_lock_bh(&wb->work_lock);
+ if (test_bit(WB_registered, &wb->state))
+ queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+ spin_unlock_bh(&wb->work_lock);
}
/*
- * Remove bdi from bdi_list, and ensure that it is no longer visible
+ * Initial write bandwidth: 100 MB/s
*/
-static void bdi_remove_from_list(struct backing_dev_info *bdi)
+#define INIT_BW (100 << (20 - PAGE_SHIFT))
+
+static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
+ int blkcg_id, gfp_t gfp)
{
- spin_lock_bh(&bdi_lock);
- list_del_rcu(&bdi->bdi_list);
- spin_unlock_bh(&bdi_lock);
+ int i, err;
- synchronize_rcu_expedited();
-}
+ memset(wb, 0, sizeof(*wb));
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
- const char *fmt, ...)
-{
- va_list args;
- struct device *dev;
+ wb->bdi = bdi;
+ wb->last_old_flush = jiffies;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+ INIT_LIST_HEAD(&wb->b_dirty_time);
+ spin_lock_init(&wb->list_lock);
- if (bdi->dev) /* The driver needs to use separate queues per device */
- return 0;
+ wb->bw_time_stamp = jiffies;
+ wb->balanced_dirty_ratelimit = INIT_BW;
+ wb->dirty_ratelimit = INIT_BW;
+ wb->write_bandwidth = INIT_BW;
+ wb->avg_write_bandwidth = INIT_BW;
- va_start(args, fmt);
- dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
- va_end(args);
- if (IS_ERR(dev))
- return PTR_ERR(dev);
+ spin_lock_init(&wb->work_lock);
+ INIT_LIST_HEAD(&wb->work_list);
+ INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
- bdi->dev = dev;
+ wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
+ if (!wb->congested)
+ return -ENOMEM;
- bdi_debug_register(bdi, dev_name(dev));
- set_bit(BDI_registered, &bdi->state);
+ err = fprop_local_init_percpu(&wb->completions, gfp);
+ if (err)
+ goto out_put_cong;
- spin_lock_bh(&bdi_lock);
- list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
+ for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
+ err = percpu_counter_init(&wb->stat[i], 0, gfp);
+ if (err)
+ goto out_destroy_stat;
+ }
- trace_writeback_bdi_register(bdi);
return 0;
-}
-EXPORT_SYMBOL(bdi_register);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
-{
- return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+out_destroy_stat:
+ while (--i)
+ percpu_counter_destroy(&wb->stat[i]);
+ fprop_local_destroy_percpu(&wb->completions);
+out_put_cong:
+ wb_congested_put(wb->congested);
+ return err;
}
-EXPORT_SYMBOL(bdi_register_dev);
/*
* Remove bdi from the global list and shutdown any threads we have running
*/
-static void bdi_wb_shutdown(struct backing_dev_info *bdi)
+static void wb_shutdown(struct bdi_writeback *wb)
{
/* Make sure nobody queues further work */
- spin_lock_bh(&bdi->wb_lock);
- if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
- spin_unlock_bh(&bdi->wb_lock);
+ spin_lock_bh(&wb->work_lock);
+ if (!test_and_clear_bit(WB_registered, &wb->state)) {
+ spin_unlock_bh(&wb->work_lock);
return;
}
- spin_unlock_bh(&bdi->wb_lock);
+ spin_unlock_bh(&wb->work_lock);
/*
- * Make sure nobody finds us on the bdi_list anymore
+ * Drain work list and shutdown the delayed_work. !WB_registered
+ * tells wb_workfn() that @wb is dying and its work_list needs to
+ * be drained no matter what.
*/
- bdi_remove_from_list(bdi);
+ mod_delayed_work(bdi_wq, &wb->dwork, 0);
+ flush_delayed_work(&wb->dwork);
+ WARN_ON(!list_empty(&wb->work_list));
+}
+
+static void wb_exit(struct bdi_writeback *wb)
+{
+ int i;
+
+ WARN_ON(delayed_work_pending(&wb->dwork));
+
+ for (i = 0; i < NR_WB_STAT_ITEMS; i++)
+ percpu_counter_destroy(&wb->stat[i]);
+
+ fprop_local_destroy_percpu(&wb->completions);
+ wb_congested_put(wb->congested);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#include <linux/memcontrol.h>
+
+/*
+ * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
+ * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
+ * protected. cgwb_release_wait is used to wait for the completion of cgwb
+ * releases from bdi destruction path.
+ */
+static DEFINE_SPINLOCK(cgwb_lock);
+static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
+
+/**
+ * wb_congested_get_create - get or create a wb_congested
+ * @bdi: associated bdi
+ * @blkcg_id: ID of the associated blkcg
+ * @gfp: allocation mask
+ *
+ * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one.
+ * The returned wb_congested has its reference count incremented. Returns
+ * NULL on failure.
+ */
+struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+{
+ struct bdi_writeback_congested *new_congested = NULL, *congested;
+ struct rb_node **node, *parent;
+ unsigned long flags;
+retry:
+ spin_lock_irqsave(&cgwb_lock, flags);
+
+ node = &bdi->cgwb_congested_tree.rb_node;
+ parent = NULL;
+
+ while (*node != NULL) {
+ parent = *node;
+ congested = container_of(parent, struct bdi_writeback_congested,
+ rb_node);
+ if (congested->blkcg_id < blkcg_id)
+ node = &parent->rb_left;
+ else if (congested->blkcg_id > blkcg_id)
+ node = &parent->rb_right;
+ else
+ goto found;
+ }
+
+ if (new_congested) {
+ /* !found and storage for new one already allocated, insert */
+ congested = new_congested;
+ new_congested = NULL;
+ rb_link_node(&congested->rb_node, parent, node);
+ rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
+ goto found;
+ }
+
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+
+ /* allocate storage for new one and retry */
+ new_congested = kzalloc(sizeof(*new_congested), gfp);
+ if (!new_congested)
+ return NULL;
+
+ atomic_set(&new_congested->refcnt, 0);
+ new_congested->bdi = bdi;
+ new_congested->blkcg_id = blkcg_id;
+ goto retry;
+
+found:
+ atomic_inc(&congested->refcnt);
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ kfree(new_congested);
+ return congested;
+}
+
+/**
+ * wb_congested_put - put a wb_congested
+ * @congested: wb_congested to put
+ *
+ * Put @congested and destroy it if the refcnt reaches zero.
+ */
+void wb_congested_put(struct bdi_writeback_congested *congested)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ /* bdi might already have been destroyed leaving @congested unlinked */
+ if (congested->bdi) {
+ rb_erase(&congested->rb_node,
+ &congested->bdi->cgwb_congested_tree);
+ congested->bdi = NULL;
+ }
+
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ kfree(congested);
+}
+
+static void cgwb_release_workfn(struct work_struct *work)
+{
+ struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
+ release_work);
+ struct backing_dev_info *bdi = wb->bdi;
+
+ wb_shutdown(wb);
+
+ css_put(wb->memcg_css);
+ css_put(wb->blkcg_css);
+
+ fprop_local_destroy_percpu(&wb->memcg_completions);
+ percpu_ref_exit(&wb->refcnt);
+ wb_exit(wb);
+ kfree_rcu(wb, rcu);
+
+ if (atomic_dec_and_test(&bdi->usage_cnt))
+ wake_up_all(&cgwb_release_wait);
+}
+
+static void cgwb_release(struct percpu_ref *refcnt)
+{
+ struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
+ refcnt);
+ schedule_work(&wb->release_work);
+}
+
+static void cgwb_kill(struct bdi_writeback *wb)
+{
+ lockdep_assert_held(&cgwb_lock);
+
+ WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
+ list_del(&wb->memcg_node);
+ list_del(&wb->blkcg_node);
+ percpu_ref_kill(&wb->refcnt);
+}
+
+static int cgwb_create(struct backing_dev_info *bdi,
+ struct cgroup_subsys_state *memcg_css, gfp_t gfp)
+{
+ struct mem_cgroup *memcg;
+ struct cgroup_subsys_state *blkcg_css;
+ struct blkcg *blkcg;
+ struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
+ struct bdi_writeback *wb;
+ unsigned long flags;
+ int ret = 0;
+
+ memcg = mem_cgroup_from_css(memcg_css);
+ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+ blkcg = css_to_blkcg(blkcg_css);
+ memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+ blkcg_cgwb_list = &blkcg->cgwb_list;
+
+ /* look up again under lock and discard on blkcg mismatch */
+ spin_lock_irqsave(&cgwb_lock, flags);
+ wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+ if (wb && wb->blkcg_css != blkcg_css) {
+ cgwb_kill(wb);
+ wb = NULL;
+ }
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ if (wb)
+ goto out_put;
+
+ /* need to create a new one */
+ wb = kmalloc(sizeof(*wb), gfp);
+ if (!wb)
+ return -ENOMEM;
+
+ ret = wb_init(wb, bdi, blkcg_css->id, gfp);
+ if (ret)
+ goto err_free;
+
+ ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
+ if (ret)
+ goto err_wb_exit;
+
+ ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
+ if (ret)
+ goto err_ref_exit;
+
+ wb->memcg_css = memcg_css;
+ wb->blkcg_css = blkcg_css;
+ INIT_WORK(&wb->release_work, cgwb_release_workfn);
+ set_bit(WB_registered, &wb->state);
/*
- * Drain work list and shutdown the delayed_work. At this point,
- * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
- * is dying and its work_list needs to be drained no matter what.
+ * The root wb determines the registered state of the whole bdi and
+ * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
+ * whether they're still online. Don't link @wb if any is dead.
+ * See wb_memcg_offline() and wb_blkcg_offline().
*/
- mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
- flush_delayed_work(&bdi->wb.dwork);
+ ret = -ENODEV;
+ spin_lock_irqsave(&cgwb_lock, flags);
+ if (test_bit(WB_registered, &bdi->wb.state) &&
+ blkcg_cgwb_list->next && memcg_cgwb_list->next) {
+ /* we might have raced another instance of this function */
+ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
+ if (!ret) {
+ atomic_inc(&bdi->usage_cnt);
+ list_add(&wb->memcg_node, memcg_cgwb_list);
+ list_add(&wb->blkcg_node, blkcg_cgwb_list);
+ css_get(memcg_css);
+ css_get(blkcg_css);
+ }
+ }
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ if (ret) {
+ if (ret == -EEXIST)
+ ret = 0;
+ goto err_fprop_exit;
+ }
+ goto out_put;
+
+err_fprop_exit:
+ fprop_local_destroy_percpu(&wb->memcg_completions);
+err_ref_exit:
+ percpu_ref_exit(&wb->refcnt);
+err_wb_exit:
+ wb_exit(wb);
+err_free:
+ kfree(wb);
+out_put:
+ css_put(blkcg_css);
+ return ret;
}
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+/**
+ * wb_get_create - get wb for a given memcg, create if necessary
+ * @bdi: target bdi
+ * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
+ * @gfp: allocation mask to use
+ *
+ * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
+ * create one. The returned wb has its refcount incremented.
+ *
+ * This function uses css_get() on @memcg_css and thus expects its refcnt
+ * to be positive on invocation. IOW, rcu_read_lock() protection on
+ * @memcg_css isn't enough. try_get it before calling this function.
+ *
+ * A wb is keyed by its associated memcg. As blkcg implicitly enables
+ * memcg on the default hierarchy, memcg association is guaranteed to be
+ * more specific (equal or descendant to the associated blkcg) and thus can
+ * identify both the memcg and blkcg associations.
+ *
+ * Because the blkcg associated with a memcg may change as blkcg is enabled
+ * and disabled closer to root in the hierarchy, each wb keeps track of
+ * both the memcg and blkcg associated with it and verifies the blkcg on
+ * each lookup. On mismatch, the existing wb is discarded and a new one is
+ * created.
+ */
+struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+ struct cgroup_subsys_state *memcg_css,
+ gfp_t gfp)
{
- memset(wb, 0, sizeof(*wb));
+ struct bdi_writeback *wb;
+
+ might_sleep_if(gfp & __GFP_WAIT);
+
+ if (!memcg_css->parent)
+ return &bdi->wb;
+
+ do {
+ rcu_read_lock();
+ wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+ if (wb) {
+ struct cgroup_subsys_state *blkcg_css;
+
+ /* see whether the blkcg association has changed */
+ blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
+ &blkio_cgrp_subsys);
+ if (unlikely(wb->blkcg_css != blkcg_css ||
+ !wb_tryget(wb)))
+ wb = NULL;
+ css_put(blkcg_css);
+ }
+ rcu_read_unlock();
+ } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
+
+ return wb;
+}
- wb->bdi = bdi;
- wb->last_old_flush = jiffies;
- INIT_LIST_HEAD(&wb->b_dirty);
- INIT_LIST_HEAD(&wb->b_io);
- INIT_LIST_HEAD(&wb->b_more_io);
- INIT_LIST_HEAD(&wb->b_dirty_time);
- spin_lock_init(&wb->list_lock);
- INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
+static int cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+ int ret;
+
+ INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
+ bdi->cgwb_congested_tree = RB_ROOT;
+ atomic_set(&bdi->usage_cnt, 1);
+
+ ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+ if (!ret) {
+ bdi->wb.memcg_css = mem_cgroup_root_css;
+ bdi->wb.blkcg_css = blkcg_root_css;
+ }
+ return ret;
}
-/*
- * Initial write bandwidth: 100 MB/s
+static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+{
+ struct radix_tree_iter iter;
+ struct bdi_writeback_congested *congested, *congested_n;
+ void **slot;
+
+ WARN_ON(test_bit(WB_registered, &bdi->wb.state));
+
+ spin_lock_irq(&cgwb_lock);
+
+ radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
+ cgwb_kill(*slot);
+
+ rbtree_postorder_for_each_entry_safe(congested, congested_n,
+ &bdi->cgwb_congested_tree, rb_node) {
+ rb_erase(&congested->rb_node, &bdi->cgwb_congested_tree);
+ congested->bdi = NULL; /* mark @congested unlinked */
+ }
+
+ spin_unlock_irq(&cgwb_lock);
+
+ /*
+ * All cgwb's and their congested states must be shutdown and
+ * released before returning. Drain the usage counter to wait for
+ * all cgwb's and cgwb_congested's ever created on @bdi.
+ */
+ atomic_dec(&bdi->usage_cnt);
+ wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
+}
+
+/**
+ * wb_memcg_offline - kill all wb's associated with a memcg being offlined
+ * @memcg: memcg being offlined
+ *
+ * Also prevents creation of any new wb's associated with @memcg.
*/
-#define INIT_BW (100 << (20 - PAGE_SHIFT))
+void wb_memcg_offline(struct mem_cgroup *memcg)
+{
+ LIST_HEAD(to_destroy);
+ struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+ struct bdi_writeback *wb, *next;
+
+ spin_lock_irq(&cgwb_lock);
+ list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
+ cgwb_kill(wb);
+ memcg_cgwb_list->next = NULL; /* prevent new wb's */
+ spin_unlock_irq(&cgwb_lock);
+}
-int bdi_init(struct backing_dev_info *bdi)
+/**
+ * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
+ * @blkcg: blkcg being offlined
+ *
+ * Also prevents creation of any new wb's associated with @blkcg.
+ */
+void wb_blkcg_offline(struct blkcg *blkcg)
{
- int i, err;
+ LIST_HEAD(to_destroy);
+ struct bdi_writeback *wb, *next;
+
+ spin_lock_irq(&cgwb_lock);
+ list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+ cgwb_kill(wb);
+ blkcg->cgwb_list.next = NULL; /* prevent new wb's */
+ spin_unlock_irq(&cgwb_lock);
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static int cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+ int err;
+
+ bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL);
+ if (!bdi->wb_congested)
+ return -ENOMEM;
+
+ err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+ if (err) {
+ kfree(bdi->wb_congested);
+ return err;
+ }
+ return 0;
+}
+
+static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+int bdi_init(struct backing_dev_info *bdi)
+{
bdi->dev = NULL;
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = FPROP_FRAC_BASE;
- spin_lock_init(&bdi->wb_lock);
INIT_LIST_HEAD(&bdi->bdi_list);
- INIT_LIST_HEAD(&bdi->work_list);
+ init_waitqueue_head(&bdi->wb_waitq);
- bdi_wb_init(&bdi->wb, bdi);
+ return cgwb_bdi_init(bdi);
+}
+EXPORT_SYMBOL(bdi_init);
- for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
- err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
- if (err)
- goto err;
- }
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+ const char *fmt, ...)
+{
+ va_list args;
+ struct device *dev;
- bdi->dirty_exceeded = 0;
+ if (bdi->dev) /* The driver needs to use separate queues per device */
+ return 0;
- bdi->bw_time_stamp = jiffies;
- bdi->written_stamp = 0;
+ va_start(args, fmt);
+ dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+ va_end(args);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
- bdi->balanced_dirty_ratelimit = INIT_BW;
- bdi->dirty_ratelimit = INIT_BW;
- bdi->write_bandwidth = INIT_BW;
- bdi->avg_write_bandwidth = INIT_BW;
+ bdi->dev = dev;
- err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
+ bdi_debug_register(bdi, dev_name(dev));
+ set_bit(WB_registered, &bdi->wb.state);
- if (err) {
-err:
- while (i--)
- percpu_counter_destroy(&bdi->bdi_stat[i]);
- }
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
- return err;
+ trace_writeback_bdi_register(bdi);
+ return 0;
}
-EXPORT_SYMBOL(bdi_init);
+EXPORT_SYMBOL(bdi_register);
-void bdi_destroy(struct backing_dev_info *bdi)
+int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
{
- int i;
+ return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+}
+EXPORT_SYMBOL(bdi_register_dev);
+
+/*
+ * Remove bdi from bdi_list, and ensure that it is no longer visible
+ */
+static void bdi_remove_from_list(struct backing_dev_info *bdi)
+{
+ spin_lock_bh(&bdi_lock);
+ list_del_rcu(&bdi->bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ synchronize_rcu_expedited();
+}
+
+/*
+ * Called when the device behind @bdi has been removed or ejected.
+ *
+ * We can't really do much here except for reducing the dirty ratio at
+ * the moment. In the future we should be able to set a flag so that
+ * the filesystem can handle errors at mark_inode_dirty time instead
+ * of only at writeback time.
+ */
+void bdi_unregister(struct backing_dev_info *bdi)
+{
+ if (WARN_ON_ONCE(!bdi->dev))
+ return;
- bdi_wb_shutdown(bdi);
bdi_set_min_ratio(bdi, 0);
+}
+EXPORT_SYMBOL(bdi_unregister);
- WARN_ON(!list_empty(&bdi->work_list));
- WARN_ON(delayed_work_pending(&bdi->wb.dwork));
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+ /* make sure nobody finds us on the bdi_list anymore */
+ bdi_remove_from_list(bdi);
+ wb_shutdown(&bdi->wb);
+ cgwb_bdi_destroy(bdi);
if (bdi->dev) {
bdi_debug_unregister(bdi);
@@ -437,9 +853,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
bdi->dev = NULL;
}
- for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
- percpu_counter_destroy(&bdi->bdi_stat[i]);
- fprop_local_destroy_percpu(&bdi->completions);
+ wb_exit(&bdi->wb);
}
EXPORT_SYMBOL(bdi_destroy);
@@ -472,31 +886,31 @@ static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
};
-static atomic_t nr_bdi_congested[2];
+static atomic_t nr_wb_congested[2];
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
{
- enum bdi_state bit;
wait_queue_head_t *wqh = &congestion_wqh[sync];
+ enum wb_state bit;
- bit = sync ? BDI_sync_congested : BDI_async_congested;
- if (test_and_clear_bit(bit, &bdi->state))
- atomic_dec(&nr_bdi_congested[sync]);
+ bit = sync ? WB_sync_congested : WB_async_congested;
+ if (test_and_clear_bit(bit, &congested->state))
+ atomic_dec(&nr_wb_congested[sync]);
smp_mb__after_atomic();
if (waitqueue_active(wqh))
wake_up(wqh);
}
-EXPORT_SYMBOL(clear_bdi_congested);
+EXPORT_SYMBOL(clear_wb_congested);
-void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
{
- enum bdi_state bit;
+ enum wb_state bit;
- bit = sync ? BDI_sync_congested : BDI_async_congested;
- if (!test_and_set_bit(bit, &bdi->state))
- atomic_inc(&nr_bdi_congested[sync]);
+ bit = sync ? WB_sync_congested : WB_async_congested;
+ if (!test_and_set_bit(bit, &congested->state))
+ atomic_inc(&nr_wb_congested[sync]);
}
-EXPORT_SYMBOL(set_bdi_congested);
+EXPORT_SYMBOL(set_wb_congested);
/**
* congestion_wait - wait for a backing_dev to become uncongested
@@ -555,7 +969,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
* encountered in the current zone, yield if necessary instead
* of sleeping on the congestion queue
*/
- if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+ if (atomic_read(&nr_wb_congested[sync]) == 0 ||
!test_bit(ZONE_CONGESTED, &zone->flags)) {
cond_resched();
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index e10ccd299d66..a0baeb4be934 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -217,6 +217,28 @@ early_memremap(resource_size_t phys_addr, unsigned long size)
return (__force void *)__early_ioremap(phys_addr, size,
FIXMAP_PAGE_NORMAL);
}
+
+#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
+
+void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
+{
+ unsigned long slop, clen;
+ char *p;
+
+ while (size) {
+ slop = src & ~PAGE_MASK;
+ clen = size;
+ if (clen > MAX_MAP_CHUNK - slop)
+ clen = MAX_MAP_CHUNK - slop;
+ p = early_memremap(src & PAGE_MASK, clen + slop);
+ memcpy(dest, p + slop, clen);
+ early_memunmap(p, clen + slop);
+ dest += clen;
+ src += clen;
+ size -= clen;
+ }
+}
+
#else /* CONFIG_MMU */
void __init __iomem *
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 4a3907cf79f8..b8a5bc66b0c0 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -115,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
- if (!bdi_write_congested(bdi))
+ if (!inode_write_congested(mapping->host))
__filemap_fdatawrite_range(mapping, offset, endbyte,
WB_SYNC_NONE);
diff --git a/mm/failslab.c b/mm/failslab.c
index fefaabaab76d..98fb490311eb 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -3,12 +3,12 @@
static struct {
struct fault_attr attr;
- u32 ignore_gfp_wait;
- int cache_filter;
+ bool ignore_gfp_wait;
+ bool cache_filter;
} failslab = {
.attr = FAULT_ATTR_INITIALIZER,
- .ignore_gfp_wait = 1,
- .cache_filter = 0,
+ .ignore_gfp_wait = true,
+ .cache_filter = false,
};
bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
diff --git a/mm/filemap.c b/mm/filemap.c
index 7d4fa2bf6ac2..ceec993cf0da 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -100,6 +100,7 @@
* ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
+ * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
@@ -176,9 +177,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock.
+ * is safe. The caller must hold the mapping's tree_lock and
+ * mem_cgroup_begin_page_stat().
*/
-void __delete_from_page_cache(struct page *page, void *shadow)
+void __delete_from_page_cache(struct page *page, void *shadow,
+ struct mem_cgroup *memcg)
{
struct address_space *mapping = page->mapping;
@@ -212,7 +215,8 @@ void __delete_from_page_cache(struct page *page, void *shadow)
* anyway will be cleared before returning page into buddy allocator.
*/
if (WARN_ON_ONCE(PageDirty(page)))
- account_page_cleaned(page, mapping);
+ account_page_cleaned(page, mapping, memcg,
+ inode_to_wb(mapping->host));
}
/**
@@ -226,14 +230,20 @@ void __delete_from_page_cache(struct page *page, void *shadow)
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ struct mem_cgroup *memcg;
+ unsigned long flags;
+
void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
freepage = mapping->a_ops->freepage;
- spin_lock_irq(&mapping->tree_lock);
- __delete_from_page_cache(page, NULL);
- spin_unlock_irq(&mapping->tree_lock);
+
+ memcg = mem_cgroup_begin_page_stat(page);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ __delete_from_page_cache(page, NULL, memcg);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
if (freepage)
freepage(page);
@@ -283,7 +293,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
if (!mapping_cap_writeback_dirty(mapping))
return 0;
+ wbc_attach_fdatawrite_inode(&wbc, mapping->host);
ret = do_writepages(mapping, &wbc);
+ wbc_detach_inode(&wbc);
return ret;
}
@@ -472,6 +484,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (!error) {
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *);
+ struct mem_cgroup *memcg;
+ unsigned long flags;
pgoff_t offset = old->index;
freepage = mapping->a_ops->freepage;
@@ -480,15 +494,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
new->mapping = mapping;
new->index = offset;
- spin_lock_irq(&mapping->tree_lock);
- __delete_from_page_cache(old, NULL);
+ memcg = mem_cgroup_begin_page_stat(old);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ __delete_from_page_cache(old, NULL, memcg);
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
__inc_zone_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
mem_cgroup_migrate(old, new, true);
radix_tree_preload_end();
if (freepage)
diff --git a/mm/madvise.c b/mm/madvise.c
index d551475517bf..64bb8a22110c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c56a6bbb6909..9d2056ae8f82 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -79,6 +79,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
+struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
@@ -93,6 +94,7 @@ static const char * const mem_cgroup_stat_names[] = {
"rss",
"rss_huge",
"mapped_file",
+ "dirty",
"writeback",
"swap",
};
@@ -325,11 +327,6 @@ struct mem_cgroup {
* percpu counter.
*/
struct mem_cgroup_stat_cpu __percpu *stat;
- /*
- * used when a cpu is offlined or other synchronizations
- * See mem_cgroup_read_stat().
- */
- struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
@@ -349,6 +346,11 @@ struct mem_cgroup {
atomic_t numainfo_updating;
#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head cgwb_list;
+ struct wb_domain cgwb_domain;
+#endif
+
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
@@ -599,6 +601,39 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
return &memcg->css;
}
+/**
+ * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * @page: page of interest
+ *
+ * If memcg is bound to the default hierarchy, css of the memcg associated
+ * with @page is returned. The returned css remains associated with @page
+ * until it is released.
+ *
+ * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
+ * is returned.
+ *
+ * XXX: The above description of behavior on the default hierarchy isn't
+ * strictly true yet as replace_page_cache_page() can modify the
+ * association before @page is released even on the default hierarchy;
+ * however, the current and planned usages don't mix the the two functions
+ * and replace_page_cache_page() will soon be updated to make the invariant
+ * actually true.
+ */
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+
+ memcg = page->mem_cgroup;
+
+ if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+ memcg = root_mem_cgroup;
+
+ rcu_read_unlock();
+ return &memcg->css;
+}
+
static struct mem_cgroup_per_zone *
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
@@ -798,15 +833,8 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
long val = 0;
int cpu;
- get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
val += per_cpu(memcg->stat->count[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
- spin_lock(&memcg->pcp_counter_lock);
- val += memcg->nocpu_base.count[idx];
- spin_unlock(&memcg->pcp_counter_lock);
-#endif
- put_online_cpus();
return val;
}
@@ -816,15 +844,8 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
unsigned long val = 0;
int cpu;
- get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
val += per_cpu(memcg->stat->events[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
- spin_lock(&memcg->pcp_counter_lock);
- val += memcg->nocpu_base.events[idx];
- spin_unlock(&memcg->pcp_counter_lock);
-#endif
- put_online_cpus();
return val;
}
@@ -2014,6 +2035,7 @@ again:
return memcg;
}
+EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
/**
* mem_cgroup_end_page_stat - finish a page state statistics transaction
@@ -2032,6 +2054,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
rcu_read_unlock();
}
+EXPORT_SYMBOL(mem_cgroup_end_page_stat);
/**
* mem_cgroup_update_page_stat - update page state statistics
@@ -2175,37 +2198,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
mutex_unlock(&percpu_charge_mutex);
}
-/*
- * This function drains percpu counter value from DEAD cpu and
- * move it to local cpu. Note that this function can be preempted.
- */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
-{
- int i;
-
- spin_lock(&memcg->pcp_counter_lock);
- for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- long x = per_cpu(memcg->stat->count[i], cpu);
-
- per_cpu(memcg->stat->count[i], cpu) = 0;
- memcg->nocpu_base.count[i] += x;
- }
- for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
- unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-
- per_cpu(memcg->stat->events[i], cpu) = 0;
- memcg->nocpu_base.events[i] += x;
- }
- spin_unlock(&memcg->pcp_counter_lock);
-}
-
static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
- struct mem_cgroup *iter;
if (action == CPU_ONLINE)
return NOTIFY_OK;
@@ -2213,9 +2211,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
return NOTIFY_OK;
- for_each_mem_cgroup(iter)
- mem_cgroup_drain_pcp_counter(iter, cpu);
-
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
return NOTIFY_OK;
@@ -4003,6 +3998,98 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
}
#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
+{
+ return &memcg->cgwb_list;
+}
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+ return wb_domain_init(&memcg->cgwb_domain, gfp);
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+ wb_domain_exit(&memcg->cgwb_domain);
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+ wb_domain_size_changed(&memcg->cgwb_domain);
+}
+
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+ if (!memcg->css.parent)
+ return NULL;
+
+ return &memcg->cgwb_domain;
+}
+
+/**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+ * @pavail: out parameter for number of available pages
+ * @pdirty: out parameter for number of dirty pages
+ * @pwriteback: out parameter for number of pages under writeback
+ *
+ * Determine the numbers of available, dirty, and writeback pages in @wb's
+ * memcg. Dirty and writeback are self-explanatory. Available is a bit
+ * more involved.
+ *
+ * A memcg's headroom is "min(max, high) - used". The available memory is
+ * calculated as the lowest headroom of itself and the ancestors plus the
+ * number of pages already being used for file pages. Note that this
+ * doesn't consider the actual amount of available memory in the system.
+ * The caller should further cap *@pavail accordingly.
+ */
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+ unsigned long *pdirty, unsigned long *pwriteback)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ struct mem_cgroup *parent;
+ unsigned long head_room = PAGE_COUNTER_MAX;
+ unsigned long file_pages;
+
+ *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+
+ /* this should eventually include NR_UNSTABLE_NFS */
+ *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+
+ file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+ (1 << LRU_ACTIVE_FILE));
+ while ((parent = parent_mem_cgroup(memcg))) {
+ unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+ unsigned long used = page_counter_read(&memcg->memory);
+
+ head_room = min(head_room, ceiling - min(ceiling, used));
+ memcg = parent;
+ }
+
+ *pavail = file_pages + head_room;
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+ return 0;
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
/*
* DO NOT USE IN NEW FILES.
*
@@ -4387,9 +4474,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!memcg->stat)
goto out_free;
+
+ if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+ goto out_free_stat;
+
spin_lock_init(&memcg->pcp_counter_lock);
return memcg;
+out_free_stat:
+ free_percpu(memcg->stat);
out_free:
kfree(memcg);
return NULL;
@@ -4416,6 +4509,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_mem_cgroup_per_zone_info(memcg, node);
free_percpu(memcg->stat);
+ memcg_wb_domain_exit(memcg);
kfree(memcg);
}
@@ -4448,6 +4542,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
/* root ? */
if (parent_css == NULL) {
root_mem_cgroup = memcg;
+ mem_cgroup_root_css = &memcg->css;
page_counter_init(&memcg->memory, NULL);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4466,7 +4561,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
#endif
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+ INIT_LIST_HEAD(&memcg->cgwb_list);
+#endif
return &memcg->css;
free_out:
@@ -4554,6 +4651,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
vmpressure_cleanup(&memcg->vmpressure);
memcg_deactivate_kmem(memcg);
+
+ wb_memcg_offline(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4587,6 +4686,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
memcg->low = 0;
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
+ memcg_wb_domain_size_changed(memcg);
}
#ifdef CONFIG_MMU
@@ -4756,6 +4856,7 @@ static int mem_cgroup_move_account(struct page *page,
{
unsigned long flags;
int ret;
+ bool anon;
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -4781,15 +4882,33 @@ static int mem_cgroup_move_account(struct page *page,
if (page->mem_cgroup != from)
goto out_unlock;
+ anon = PageAnon(page);
+
spin_lock_irqsave(&from->move_lock, flags);
- if (!PageAnon(page) && page_mapped(page)) {
+ if (!anon && page_mapped(page)) {
__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
nr_pages);
__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
nr_pages);
}
+ /*
+ * move_lock grabbed above and caller set from->moving_account, so
+ * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+ * So mapping should be stable for dirty pages.
+ */
+ if (!anon && PageDirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping_cap_account_dirty(mapping)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+ nr_pages);
+ }
+ }
+
if (PageWriteback(page)) {
__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
nr_pages);
@@ -5305,6 +5424,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
memcg->high = high;
+ memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -5337,6 +5457,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (err)
return err;
+ memcg_wb_domain_size_changed(memcg);
return nbytes;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index eb59f7eea508..33c2bf721e97 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -122,31 +122,31 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
-unsigned long global_dirty_limit;
+struct wb_domain global_wb_domain;
-/*
- * Scale the writeback cache size proportional to the relative writeout speeds.
- *
- * We do this by keeping a floating proportion between BDIs, based on page
- * writeback completions [end_page_writeback()]. Those devices that write out
- * pages fastest will get the larger share, while the slower will get a smaller
- * share.
- *
- * We use page writeout completions because we are interested in getting rid of
- * dirty pages. Having them written out is the primary goal.
- *
- * We introduce a concept of time, a period over which we measure these events,
- * because demand can/will vary over time. The length of this period itself is
- * measured in page writeback completions.
- *
- */
-static struct fprop_global writeout_completions;
+/* consolidated parameters for balance_dirty_pages() and its subroutines */
+struct dirty_throttle_control {
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct wb_domain *dom;
+ struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */
+#endif
+ struct bdi_writeback *wb;
+ struct fprop_local_percpu *wb_completions;
-static void writeout_period(unsigned long t);
-/* Timer for aging of writeout_completions */
-static struct timer_list writeout_period_timer =
- TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
-static unsigned long writeout_period_time = 0;
+ unsigned long avail; /* dirtyable */
+ unsigned long dirty; /* file_dirty + write + nfs */
+ unsigned long thresh; /* dirty threshold */
+ unsigned long bg_thresh; /* dirty background threshold */
+
+ unsigned long wb_dirty; /* per-wb counterparts */
+ unsigned long wb_thresh;
+ unsigned long wb_bg_thresh;
+
+ unsigned long pos_ratio;
+};
+
+#define DTC_INIT_COMMON(__wb) .wb = (__wb), \
+ .wb_completions = &(__wb)->completions
/*
* Length of period for aging writeout fractions of bdis. This is an
@@ -155,6 +155,97 @@ static unsigned long writeout_period_time = 0;
*/
#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#define GDTC_INIT(__wb) .dom = &global_wb_domain, \
+ DTC_INIT_COMMON(__wb)
+#define GDTC_INIT_NO_WB .dom = &global_wb_domain
+#define MDTC_INIT(__wb, __gdtc) .dom = mem_cgroup_wb_domain(__wb), \
+ .gdtc = __gdtc, \
+ DTC_INIT_COMMON(__wb)
+
+static bool mdtc_valid(struct dirty_throttle_control *dtc)
+{
+ return dtc->dom;
+}
+
+static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+{
+ return dtc->dom;
+}
+
+static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+{
+ return mdtc->gdtc;
+}
+
+static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+{
+ return &wb->memcg_completions;
+}
+
+static void wb_min_max_ratio(struct bdi_writeback *wb,
+ unsigned long *minp, unsigned long *maxp)
+{
+ unsigned long this_bw = wb->avg_write_bandwidth;
+ unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+ unsigned long long min = wb->bdi->min_ratio;
+ unsigned long long max = wb->bdi->max_ratio;
+
+ /*
+ * @wb may already be clean by the time control reaches here and
+ * the total may not include its bw.
+ */
+ if (this_bw < tot_bw) {
+ if (min) {
+ min *= this_bw;
+ do_div(min, tot_bw);
+ }
+ if (max < 100) {
+ max *= this_bw;
+ do_div(max, tot_bw);
+ }
+ }
+
+ *minp = min;
+ *maxp = max;
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+#define GDTC_INIT(__wb) DTC_INIT_COMMON(__wb)
+#define GDTC_INIT_NO_WB
+#define MDTC_INIT(__wb, __gdtc)
+
+static bool mdtc_valid(struct dirty_throttle_control *dtc)
+{
+ return false;
+}
+
+static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+{
+ return &global_wb_domain;
+}
+
+static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+{
+ return NULL;
+}
+
+static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+{
+ return NULL;
+}
+
+static void wb_min_max_ratio(struct bdi_writeback *wb,
+ unsigned long *minp, unsigned long *maxp)
+{
+ *minp = wb->bdi->min_ratio;
+ *maxp = wb->bdi->max_ratio;
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
/*
* In a memory zone, there is a certain amount of pages we consider
* available for the page cache, which is essentially the number of
@@ -250,42 +341,88 @@ static unsigned long global_dirtyable_memory(void)
return x + 1; /* Ensure that we never return 0 */
}
-/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
+/**
+ * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
+ * @dtc: dirty_throttle_control of interest
*
- * Calculate the dirty thresholds based on sysctl parameters
- * - vm.dirty_background_ratio or vm.dirty_background_bytes
- * - vm.dirty_ratio or vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * Calculate @dtc->thresh and ->bg_thresh considering
+ * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller
+ * must ensure that @dtc->avail is set before calling this function. The
+ * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
* real-time tasks.
*/
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
- const unsigned long available_memory = global_dirtyable_memory();
- unsigned long background;
- unsigned long dirty;
+ const unsigned long available_memory = dtc->avail;
+ struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
+ unsigned long bytes = vm_dirty_bytes;
+ unsigned long bg_bytes = dirty_background_bytes;
+ unsigned long ratio = vm_dirty_ratio;
+ unsigned long bg_ratio = dirty_background_ratio;
+ unsigned long thresh;
+ unsigned long bg_thresh;
struct task_struct *tsk;
- if (vm_dirty_bytes)
- dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+ /* gdtc is !NULL iff @dtc is for memcg domain */
+ if (gdtc) {
+ unsigned long global_avail = gdtc->avail;
+
+ /*
+ * The byte settings can't be applied directly to memcg
+ * domains. Convert them to ratios by scaling against
+ * globally available memory.
+ */
+ if (bytes)
+ ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 /
+ global_avail, 100UL);
+ if (bg_bytes)
+ bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 /
+ global_avail, 100UL);
+ bytes = bg_bytes = 0;
+ }
+
+ if (bytes)
+ thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
else
- dirty = (vm_dirty_ratio * available_memory) / 100;
+ thresh = (ratio * available_memory) / 100;
- if (dirty_background_bytes)
- background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+ if (bg_bytes)
+ bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
else
- background = (dirty_background_ratio * available_memory) / 100;
+ bg_thresh = (bg_ratio * available_memory) / 100;
- if (background >= dirty)
- background = dirty / 2;
+ if (bg_thresh >= thresh)
+ bg_thresh = thresh / 2;
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
- background += background / 4;
- dirty += dirty / 4;
+ bg_thresh += bg_thresh / 4;
+ thresh += thresh / 4;
}
- *pbackground = background;
- *pdirty = dirty;
- trace_global_dirty_state(background, dirty);
+ dtc->thresh = thresh;
+ dtc->bg_thresh = bg_thresh;
+
+ /* we should eventually report the domain in the TP */
+ if (!gdtc)
+ trace_global_dirty_state(bg_thresh, thresh);
+}
+
+/**
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ * @pbackground: out parameter for bg_thresh
+ * @pdirty: out parameter for thresh
+ *
+ * Calculate bg_thresh and thresh for global_wb_domain. See
+ * domain_dirty_limits() for details.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+ struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+
+ gdtc.avail = global_dirtyable_memory();
+ domain_dirty_limits(&gdtc);
+
+ *pbackground = gdtc.bg_thresh;
+ *pdirty = gdtc.thresh;
}
/**
@@ -392,47 +529,52 @@ static unsigned long wp_next_time(unsigned long cur_time)
return cur_time;
}
-/*
- * Increment the BDI's writeout completion count and the global writeout
- * completion count. Called from test_clear_page_writeback().
- */
-static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+static void wb_domain_writeout_inc(struct wb_domain *dom,
+ struct fprop_local_percpu *completions,
+ unsigned int max_prop_frac)
{
- __inc_bdi_stat(bdi, BDI_WRITTEN);
- __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
- bdi->max_prop_frac);
+ __fprop_inc_percpu_max(&dom->completions, completions,
+ max_prop_frac);
/* First event after period switching was turned off? */
- if (!unlikely(writeout_period_time)) {
+ if (!unlikely(dom->period_time)) {
/*
* We can race with other __bdi_writeout_inc calls here but
* it does not cause any harm since the resulting time when
* timer will fire and what is in writeout_period_time will be
* roughly the same.
*/
- writeout_period_time = wp_next_time(jiffies);
- mod_timer(&writeout_period_timer, writeout_period_time);
+ dom->period_time = wp_next_time(jiffies);
+ mod_timer(&dom->period_timer, dom->period_time);
}
}
-void bdi_writeout_inc(struct backing_dev_info *bdi)
+/*
+ * Increment @wb's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __wb_writeout_inc(struct bdi_writeback *wb)
{
- unsigned long flags;
+ struct wb_domain *cgdom;
- local_irq_save(flags);
- __bdi_writeout_inc(bdi);
- local_irq_restore(flags);
+ __inc_wb_stat(wb, WB_WRITTEN);
+ wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
+ wb->bdi->max_prop_frac);
+
+ cgdom = mem_cgroup_wb_domain(wb);
+ if (cgdom)
+ wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
+ wb->bdi->max_prop_frac);
}
-EXPORT_SYMBOL_GPL(bdi_writeout_inc);
-/*
- * Obtain an accurate fraction of the BDI's portion.
- */
-static void bdi_writeout_fraction(struct backing_dev_info *bdi,
- long *numerator, long *denominator)
+void wb_writeout_inc(struct bdi_writeback *wb)
{
- fprop_fraction_percpu(&writeout_completions, &bdi->completions,
- numerator, denominator);
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __wb_writeout_inc(wb);
+ local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(wb_writeout_inc);
/*
* On idle system, we can be called long after we scheduled because we use
@@ -440,22 +582,46 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
*/
static void writeout_period(unsigned long t)
{
- int miss_periods = (jiffies - writeout_period_time) /
+ struct wb_domain *dom = (void *)t;
+ int miss_periods = (jiffies - dom->period_time) /
VM_COMPLETIONS_PERIOD_LEN;
- if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
- writeout_period_time = wp_next_time(writeout_period_time +
+ if (fprop_new_period(&dom->completions, miss_periods + 1)) {
+ dom->period_time = wp_next_time(dom->period_time +
miss_periods * VM_COMPLETIONS_PERIOD_LEN);
- mod_timer(&writeout_period_timer, writeout_period_time);
+ mod_timer(&dom->period_timer, dom->period_time);
} else {
/*
* Aging has zeroed all fractions. Stop wasting CPU on period
* updates.
*/
- writeout_period_time = 0;
+ dom->period_time = 0;
}
}
+int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
+{
+ memset(dom, 0, sizeof(*dom));
+
+ spin_lock_init(&dom->lock);
+
+ init_timer_deferrable(&dom->period_timer);
+ dom->period_timer.function = writeout_period;
+ dom->period_timer.data = (unsigned long)dom;
+
+ dom->dirty_limit_tstamp = jiffies;
+
+ return fprop_global_init(&dom->completions, gfp);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+void wb_domain_exit(struct wb_domain *dom)
+{
+ del_timer_sync(&dom->period_timer);
+ fprop_global_destroy(&dom->completions);
+}
+#endif
+
/*
* bdi_min_ratio keeps the sum of the minimum dirty shares of all
* registered backing devices, which, for obvious reasons, can not
@@ -510,17 +676,26 @@ static unsigned long dirty_freerun_ceiling(unsigned long thresh,
return (thresh + bg_thresh) / 2;
}
-static unsigned long hard_dirty_limit(unsigned long thresh)
+static unsigned long hard_dirty_limit(struct wb_domain *dom,
+ unsigned long thresh)
{
- return max(thresh, global_dirty_limit);
+ return max(thresh, dom->dirty_limit);
+}
+
+/* memory available to a memcg domain is capped by system-wide clean memory */
+static void mdtc_cap_avail(struct dirty_throttle_control *mdtc)
+{
+ struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
+ unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+
+ mdtc->avail = min(mdtc->avail, clean);
}
/**
- * bdi_dirty_limit - @bdi's share of dirty throttling threshold
- * @bdi: the backing_dev_info to query
- * @dirty: global dirty limit in pages
+ * __wb_calc_thresh - @wb's share of dirty throttling threshold
+ * @dtc: dirty_throttle_context of interest
*
- * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * Returns @wb's dirty limit in pages. The term "dirty" in the context of
* dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
*
* Note that balance_dirty_pages() will only seriously take it as a hard limit
@@ -528,34 +703,47 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
* control. For example, when the device is completely stalled due to some error
* conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
* In the other normal situations, it acts more gently by throttling the tasks
- * more (rather than completely block them) when the bdi dirty pages go high.
+ * more (rather than completely block them) when the wb dirty pages go high.
*
* It allocates high/low dirty limits to fast/slow devices, in order to prevent
* - starving fast devices
* - piling up dirty pages (that will take long time to sync) on slow devices
*
- * The bdi's share of dirty limit will be adapting to its throughput and
+ * The wb's share of dirty limit will be adapting to its throughput and
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
*/
-unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
- u64 bdi_dirty;
+ struct wb_domain *dom = dtc_dom(dtc);
+ unsigned long thresh = dtc->thresh;
+ u64 wb_thresh;
long numerator, denominator;
+ unsigned long wb_min_ratio, wb_max_ratio;
/*
- * Calculate this BDI's share of the dirty ratio.
+ * Calculate this BDI's share of the thresh ratio.
*/
- bdi_writeout_fraction(bdi, &numerator, &denominator);
+ fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
+ &numerator, &denominator);
+
+ wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
+ wb_thresh *= numerator;
+ do_div(wb_thresh, denominator);
- bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
- bdi_dirty *= numerator;
- do_div(bdi_dirty, denominator);
+ wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
- bdi_dirty += (dirty * bdi->min_ratio) / 100;
- if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
- bdi_dirty = dirty * bdi->max_ratio / 100;
+ wb_thresh += (thresh * wb_min_ratio) / 100;
+ if (wb_thresh > (thresh * wb_max_ratio) / 100)
+ wb_thresh = thresh * wb_max_ratio / 100;
- return bdi_dirty;
+ return wb_thresh;
+}
+
+unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
+{
+ struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
+ .thresh = thresh };
+ return __wb_calc_thresh(&gdtc);
}
/*
@@ -594,7 +782,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
*
* (o) global/bdi setpoints
*
- * We want the dirty pages be balanced around the global/bdi setpoints.
+ * We want the dirty pages be balanced around the global/wb setpoints.
* When the number of dirty pages is higher/lower than the setpoint, the
* dirty position control ratio (and hence task dirty ratelimit) will be
* decreased/increased to bring the dirty pages back to the setpoint.
@@ -604,8 +792,8 @@ static long long pos_ratio_polynom(unsigned long setpoint,
* if (dirty < setpoint) scale up pos_ratio
* if (dirty > setpoint) scale down pos_ratio
*
- * if (bdi_dirty < bdi_setpoint) scale up pos_ratio
- * if (bdi_dirty > bdi_setpoint) scale down pos_ratio
+ * if (wb_dirty < wb_setpoint) scale up pos_ratio
+ * if (wb_dirty > wb_setpoint) scale down pos_ratio
*
* task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
*
@@ -630,7 +818,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
* 0 +------------.------------------.----------------------*------------->
* freerun^ setpoint^ limit^ dirty pages
*
- * (o) bdi control line
+ * (o) wb control line
*
* ^ pos_ratio
* |
@@ -656,33 +844,32 @@ static long long pos_ratio_polynom(unsigned long setpoint,
* | . .
* | . .
* 0 +----------------------.-------------------------------.------------->
- * bdi_setpoint^ x_intercept^
+ * wb_setpoint^ x_intercept^
*
- * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
+ * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
* be smoothly throttled down to normal if it starts high in situations like
* - start writing to a slow SD card and a fast disk at the same time. The SD
- * card's bdi_dirty may rush to many times higher than bdi_setpoint.
- * - the bdi dirty thresh drops quickly due to change of JBOD workload
+ * card's wb_dirty may rush to many times higher than wb_setpoint.
+ * - the wb dirty thresh drops quickly due to change of JBOD workload
*/
-static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
- unsigned long thresh,
- unsigned long bg_thresh,
- unsigned long dirty,
- unsigned long bdi_thresh,
- unsigned long bdi_dirty)
-{
- unsigned long write_bw = bdi->avg_write_bandwidth;
- unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
- unsigned long limit = hard_dirty_limit(thresh);
+static void wb_position_ratio(struct dirty_throttle_control *dtc)
+{
+ struct bdi_writeback *wb = dtc->wb;
+ unsigned long write_bw = wb->avg_write_bandwidth;
+ unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+ unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+ unsigned long wb_thresh = dtc->wb_thresh;
unsigned long x_intercept;
unsigned long setpoint; /* dirty pages' target balance point */
- unsigned long bdi_setpoint;
+ unsigned long wb_setpoint;
unsigned long span;
long long pos_ratio; /* for scaling up/down the rate limit */
long x;
- if (unlikely(dirty >= limit))
- return 0;
+ dtc->pos_ratio = 0;
+
+ if (unlikely(dtc->dirty >= limit))
+ return;
/*
* global setpoint
@@ -690,165 +877,167 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
* See comment for pos_ratio_polynom().
*/
setpoint = (freerun + limit) / 2;
- pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
+ pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
/*
* The strictlimit feature is a tool preventing mistrusted filesystems
* from growing a large number of dirty pages before throttling. For
- * such filesystems balance_dirty_pages always checks bdi counters
- * against bdi limits. Even if global "nr_dirty" is under "freerun".
+ * such filesystems balance_dirty_pages always checks wb counters
+ * against wb limits. Even if global "nr_dirty" is under "freerun".
* This is especially important for fuse which sets bdi->max_ratio to
* 1% by default. Without strictlimit feature, fuse writeback may
* consume arbitrary amount of RAM because it is accounted in
* NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
*
- * Here, in bdi_position_ratio(), we calculate pos_ratio based on
- * two values: bdi_dirty and bdi_thresh. Let's consider an example:
+ * Here, in wb_position_ratio(), we calculate pos_ratio based on
+ * two values: wb_dirty and wb_thresh. Let's consider an example:
* total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
* limits are set by default to 10% and 20% (background and throttle).
- * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
- * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
- * about ~6K pages (as the average of background and throttle bdi
+ * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
+ * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
+ * about ~6K pages (as the average of background and throttle wb
* limits). The 3rd order polynomial will provide positive feedback if
- * bdi_dirty is under bdi_setpoint and vice versa.
+ * wb_dirty is under wb_setpoint and vice versa.
*
* Note, that we cannot use global counters in these calculations
- * because we want to throttle process writing to a strictlimit BDI
+ * because we want to throttle process writing to a strictlimit wb
* much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
* in the example above).
*/
- if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
- long long bdi_pos_ratio;
- unsigned long bdi_bg_thresh;
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+ long long wb_pos_ratio;
- if (bdi_dirty < 8)
- return min_t(long long, pos_ratio * 2,
- 2 << RATELIMIT_CALC_SHIFT);
+ if (dtc->wb_dirty < 8) {
+ dtc->pos_ratio = min_t(long long, pos_ratio * 2,
+ 2 << RATELIMIT_CALC_SHIFT);
+ return;
+ }
- if (bdi_dirty >= bdi_thresh)
- return 0;
+ if (dtc->wb_dirty >= wb_thresh)
+ return;
- bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
- bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
- bdi_bg_thresh);
+ wb_setpoint = dirty_freerun_ceiling(wb_thresh,
+ dtc->wb_bg_thresh);
- if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
- return 0;
+ if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
+ return;
- bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
- bdi_thresh);
+ wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
+ wb_thresh);
/*
- * Typically, for strictlimit case, bdi_setpoint << setpoint
- * and pos_ratio >> bdi_pos_ratio. In the other words global
+ * Typically, for strictlimit case, wb_setpoint << setpoint
+ * and pos_ratio >> wb_pos_ratio. In the other words global
* state ("dirty") is not limiting factor and we have to
- * make decision based on bdi counters. But there is an
+ * make decision based on wb counters. But there is an
* important case when global pos_ratio should get precedence:
* global limits are exceeded (e.g. due to activities on other
- * BDIs) while given strictlimit BDI is below limit.
+ * wb's) while given strictlimit wb is below limit.
*
- * "pos_ratio * bdi_pos_ratio" would work for the case above,
+ * "pos_ratio * wb_pos_ratio" would work for the case above,
* but it would look too non-natural for the case of all
- * activity in the system coming from a single strictlimit BDI
+ * activity in the system coming from a single strictlimit wb
* with bdi->max_ratio == 100%.
*
* Note that min() below somewhat changes the dynamics of the
* control system. Normally, pos_ratio value can be well over 3
- * (when globally we are at freerun and bdi is well below bdi
+ * (when globally we are at freerun and wb is well below wb
* setpoint). Now the maximum pos_ratio in the same situation
* is 2. We might want to tweak this if we observe the control
* system is too slow to adapt.
*/
- return min(pos_ratio, bdi_pos_ratio);
+ dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
+ return;
}
/*
* We have computed basic pos_ratio above based on global situation. If
- * the bdi is over/under its share of dirty pages, we want to scale
+ * the wb is over/under its share of dirty pages, we want to scale
* pos_ratio further down/up. That is done by the following mechanism.
*/
/*
- * bdi setpoint
+ * wb setpoint
*
- * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
+ * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
*
- * x_intercept - bdi_dirty
+ * x_intercept - wb_dirty
* := --------------------------
- * x_intercept - bdi_setpoint
+ * x_intercept - wb_setpoint
*
- * The main bdi control line is a linear function that subjects to
+ * The main wb control line is a linear function that subjects to
*
- * (1) f(bdi_setpoint) = 1.0
- * (2) k = - 1 / (8 * write_bw) (in single bdi case)
- * or equally: x_intercept = bdi_setpoint + 8 * write_bw
+ * (1) f(wb_setpoint) = 1.0
+ * (2) k = - 1 / (8 * write_bw) (in single wb case)
+ * or equally: x_intercept = wb_setpoint + 8 * write_bw
*
- * For single bdi case, the dirty pages are observed to fluctuate
+ * For single wb case, the dirty pages are observed to fluctuate
* regularly within range
- * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
+ * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
* for various filesystems, where (2) can yield in a reasonable 12.5%
* fluctuation range for pos_ratio.
*
- * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
+ * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
* own size, so move the slope over accordingly and choose a slope that
- * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
+ * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
*/
- if (unlikely(bdi_thresh > thresh))
- bdi_thresh = thresh;
+ if (unlikely(wb_thresh > dtc->thresh))
+ wb_thresh = dtc->thresh;
/*
- * It's very possible that bdi_thresh is close to 0 not because the
+ * It's very possible that wb_thresh is close to 0 not because the
* device is slow, but that it has remained inactive for long time.
* Honour such devices a reasonable good (hopefully IO efficient)
* threshold, so that the occasional writes won't be blocked and active
* writes can rampup the threshold quickly.
*/
- bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+ wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
/*
- * scale global setpoint to bdi's:
- * bdi_setpoint = setpoint * bdi_thresh / thresh
+ * scale global setpoint to wb's:
+ * wb_setpoint = setpoint * wb_thresh / thresh
*/
- x = div_u64((u64)bdi_thresh << 16, thresh | 1);
- bdi_setpoint = setpoint * (u64)x >> 16;
+ x = div_u64((u64)wb_thresh << 16, dtc->thresh + 1);
+ wb_setpoint = setpoint * (u64)x >> 16;
/*
- * Use span=(8*write_bw) in single bdi case as indicated by
- * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+ * Use span=(8*write_bw) in single wb case as indicated by
+ * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
*
- * bdi_thresh thresh - bdi_thresh
- * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
- * thresh thresh
+ * wb_thresh thresh - wb_thresh
+ * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
+ * thresh thresh
*/
- span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
- x_intercept = bdi_setpoint + span;
+ span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
+ x_intercept = wb_setpoint + span;
- if (bdi_dirty < x_intercept - span / 4) {
- pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
- (x_intercept - bdi_setpoint) | 1);
+ if (dtc->wb_dirty < x_intercept - span / 4) {
+ pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
+ x_intercept - wb_setpoint + 1);
} else
pos_ratio /= 4;
/*
- * bdi reserve area, safeguard against dirty pool underrun and disk idle
+ * wb reserve area, safeguard against dirty pool underrun and disk idle
* It may push the desired control point of global dirty pages higher
* than setpoint.
*/
- x_intercept = bdi_thresh / 2;
- if (bdi_dirty < x_intercept) {
- if (bdi_dirty > x_intercept / 8)
- pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
+ x_intercept = wb_thresh / 2;
+ if (dtc->wb_dirty < x_intercept) {
+ if (dtc->wb_dirty > x_intercept / 8)
+ pos_ratio = div_u64(pos_ratio * x_intercept,
+ dtc->wb_dirty);
else
pos_ratio *= 8;
}
- return pos_ratio;
+ dtc->pos_ratio = pos_ratio;
}
-static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
- unsigned long elapsed,
- unsigned long written)
+static void wb_update_write_bandwidth(struct bdi_writeback *wb,
+ unsigned long elapsed,
+ unsigned long written)
{
const unsigned long period = roundup_pow_of_two(3 * HZ);
- unsigned long avg = bdi->avg_write_bandwidth;
- unsigned long old = bdi->write_bandwidth;
+ unsigned long avg = wb->avg_write_bandwidth;
+ unsigned long old = wb->write_bandwidth;
u64 bw;
/*
@@ -861,14 +1050,14 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
* @written may have decreased due to account_page_redirty().
* Avoid underflowing @bw calculation.
*/
- bw = written - min(written, bdi->written_stamp);
+ bw = written - min(written, wb->written_stamp);
bw *= HZ;
if (unlikely(elapsed > period)) {
do_div(bw, elapsed);
avg = bw;
goto out;
}
- bw += (u64)bdi->write_bandwidth * (period - elapsed);
+ bw += (u64)wb->write_bandwidth * (period - elapsed);
bw >>= ilog2(period);
/*
@@ -881,21 +1070,22 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
avg += (old - avg) >> 3;
out:
- bdi->write_bandwidth = bw;
- bdi->avg_write_bandwidth = avg;
+ /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
+ avg = max(avg, 1LU);
+ if (wb_has_dirty_io(wb)) {
+ long delta = avg - wb->avg_write_bandwidth;
+ WARN_ON_ONCE(atomic_long_add_return(delta,
+ &wb->bdi->tot_write_bandwidth) <= 0);
+ }
+ wb->write_bandwidth = bw;
+ wb->avg_write_bandwidth = avg;
}
-/*
- * The global dirtyable memory and dirty threshold could be suddenly knocked
- * down by a large amount (eg. on the startup of KVM in a swapless system).
- * This may throw the system into deep dirty exceeded state and throttle
- * heavy/light dirtiers alike. To retain good responsiveness, maintain
- * global_dirty_limit for tracking slowly down to the knocked down dirty
- * threshold.
- */
-static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+static void update_dirty_limit(struct dirty_throttle_control *dtc)
{
- unsigned long limit = global_dirty_limit;
+ struct wb_domain *dom = dtc_dom(dtc);
+ unsigned long thresh = dtc->thresh;
+ unsigned long limit = dom->dirty_limit;
/*
* Follow up in one step.
@@ -908,63 +1098,57 @@ static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
/*
* Follow down slowly. Use the higher one as the target, because thresh
* may drop below dirty. This is exactly the reason to introduce
- * global_dirty_limit which is guaranteed to lie above the dirty pages.
+ * dom->dirty_limit which is guaranteed to lie above the dirty pages.
*/
- thresh = max(thresh, dirty);
+ thresh = max(thresh, dtc->dirty);
if (limit > thresh) {
limit -= (limit - thresh) >> 5;
goto update;
}
return;
update:
- global_dirty_limit = limit;
+ dom->dirty_limit = limit;
}
-static void global_update_bandwidth(unsigned long thresh,
- unsigned long dirty,
+static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
unsigned long now)
{
- static DEFINE_SPINLOCK(dirty_lock);
- static unsigned long update_time = INITIAL_JIFFIES;
+ struct wb_domain *dom = dtc_dom(dtc);
/*
* check locklessly first to optimize away locking for the most time
*/
- if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+ if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
return;
- spin_lock(&dirty_lock);
- if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
- update_dirty_limit(thresh, dirty);
- update_time = now;
+ spin_lock(&dom->lock);
+ if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
+ update_dirty_limit(dtc);
+ dom->dirty_limit_tstamp = now;
}
- spin_unlock(&dirty_lock);
+ spin_unlock(&dom->lock);
}
/*
- * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+ * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
*
- * Normal bdi tasks will be curbed at or below it in long term.
+ * Normal wb tasks will be curbed at or below it in long term.
* Obviously it should be around (write_bw / N) when there are N dd tasks.
*/
-static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
- unsigned long thresh,
- unsigned long bg_thresh,
- unsigned long dirty,
- unsigned long bdi_thresh,
- unsigned long bdi_dirty,
- unsigned long dirtied,
- unsigned long elapsed)
-{
- unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
- unsigned long limit = hard_dirty_limit(thresh);
+static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
+ unsigned long dirtied,
+ unsigned long elapsed)
+{
+ struct bdi_writeback *wb = dtc->wb;
+ unsigned long dirty = dtc->dirty;
+ unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+ unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long setpoint = (freerun + limit) / 2;
- unsigned long write_bw = bdi->avg_write_bandwidth;
- unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+ unsigned long write_bw = wb->avg_write_bandwidth;
+ unsigned long dirty_ratelimit = wb->dirty_ratelimit;
unsigned long dirty_rate;
unsigned long task_ratelimit;
unsigned long balanced_dirty_ratelimit;
- unsigned long pos_ratio;
unsigned long step;
unsigned long x;
@@ -972,20 +1156,18 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
* The dirty rate will match the writeout rate in long term, except
* when dirty pages are truncated by userspace or re-dirtied by FS.
*/
- dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+ dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
- pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
- bdi_thresh, bdi_dirty);
/*
* task_ratelimit reflects each dd's dirty rate for the past 200ms.
*/
task_ratelimit = (u64)dirty_ratelimit *
- pos_ratio >> RATELIMIT_CALC_SHIFT;
+ dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
/*
* A linear estimation of the "balanced" throttle rate. The theory is,
- * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+ * if there are N dd tasks, each throttled at task_ratelimit, the wb's
* dirty_rate will be measured to be (N * task_ratelimit). So the below
* formula will yield the balanced rate limit (write_bw / N).
*
@@ -1024,7 +1206,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
/*
* We could safely do this and return immediately:
*
- * bdi->dirty_ratelimit = balanced_dirty_ratelimit;
+ * wb->dirty_ratelimit = balanced_dirty_ratelimit;
*
* However to get a more stable dirty_ratelimit, the below elaborated
* code makes use of task_ratelimit to filter out singular points and
@@ -1058,32 +1240,31 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
step = 0;
/*
- * For strictlimit case, calculations above were based on bdi counters
- * and limits (starting from pos_ratio = bdi_position_ratio() and up to
+ * For strictlimit case, calculations above were based on wb counters
+ * and limits (starting from pos_ratio = wb_position_ratio() and up to
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
- * Hence, to calculate "step" properly, we have to use bdi_dirty as
- * "dirty" and bdi_setpoint as "setpoint".
+ * Hence, to calculate "step" properly, we have to use wb_dirty as
+ * "dirty" and wb_setpoint as "setpoint".
*
- * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
- * it's possible that bdi_thresh is close to zero due to inactivity
- * of backing device (see the implementation of bdi_dirty_limit()).
+ * We rampup dirty_ratelimit forcibly if wb_dirty is low because
+ * it's possible that wb_thresh is close to zero due to inactivity
+ * of backing device.
*/
- if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
- dirty = bdi_dirty;
- if (bdi_dirty < 8)
- setpoint = bdi_dirty + 1;
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+ dirty = dtc->wb_dirty;
+ if (dtc->wb_dirty < 8)
+ setpoint = dtc->wb_dirty + 1;
else
- setpoint = (bdi_thresh +
- bdi_dirty_limit(bdi, bg_thresh)) / 2;
+ setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
}
if (dirty < setpoint) {
- x = min3(bdi->balanced_dirty_ratelimit,
+ x = min3(wb->balanced_dirty_ratelimit,
balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit < x)
step = x - dirty_ratelimit;
} else {
- x = max3(bdi->balanced_dirty_ratelimit,
+ x = max3(wb->balanced_dirty_ratelimit,
balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit > x)
step = dirty_ratelimit - x;
@@ -1105,69 +1286,67 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
else
dirty_ratelimit -= step;
- bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
- bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+ wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+ wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
- trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
+ trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
}
-void __bdi_update_bandwidth(struct backing_dev_info *bdi,
- unsigned long thresh,
- unsigned long bg_thresh,
- unsigned long dirty,
- unsigned long bdi_thresh,
- unsigned long bdi_dirty,
- unsigned long start_time)
+static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
+ struct dirty_throttle_control *mdtc,
+ unsigned long start_time,
+ bool update_ratelimit)
{
+ struct bdi_writeback *wb = gdtc->wb;
unsigned long now = jiffies;
- unsigned long elapsed = now - bdi->bw_time_stamp;
+ unsigned long elapsed = now - wb->bw_time_stamp;
unsigned long dirtied;
unsigned long written;
+ lockdep_assert_held(&wb->list_lock);
+
/*
* rate-limit, only update once every 200ms.
*/
if (elapsed < BANDWIDTH_INTERVAL)
return;
- dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
- written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+ dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
+ written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
/*
* Skip quiet periods when disk bandwidth is under-utilized.
* (at least 1s idle time between two flusher runs)
*/
- if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+ if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
goto snapshot;
- if (thresh) {
- global_update_bandwidth(thresh, dirty, now);
- bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
- bdi_thresh, bdi_dirty,
- dirtied, elapsed);
+ if (update_ratelimit) {
+ domain_update_bandwidth(gdtc, now);
+ wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
+
+ /*
+ * @mdtc is always NULL if !CGROUP_WRITEBACK but the
+ * compiler has no way to figure that out. Help it.
+ */
+ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
+ domain_update_bandwidth(mdtc, now);
+ wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
+ }
}
- bdi_update_write_bandwidth(bdi, elapsed, written);
+ wb_update_write_bandwidth(wb, elapsed, written);
snapshot:
- bdi->dirtied_stamp = dirtied;
- bdi->written_stamp = written;
- bdi->bw_time_stamp = now;
+ wb->dirtied_stamp = dirtied;
+ wb->written_stamp = written;
+ wb->bw_time_stamp = now;
}
-static void bdi_update_bandwidth(struct backing_dev_info *bdi,
- unsigned long thresh,
- unsigned long bg_thresh,
- unsigned long dirty,
- unsigned long bdi_thresh,
- unsigned long bdi_dirty,
- unsigned long start_time)
+void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
{
- if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
- return;
- spin_lock(&bdi->wb.list_lock);
- __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
- bdi_thresh, bdi_dirty, start_time);
- spin_unlock(&bdi->wb.list_lock);
+ struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+
+ __wb_update_bandwidth(&gdtc, NULL, start_time, false);
}
/*
@@ -1187,10 +1366,10 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
return 1;
}
-static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
- unsigned long bdi_dirty)
+static unsigned long wb_max_pause(struct bdi_writeback *wb,
+ unsigned long wb_dirty)
{
- unsigned long bw = bdi->avg_write_bandwidth;
+ unsigned long bw = wb->avg_write_bandwidth;
unsigned long t;
/*
@@ -1200,20 +1379,20 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
*
* 8 serves as the safety ratio.
*/
- t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
+ t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
t++;
return min_t(unsigned long, t, MAX_PAUSE);
}
-static long bdi_min_pause(struct backing_dev_info *bdi,
- long max_pause,
- unsigned long task_ratelimit,
- unsigned long dirty_ratelimit,
- int *nr_dirtied_pause)
+static long wb_min_pause(struct bdi_writeback *wb,
+ long max_pause,
+ unsigned long task_ratelimit,
+ unsigned long dirty_ratelimit,
+ int *nr_dirtied_pause)
{
- long hi = ilog2(bdi->avg_write_bandwidth);
- long lo = ilog2(bdi->dirty_ratelimit);
+ long hi = ilog2(wb->avg_write_bandwidth);
+ long lo = ilog2(wb->dirty_ratelimit);
long t; /* target pause */
long pause; /* estimated next pause */
int pages; /* target nr_dirtied_pause */
@@ -1281,34 +1460,27 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
}
-static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
- unsigned long dirty_thresh,
- unsigned long background_thresh,
- unsigned long *bdi_dirty,
- unsigned long *bdi_thresh,
- unsigned long *bdi_bg_thresh)
+static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
{
- unsigned long bdi_reclaimable;
+ struct bdi_writeback *wb = dtc->wb;
+ unsigned long wb_reclaimable;
/*
- * bdi_thresh is not treated as some limiting factor as
+ * wb_thresh is not treated as some limiting factor as
* dirty_thresh, due to reasons
- * - in JBOD setup, bdi_thresh can fluctuate a lot
+ * - in JBOD setup, wb_thresh can fluctuate a lot
* - in a system with HDD and USB key, the USB key may somehow
- * go into state (bdi_dirty >> bdi_thresh) either because
- * bdi_dirty starts high, or because bdi_thresh drops low.
+ * go into state (wb_dirty >> wb_thresh) either because
+ * wb_dirty starts high, or because wb_thresh drops low.
* In this case we don't want to hard throttle the USB key
- * dirtiers for 100 seconds until bdi_dirty drops under
- * bdi_thresh. Instead the auxiliary bdi control line in
- * bdi_position_ratio() will let the dirtier task progress
- * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+ * dirtiers for 100 seconds until wb_dirty drops under
+ * wb_thresh. Instead the auxiliary wb control line in
+ * wb_position_ratio() will let the dirtier task progress
+ * at some rate <= (write_bw / 2) for bringing down wb_dirty.
*/
- *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-
- if (bdi_bg_thresh)
- *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
- background_thresh,
- dirty_thresh) : 0;
+ dtc->wb_thresh = __wb_calc_thresh(dtc);
+ dtc->wb_bg_thresh = dtc->thresh ?
+ div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
/*
* In order to avoid the stacked BDI deadlock we need
@@ -1320,14 +1492,12 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
* actually dirty; with m+n sitting in the percpu
* deltas.
*/
- if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
- bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
- *bdi_dirty = bdi_reclaimable +
- bdi_stat_sum(bdi, BDI_WRITEBACK);
+ if (dtc->wb_thresh < 2 * wb_stat_error(wb)) {
+ wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
+ dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
} else {
- bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- *bdi_dirty = bdi_reclaimable +
- bdi_stat(bdi, BDI_WRITEBACK);
+ wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+ dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
}
}
@@ -1339,12 +1509,16 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
* perform some writeout.
*/
static void balance_dirty_pages(struct address_space *mapping,
+ struct bdi_writeback *wb,
unsigned long pages_dirtied)
{
+ struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+ struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+ struct dirty_throttle_control * const gdtc = &gdtc_stor;
+ struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+ &mdtc_stor : NULL;
+ struct dirty_throttle_control *sdtc;
unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
- unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
- unsigned long background_thresh;
- unsigned long dirty_thresh;
long period;
long pause;
long max_pause;
@@ -1353,18 +1527,14 @@ static void balance_dirty_pages(struct address_space *mapping,
bool dirty_exceeded = false;
unsigned long task_ratelimit;
unsigned long dirty_ratelimit;
- unsigned long pos_ratio;
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct backing_dev_info *bdi = wb->bdi;
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
for (;;) {
unsigned long now = jiffies;
- unsigned long uninitialized_var(bdi_thresh);
- unsigned long thresh;
- unsigned long uninitialized_var(bdi_dirty);
- unsigned long dirty;
- unsigned long bg_thresh;
+ unsigned long dirty, thresh, bg_thresh;
+ unsigned long m_dirty, m_thresh, m_bg_thresh;
/*
* Unstable writes are a feature of certain networked
@@ -1374,65 +1544,127 @@ static void balance_dirty_pages(struct address_space *mapping,
*/
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
+ gdtc->avail = global_dirtyable_memory();
+ gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
- global_dirty_limits(&background_thresh, &dirty_thresh);
+ domain_dirty_limits(gdtc);
if (unlikely(strictlimit)) {
- bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
- &bdi_dirty, &bdi_thresh, &bg_thresh);
+ wb_dirty_limits(gdtc);
- dirty = bdi_dirty;
- thresh = bdi_thresh;
+ dirty = gdtc->wb_dirty;
+ thresh = gdtc->wb_thresh;
+ bg_thresh = gdtc->wb_bg_thresh;
} else {
- dirty = nr_dirty;
- thresh = dirty_thresh;
- bg_thresh = background_thresh;
+ dirty = gdtc->dirty;
+ thresh = gdtc->thresh;
+ bg_thresh = gdtc->bg_thresh;
+ }
+
+ if (mdtc) {
+ unsigned long writeback;
+
+ /*
+ * If @wb belongs to !root memcg, repeat the same
+ * basic calculations for the memcg domain.
+ */
+ mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty,
+ &writeback);
+ mdtc_cap_avail(mdtc);
+ mdtc->dirty += writeback;
+
+ domain_dirty_limits(mdtc);
+
+ if (unlikely(strictlimit)) {
+ wb_dirty_limits(mdtc);
+ m_dirty = mdtc->wb_dirty;
+ m_thresh = mdtc->wb_thresh;
+ m_bg_thresh = mdtc->wb_bg_thresh;
+ } else {
+ m_dirty = mdtc->dirty;
+ m_thresh = mdtc->thresh;
+ m_bg_thresh = mdtc->bg_thresh;
+ }
}
/*
* Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts
- * when the bdi limits are ramping up in case of !strictlimit.
+ * when the wb limits are ramping up in case of !strictlimit.
*
- * In strictlimit case make decision based on the bdi counters
- * and limits. Small writeouts when the bdi limits are ramping
+ * In strictlimit case make decision based on the wb counters
+ * and limits. Small writeouts when the wb limits are ramping
* up are the price we consciously pay for strictlimit-ing.
+ *
+ * If memcg domain is in effect, @dirty should be under
+ * both global and memcg freerun ceilings.
*/
- if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
+ if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
+ (!mdtc ||
+ m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
+ unsigned long intv = dirty_poll_interval(dirty, thresh);
+ unsigned long m_intv = ULONG_MAX;
+
current->dirty_paused_when = now;
current->nr_dirtied = 0;
- current->nr_dirtied_pause =
- dirty_poll_interval(dirty, thresh);
+ if (mdtc)
+ m_intv = dirty_poll_interval(m_dirty, m_thresh);
+ current->nr_dirtied_pause = min(intv, m_intv);
break;
}
- if (unlikely(!writeback_in_progress(bdi)))
- bdi_start_background_writeback(bdi);
+ if (unlikely(!writeback_in_progress(wb)))
+ wb_start_background_writeback(wb);
+ /*
+ * Calculate global domain's pos_ratio and select the
+ * global dtc by default.
+ */
if (!strictlimit)
- bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
- &bdi_dirty, &bdi_thresh, NULL);
-
- dirty_exceeded = (bdi_dirty > bdi_thresh) &&
- ((nr_dirty > dirty_thresh) || strictlimit);
- if (dirty_exceeded && !bdi->dirty_exceeded)
- bdi->dirty_exceeded = 1;
-
- bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
- nr_dirty, bdi_thresh, bdi_dirty,
- start_time);
-
- dirty_ratelimit = bdi->dirty_ratelimit;
- pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
- background_thresh, nr_dirty,
- bdi_thresh, bdi_dirty);
- task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
+ wb_dirty_limits(gdtc);
+
+ dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
+ ((gdtc->dirty > gdtc->thresh) || strictlimit);
+
+ wb_position_ratio(gdtc);
+ sdtc = gdtc;
+
+ if (mdtc) {
+ /*
+ * If memcg domain is in effect, calculate its
+ * pos_ratio. @wb should satisfy constraints from
+ * both global and memcg domains. Choose the one
+ * w/ lower pos_ratio.
+ */
+ if (!strictlimit)
+ wb_dirty_limits(mdtc);
+
+ dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
+ ((mdtc->dirty > mdtc->thresh) || strictlimit);
+
+ wb_position_ratio(mdtc);
+ if (mdtc->pos_ratio < gdtc->pos_ratio)
+ sdtc = mdtc;
+ }
+
+ if (dirty_exceeded && !wb->dirty_exceeded)
+ wb->dirty_exceeded = 1;
+
+ if (time_is_before_jiffies(wb->bw_time_stamp +
+ BANDWIDTH_INTERVAL)) {
+ spin_lock(&wb->list_lock);
+ __wb_update_bandwidth(gdtc, mdtc, start_time, true);
+ spin_unlock(&wb->list_lock);
+ }
+
+ /* throttle according to the chosen dtc */
+ dirty_ratelimit = wb->dirty_ratelimit;
+ task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
RATELIMIT_CALC_SHIFT;
- max_pause = bdi_max_pause(bdi, bdi_dirty);
- min_pause = bdi_min_pause(bdi, max_pause,
- task_ratelimit, dirty_ratelimit,
- &nr_dirtied_pause);
+ max_pause = wb_max_pause(wb, sdtc->wb_dirty);
+ min_pause = wb_min_pause(wb, max_pause,
+ task_ratelimit, dirty_ratelimit,
+ &nr_dirtied_pause);
if (unlikely(task_ratelimit == 0)) {
period = max_pause;
@@ -1452,11 +1684,11 @@ static void balance_dirty_pages(struct address_space *mapping,
*/
if (pause < min_pause) {
trace_balance_dirty_pages(bdi,
- dirty_thresh,
- background_thresh,
- nr_dirty,
- bdi_thresh,
- bdi_dirty,
+ sdtc->thresh,
+ sdtc->bg_thresh,
+ sdtc->dirty,
+ sdtc->wb_thresh,
+ sdtc->wb_dirty,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
@@ -1481,11 +1713,11 @@ static void balance_dirty_pages(struct address_space *mapping,
pause:
trace_balance_dirty_pages(bdi,
- dirty_thresh,
- background_thresh,
- nr_dirty,
- bdi_thresh,
- bdi_dirty,
+ sdtc->thresh,
+ sdtc->bg_thresh,
+ sdtc->dirty,
+ sdtc->wb_thresh,
+ sdtc->wb_dirty,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
@@ -1500,33 +1732,33 @@ pause:
current->nr_dirtied_pause = nr_dirtied_pause;
/*
- * This is typically equal to (nr_dirty < dirty_thresh) and can
- * also keep "1000+ dd on a slow USB stick" under control.
+ * This is typically equal to (dirty < thresh) and can also
+ * keep "1000+ dd on a slow USB stick" under control.
*/
if (task_ratelimit)
break;
/*
* In the case of an unresponding NFS server and the NFS dirty
- * pages exceeds dirty_thresh, give the other good bdi's a pipe
+ * pages exceeds dirty_thresh, give the other good wb's a pipe
* to go through, so that tasks on them still remain responsive.
*
* In theory 1 page is enough to keep the comsumer-producer
* pipe going: the flusher cleans 1 page => the task dirties 1
- * more page. However bdi_dirty has accounting errors. So use
- * the larger and more IO friendly bdi_stat_error.
+ * more page. However wb_dirty has accounting errors. So use
+ * the larger and more IO friendly wb_stat_error.
*/
- if (bdi_dirty <= bdi_stat_error(bdi))
+ if (sdtc->wb_dirty <= wb_stat_error(wb))
break;
if (fatal_signal_pending(current))
break;
}
- if (!dirty_exceeded && bdi->dirty_exceeded)
- bdi->dirty_exceeded = 0;
+ if (!dirty_exceeded && wb->dirty_exceeded)
+ wb->dirty_exceeded = 0;
- if (writeback_in_progress(bdi))
+ if (writeback_in_progress(wb))
return;
/*
@@ -1540,8 +1772,8 @@ pause:
if (laptop_mode)
return;
- if (nr_reclaimable > background_thresh)
- bdi_start_background_writeback(bdi);
+ if (nr_reclaimable > gdtc->bg_thresh)
+ wb_start_background_writeback(wb);
}
static DEFINE_PER_CPU(int, bdp_ratelimits);
@@ -1577,15 +1809,22 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct inode *inode = mapping->host;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb = NULL;
int ratelimit;
int *p;
if (!bdi_cap_account_dirty(bdi))
return;
+ if (inode_cgwb_enabled(inode))
+ wb = wb_get_create_current(bdi, GFP_KERNEL);
+ if (!wb)
+ wb = &bdi->wb;
+
ratelimit = current->nr_dirtied_pause;
- if (bdi->dirty_exceeded)
+ if (wb->dirty_exceeded)
ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
preempt_disable();
@@ -1617,10 +1856,59 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
preempt_enable();
if (unlikely(current->nr_dirtied >= ratelimit))
- balance_dirty_pages(mapping, current->nr_dirtied);
+ balance_dirty_pages(mapping, wb, current->nr_dirtied);
+
+ wb_put(wb);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+/**
+ * wb_over_bg_thresh - does @wb need to be written back?
+ * @wb: bdi_writeback of interest
+ *
+ * Determines whether background writeback should keep writing @wb or it's
+ * clean enough. Returns %true if writeback should continue.
+ */
+bool wb_over_bg_thresh(struct bdi_writeback *wb)
+{
+ struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+ struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+ struct dirty_throttle_control * const gdtc = &gdtc_stor;
+ struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+ &mdtc_stor : NULL;
+
+ /*
+ * Similar to balance_dirty_pages() but ignores pages being written
+ * as we're trying to decide whether to put more under writeback.
+ */
+ gdtc->avail = global_dirtyable_memory();
+ gdtc->dirty = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ domain_dirty_limits(gdtc);
+
+ if (gdtc->dirty > gdtc->bg_thresh)
+ return true;
+
+ if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc))
+ return true;
+
+ if (mdtc) {
+ unsigned long writeback;
+
+ mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback);
+ mdtc_cap_avail(mdtc);
+ domain_dirty_limits(mdtc); /* ditto, ignore writeback */
+
+ if (mdtc->dirty > mdtc->bg_thresh)
+ return true;
+
+ if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc))
+ return true;
+ }
+
+ return false;
+}
+
void throttle_vm_writeout(gfp_t gfp_mask)
{
unsigned long background_thresh;
@@ -1628,7 +1916,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
for ( ; ; ) {
global_dirty_limits(&background_thresh, &dirty_thresh);
- dirty_thresh = hard_dirty_limit(dirty_thresh);
+ dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
/*
* Boost the allowable dirty threshold a bit for page
@@ -1667,14 +1955,20 @@ void laptop_mode_timer_fn(unsigned long data)
struct request_queue *q = (struct request_queue *)data;
int nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
+ struct bdi_writeback *wb;
+ struct wb_iter iter;
/*
* We want to write everything out, not just down to the dirty
* threshold
*/
- if (bdi_has_dirty_io(&q->backing_dev_info))
- bdi_start_writeback(&q->backing_dev_info, nr_pages,
- WB_REASON_LAPTOP_TIMER);
+ if (!bdi_has_dirty_io(&q->backing_dev_info))
+ return;
+
+ bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
+ if (wb_has_dirty_io(wb))
+ wb_start_writeback(wb, nr_pages, true,
+ WB_REASON_LAPTOP_TIMER);
}
/*
@@ -1718,10 +2012,12 @@ void laptop_sync_completion(void)
void writeback_set_ratelimit(void)
{
+ struct wb_domain *dom = &global_wb_domain;
unsigned long background_thresh;
unsigned long dirty_thresh;
+
global_dirty_limits(&background_thresh, &dirty_thresh);
- global_dirty_limit = dirty_thresh;
+ dom->dirty_limit = dirty_thresh;
ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
if (ratelimit_pages < 16)
ratelimit_pages = 16;
@@ -1770,7 +2066,7 @@ void __init page_writeback_init(void)
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
- fprop_global_init(&writeout_completions, GFP_KERNEL);
+ BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
}
/**
@@ -2090,19 +2386,29 @@ int __set_page_dirty_no_writeback(struct page *page)
/*
* Helper function for set_page_dirty family.
+ *
+ * Caller must hold mem_cgroup_begin_page_stat().
+ *
* NOTE: This relies on being atomic wrt interrupts.
*/
-void account_page_dirtied(struct page *page, struct address_space *mapping)
+void account_page_dirtied(struct page *page, struct address_space *mapping,
+ struct mem_cgroup *memcg)
{
+ struct inode *inode = mapping->host;
+
trace_writeback_dirty_page(page, mapping);
if (mapping_cap_account_dirty(mapping)) {
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct bdi_writeback *wb;
+ inode_attach_wb(inode, page);
+ wb = inode_to_wb(inode);
+
+ mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
- __inc_bdi_stat(bdi, BDI_RECLAIMABLE);
- __inc_bdi_stat(bdi, BDI_DIRTIED);
+ __inc_wb_stat(wb, WB_RECLAIMABLE);
+ __inc_wb_stat(wb, WB_DIRTIED);
task_io_account_write(PAGE_CACHE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
@@ -2113,21 +2419,18 @@ EXPORT_SYMBOL(account_page_dirtied);
/*
* Helper function for deaccounting dirty page without writeback.
*
- * Doing this should *normally* only ever be done when a page
- * is truncated, and is not actually mapped anywhere at all. However,
- * fs/buffer.c does this when it notices that somebody has cleaned
- * out all the buffers on a page without actually doing it through
- * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
+ * Caller must hold mem_cgroup_begin_page_stat().
*/
-void account_page_cleaned(struct page *page, struct address_space *mapping)
+void account_page_cleaned(struct page *page, struct address_space *mapping,
+ struct mem_cgroup *memcg, struct bdi_writeback *wb)
{
if (mapping_cap_account_dirty(mapping)) {
+ mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
+ dec_wb_stat(wb, WB_RECLAIMABLE);
task_io_account_cancelled_write(PAGE_CACHE_SIZE);
}
}
-EXPORT_SYMBOL(account_page_cleaned);
/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
@@ -2143,26 +2446,34 @@ EXPORT_SYMBOL(account_page_cleaned);
*/
int __set_page_dirty_nobuffers(struct page *page)
{
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_begin_page_stat(page);
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
unsigned long flags;
- if (!mapping)
+ if (!mapping) {
+ mem_cgroup_end_page_stat(memcg);
return 1;
+ }
spin_lock_irqsave(&mapping->tree_lock, flags);
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- account_page_dirtied(page, mapping);
+ account_page_dirtied(page, mapping, memcg);
radix_tree_tag_set(&mapping->page_tree, page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
+
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
return 1;
}
+ mem_cgroup_end_page_stat(memcg);
return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -2177,10 +2488,17 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers);
void account_page_redirty(struct page *page)
{
struct address_space *mapping = page->mapping;
+
if (mapping && mapping_cap_account_dirty(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ bool locked;
+
+ wb = unlocked_inode_to_wb_begin(inode, &locked);
current->nr_dirtied--;
dec_zone_page_state(page, NR_DIRTIED);
- dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
+ dec_wb_stat(wb, WB_DIRTIED);
+ unlocked_inode_to_wb_end(inode, locked);
}
}
EXPORT_SYMBOL(account_page_redirty);
@@ -2266,6 +2584,43 @@ int set_page_dirty_lock(struct page *page)
EXPORT_SYMBOL(set_page_dirty_lock);
/*
+ * This cancels just the dirty bit on the kernel page itself, it does NOT
+ * actually remove dirty bits on any mmap's that may be around. It also
+ * leaves the page tagged dirty, so any sync activity will still find it on
+ * the dirty lists, and in particular, clear_page_dirty_for_io() will still
+ * look at the dirty bits in the VM.
+ *
+ * Doing this should *normally* only ever be done when a page is truncated,
+ * and is not actually mapped anywhere at all. However, fs/buffer.c does
+ * this when it notices that somebody has cleaned out all the buffers on a
+ * page without actually doing it through the VM. Can you say "ext3 is
+ * horribly ugly"? Thought you could.
+ */
+void cancel_dirty_page(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping_cap_account_dirty(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct mem_cgroup *memcg;
+ bool locked;
+
+ memcg = mem_cgroup_begin_page_stat(page);
+ wb = unlocked_inode_to_wb_begin(inode, &locked);
+
+ if (TestClearPageDirty(page))
+ account_page_cleaned(page, mapping, memcg, wb);
+
+ unlocked_inode_to_wb_end(inode, locked);
+ mem_cgroup_end_page_stat(memcg);
+ } else {
+ ClearPageDirty(page);
+ }
+}
+EXPORT_SYMBOL(cancel_dirty_page);
+
+/*
* Clear a page's dirty flag, while caring for dirty memory accounting.
* Returns true if the page was previously dirty.
*
@@ -2282,10 +2637,16 @@ EXPORT_SYMBOL(set_page_dirty_lock);
int clear_page_dirty_for_io(struct page *page)
{
struct address_space *mapping = page_mapping(page);
+ int ret = 0;
BUG_ON(!PageLocked(page));
if (mapping && mapping_cap_account_dirty(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct mem_cgroup *memcg;
+ bool locked;
+
/*
* Yes, Virginia, this is indeed insane.
*
@@ -2321,13 +2682,17 @@ int clear_page_dirty_for_io(struct page *page)
* always locked coming in here, so we get the desired
* exclusion.
*/
+ memcg = mem_cgroup_begin_page_stat(page);
+ wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) {
+ mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(inode_to_bdi(mapping->host),
- BDI_RECLAIMABLE);
- return 1;
+ dec_wb_stat(wb, WB_RECLAIMABLE);
+ ret = 1;
}
- return 0;
+ unlocked_inode_to_wb_end(inode, locked);
+ mem_cgroup_end_page_stat(memcg);
+ return ret;
}
return TestClearPageDirty(page);
}
@@ -2341,7 +2706,8 @@ int test_clear_page_writeback(struct page *page)
memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct inode *inode = mapping->host;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2351,8 +2717,10 @@ int test_clear_page_writeback(struct page *page)
page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi)) {
- __dec_bdi_stat(bdi, BDI_WRITEBACK);
- __bdi_writeout_inc(bdi);
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ __dec_wb_stat(wb, WB_WRITEBACK);
+ __wb_writeout_inc(wb);
}
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -2376,7 +2744,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct inode *inode = mapping->host;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2386,7 +2755,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi))
- __inc_bdi_stat(bdi, BDI_WRITEBACK);
+ __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
}
if (!PageDirty(page))
radix_tree_tag_clear(&mapping->page_tree,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 41bd90d60cb4..4d3e8df68351 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1834,13 +1834,13 @@ failed:
static struct {
struct fault_attr attr;
- u32 ignore_gfp_highmem;
- u32 ignore_gfp_wait;
+ bool ignore_gfp_highmem;
+ bool ignore_gfp_wait;
u32 min_order;
} fail_page_alloc = {
.attr = FAULT_ATTR_INITIALIZER,
- .ignore_gfp_wait = 1,
- .ignore_gfp_highmem = 1,
+ .ignore_gfp_wait = true,
+ .ignore_gfp_highmem = true,
.min_order = 1,
};
diff --git a/mm/readahead.c b/mm/readahead.c
index 935675844b2e..60cd846a9a44 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
/*
* Defer asynchronous read-ahead on IO congestion.
*/
- if (bdi_read_congested(inode_to_bdi(mapping->host)))
+ if (inode_read_congested(mapping->host))
return;
/* do read-ahead */
diff --git a/mm/rmap.c b/mm/rmap.c
index 24dd3f9fee27..8fc556ce2dcb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -30,6 +30,8 @@
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
+ * mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ * mapping->tree_lock (widely used)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
diff --git a/mm/truncate.c b/mm/truncate.c
index 09598db42681..5f196420020c 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -119,9 +119,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
* the VM has canceled the dirty bit (eg ext3 journaling).
* Hence dirty accounting check is placed after invalidation.
*/
- if (TestClearPageDirty(page))
- account_page_cleaned(page, mapping);
-
+ cancel_dirty_page(page);
ClearPageMappedToDisk(page);
delete_from_page_cache(page);
return 0;
@@ -515,19 +513,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
+ struct mem_cgroup *memcg;
+ unsigned long flags;
+
if (page->mapping != mapping)
return 0;
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
- spin_lock_irq(&mapping->tree_lock);
+ memcg = mem_cgroup_begin_page_stat(page);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page));
- __delete_from_page_cache(page, NULL);
- spin_unlock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(page, NULL, memcg);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -535,7 +538,8 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
page_cache_release(page); /* pagecache ref */
return 1;
failed:
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1a17bd7c0ce5..b0c61df92b81 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -154,11 +154,42 @@ static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
}
+
+/**
+ * sane_reclaim - is the usual dirty throttling mechanism operational?
+ * @sc: scan_control in question
+ *
+ * The normal page dirty throttling mechanism in balance_dirty_pages() is
+ * completely broken with the legacy memcg and direct stalling in
+ * shrink_page_list() is used for throttling instead, which lacks all the
+ * niceties such as fairness, adaptive pausing, bandwidth proportional
+ * allocation and configurability.
+ *
+ * This function tests whether the vmscan currently in progress can assume
+ * that the normal dirty throttling mechanism is operational.
+ */
+static bool sane_reclaim(struct scan_control *sc)
+{
+ struct mem_cgroup *memcg = sc->target_mem_cgroup;
+
+ if (!memcg)
+ return true;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+ return true;
+#endif
+ return false;
+}
#else
static bool global_reclaim(struct scan_control *sc)
{
return true;
}
+
+static bool sane_reclaim(struct scan_control *sc)
+{
+ return true;
+}
#endif
static unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -452,14 +483,13 @@ static inline int is_page_cache_freeable(struct page *page)
return page_count(page) - page_has_private(page) == 2;
}
-static int may_write_to_queue(struct backing_dev_info *bdi,
- struct scan_control *sc)
+static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
{
if (current->flags & PF_SWAPWRITE)
return 1;
- if (!bdi_write_congested(bdi))
+ if (!inode_write_congested(inode))
return 1;
- if (bdi == current->backing_dev_info)
+ if (inode_to_bdi(inode) == current->backing_dev_info)
return 1;
return 0;
}
@@ -538,7 +568,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
+ if (!may_write_to_inode(mapping->host, sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -579,10 +609,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed)
{
+ unsigned long flags;
+ struct mem_cgroup *memcg;
+
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- spin_lock_irq(&mapping->tree_lock);
+ memcg = mem_cgroup_begin_page_stat(page);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
/*
* The non racy check for a busy page.
*
@@ -620,7 +654,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
swapcache_free(swap);
} else {
void (*freepage)(struct page *);
@@ -640,8 +675,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping))
shadow = workingset_eviction(mapping, page);
- __delete_from_page_cache(page, shadow);
- spin_unlock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(page, shadow, memcg);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
if (freepage != NULL)
freepage(page);
@@ -650,7 +686,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
return 1;
cannot_free:
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
return 0;
}
@@ -917,7 +954,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
mapping = page_mapping(page);
if (((dirty || writeback) && mapping &&
- bdi_write_congested(inode_to_bdi(mapping->host))) ||
+ inode_write_congested(mapping->host)) ||
(writeback && PageReclaim(page)))
nr_congested++;
@@ -935,11 +972,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* note that the LRU is being scanned too quickly and the
* caller can stall after page list has been processed.
*
- * 2) Global reclaim encounters a page, memcg encounters a
- * page that is not marked for immediate reclaim or
- * the caller does not have __GFP_FS (or __GFP_IO if it's
- * simply going to swap, not to fs). In this case mark
- * the page for immediate reclaim and continue scanning.
+ * 2) Global or new memcg reclaim encounters a page that is
+ * not marked for immediate reclaim, or the caller does not
+ * have __GFP_FS (or __GFP_IO if it's simply going to swap,
+ * not to fs). In this case mark the page for immediate
+ * reclaim and continue scanning.
*
* Require may_enter_fs because we would wait on fs, which
* may not have submitted IO yet. And the loop driver might
@@ -948,7 +985,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
- * 3) memcg encounters a page that is not already marked
+ * 3) Legacy memcg encounters a page that is not already marked
* PageReclaim. memcg does not have any dirty pages
* throttling so we could easily OOM just because too many
* pages are in writeback and there is nothing else to
@@ -963,7 +1000,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
/* Case 2 above */
- } else if (global_reclaim(sc) ||
+ } else if (sane_reclaim(sc) ||
!PageReclaim(page) || !may_enter_fs) {
/*
* This is slightly racy - end_page_writeback()
@@ -1412,7 +1449,7 @@ static int too_many_isolated(struct zone *zone, int file,
if (current_is_kswapd())
return 0;
- if (!global_reclaim(sc))
+ if (!sane_reclaim(sc))
return 0;
if (file) {
@@ -1604,10 +1641,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
set_bit(ZONE_WRITEBACK, &zone->flags);
/*
- * memcg will stall in page writeback so only consider forcibly
- * stalling for global reclaim
+ * Legacy memcg will stall in page writeback so avoid forcibly
+ * stalling here.
*/
- if (global_reclaim(sc)) {
+ if (sane_reclaim(sc)) {
/*
* Tag a zone as congested if all the dirty pages scanned were
* backed by a congested BDI and wait_iff_congested will stall.
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 5c564a68fb50..d71edcbd0c58 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -289,7 +289,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
}
/* prepare A-MPDU MLME for Rx aggregation */
- tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL);
+ tid_agg_rx = kzalloc(sizeof(*tid_agg_rx), GFP_KERNEL);
if (!tid_agg_rx)
goto end;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c0a9187bc3a9..cdf8609a6240 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -90,7 +90,7 @@ struct ieee80211_fragment_entry {
unsigned int last_frag;
unsigned int extra_len;
struct sk_buff_head skb_list;
- int ccmp; /* Whether fragments were encrypted with CCMP */
+ bool check_sequential_pn; /* needed for CCMP/GCMP */
u8 last_pn[6]; /* PN of the last fragment if CCMP was used */
};
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 3ece7d1034c8..b54f398cda5d 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -711,7 +711,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta)
* computing cur_tp
*/
tmp_mrs = &mi->r[idx].stats;
- tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma);
+ tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10;
tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024;
return tmp_cur_tp;
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 7430a1df2ab1..1ec889dc2e46 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -691,7 +691,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb)
if (likely(sta->ampdu_mlme.tid_tx[tid]))
return;
- ieee80211_start_tx_ba_session(pubsta, tid, 5000);
+ ieee80211_start_tx_ba_session(pubsta, tid, 0);
}
static void
@@ -1328,7 +1328,8 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta)
prob = mi->groups[i].rates[j].prob_ewma;
/* convert tp_avg from pkt per second in kbps */
- tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * AVG_PKT_SIZE * 8 / 1024;
+ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10;
+ tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024;
return tp_avg;
}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index f6f8d9880873..32cea15823d8 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1725,7 +1725,7 @@ ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata,
entry->seq = seq;
entry->rx_queue = rx_queue;
entry->last_frag = frag;
- entry->ccmp = 0;
+ entry->check_sequential_pn = false;
entry->extra_len = 0;
return entry;
@@ -1821,15 +1821,27 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
rx->seqno_idx, &(rx->skb));
if (rx->key &&
(rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP ||
- rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256) &&
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 ||
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP ||
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) &&
ieee80211_has_protected(fc)) {
int queue = rx->security_idx;
- /* Store CCMP PN so that we can verify that the next
- * fragment has a sequential PN value. */
- entry->ccmp = 1;
+
+ /* Store CCMP/GCMP PN so that we can verify that the
+ * next fragment has a sequential PN value.
+ */
+ entry->check_sequential_pn = true;
memcpy(entry->last_pn,
rx->key->u.ccmp.rx_pn[queue],
IEEE80211_CCMP_PN_LEN);
+ BUILD_BUG_ON(offsetof(struct ieee80211_key,
+ u.ccmp.rx_pn) !=
+ offsetof(struct ieee80211_key,
+ u.gcmp.rx_pn));
+ BUILD_BUG_ON(sizeof(rx->key->u.ccmp.rx_pn[queue]) !=
+ sizeof(rx->key->u.gcmp.rx_pn[queue]));
+ BUILD_BUG_ON(IEEE80211_CCMP_PN_LEN !=
+ IEEE80211_GCMP_PN_LEN);
}
return RX_QUEUED;
}
@@ -1844,15 +1856,21 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
return RX_DROP_MONITOR;
}
- /* Verify that MPDUs within one MSDU have sequential PN values.
- * (IEEE 802.11i, 8.3.3.4.5) */
- if (entry->ccmp) {
+ /* "The receiver shall discard MSDUs and MMPDUs whose constituent
+ * MPDU PN values are not incrementing in steps of 1."
+ * see IEEE P802.11-REVmc/D5.0, 12.5.3.4.4, item d (for CCMP)
+ * and IEEE P802.11-REVmc/D5.0, 12.5.5.4.4, item d (for GCMP)
+ */
+ if (entry->check_sequential_pn) {
int i;
u8 pn[IEEE80211_CCMP_PN_LEN], *rpn;
int queue;
+
if (!rx->key ||
(rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP &&
- rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256))
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256 &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP_256))
return RX_DROP_UNUSABLE;
memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN);
for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) {
@@ -3359,6 +3377,7 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx,
return false;
/* ignore action frames to TDLS-peers */
if (ieee80211_is_action(hdr->frame_control) &&
+ !is_broadcast_ether_addr(bssid) &&
!ether_addr_equal(bssid, hdr->addr1))
return false;
}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 2a0bbd22854b..71e9b84847f3 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1138,6 +1138,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
return NOTIFY_DONE;
}
+ wireless_nlevent_flush();
+
return NOTIFY_OK;
}
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index c8717c1d082e..b50ee5d622e1 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -342,6 +342,40 @@ static const int compat_event_type_size[] = {
/* IW event code */
+void wireless_nlevent_flush(void)
+{
+ struct sk_buff *skb;
+ struct net *net;
+
+ ASSERT_RTNL();
+
+ for_each_net(net) {
+ while ((skb = skb_dequeue(&net->wext_nlevents)))
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
+ GFP_KERNEL);
+ }
+}
+EXPORT_SYMBOL_GPL(wireless_nlevent_flush);
+
+static int wext_netdev_notifier_call(struct notifier_block *nb,
+ unsigned long state, void *ptr)
+{
+ /*
+ * When a netdev changes state in any way, flush all pending messages
+ * to avoid them going out in a strange order, e.g. RTM_NEWLINK after
+ * RTM_DELLINK, or with IFF_UP after without IFF_UP during dev_close()
+ * or similar - all of which could otherwise happen due to delays from
+ * schedule_work().
+ */
+ wireless_nlevent_flush();
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block wext_netdev_notifier = {
+ .notifier_call = wext_netdev_notifier_call,
+};
+
static int __net_init wext_pernet_init(struct net *net)
{
skb_queue_head_init(&net->wext_nlevents);
@@ -360,7 +394,12 @@ static struct pernet_operations wext_pernet_ops = {
static int __init wireless_nlevent_init(void)
{
- return register_pernet_subsys(&wext_pernet_ops);
+ int err = register_pernet_subsys(&wext_pernet_ops);
+
+ if (err)
+ return err;
+
+ return register_netdevice_notifier(&wext_netdev_notifier);
}
subsys_initcall(wireless_nlevent_init);
@@ -368,17 +407,8 @@ subsys_initcall(wireless_nlevent_init);
/* Process events generated by the wireless layer or the driver. */
static void wireless_nlevent_process(struct work_struct *work)
{
- struct sk_buff *skb;
- struct net *net;
-
rtnl_lock();
-
- for_each_net(net) {
- while ((skb = skb_dequeue(&net->wext_nlevents)))
- rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
- GFP_KERNEL);
- }
-
+ wireless_nlevent_flush();
rtnl_unlock();
}
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 3f874d24234f..37323b0df374 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -5,10 +5,12 @@ else
call_threshold := 0
endif
+KASAN_SHADOW_OFFSET ?= $(CONFIG_KASAN_SHADOW_OFFSET)
+
CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address
CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \
- -fasan-shadow-offset=$(CONFIG_KASAN_SHADOW_OFFSET) \
+ -fasan-shadow-offset=$(KASAN_SHADOW_OFFSET) \
--param asan-stack=1 --param asan-globals=1 \
--param asan-instrumentation-with-call-threshold=$(call_threshold))
diff --git a/scripts/ld-version.sh b/scripts/ld-version.sh
index 198580d245e0..1659b409ef10 100755
--- a/scripts/ld-version.sh
+++ b/scripts/ld-version.sh
@@ -1,7 +1,7 @@
#!/usr/bin/awk -f
# extract linker version number from stdin and turn into single number
{
- gsub(".*)", "");
+ gsub(".*\\)", "");
split($1,a, ".");
print a[1]*10000000 + a[2]*100000 + a[3]*10000 + a[4]*100 + a[5];
exit
diff --git a/sound/soc/codecs/wm8958-dsp2.c b/sound/soc/codecs/wm8958-dsp2.c
index c799cca5abeb..6b864c0fc2b6 100644
--- a/sound/soc/codecs/wm8958-dsp2.c
+++ b/sound/soc/codecs/wm8958-dsp2.c
@@ -459,7 +459,7 @@ static int wm8958_put_mbc_enum(struct snd_kcontrol *kcontrol,
struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
struct wm8994 *control = wm8994->wm8994;
- int value = ucontrol->value.integer.value[0];
+ int value = ucontrol->value.enumerated.item[0];
int reg;
/* Don't allow on the fly reconfiguration */
@@ -549,7 +549,7 @@ static int wm8958_put_vss_enum(struct snd_kcontrol *kcontrol,
struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
struct wm8994 *control = wm8994->wm8994;
- int value = ucontrol->value.integer.value[0];
+ int value = ucontrol->value.enumerated.item[0];
int reg;
/* Don't allow on the fly reconfiguration */
@@ -582,7 +582,7 @@ static int wm8958_put_vss_hpf_enum(struct snd_kcontrol *kcontrol,
struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
struct wm8994 *control = wm8994->wm8994;
- int value = ucontrol->value.integer.value[0];
+ int value = ucontrol->value.enumerated.item[0];
int reg;
/* Don't allow on the fly reconfiguration */
@@ -749,7 +749,7 @@ static int wm8958_put_enh_eq_enum(struct snd_kcontrol *kcontrol,
struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
struct wm8994 *control = wm8994->wm8994;
- int value = ucontrol->value.integer.value[0];
+ int value = ucontrol->value.enumerated.item[0];
int reg;
/* Don't allow on the fly reconfiguration */
diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c
index a1c04dab6684..a484ca8421af 100644
--- a/sound/soc/codecs/wm8994.c
+++ b/sound/soc/codecs/wm8994.c
@@ -361,7 +361,7 @@ static int wm8994_put_drc_enum(struct snd_kcontrol *kcontrol,
struct wm8994 *control = wm8994->wm8994;
struct wm8994_pdata *pdata = &control->pdata;
int drc = wm8994_get_drc(kcontrol->id.name);
- int value = ucontrol->value.integer.value[0];
+ int value = ucontrol->value.enumerated.item[0];
if (drc < 0)
return drc;
@@ -468,7 +468,7 @@ static int wm8994_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
struct wm8994 *control = wm8994->wm8994;
struct wm8994_pdata *pdata = &control->pdata;
int block = wm8994_get_retune_mobile_block(kcontrol->id.name);
- int value = ucontrol->value.integer.value[0];
+ int value = ucontrol->value.enumerated.item[0];
if (block < 0)
return block;
diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c
index d01c2095452f..431d94397219 100644
--- a/sound/soc/codecs/wm_adsp.c
+++ b/sound/soc/codecs/wm_adsp.c
@@ -248,7 +248,7 @@ static int wm_adsp_fw_get(struct snd_kcontrol *kcontrol,
struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
struct wm_adsp *adsp = snd_soc_codec_get_drvdata(codec);
- ucontrol->value.integer.value[0] = adsp[e->shift_l].fw;
+ ucontrol->value.enumerated.item[0] = adsp[e->shift_l].fw;
return 0;
}
@@ -260,16 +260,16 @@ static int wm_adsp_fw_put(struct snd_kcontrol *kcontrol,
struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
struct wm_adsp *adsp = snd_soc_codec_get_drvdata(codec);
- if (ucontrol->value.integer.value[0] == adsp[e->shift_l].fw)
+ if (ucontrol->value.enumerated.item[0] == adsp[e->shift_l].fw)
return 0;
- if (ucontrol->value.integer.value[0] >= WM_ADSP_NUM_FW)
+ if (ucontrol->value.enumerated.item[0] >= WM_ADSP_NUM_FW)
return -EINVAL;
if (adsp[e->shift_l].running)
return -EBUSY;
- adsp[e->shift_l].fw = ucontrol->value.integer.value[0];
+ adsp[e->shift_l].fw = ucontrol->value.enumerated.item[0];
return 0;
}
diff --git a/sound/soc/samsung/i2s.c b/sound/soc/samsung/i2s.c
index b92ab40d2be6..5e8ccb0a7028 100644
--- a/sound/soc/samsung/i2s.c
+++ b/sound/soc/samsung/i2s.c
@@ -480,10 +480,11 @@ static int i2s_set_sysclk(struct snd_soc_dai *dai,
unsigned int cdcon_mask = 1 << i2s_regs->cdclkcon_off;
unsigned int rsrc_mask = 1 << i2s_regs->rclksrc_off;
u32 mod, mask, val = 0;
+ unsigned long flags;
- spin_lock(i2s->lock);
+ spin_lock_irqsave(i2s->lock, flags);
mod = readl(i2s->addr + I2SMOD);
- spin_unlock(i2s->lock);
+ spin_unlock_irqrestore(i2s->lock, flags);
switch (clk_id) {
case SAMSUNG_I2S_OPCLK:
@@ -574,11 +575,11 @@ static int i2s_set_sysclk(struct snd_soc_dai *dai,
return -EINVAL;
}
- spin_lock(i2s->lock);
+ spin_lock_irqsave(i2s->lock, flags);
mod = readl(i2s->addr + I2SMOD);
mod = (mod & ~mask) | val;
writel(mod, i2s->addr + I2SMOD);
- spin_unlock(i2s->lock);
+ spin_unlock_irqrestore(i2s->lock, flags);
return 0;
}
@@ -589,6 +590,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
struct i2s_dai *i2s = to_info(dai);
int lrp_shift, sdf_shift, sdf_mask, lrp_rlow, mod_slave;
u32 mod, tmp = 0;
+ unsigned long flags;
lrp_shift = i2s->variant_regs->lrp_off;
sdf_shift = i2s->variant_regs->sdf_off;
@@ -648,7 +650,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
return -EINVAL;
}
- spin_lock(i2s->lock);
+ spin_lock_irqsave(i2s->lock, flags);
mod = readl(i2s->addr + I2SMOD);
/*
* Don't change the I2S mode if any controller is active on this
@@ -656,7 +658,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
*/
if (any_active(i2s) &&
((mod & (sdf_mask | lrp_rlow | mod_slave)) != tmp)) {
- spin_unlock(i2s->lock);
+ spin_unlock_irqrestore(i2s->lock, flags);
dev_err(&i2s->pdev->dev,
"%s:%d Other DAI busy\n", __func__, __LINE__);
return -EAGAIN;
@@ -665,7 +667,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
mod &= ~(sdf_mask | lrp_rlow | mod_slave);
mod |= tmp;
writel(mod, i2s->addr + I2SMOD);
- spin_unlock(i2s->lock);
+ spin_unlock_irqrestore(i2s->lock, flags);
return 0;
}
@@ -675,6 +677,7 @@ static int i2s_hw_params(struct snd_pcm_substream *substream,
{
struct i2s_dai *i2s = to_info(dai);
u32 mod, mask = 0, val = 0;
+ unsigned long flags;
if (!is_secondary(i2s))
mask |= (MOD_DC2_EN | MOD_DC1_EN);
@@ -743,11 +746,11 @@ static int i2s_hw_params(struct snd_pcm_substream *substream,
return -EINVAL;
}
- spin_lock(i2s->lock);
+ spin_lock_irqsave(i2s->lock, flags);
mod = readl(i2s->addr + I2SMOD);
mod = (mod & ~mask) | val;
writel(mod, i2s->addr + I2SMOD);
- spin_unlock(i2s->lock);
+ spin_unlock_irqrestore(i2s->lock, flags);
samsung_asoc_init_dma_data(dai, &i2s->dma_playback, &i2s->dma_capture);
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index b6c12dccb259..28df6adf362b 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -3324,7 +3324,7 @@ static int snd_soc_dapm_dai_link_get(struct snd_kcontrol *kcontrol,
{
struct snd_soc_dapm_widget *w = snd_kcontrol_chip(kcontrol);
- ucontrol->value.integer.value[0] = w->params_select;
+ ucontrol->value.enumerated.item[0] = w->params_select;
return 0;
}
@@ -3338,13 +3338,13 @@ static int snd_soc_dapm_dai_link_put(struct snd_kcontrol *kcontrol,
if (w->power)
return -EBUSY;
- if (ucontrol->value.integer.value[0] == w->params_select)
+ if (ucontrol->value.enumerated.item[0] == w->params_select)
return 0;
- if (ucontrol->value.integer.value[0] >= w->num_params)
+ if (ucontrol->value.enumerated.item[0] >= w->num_params)
return -EINVAL;
- w->params_select = ucontrol->value.integer.value[0];
+ w->params_select = ucontrol->value.enumerated.item[0];
return 0;
}