aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJon Medhurst <tixy@linaro.org>2013-05-09 14:06:50 +0100
committerJon Medhurst <tixy@linaro.org>2013-05-09 14:06:50 +0100
commit0be921c23c74a211cf9cf01cbf654fcb796c1a2e (patch)
tree0e5b89be9f2f9507e83a26f1c8ddf8cf6ce04884
parentf1ab999e4776b5cf0acd9d466a61c14c61783cd8 (diff)
parent2ce16b32ad5d5d69591ea9fe9917c8fe5598a22c (diff)
Merge branch 'mcpm-merge-nico' into lsk-3.9-vexpress
-rw-r--r--Documentation/arm/cluster-pm-race-avoidance.txt498
-rw-r--r--Documentation/arm/vlocks.txt211
-rw-r--r--arch/arm/Kconfig15
-rw-r--r--arch/arm/common/Makefile2
-rw-r--r--arch/arm/common/mcpm_entry.c263
-rw-r--r--arch/arm/common/mcpm_head.S219
-rw-r--r--arch/arm/common/mcpm_platsmp.c89
-rw-r--r--arch/arm/common/vlock.S108
-rw-r--r--arch/arm/common/vlock.h29
-rw-r--r--arch/arm/include/asm/cacheflush.h75
-rw-r--r--arch/arm/include/asm/mach/arch.h5
-rw-r--r--arch/arm/include/asm/mcpm.h209
-rw-r--r--arch/arm/kernel/asm-offsets.c4
-rw-r--r--arch/arm/kernel/setup.c5
-rw-r--r--arch/arm/mach-vexpress/core.h2
-rw-r--r--arch/arm/mach-vexpress/platsmp.c13
-rw-r--r--arch/arm/mach-vexpress/v2m.c1
17 files changed, 1747 insertions, 1 deletions
diff --git a/Documentation/arm/cluster-pm-race-avoidance.txt b/Documentation/arm/cluster-pm-race-avoidance.txt
new file mode 100644
index 00000000000..750b6fc24af
--- /dev/null
+++ b/Documentation/arm/cluster-pm-race-avoidance.txt
@@ -0,0 +1,498 @@
+Cluster-wide Power-up/power-down race avoidance algorithm
+=========================================================
+
+This file documents the algorithm which is used to coordinate CPU and
+cluster setup and teardown operations and to manage hardware coherency
+controls safely.
+
+The section "Rationale" explains what the algorithm is for and why it is
+needed. "Basic model" explains general concepts using a simplified view
+of the system. The other sections explain the actual details of the
+algorithm in use.
+
+
+Rationale
+---------
+
+In a system containing multiple CPUs, it is desirable to have the
+ability to turn off individual CPUs when the system is idle, reducing
+power consumption and thermal dissipation.
+
+In a system containing multiple clusters of CPUs, it is also desirable
+to have the ability to turn off entire clusters.
+
+Turning entire clusters off and on is a risky business, because it
+involves performing potentially destructive operations affecting a group
+of independently running CPUs, while the OS continues to run. This
+means that we need some coordination in order to ensure that critical
+cluster-level operations are only performed when it is truly safe to do
+so.
+
+Simple locking may not be sufficient to solve this problem, because
+mechanisms like Linux spinlocks may rely on coherency mechanisms which
+are not immediately enabled when a cluster powers up. Since enabling or
+disabling those mechanisms may itself be a non-atomic operation (such as
+writing some hardware registers and invalidating large caches), other
+methods of coordination are required in order to guarantee safe
+power-down and power-up at the cluster level.
+
+The mechanism presented in this document describes a coherent memory
+based protocol for performing the needed coordination. It aims to be as
+lightweight as possible, while providing the required safety properties.
+
+
+Basic model
+-----------
+
+Each cluster and CPU is assigned a state, as follows:
+
+ DOWN
+ COMING_UP
+ UP
+ GOING_DOWN
+
+ +---------> UP ----------+
+ | v
+
+ COMING_UP GOING_DOWN
+
+ ^ |
+ +--------- DOWN <--------+
+
+
+DOWN: The CPU or cluster is not coherent, and is either powered off or
+ suspended, or is ready to be powered off or suspended.
+
+COMING_UP: The CPU or cluster has committed to moving to the UP state.
+ It may be part way through the process of initialisation and
+ enabling coherency.
+
+UP: The CPU or cluster is active and coherent at the hardware
+ level. A CPU in this state is not necessarily being used
+ actively by the kernel.
+
+GOING_DOWN: The CPU or cluster has committed to moving to the DOWN
+ state. It may be part way through the process of teardown and
+ coherency exit.
+
+
+Each CPU has one of these states assigned to it at any point in time.
+The CPU states are described in the "CPU state" section, below.
+
+Each cluster is also assigned a state, but it is necessary to split the
+state value into two parts (the "cluster" state and "inbound" state) and
+to introduce additional states in order to avoid races between different
+CPUs in the cluster simultaneously modifying the state. The cluster-
+level states are described in the "Cluster state" section.
+
+To help distinguish the CPU states from cluster states in this
+discussion, the state names are given a CPU_ prefix for the CPU states,
+and a CLUSTER_ or INBOUND_ prefix for the cluster states.
+
+
+CPU state
+---------
+
+In this algorithm, each individual core in a multi-core processor is
+referred to as a "CPU". CPUs are assumed to be single-threaded:
+therefore, a CPU can only be doing one thing at a single point in time.
+
+This means that CPUs fit the basic model closely.
+
+The algorithm defines the following states for each CPU in the system:
+
+ CPU_DOWN
+ CPU_COMING_UP
+ CPU_UP
+ CPU_GOING_DOWN
+
+ cluster setup and
+ CPU setup complete policy decision
+ +-----------> CPU_UP ------------+
+ | v
+
+ CPU_COMING_UP CPU_GOING_DOWN
+
+ ^ |
+ +----------- CPU_DOWN <----------+
+ policy decision CPU teardown complete
+ or hardware event
+
+
+The definitions of the four states correspond closely to the states of
+the basic model.
+
+Transitions between states occur as follows.
+
+A trigger event (spontaneous) means that the CPU can transition to the
+next state as a result of making local progress only, with no
+requirement for any external event to happen.
+
+
+CPU_DOWN:
+
+ A CPU reaches the CPU_DOWN state when it is ready for
+ power-down. On reaching this state, the CPU will typically
+ power itself down or suspend itself, via a WFI instruction or a
+ firmware call.
+
+ Next state: CPU_COMING_UP
+ Conditions: none
+
+ Trigger events:
+
+ a) an explicit hardware power-up operation, resulting
+ from a policy decision on another CPU;
+
+ b) a hardware event, such as an interrupt.
+
+
+CPU_COMING_UP:
+
+ A CPU cannot start participating in hardware coherency until the
+ cluster is set up and coherent. If the cluster is not ready,
+ then the CPU will wait in the CPU_COMING_UP state until the
+ cluster has been set up.
+
+ Next state: CPU_UP
+ Conditions: The CPU's parent cluster must be in CLUSTER_UP.
+ Trigger events: Transition of the parent cluster to CLUSTER_UP.
+
+ Refer to the "Cluster state" section for a description of the
+ CLUSTER_UP state.
+
+
+CPU_UP:
+ When a CPU reaches the CPU_UP state, it is safe for the CPU to
+ start participating in local coherency.
+
+ This is done by jumping to the kernel's CPU resume code.
+
+ Note that the definition of this state is slightly different
+ from the basic model definition: CPU_UP does not mean that the
+ CPU is coherent yet, but it does mean that it is safe to resume
+ the kernel. The kernel handles the rest of the resume
+ procedure, so the remaining steps are not visible as part of the
+ race avoidance algorithm.
+
+ The CPU remains in this state until an explicit policy decision
+ is made to shut down or suspend the CPU.
+
+ Next state: CPU_GOING_DOWN
+ Conditions: none
+ Trigger events: explicit policy decision
+
+
+CPU_GOING_DOWN:
+
+ While in this state, the CPU exits coherency, including any
+ operations required to achieve this (such as cleaning data
+ caches).
+
+ Next state: CPU_DOWN
+ Conditions: local CPU teardown complete
+ Trigger events: (spontaneous)
+
+
+Cluster state
+-------------
+
+A cluster is a group of connected CPUs with some common resources.
+Because a cluster contains multiple CPUs, it can be doing multiple
+things at the same time. This has some implications. In particular, a
+CPU can start up while another CPU is tearing the cluster down.
+
+In this discussion, the "outbound side" is the view of the cluster state
+as seen by a CPU tearing the cluster down. The "inbound side" is the
+view of the cluster state as seen by a CPU setting the CPU up.
+
+In order to enable safe coordination in such situations, it is important
+that a CPU which is setting up the cluster can advertise its state
+independently of the CPU which is tearing down the cluster. For this
+reason, the cluster state is split into two parts:
+
+ "cluster" state: The global state of the cluster; or the state
+ on the outbound side:
+
+ CLUSTER_DOWN
+ CLUSTER_UP
+ CLUSTER_GOING_DOWN
+
+ "inbound" state: The state of the cluster on the inbound side.
+
+ INBOUND_NOT_COMING_UP
+ INBOUND_COMING_UP
+
+
+ The different pairings of these states results in six possible
+ states for the cluster as a whole:
+
+ CLUSTER_UP
+ +==========> INBOUND_NOT_COMING_UP -------------+
+ # |
+ |
+ CLUSTER_UP <----+ |
+ INBOUND_COMING_UP | v
+
+ ^ CLUSTER_GOING_DOWN CLUSTER_GOING_DOWN
+ # INBOUND_COMING_UP <=== INBOUND_NOT_COMING_UP
+
+ CLUSTER_DOWN | |
+ INBOUND_COMING_UP <----+ |
+ |
+ ^ |
+ +=========== CLUSTER_DOWN <------------+
+ INBOUND_NOT_COMING_UP
+
+ Transitions -----> can only be made by the outbound CPU, and
+ only involve changes to the "cluster" state.
+
+ Transitions ===##> can only be made by the inbound CPU, and only
+ involve changes to the "inbound" state, except where there is no
+ further transition possible on the outbound side (i.e., the
+ outbound CPU has put the cluster into the CLUSTER_DOWN state).
+
+ The race avoidance algorithm does not provide a way to determine
+ which exact CPUs within the cluster play these roles. This must
+ be decided in advance by some other means. Refer to the section
+ "Last man and first man selection" for more explanation.
+
+
+ CLUSTER_DOWN/INBOUND_NOT_COMING_UP is the only state where the
+ cluster can actually be powered down.
+
+ The parallelism of the inbound and outbound CPUs is observed by
+ the existence of two different paths from CLUSTER_GOING_DOWN/
+ INBOUND_NOT_COMING_UP (corresponding to GOING_DOWN in the basic
+ model) to CLUSTER_DOWN/INBOUND_COMING_UP (corresponding to
+ COMING_UP in the basic model). The second path avoids cluster
+ teardown completely.
+
+ CLUSTER_UP/INBOUND_COMING_UP is equivalent to UP in the basic
+ model. The final transition to CLUSTER_UP/INBOUND_NOT_COMING_UP
+ is trivial and merely resets the state machine ready for the
+ next cycle.
+
+ Details of the allowable transitions follow.
+
+ The next state in each case is notated
+
+ <cluster state>/<inbound state> (<transitioner>)
+
+ where the <transitioner> is the side on which the transition
+ can occur; either the inbound or the outbound side.
+
+
+CLUSTER_DOWN/INBOUND_NOT_COMING_UP:
+
+ Next state: CLUSTER_DOWN/INBOUND_COMING_UP (inbound)
+ Conditions: none
+ Trigger events:
+
+ a) an explicit hardware power-up operation, resulting
+ from a policy decision on another CPU;
+
+ b) a hardware event, such as an interrupt.
+
+
+CLUSTER_DOWN/INBOUND_COMING_UP:
+
+ In this state, an inbound CPU sets up the cluster, including
+ enabling of hardware coherency at the cluster level and any
+ other operations (such as cache invalidation) which are required
+ in order to achieve this.
+
+ The purpose of this state is to do sufficient cluster-level
+ setup to enable other CPUs in the cluster to enter coherency
+ safely.
+
+ Next state: CLUSTER_UP/INBOUND_COMING_UP (inbound)
+ Conditions: cluster-level setup and hardware coherency complete
+ Trigger events: (spontaneous)
+
+
+CLUSTER_UP/INBOUND_COMING_UP:
+
+ Cluster-level setup is complete and hardware coherency is
+ enabled for the cluster. Other CPUs in the cluster can safely
+ enter coherency.
+
+ This is a transient state, leading immediately to
+ CLUSTER_UP/INBOUND_NOT_COMING_UP. All other CPUs on the cluster
+ should consider treat these two states as equivalent.
+
+ Next state: CLUSTER_UP/INBOUND_NOT_COMING_UP (inbound)
+ Conditions: none
+ Trigger events: (spontaneous)
+
+
+CLUSTER_UP/INBOUND_NOT_COMING_UP:
+
+ Cluster-level setup is complete and hardware coherency is
+ enabled for the cluster. Other CPUs in the cluster can safely
+ enter coherency.
+
+ The cluster will remain in this state until a policy decision is
+ made to power the cluster down.
+
+ Next state: CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP (outbound)
+ Conditions: none
+ Trigger events: policy decision to power down the cluster
+
+
+CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP:
+
+ An outbound CPU is tearing the cluster down. The selected CPU
+ must wait in this state until all CPUs in the cluster are in the
+ CPU_DOWN state.
+
+ When all CPUs are in the CPU_DOWN state, the cluster can be torn
+ down, for example by cleaning data caches and exiting
+ cluster-level coherency.
+
+ To avoid wasteful unnecessary teardown operations, the outbound
+ should check the inbound cluster state for asynchronous
+ transitions to INBOUND_COMING_UP. Alternatively, individual
+ CPUs can be checked for entry into CPU_COMING_UP or CPU_UP.
+
+
+ Next states:
+
+ CLUSTER_DOWN/INBOUND_NOT_COMING_UP (outbound)
+ Conditions: cluster torn down and ready to power off
+ Trigger events: (spontaneous)
+
+ CLUSTER_GOING_DOWN/INBOUND_COMING_UP (inbound)
+ Conditions: none
+ Trigger events:
+
+ a) an explicit hardware power-up operation,
+ resulting from a policy decision on another
+ CPU;
+
+ b) a hardware event, such as an interrupt.
+
+
+CLUSTER_GOING_DOWN/INBOUND_COMING_UP:
+
+ The cluster is (or was) being torn down, but another CPU has
+ come online in the meantime and is trying to set up the cluster
+ again.
+
+ If the outbound CPU observes this state, it has two choices:
+
+ a) back out of teardown, restoring the cluster to the
+ CLUSTER_UP state;
+
+ b) finish tearing the cluster down and put the cluster
+ in the CLUSTER_DOWN state; the inbound CPU will
+ set up the cluster again from there.
+
+ Choice (a) permits the removal of some latency by avoiding
+ unnecessary teardown and setup operations in situations where
+ the cluster is not really going to be powered down.
+
+
+ Next states:
+
+ CLUSTER_UP/INBOUND_COMING_UP (outbound)
+ Conditions: cluster-level setup and hardware
+ coherency complete
+ Trigger events: (spontaneous)
+
+ CLUSTER_DOWN/INBOUND_COMING_UP (outbound)
+ Conditions: cluster torn down and ready to power off
+ Trigger events: (spontaneous)
+
+
+Last man and First man selection
+--------------------------------
+
+The CPU which performs cluster tear-down operations on the outbound side
+is commonly referred to as the "last man".
+
+The CPU which performs cluster setup on the inbound side is commonly
+referred to as the "first man".
+
+The race avoidance algorithm documented above does not provide a
+mechanism to choose which CPUs should play these roles.
+
+
+Last man:
+
+When shutting down the cluster, all the CPUs involved are initially
+executing Linux and hence coherent. Therefore, ordinary spinlocks can
+be used to select a last man safely, before the CPUs become
+non-coherent.
+
+
+First man:
+
+Because CPUs may power up asynchronously in response to external wake-up
+events, a dynamic mechanism is needed to make sure that only one CPU
+attempts to play the first man role and do the cluster-level
+initialisation: any other CPUs must wait for this to complete before
+proceeding.
+
+Cluster-level initialisation may involve actions such as configuring
+coherency controls in the bus fabric.
+
+The current implementation in mcpm_head.S uses a separate mutual exclusion
+mechanism to do this arbitration. This mechanism is documented in
+detail in vlocks.txt.
+
+
+Features and Limitations
+------------------------
+
+Implementation:
+
+ The current ARM-based implementation is split between
+ arch/arm/common/mcpm_head.S (low-level inbound CPU operations) and
+ arch/arm/common/mcpm_entry.c (everything else):
+
+ __mcpm_cpu_going_down() signals the transition of a CPU to the
+ CPU_GOING_DOWN state.
+
+ __mcpm_cpu_down() signals the transition of a CPU to the CPU_DOWN
+ state.
+
+ A CPU transitions to CPU_COMING_UP and then to CPU_UP via the
+ low-level power-up code in mcpm_head.S. This could
+ involve CPU-specific setup code, but in the current
+ implementation it does not.
+
+ __mcpm_outbound_enter_critical() and __mcpm_outbound_leave_critical()
+ handle transitions from CLUSTER_UP to CLUSTER_GOING_DOWN
+ and from there to CLUSTER_DOWN or back to CLUSTER_UP (in
+ the case of an aborted cluster power-down).
+
+ These functions are more complex than the __mcpm_cpu_*()
+ functions due to the extra inter-CPU coordination which
+ is needed for safe transitions at the cluster level.
+
+ A cluster transitions from CLUSTER_DOWN back to CLUSTER_UP via
+ the low-level power-up code in mcpm_head.S. This
+ typically involves platform-specific setup code,
+ provided by the platform-specific power_up_setup
+ function registered via mcpm_sync_init.
+
+Deep topologies:
+
+ As currently described and implemented, the algorithm does not
+ support CPU topologies involving more than two levels (i.e.,
+ clusters of clusters are not supported). The algorithm could be
+ extended by replicating the cluster-level states for the
+ additional topological levels, and modifying the transition
+ rules for the intermediate (non-outermost) cluster levels.
+
+
+Colophon
+--------
+
+Originally created and documented by Dave Martin for Linaro Limited, in
+collaboration with Nicolas Pitre and Achin Gupta.
+
+Copyright (C) 2012-2013 Linaro Limited
+Distributed under the terms of Version 2 of the GNU General Public
+License, as defined in linux/COPYING.
diff --git a/Documentation/arm/vlocks.txt b/Documentation/arm/vlocks.txt
new file mode 100644
index 00000000000..415960a9bab
--- /dev/null
+++ b/Documentation/arm/vlocks.txt
@@ -0,0 +1,211 @@
+vlocks for Bare-Metal Mutual Exclusion
+======================================
+
+Voting Locks, or "vlocks" provide a simple low-level mutual exclusion
+mechanism, with reasonable but minimal requirements on the memory
+system.
+
+These are intended to be used to coordinate critical activity among CPUs
+which are otherwise non-coherent, in situations where the hardware
+provides no other mechanism to support this and ordinary spinlocks
+cannot be used.
+
+
+vlocks make use of the atomicity provided by the memory system for
+writes to a single memory location. To arbitrate, every CPU "votes for
+itself", by storing a unique number to a common memory location. The
+final value seen in that memory location when all the votes have been
+cast identifies the winner.
+
+In order to make sure that the election produces an unambiguous result
+in finite time, a CPU will only enter the election in the first place if
+no winner has been chosen and the election does not appear to have
+started yet.
+
+
+Algorithm
+---------
+
+The easiest way to explain the vlocks algorithm is with some pseudo-code:
+
+
+ int currently_voting[NR_CPUS] = { 0, };
+ int last_vote = -1; /* no votes yet */
+
+ bool vlock_trylock(int this_cpu)
+ {
+ /* signal our desire to vote */
+ currently_voting[this_cpu] = 1;
+ if (last_vote != -1) {
+ /* someone already volunteered himself */
+ currently_voting[this_cpu] = 0;
+ return false; /* not ourself */
+ }
+
+ /* let's suggest ourself */
+ last_vote = this_cpu;
+ currently_voting[this_cpu] = 0;
+
+ /* then wait until everyone else is done voting */
+ for_each_cpu(i) {
+ while (currently_voting[i] != 0)
+ /* wait */;
+ }
+
+ /* result */
+ if (last_vote == this_cpu)
+ return true; /* we won */
+ return false;
+ }
+
+ bool vlock_unlock(void)
+ {
+ last_vote = -1;
+ }
+
+
+The currently_voting[] array provides a way for the CPUs to determine
+whether an election is in progress, and plays a role analogous to the
+"entering" array in Lamport's bakery algorithm [1].
+
+However, once the election has started, the underlying memory system
+atomicity is used to pick the winner. This avoids the need for a static
+priority rule to act as a tie-breaker, or any counters which could
+overflow.
+
+As long as the last_vote variable is globally visible to all CPUs, it
+will contain only one value that won't change once every CPU has cleared
+its currently_voting flag.
+
+
+Features and limitations
+------------------------
+
+ * vlocks are not intended to be fair. In the contended case, it is the
+ _last_ CPU which attempts to get the lock which will be most likely
+ to win.
+
+ vlocks are therefore best suited to situations where it is necessary
+ to pick a unique winner, but it does not matter which CPU actually
+ wins.
+
+ * Like other similar mechanisms, vlocks will not scale well to a large
+ number of CPUs.
+
+ vlocks can be cascaded in a voting hierarchy to permit better scaling
+ if necessary, as in the following hypothetical example for 4096 CPUs:
+
+ /* first level: local election */
+ my_town = towns[(this_cpu >> 4) & 0xf];
+ I_won = vlock_trylock(my_town, this_cpu & 0xf);
+ if (I_won) {
+ /* we won the town election, let's go for the state */
+ my_state = states[(this_cpu >> 8) & 0xf];
+ I_won = vlock_lock(my_state, this_cpu & 0xf));
+ if (I_won) {
+ /* and so on */
+ I_won = vlock_lock(the_whole_country, this_cpu & 0xf];
+ if (I_won) {
+ /* ... */
+ }
+ vlock_unlock(the_whole_country);
+ }
+ vlock_unlock(my_state);
+ }
+ vlock_unlock(my_town);
+
+
+ARM implementation
+------------------
+
+The current ARM implementation [2] contains some optimisations beyond
+the basic algorithm:
+
+ * By packing the members of the currently_voting array close together,
+ we can read the whole array in one transaction (providing the number
+ of CPUs potentially contending the lock is small enough). This
+ reduces the number of round-trips required to external memory.
+
+ In the ARM implementation, this means that we can use a single load
+ and comparison:
+
+ LDR Rt, [Rn]
+ CMP Rt, #0
+
+ ...in place of code equivalent to:
+
+ LDRB Rt, [Rn]
+ CMP Rt, #0
+ LDRBEQ Rt, [Rn, #1]
+ CMPEQ Rt, #0
+ LDRBEQ Rt, [Rn, #2]
+ CMPEQ Rt, #0
+ LDRBEQ Rt, [Rn, #3]
+ CMPEQ Rt, #0
+
+ This cuts down on the fast-path latency, as well as potentially
+ reducing bus contention in contended cases.
+
+ The optimisation relies on the fact that the ARM memory system
+ guarantees coherency between overlapping memory accesses of
+ different sizes, similarly to many other architectures. Note that
+ we do not care which element of currently_voting appears in which
+ bits of Rt, so there is no need to worry about endianness in this
+ optimisation.
+
+ If there are too many CPUs to read the currently_voting array in
+ one transaction then multiple transations are still required. The
+ implementation uses a simple loop of word-sized loads for this
+ case. The number of transactions is still fewer than would be
+ required if bytes were loaded individually.
+
+
+ In principle, we could aggregate further by using LDRD or LDM, but
+ to keep the code simple this was not attempted in the initial
+ implementation.
+
+
+ * vlocks are currently only used to coordinate between CPUs which are
+ unable to enable their caches yet. This means that the
+ implementation removes many of the barriers which would be required
+ when executing the algorithm in cached memory.
+
+ packing of the currently_voting array does not work with cached
+ memory unless all CPUs contending the lock are cache-coherent, due
+ to cache writebacks from one CPU clobbering values written by other
+ CPUs. (Though if all the CPUs are cache-coherent, you should be
+ probably be using proper spinlocks instead anyway).
+
+
+ * The "no votes yet" value used for the last_vote variable is 0 (not
+ -1 as in the pseudocode). This allows statically-allocated vlocks
+ to be implicitly initialised to an unlocked state simply by putting
+ them in .bss.
+
+ An offset is added to each CPU's ID for the purpose of setting this
+ variable, so that no CPU uses the value 0 for its ID.
+
+
+Colophon
+--------
+
+Originally created and documented by Dave Martin for Linaro Limited, for
+use in ARM-based big.LITTLE platforms, with review and input gratefully
+received from Nicolas Pitre and Achin Gupta. Thanks to Nicolas for
+grabbing most of this text out of the relevant mail thread and writing
+up the pseudocode.
+
+Copyright (C) 2012-2013 Linaro Limited
+Distributed under the terms of Version 2 of the GNU General Public
+License, as defined in linux/COPYING.
+
+
+References
+----------
+
+[1] Lamport, L. "A New Solution of Dijkstra's Concurrent Programming
+ Problem", Communications of the ACM 17, 8 (August 1974), 453-455.
+
+ http://en.wikipedia.org/wiki/Lamport%27s_bakery_algorithm
+
+[2] linux/arch/arm/common/vlock.S, www.kernel.org.
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1cacda426a0..a8f14c03907 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1606,6 +1606,21 @@ config HAVE_ARM_TWD
help
This options enables support for the ARM timer and watchdog unit
+config MCPM
+ bool "Multi-Cluster Power Management"
+ depends on CPU_V7 && SMP
+ help
+ This option provides the common power management infrastructure
+ for (multi-)cluster based systems, such as big.LITTLE based
+ systems.
+
+config BIG_LITTLE
+ bool "big.LITTLE support (Experimental)"
+ depends on CPU_V7 && SMP
+ select MCPM
+ help
+ This option enables support for the big.LITTLE architecture.
+
choice
prompt "Memory split"
default VMSPLIT_3G
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile
index dc8dd0de5c0..bd48ab52544 100644
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -11,3 +11,5 @@ obj-$(CONFIG_SHARP_PARAM) += sharpsl_param.o
obj-$(CONFIG_SHARP_SCOOP) += scoop.o
obj-$(CONFIG_PCI_HOST_ITE8152) += it8152.o
obj-$(CONFIG_ARM_TIMER_SP804) += timer-sp.o
+obj-$(CONFIG_MCPM) += mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o
+CFLAGS_REMOVE_mcpm_entry.o = -pg
diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c
new file mode 100644
index 00000000000..370236dd1a0
--- /dev/null
+++ b/arch/arm/common/mcpm_entry.c
@@ -0,0 +1,263 @@
+/*
+ * arch/arm/common/mcpm_entry.c -- entry point for multi-cluster PM
+ *
+ * Created by: Nicolas Pitre, March 2012
+ * Copyright: (C) 2012-2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/irqflags.h>
+
+#include <asm/mcpm.h>
+#include <asm/cacheflush.h>
+#include <asm/idmap.h>
+#include <asm/cputype.h>
+
+extern unsigned long mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER];
+
+void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
+{
+ unsigned long val = ptr ? virt_to_phys(ptr) : 0;
+ mcpm_entry_vectors[cluster][cpu] = val;
+ sync_cache_w(&mcpm_entry_vectors[cluster][cpu]);
+}
+
+static const struct mcpm_platform_ops *platform_ops;
+
+int __init mcpm_platform_register(const struct mcpm_platform_ops *ops)
+{
+ if (platform_ops)
+ return -EBUSY;
+ platform_ops = ops;
+ return 0;
+}
+
+int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster)
+{
+ if (!platform_ops)
+ return -EUNATCH; /* try not to shadow power_up errors */
+ might_sleep();
+ return platform_ops->power_up(cpu, cluster);
+}
+
+typedef void (*phys_reset_t)(unsigned long);
+
+void mcpm_cpu_power_down(void)
+{
+ phys_reset_t phys_reset;
+
+ BUG_ON(!platform_ops);
+ BUG_ON(!irqs_disabled());
+
+ /*
+ * Do this before calling into the power_down method,
+ * as it might not always be safe to do afterwards.
+ */
+ setup_mm_for_reboot();
+
+ platform_ops->power_down();
+
+ /*
+ * It is possible for a power_up request to happen concurrently
+ * with a power_down request for the same CPU. In this case the
+ * power_down method might not be able to actually enter a
+ * powered down state with the WFI instruction if the power_up
+ * method has removed the required reset condition. The
+ * power_down method is then allowed to return. We must perform
+ * a re-entry in the kernel as if the power_up method just had
+ * deasserted reset on the CPU.
+ *
+ * To simplify race issues, the platform specific implementation
+ * must accommodate for the possibility of unordered calls to
+ * power_down and power_up with a usage count. Therefore, if a
+ * call to power_up is issued for a CPU that is not down, then
+ * the next call to power_down must not attempt a full shutdown
+ * but only do the minimum (normally disabling L1 cache and CPU
+ * coherency) and return just as if a concurrent power_up request
+ * had happened as described above.
+ */
+
+ phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
+ phys_reset(virt_to_phys(mcpm_entry_point));
+
+ /* should never get here */
+ BUG();
+}
+
+void mcpm_cpu_suspend(u64 expected_residency)
+{
+ phys_reset_t phys_reset;
+
+ BUG_ON(!platform_ops);
+ BUG_ON(!irqs_disabled());
+
+ /* Very similar to mcpm_cpu_power_down() */
+ setup_mm_for_reboot();
+ platform_ops->suspend(expected_residency);
+ phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
+ phys_reset(virt_to_phys(mcpm_entry_point));
+ BUG();
+}
+
+int mcpm_cpu_powered_up(void)
+{
+ if (!platform_ops)
+ return -EUNATCH;
+ if (platform_ops->powered_up)
+ platform_ops->powered_up();
+ return 0;
+}
+
+struct sync_struct mcpm_sync;
+
+/*
+ * __mcpm_cpu_going_down: Indicates that the cpu is being torn down.
+ * This must be called at the point of committing to teardown of a CPU.
+ * The CPU cache (SCTRL.C bit) is expected to still be active.
+ */
+void __mcpm_cpu_going_down(unsigned int cpu, unsigned int cluster)
+{
+ mcpm_sync.clusters[cluster].cpus[cpu].cpu = CPU_GOING_DOWN;
+ sync_cache_w(&mcpm_sync.clusters[cluster].cpus[cpu].cpu);
+}
+
+/*
+ * __mcpm_cpu_down: Indicates that cpu teardown is complete and that the
+ * cluster can be torn down without disrupting this CPU.
+ * To avoid deadlocks, this must be called before a CPU is powered down.
+ * The CPU cache (SCTRL.C bit) is expected to be off.
+ * However L2 cache might or might not be active.
+ */
+void __mcpm_cpu_down(unsigned int cpu, unsigned int cluster)
+{
+ dmb();
+ mcpm_sync.clusters[cluster].cpus[cpu].cpu = CPU_DOWN;
+ sync_cache_w(&mcpm_sync.clusters[cluster].cpus[cpu].cpu);
+ dsb_sev();
+}
+
+/*
+ * __mcpm_outbound_leave_critical: Leave the cluster teardown critical section.
+ * @state: the final state of the cluster:
+ * CLUSTER_UP: no destructive teardown was done and the cluster has been
+ * restored to the previous state (CPU cache still active); or
+ * CLUSTER_DOWN: the cluster has been torn-down, ready for power-off
+ * (CPU cache disabled, L2 cache either enabled or disabled).
+ */
+void __mcpm_outbound_leave_critical(unsigned int cluster, int state)
+{
+ dmb();
+ mcpm_sync.clusters[cluster].cluster = state;
+ sync_cache_w(&mcpm_sync.clusters[cluster].cluster);
+ dsb_sev();
+}
+
+/*
+ * __mcpm_outbound_enter_critical: Enter the cluster teardown critical section.
+ * This function should be called by the last man, after local CPU teardown
+ * is complete. CPU cache expected to be active.
+ *
+ * Returns:
+ * false: the critical section was not entered because an inbound CPU was
+ * observed, or the cluster is already being set up;
+ * true: the critical section was entered: it is now safe to tear down the
+ * cluster.
+ */
+bool __mcpm_outbound_enter_critical(unsigned int cpu, unsigned int cluster)
+{
+ unsigned int i;
+ struct mcpm_sync_struct *c = &mcpm_sync.clusters[cluster];
+
+ /* Warn inbound CPUs that the cluster is being torn down: */
+ c->cluster = CLUSTER_GOING_DOWN;
+ sync_cache_w(&c->cluster);
+
+ /* Back out if the inbound cluster is already in the critical region: */
+ sync_cache_r(&c->inbound);
+ if (c->inbound == INBOUND_COMING_UP)
+ goto abort;
+
+ /*
+ * Wait for all CPUs to get out of the GOING_DOWN state, so that local
+ * teardown is complete on each CPU before tearing down the cluster.
+ *
+ * If any CPU has been woken up again from the DOWN state, then we
+ * shouldn't be taking the cluster down at all: abort in that case.
+ */
+ sync_cache_r(&c->cpus);
+ for (i = 0; i < MAX_CPUS_PER_CLUSTER; i++) {
+ int cpustate;
+
+ if (i == cpu)
+ continue;
+
+ while (1) {
+ cpustate = c->cpus[i].cpu;
+ if (cpustate != CPU_GOING_DOWN)
+ break;
+
+ wfe();
+ sync_cache_r(&c->cpus[i].cpu);
+ }
+
+ switch (cpustate) {
+ case CPU_DOWN:
+ continue;
+
+ default:
+ goto abort;
+ }
+ }
+
+ return true;
+
+abort:
+ __mcpm_outbound_leave_critical(cluster, CLUSTER_UP);
+ return false;
+}
+
+int __mcpm_cluster_state(unsigned int cluster)
+{
+ sync_cache_r(&mcpm_sync.clusters[cluster].cluster);
+ return mcpm_sync.clusters[cluster].cluster;
+}
+
+extern unsigned long mcpm_power_up_setup_phys;
+
+int __init mcpm_sync_init(
+ void (*power_up_setup)(unsigned int affinity_level))
+{
+ unsigned int i, j, mpidr, this_cluster;
+
+ BUILD_BUG_ON(MCPM_SYNC_CLUSTER_SIZE * MAX_NR_CLUSTERS != sizeof mcpm_sync);
+ BUG_ON((unsigned long)&mcpm_sync & (__CACHE_WRITEBACK_GRANULE - 1));
+
+ /*
+ * Set initial CPU and cluster states.
+ * Only one cluster is assumed to be active at this point.
+ */
+ for (i = 0; i < MAX_NR_CLUSTERS; i++) {
+ mcpm_sync.clusters[i].cluster = CLUSTER_DOWN;
+ mcpm_sync.clusters[i].inbound = INBOUND_NOT_COMING_UP;
+ for (j = 0; j < MAX_CPUS_PER_CLUSTER; j++)
+ mcpm_sync.clusters[i].cpus[j].cpu = CPU_DOWN;
+ }
+ mpidr = read_cpuid_mpidr();
+ this_cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+ for_each_online_cpu(i)
+ mcpm_sync.clusters[this_cluster].cpus[i].cpu = CPU_UP;
+ mcpm_sync.clusters[this_cluster].cluster = CLUSTER_UP;
+ sync_cache_w(&mcpm_sync);
+
+ if (power_up_setup) {
+ mcpm_power_up_setup_phys = virt_to_phys(power_up_setup);
+ sync_cache_w(&mcpm_power_up_setup_phys);
+ }
+
+ return 0;
+}
diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S
new file mode 100644
index 00000000000..8178705c4b2
--- /dev/null
+++ b/arch/arm/common/mcpm_head.S
@@ -0,0 +1,219 @@
+/*
+ * arch/arm/common/mcpm_head.S -- kernel entry point for multi-cluster PM
+ *
+ * Created by: Nicolas Pitre, March 2012
+ * Copyright: (C) 2012-2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *
+ * Refer to Documentation/arm/cluster-pm-race-avoidance.txt
+ * for details of the synchronisation algorithms used here.
+ */
+
+#include <linux/linkage.h>
+#include <asm/mcpm.h>
+
+#include "vlock.h"
+
+.if MCPM_SYNC_CLUSTER_CPUS
+.error "cpus must be the first member of struct mcpm_sync_struct"
+.endif
+
+ .macro pr_dbg string
+#if defined(CONFIG_DEBUG_LL) && defined(DEBUG)
+ b 1901f
+1902: .asciz "CPU"
+1903: .asciz " cluster"
+1904: .asciz ": \string"
+ .align
+1901: adr r0, 1902b
+ bl printascii
+ mov r0, r9
+ bl printhex8
+ adr r0, 1903b
+ bl printascii
+ mov r0, r10
+ bl printhex8
+ adr r0, 1904b
+ bl printascii
+#endif
+ .endm
+
+ .arm
+ .align
+
+ENTRY(mcpm_entry_point)
+
+ THUMB( adr r12, BSYM(1f) )
+ THUMB( bx r12 )
+ THUMB( .thumb )
+1:
+ mrc p15, 0, r0, c0, c0, 5 @ MPIDR
+ ubfx r9, r0, #0, #8 @ r9 = cpu
+ ubfx r10, r0, #8, #8 @ r10 = cluster
+ mov r3, #MAX_CPUS_PER_CLUSTER
+ mla r4, r3, r10, r9 @ r4 = canonical CPU index
+ cmp r4, #(MAX_CPUS_PER_CLUSTER * MAX_NR_CLUSTERS)
+ blo 2f
+
+ /* We didn't expect this CPU. Try to cheaply make it quiet. */
+1: wfi
+ wfe
+ b 1b
+
+2: pr_dbg "kernel mcpm_entry_point\n"
+
+ /*
+ * MMU is off so we need to get to various variables in a
+ * position independent way.
+ */
+ adr r5, 3f
+ ldmia r5, {r6, r7, r8, r11}
+ add r6, r5, r6 @ r6 = mcpm_entry_vectors
+ ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys
+ add r8, r5, r8 @ r8 = mcpm_sync
+ add r11, r5, r11 @ r11 = first_man_locks
+
+ mov r0, #MCPM_SYNC_CLUSTER_SIZE
+ mla r8, r0, r10, r8 @ r8 = sync cluster base
+
+ @ Signal that this CPU is coming UP:
+ mov r0, #CPU_COMING_UP
+ mov r5, #MCPM_SYNC_CPU_SIZE
+ mla r5, r9, r5, r8 @ r5 = sync cpu address
+ strb r0, [r5]
+
+ @ At this point, the cluster cannot unexpectedly enter the GOING_DOWN
+ @ state, because there is at least one active CPU (this CPU).
+
+ mov r0, #VLOCK_SIZE
+ mla r11, r0, r10, r11 @ r11 = cluster first man lock
+ mov r0, r11
+ mov r1, r9 @ cpu
+ bl vlock_trylock @ implies DMB
+
+ cmp r0, #0 @ failed to get the lock?
+ bne mcpm_setup_wait @ wait for cluster setup if so
+
+ ldrb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]
+ cmp r0, #CLUSTER_UP @ cluster already up?
+ bne mcpm_setup @ if not, set up the cluster
+
+ @ Otherwise, release the first man lock and skip setup:
+ mov r0, r11
+ bl vlock_unlock
+ b mcpm_setup_complete
+
+mcpm_setup:
+ @ Control dependency implies strb not observable before previous ldrb.
+
+ @ Signal that the cluster is being brought up:
+ mov r0, #INBOUND_COMING_UP
+ strb r0, [r8, #MCPM_SYNC_CLUSTER_INBOUND]
+ dmb
+
+ @ Any CPU trying to take the cluster into CLUSTER_GOING_DOWN from this
+ @ point onwards will observe INBOUND_COMING_UP and abort.
+
+ @ Wait for any previously-pending cluster teardown operations to abort
+ @ or complete:
+mcpm_teardown_wait:
+ ldrb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]
+ cmp r0, #CLUSTER_GOING_DOWN
+ bne first_man_setup
+ wfe
+ b mcpm_teardown_wait
+
+first_man_setup:
+ dmb
+
+ @ If the outbound gave up before teardown started, skip cluster setup:
+
+ cmp r0, #CLUSTER_UP
+ beq mcpm_setup_leave
+
+ @ power_up_setup is now responsible for setting up the cluster:
+
+ cmp r7, #0
+ mov r0, #1 @ second (cluster) affinity level
+ blxne r7 @ Call power_up_setup if defined
+ dmb
+
+ mov r0, #CLUSTER_UP
+ strb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]
+ dmb
+
+mcpm_setup_leave:
+ @ Leave the cluster setup critical section:
+
+ mov r0, #INBOUND_NOT_COMING_UP
+ strb r0, [r8, #MCPM_SYNC_CLUSTER_INBOUND]
+ dsb
+ sev
+
+ mov r0, r11
+ bl vlock_unlock @ implies DMB
+ b mcpm_setup_complete
+
+ @ In the contended case, non-first men wait here for cluster setup
+ @ to complete:
+mcpm_setup_wait:
+ ldrb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]
+ cmp r0, #CLUSTER_UP
+ wfene
+ bne mcpm_setup_wait
+ dmb
+
+mcpm_setup_complete:
+ @ If a platform-specific CPU setup hook is needed, it is
+ @ called from here.
+
+ cmp r7, #0
+ mov r0, #0 @ first (CPU) affinity level
+ blxne r7 @ Call power_up_setup if defined
+ dmb
+
+ @ Mark the CPU as up:
+
+ mov r0, #CPU_UP
+ strb r0, [r5]
+
+ @ Observability order of CPU_UP and opening of the gate does not matter.
+
+mcpm_entry_gated:
+ ldr r5, [r6, r4, lsl #2] @ r5 = CPU entry vector
+ cmp r5, #0
+ wfeeq
+ beq mcpm_entry_gated
+ dmb
+
+ pr_dbg "released\n"
+ bx r5
+
+ .align 2
+
+3: .word mcpm_entry_vectors - .
+ .word mcpm_power_up_setup_phys - 3b
+ .word mcpm_sync - 3b
+ .word first_man_locks - 3b
+
+ENDPROC(mcpm_entry_point)
+
+ .bss
+
+ .align CACHE_WRITEBACK_ORDER
+ .type first_man_locks, #object
+first_man_locks:
+ .space VLOCK_SIZE * MAX_NR_CLUSTERS
+ .align CACHE_WRITEBACK_ORDER
+
+ .type mcpm_entry_vectors, #object
+ENTRY(mcpm_entry_vectors)
+ .space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
+
+ .type mcpm_power_up_setup_phys, #object
+ENTRY(mcpm_power_up_setup_phys)
+ .space 4 @ set by mcpm_sync_init()
diff --git a/arch/arm/common/mcpm_platsmp.c b/arch/arm/common/mcpm_platsmp.c
new file mode 100644
index 00000000000..3caed0db698
--- /dev/null
+++ b/arch/arm/common/mcpm_platsmp.c
@@ -0,0 +1,89 @@
+/*
+ * linux/arch/arm/mach-vexpress/mcpm_platsmp.c
+ *
+ * Created by: Nicolas Pitre, November 2012
+ * Copyright: (C) 2012-2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Code to handle secondary CPU bringup and hotplug for the cluster power API.
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+
+#include <asm/mcpm.h>
+#include <asm/smp.h>
+#include <asm/smp_plat.h>
+
+static void __init simple_smp_init_cpus(void)
+{
+}
+
+static int __cpuinit mcpm_boot_secondary(unsigned int cpu, struct task_struct *idle)
+{
+ unsigned int mpidr, pcpu, pcluster, ret;
+ extern void secondary_startup(void);
+
+ mpidr = cpu_logical_map(cpu);
+ pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+ pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+ pr_debug("%s: logical CPU %d is physical CPU %d cluster %d\n",
+ __func__, cpu, pcpu, pcluster);
+
+ mcpm_set_entry_vector(pcpu, pcluster, NULL);
+ ret = mcpm_cpu_power_up(pcpu, pcluster);
+ if (ret)
+ return ret;
+ mcpm_set_entry_vector(pcpu, pcluster, secondary_startup);
+ arch_send_wakeup_ipi_mask(cpumask_of(cpu));
+ dsb_sev();
+ return 0;
+}
+
+static void __cpuinit mcpm_secondary_init(unsigned int cpu)
+{
+ mcpm_cpu_powered_up();
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static int mcpm_cpu_disable(unsigned int cpu)
+{
+ /*
+ * We assume all CPUs may be shut down.
+ * This would be the hook to use for eventual Secure
+ * OS migration requests as described in the PSCI spec.
+ */
+ return 0;
+}
+
+static void mcpm_cpu_die(unsigned int cpu)
+{
+ unsigned int mpidr, pcpu, pcluster;
+ mpidr = read_cpuid_mpidr();
+ pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+ pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+ mcpm_set_entry_vector(pcpu, pcluster, NULL);
+ mcpm_cpu_power_down();
+}
+
+#endif
+
+static struct smp_operations __initdata mcpm_smp_ops = {
+ .smp_init_cpus = simple_smp_init_cpus,
+ .smp_boot_secondary = mcpm_boot_secondary,
+ .smp_secondary_init = mcpm_secondary_init,
+#ifdef CONFIG_HOTPLUG_CPU
+ .cpu_disable = mcpm_cpu_disable,
+ .cpu_die = mcpm_cpu_die,
+#endif
+};
+
+void __init mcpm_smp_set_ops(void)
+{
+ smp_set_ops(&mcpm_smp_ops);
+}
diff --git a/arch/arm/common/vlock.S b/arch/arm/common/vlock.S
new file mode 100644
index 00000000000..ff198583f68
--- /dev/null
+++ b/arch/arm/common/vlock.S
@@ -0,0 +1,108 @@
+/*
+ * vlock.S - simple voting lock implementation for ARM
+ *
+ * Created by: Dave Martin, 2012-08-16
+ * Copyright: (C) 2012-2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * This algorithm is described in more detail in
+ * Documentation/arm/vlocks.txt.
+ */
+
+#include <linux/linkage.h>
+#include "vlock.h"
+
+/* Select different code if voting flags can fit in a single word. */
+#if VLOCK_VOTING_SIZE > 4
+#define FEW(x...)
+#define MANY(x...) x
+#else
+#define FEW(x...) x
+#define MANY(x...)
+#endif
+
+@ voting lock for first-man coordination
+
+.macro voting_begin rbase:req, rcpu:req, rscratch:req
+ mov \rscratch, #1
+ strb \rscratch, [\rbase, \rcpu]
+ dmb
+.endm
+
+.macro voting_end rbase:req, rcpu:req, rscratch:req
+ dmb
+ mov \rscratch, #0
+ strb \rscratch, [\rbase, \rcpu]
+ dsb
+ sev
+.endm
+
+/*
+ * The vlock structure must reside in Strongly-Ordered or Device memory.
+ * This implementation deliberately eliminates most of the barriers which
+ * would be required for other memory types, and assumes that independent
+ * writes to neighbouring locations within a cacheline do not interfere
+ * with one another.
+ */
+
+@ r0: lock structure base
+@ r1: CPU ID (0-based index within cluster)
+ENTRY(vlock_trylock)
+ add r1, r1, #VLOCK_VOTING_OFFSET
+
+ voting_begin r0, r1, r2
+
+ ldrb r2, [r0, #VLOCK_OWNER_OFFSET] @ check whether lock is held
+ cmp r2, #VLOCK_OWNER_NONE
+ bne trylock_fail @ fail if so
+
+ @ Control dependency implies strb not observable before previous ldrb.
+
+ strb r1, [r0, #VLOCK_OWNER_OFFSET] @ submit my vote
+
+ voting_end r0, r1, r2 @ implies DMB
+
+ @ Wait for the current round of voting to finish:
+
+ MANY( mov r3, #VLOCK_VOTING_OFFSET )
+0:
+ MANY( ldr r2, [r0, r3] )
+ FEW( ldr r2, [r0, #VLOCK_VOTING_OFFSET] )
+ cmp r2, #0
+ wfene
+ bne 0b
+ MANY( add r3, r3, #4 )
+ MANY( cmp r3, #VLOCK_VOTING_OFFSET + VLOCK_VOTING_SIZE )
+ MANY( bne 0b )
+
+ @ Check who won:
+
+ dmb
+ ldrb r2, [r0, #VLOCK_OWNER_OFFSET]
+ eor r0, r1, r2 @ zero if I won, else nonzero
+ bx lr
+
+trylock_fail:
+ voting_end r0, r1, r2
+ mov r0, #1 @ nonzero indicates that I lost
+ bx lr
+ENDPROC(vlock_trylock)
+
+@ r0: lock structure base
+ENTRY(vlock_unlock)
+ dmb
+ mov r1, #VLOCK_OWNER_NONE
+ strb r1, [r0, #VLOCK_OWNER_OFFSET]
+ dsb
+ sev
+ bx lr
+ENDPROC(vlock_unlock)
diff --git a/arch/arm/common/vlock.h b/arch/arm/common/vlock.h
new file mode 100644
index 00000000000..3b441475a59
--- /dev/null
+++ b/arch/arm/common/vlock.h
@@ -0,0 +1,29 @@
+/*
+ * vlock.h - simple voting lock implementation
+ *
+ * Created by: Dave Martin, 2012-08-16
+ * Copyright: (C) 2012-2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __VLOCK_H
+#define __VLOCK_H
+
+#include <asm/mcpm.h>
+
+/* Offsets and sizes are rounded to a word (4 bytes) */
+#define VLOCK_OWNER_OFFSET 0
+#define VLOCK_VOTING_OFFSET 4
+#define VLOCK_VOTING_SIZE ((MAX_CPUS_PER_CLUSTER + 3) / 4 * 4)
+#define VLOCK_SIZE (VLOCK_VOTING_OFFSET + VLOCK_VOTING_SIZE)
+#define VLOCK_OWNER_NONE 0
+
+#endif /* ! __VLOCK_H */
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index e1489c54cd1..bff71388e72 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -363,4 +363,79 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
flush_cache_all();
}
+/*
+ * Memory synchronization helpers for mixed cached vs non cached accesses.
+ *
+ * Some synchronization algorithms have to set states in memory with the
+ * cache enabled or disabled depending on the code path. It is crucial
+ * to always ensure proper cache maintenance to update main memory right
+ * away in that case.
+ *
+ * Any cached write must be followed by a cache clean operation.
+ * Any cached read must be preceded by a cache invalidate operation.
+ * Yet, in the read case, a cache flush i.e. atomic clean+invalidate
+ * operation is needed to avoid discarding possible concurrent writes to the
+ * accessed memory.
+ *
+ * Also, in order to prevent a cached writer from interfering with an
+ * adjacent non-cached writer, each state variable must be located to
+ * a separate cache line.
+ */
+
+/*
+ * This needs to be >= the max cache writeback size of all
+ * supported platforms included in the current kernel configuration.
+ * This is used to align state variables to their own cache lines.
+ */
+#define __CACHE_WRITEBACK_ORDER 6 /* guessed from existing platforms */
+#define __CACHE_WRITEBACK_GRANULE (1 << __CACHE_WRITEBACK_ORDER)
+
+/*
+ * There is no __cpuc_clean_dcache_area but we use it anyway for
+ * code intent clarity, and alias it to __cpuc_flush_dcache_area.
+ */
+#define __cpuc_clean_dcache_area __cpuc_flush_dcache_area
+
+/*
+ * Ensure preceding writes to *p by this CPU are visible to
+ * subsequent reads by other CPUs:
+ */
+static inline void __sync_cache_range_w(volatile void *p, size_t size)
+{
+ char *_p = (char *)p;
+
+ __cpuc_clean_dcache_area(_p, size);
+ outer_clean_range(__pa(_p), __pa(_p + size));
+}
+
+/*
+ * Ensure preceding writes to *p by other CPUs are visible to
+ * subsequent reads by this CPU. We must be careful not to
+ * discard data simultaneously written by another CPU, hence the
+ * usage of flush rather than invalidate operations.
+ */
+static inline void __sync_cache_range_r(volatile void *p, size_t size)
+{
+ char *_p = (char *)p;
+
+#ifdef CONFIG_OUTER_CACHE
+ if (outer_cache.flush_range) {
+ /*
+ * Ensure dirty data migrated from other CPUs into our cache
+ * are cleaned out safely before the outer cache is cleaned:
+ */
+ __cpuc_clean_dcache_area(_p, size);
+
+ /* Clean and invalidate stale data for *p from outer ... */
+ outer_flush_range(__pa(_p), __pa(_p + size));
+ }
+#endif
+
+ /* ... and inner cache: */
+ __cpuc_flush_dcache_area(_p, size);
+}
+
+#define sync_cache_w(ptr) __sync_cache_range_w(ptr, sizeof *(ptr))
+#define sync_cache_r(ptr) __sync_cache_range_r(ptr, sizeof *(ptr))
+
#endif
diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h
index 308ad7d6f98..75bf07910b8 100644
--- a/arch/arm/include/asm/mach/arch.h
+++ b/arch/arm/include/asm/mach/arch.h
@@ -8,6 +8,8 @@
* published by the Free Software Foundation.
*/
+#include <linux/types.h>
+
#ifndef __ASSEMBLY__
struct tag;
@@ -16,8 +18,10 @@ struct pt_regs;
struct smp_operations;
#ifdef CONFIG_SMP
#define smp_ops(ops) (&(ops))
+#define smp_init_ops(ops) (&(ops))
#else
#define smp_ops(ops) (struct smp_operations *)NULL
+#define smp_init_ops(ops) (bool (*)(void))NULL
#endif
struct machine_desc {
@@ -41,6 +45,7 @@ struct machine_desc {
unsigned char reserve_lp2 :1; /* never has lp2 */
char restart_mode; /* default restart mode */
struct smp_operations *smp; /* SMP operations */
+ bool (*smp_init)(void);
void (*fixup)(struct tag *, char **,
struct meminfo *);
void (*reserve)(void);/* reserve mem blocks */
diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h
new file mode 100644
index 00000000000..0f7b7620e9a
--- /dev/null
+++ b/arch/arm/include/asm/mcpm.h
@@ -0,0 +1,209 @@
+/*
+ * arch/arm/include/asm/mcpm.h
+ *
+ * Created by: Nicolas Pitre, April 2012
+ * Copyright: (C) 2012-2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef MCPM_H
+#define MCPM_H
+
+/*
+ * Maximum number of possible clusters / CPUs per cluster.
+ *
+ * This should be sufficient for quite a while, while keeping the
+ * (assembly) code simpler. When this starts to grow then we'll have
+ * to consider dynamic allocation.
+ */
+#define MAX_CPUS_PER_CLUSTER 4
+#define MAX_NR_CLUSTERS 2
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <asm/cacheflush.h>
+
+/*
+ * Platform specific code should use this symbol to set up secondary
+ * entry location for processors to use when released from reset.
+ */
+extern void mcpm_entry_point(void);
+
+/*
+ * This is used to indicate where the given CPU from given cluster should
+ * branch once it is ready to re-enter the kernel using ptr, or NULL if it
+ * should be gated. A gated CPU is held in a WFE loop until its vector
+ * becomes non NULL.
+ */
+void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
+
+/*
+ * CPU/cluster power operations API for higher subsystems to use.
+ */
+
+/**
+ * mcpm_cpu_power_up - make given CPU in given cluster runable
+ *
+ * @cpu: CPU number within given cluster
+ * @cluster: cluster number for the CPU
+ *
+ * The identified CPU is brought out of reset. If the cluster was powered
+ * down then it is brought up as well, taking care not to let the other CPUs
+ * in the cluster run, and ensuring appropriate cluster setup.
+ *
+ * Caller must ensure the appropriate entry vector is initialized with
+ * mcpm_set_entry_vector() prior to calling this.
+ *
+ * This must be called in a sleepable context. However, the implementation
+ * is strongly encouraged to return early and let the operation happen
+ * asynchronously, especially when significant delays are expected.
+ *
+ * If the operation cannot be performed then an error code is returned.
+ */
+int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster);
+
+/**
+ * mcpm_cpu_power_down - power the calling CPU down
+ *
+ * The calling CPU is powered down.
+ *
+ * If this CPU is found to be the "last man standing" in the cluster
+ * then the cluster is prepared for power-down too.
+ *
+ * This must be called with interrupts disabled.
+ *
+ * This does not return. Re-entry in the kernel is expected via
+ * mcpm_entry_point.
+ */
+void mcpm_cpu_power_down(void);
+
+/**
+ * mcpm_cpu_suspend - bring the calling CPU in a suspended state
+ *
+ * @expected_residency: duration in microseconds the CPU is expected
+ * to remain suspended, or 0 if unknown/infinity.
+ *
+ * The calling CPU is suspended. The expected residency argument is used
+ * as a hint by the platform specific backend to implement the appropriate
+ * sleep state level according to the knowledge it has on wake-up latency
+ * for the given hardware.
+ *
+ * If this CPU is found to be the "last man standing" in the cluster
+ * then the cluster may be prepared for power-down too, if the expected
+ * residency makes it worthwhile.
+ *
+ * This must be called with interrupts disabled.
+ *
+ * This does not return. Re-entry in the kernel is expected via
+ * mcpm_entry_point.
+ */
+void mcpm_cpu_suspend(u64 expected_residency);
+
+/**
+ * mcpm_cpu_powered_up - housekeeping workafter a CPU has been powered up
+ *
+ * This lets the platform specific backend code perform needed housekeeping
+ * work. This must be called by the newly activated CPU as soon as it is
+ * fully operational in kernel space, before it enables interrupts.
+ *
+ * If the operation cannot be performed then an error code is returned.
+ */
+int mcpm_cpu_powered_up(void);
+
+/*
+ * Platform specific methods used in the implementation of the above API.
+ */
+struct mcpm_platform_ops {
+ int (*power_up)(unsigned int cpu, unsigned int cluster);
+ void (*power_down)(void);
+ void (*suspend)(u64);
+ void (*powered_up)(void);
+};
+
+/**
+ * mcpm_platform_register - register platform specific power methods
+ *
+ * @ops: mcpm_platform_ops structure to register
+ *
+ * An error is returned if the registration has been done previously.
+ */
+int __init mcpm_platform_register(const struct mcpm_platform_ops *ops);
+
+/* Synchronisation structures for coordinating safe cluster setup/teardown: */
+
+/*
+ * When modifying this structure, make sure you update the MCPM_SYNC_ defines
+ * to match.
+ */
+struct mcpm_sync_struct {
+ /* individual CPU states */
+ struct {
+ s8 cpu __aligned(__CACHE_WRITEBACK_GRANULE);
+ } cpus[MAX_CPUS_PER_CLUSTER];
+
+ /* cluster state */
+ s8 cluster __aligned(__CACHE_WRITEBACK_GRANULE);
+
+ /* inbound-side state */
+ s8 inbound __aligned(__CACHE_WRITEBACK_GRANULE);
+};
+
+struct sync_struct {
+ struct mcpm_sync_struct clusters[MAX_NR_CLUSTERS];
+};
+
+extern unsigned long sync_phys; /* physical address of *mcpm_sync */
+
+void __mcpm_cpu_going_down(unsigned int cpu, unsigned int cluster);
+void __mcpm_cpu_down(unsigned int cpu, unsigned int cluster);
+void __mcpm_outbound_leave_critical(unsigned int cluster, int state);
+bool __mcpm_outbound_enter_critical(unsigned int this_cpu, unsigned int cluster);
+int __mcpm_cluster_state(unsigned int cluster);
+
+int __init mcpm_sync_init(
+ void (*power_up_setup)(unsigned int affinity_level));
+
+void __init mcpm_smp_set_ops(void);
+
+#else
+
+/*
+ * asm-offsets.h causes trouble when included in .c files, and cacheflush.h
+ * cannot be included in asm files. Let's work around the conflict like this.
+ */
+#include <asm/asm-offsets.h>
+#define __CACHE_WRITEBACK_GRANULE CACHE_WRITEBACK_GRANULE
+
+#endif /* ! __ASSEMBLY__ */
+
+/* Definitions for mcpm_sync_struct */
+#define CPU_DOWN 0x11
+#define CPU_COMING_UP 0x12
+#define CPU_UP 0x13
+#define CPU_GOING_DOWN 0x14
+
+#define CLUSTER_DOWN 0x21
+#define CLUSTER_UP 0x22
+#define CLUSTER_GOING_DOWN 0x23
+
+#define INBOUND_NOT_COMING_UP 0x31
+#define INBOUND_COMING_UP 0x32
+
+/*
+ * Offsets for the mcpm_sync_struct members, for use in asm.
+ * We don't want to make them global to the kernel via asm-offsets.c.
+ */
+#define MCPM_SYNC_CLUSTER_CPUS 0
+#define MCPM_SYNC_CPU_SIZE __CACHE_WRITEBACK_GRANULE
+#define MCPM_SYNC_CLUSTER_CLUSTER \
+ (MCPM_SYNC_CLUSTER_CPUS + MCPM_SYNC_CPU_SIZE * MAX_CPUS_PER_CLUSTER)
+#define MCPM_SYNC_CLUSTER_INBOUND \
+ (MCPM_SYNC_CLUSTER_CLUSTER + __CACHE_WRITEBACK_GRANULE)
+#define MCPM_SYNC_CLUSTER_SIZE \
+ (MCPM_SYNC_CLUSTER_INBOUND + __CACHE_WRITEBACK_GRANULE)
+
+#endif
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 923eec7105c..3f088225e71 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -149,6 +149,10 @@ int main(void)
DEFINE(DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL);
DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE);
DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE);
+ BLANK();
+ DEFINE(CACHE_WRITEBACK_ORDER, __CACHE_WRITEBACK_ORDER);
+ DEFINE(CACHE_WRITEBACK_GRANULE, __CACHE_WRITEBACK_GRANULE);
+ BLANK();
#ifdef CONFIG_KVM_ARM_HOST
DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
DEFINE(VCPU_MIDR, offsetof(struct kvm_vcpu, arch.midr));
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 63ae04bdda1..166e427ed2b 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -787,7 +787,10 @@ void __init setup_arch(char **cmdline_p)
arm_dt_init_cpu_maps();
#ifdef CONFIG_SMP
if (is_smp()) {
- smp_set_ops(mdesc->smp);
+ if (!mdesc->smp_init || !mdesc->smp_init()) {
+ if(mdesc->smp)
+ smp_set_ops(mdesc->smp);
+ }
smp_init_cpus();
}
#endif
diff --git a/arch/arm/mach-vexpress/core.h b/arch/arm/mach-vexpress/core.h
index f134cd4a85f..bde4374ab6d 100644
--- a/arch/arm/mach-vexpress/core.h
+++ b/arch/arm/mach-vexpress/core.h
@@ -6,6 +6,8 @@
void vexpress_dt_smp_map_io(void);
+bool vexpress_smp_init_ops(void);
+
extern struct smp_operations vexpress_smp_ops;
extern void vexpress_cpu_die(unsigned int cpu);
diff --git a/arch/arm/mach-vexpress/platsmp.c b/arch/arm/mach-vexpress/platsmp.c
index dc1ace55d55..21368ba6ca2 100644
--- a/arch/arm/mach-vexpress/platsmp.c
+++ b/arch/arm/mach-vexpress/platsmp.c
@@ -12,9 +12,11 @@
#include <linux/errno.h>
#include <linux/smp.h>
#include <linux/io.h>
+#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/vexpress.h>
+#include <asm/mcpm.h>
#include <asm/smp_scu.h>
#include <asm/mach/map.h>
@@ -203,3 +205,14 @@ struct smp_operations __initdata vexpress_smp_ops = {
.cpu_die = vexpress_cpu_die,
#endif
};
+
+bool __init vexpress_smp_init_ops(void)
+{
+#ifdef CONFIG_MCPM
+ if(of_find_compatible_node(NULL, NULL, "arm,cci")) {
+ mcpm_smp_set_ops();
+ return true;
+ }
+#endif
+ return false;
+}
diff --git a/arch/arm/mach-vexpress/v2m.c b/arch/arm/mach-vexpress/v2m.c
index ca0699575c8..772b7a179dd 100644
--- a/arch/arm/mach-vexpress/v2m.c
+++ b/arch/arm/mach-vexpress/v2m.c
@@ -504,6 +504,7 @@ static const char * const v2m_dt_match[] __initconst = {
DT_MACHINE_START(VEXPRESS_DT, "ARM-Versatile Express")
.dt_compat = v2m_dt_match,
.smp = smp_ops(vexpress_smp_ops),
+ .smp_init = smp_init_ops(vexpress_smp_init_ops),
.map_io = v2m_dt_map_io,
.init_early = v2m_dt_init_early,
.init_irq = irqchip_init,