Merge branch 'linux-linaro-lsk' into linux-linaro-lsk-rt

author: Mark Brown <broonie@kernel.org> 2015-01-18 20:50:33 +0000
committer: Mark Brown <broonie@kernel.org> 2015-01-18 20:50:33 +0000
commit: 99687559e46be79e7b0b8f57570928e341b24a6d (patch)
tree: 81dfd6f7a0930769a1c801edc370da8ed5d48021
parent: 041be7a254157109dec33e38ff9fe5c6fea60299 (diff)
parent: 2eb736d6b425cb932b038fd555243b9b0e59c036 (diff)
42 files changed, 2312 insertions, 597 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 279594b83781..d64c1a0e9dd0 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1,6 +1,7 @@
 config ARM64
 	def_bool y
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
+	select ARCH_HAS_OPP
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_HAS_OPP
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
@@ -33,6 +34,7 @@ config ARM64
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_C_RECORDMCOUNT
+	select HAVE_CC_STACKPROTECTOR
 	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_API_DEBUG
diff --git a/arch/arm64/include/asm/cpu_ops.h b/arch/arm64/include/asm/cpu_ops.h
index 152413076503..47dfa31ad71a 100644
--- a/arch/arm64/include/asm/cpu_ops.h
+++ b/arch/arm64/include/asm/cpu_ops.h
@@ -28,6 +28,8 @@ struct device_node;
  *		enable-method property.
  * @cpu_init:	Reads any data necessary for a specific enable-method from the
  *		devicetree, for a given cpu node and proposed logical id.
+ * @cpu_init_idle: Reads any data necessary to initialize CPU idle states from
+ *		devicetree, for a given cpu node and proposed logical id.
  * @cpu_prepare: Early one-time preparation step for a cpu. If there is a
  *		mechanism for doing so, tests whether it is possible to boot
  *		the given CPU.
@@ -39,6 +41,7 @@ struct device_node;
  * 		from the cpu to be killed.
  * @cpu_die:	Makes a cpu leave the kernel. Must not fail. Called from the
  *		cpu being killed.
+ * @cpu_kill:  Ensures a cpu has left the kernel. Called from another cpu.
  * @cpu_suspend: Suspends a cpu and saves the required context. May fail owing
  *               to wrong parameters or error conditions. Called from the
  *               CPU being suspended. Must be called with IRQs disabled.
@@ -46,12 +49,14 @@ struct device_node;
 struct cpu_operations {
 	const char	*name;
 	int		(*cpu_init)(struct device_node *, unsigned int);
+	int		(*cpu_init_idle)(struct device_node *, unsigned int);
 	int		(*cpu_prepare)(unsigned int);
 	int		(*cpu_boot)(unsigned int);
 	void		(*cpu_postboot)(void);
 #ifdef CONFIG_HOTPLUG_CPU
 	int		(*cpu_disable)(unsigned int cpu);
 	void		(*cpu_die)(unsigned int cpu);
+	int		(*cpu_kill)(unsigned int cpu);
 #endif
 #ifdef CONFIG_ARM64_CPU_SUSPEND
 	int		(*cpu_suspend)(unsigned long);
diff --git a/arch/arm64/include/asm/cpuidle.h b/arch/arm64/include/asm/cpuidle.h
new file mode 100644
index 000000000000..b52a9932e2b1
--- /dev/null
+++ b/arch/arm64/include/asm/cpuidle.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_CPUIDLE_H
+#define __ASM_CPUIDLE_H
+
+#ifdef CONFIG_CPU_IDLE
+extern int cpu_init_idle(unsigned int cpu);
+#else
+static inline int cpu_init_idle(unsigned int cpu)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+#endif
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index 00a41aab4a37..fa6a0c5a8de3 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -23,8 +23,6 @@
 
 #include <asm-generic/dma-coherent.h>
 
-#define ARCH_HAS_DMA_GET_REQUIRED_MASK
-
 #define DMA_ERROR_CODE	(~(dma_addr_t)0)
 extern struct dma_map_ops *dma_ops;
 extern struct dma_map_ops coherent_swiotlb_dma_ops;
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index fb4b26509276..7099e3b84778 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -257,7 +257,7 @@ static inline pmd_t pte_pmd(pte_t pte)
 #define pmd_mkwrite(pmd)	pte_pmd(pte_mkwrite(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
-#define pmd_mknotpresent(pmd)	(__pmd(pmd_val(pmd) &= ~PMD_TYPE_MASK))
+#define pmd_mknotpresent(pmd)	(__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
 
 #define __HAVE_ARCH_PMD_WRITE
 #define pmd_write(pmd)		pte_write(pmd_pte(pmd))
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 45b20cd6cbca..349448c0caeb 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -136,8 +136,8 @@ extern struct task_struct *cpu_switch_to(struct task_struct *prev,
 #define task_pt_regs(p) \
 	((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1)
 
-#define KSTK_EIP(tsk)	task_pt_regs(tsk)->pc
-#define KSTK_ESP(tsk)	task_pt_regs(tsk)->sp
+#define KSTK_EIP(tsk)	((unsigned long)task_pt_regs(tsk)->pc)
+#define KSTK_ESP(tsk)	((unsigned long)task_pt_regs(tsk)->sp)
 
 /*
  * Prefetching support
diff --git a/arch/arm64/include/asm/psci.h b/arch/arm64/include/asm/psci.h
index 9a4b663670ff..e5312ea0ec1a 100644
--- a/arch/arm64/include/asm/psci.h
+++ b/arch/arm64/include/asm/psci.h
@@ -14,10 +14,6 @@
 #ifndef __ASM_PSCI_H
 #define __ASM_PSCI_H
 
-struct cpuidle_driver;
-void psci_init(void);
-
-int __init psci_dt_register_idle_states(struct cpuidle_driver *,
-					struct device_node *[]);
+int psci_init(void);
 
 #endif /* __ASM_PSCI_H */
diff --git a/arch/arm64/include/asm/stackprotector.h b/arch/arm64/include/asm/stackprotector.h
new file mode 100644
index 000000000000..fe5e287dc56b
--- /dev/null
+++ b/arch/arm64/include/asm/stackprotector.h
@@ -0,0 +1,38 @@
+/*
+ * GCC stack protector support.
+ *
+ * Stack protector works by putting predefined pattern at the start of
+ * the stack frame and verifying that it hasn't been overwritten when
+ * returning from the function.  The pattern is called stack canary
+ * and gcc expects it to be defined by a global variable called
+ * "__stack_chk_guard" on ARM.  This unfortunately means that on SMP
+ * we cannot have a different canary value per task.
+ */
+
+#ifndef __ASM_STACKPROTECTOR_H
+#define __ASM_STACKPROTECTOR_H
+
+#include <linux/random.h>
+#include <linux/version.h>
+
+extern unsigned long __stack_chk_guard;
+
+/*
+ * Initialize the stackprotector canary value.
+ *
+ * NOTE: this must only be called from functions that never return,
+ * and it must always be inlined.
+ */
+static __always_inline void boot_init_stack_canary(void)
+{
+	unsigned long canary;
+
+	/* Try to get a semi random initial value. */
+	get_random_bytes(&canary, sizeof(canary));
+	canary ^= LINUX_VERSION_CODE;
+
+	current->stack_canary = canary;
+	__stack_chk_guard = current->stack_canary;
+}
+
+#endif	/* _ASM_STACKPROTECTOR_H */
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index 3ee8b303d9a9..64d2d4884a9d 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -22,6 +22,18 @@ extern char *strrchr(const char *, int c);
 #define __HAVE_ARCH_STRCHR
 extern char *strchr(const char *, int c);
 
+#define __HAVE_ARCH_STRCMP
+extern int strcmp(const char *, const char *);
+
+#define __HAVE_ARCH_STRNCMP
+extern int strncmp(const char *, const char *, __kernel_size_t);
+
+#define __HAVE_ARCH_STRLEN
+extern __kernel_size_t strlen(const char *);
+
+#define __HAVE_ARCH_STRNLEN
+extern __kernel_size_t strnlen(const char *, __kernel_size_t);
+
 #define __HAVE_ARCH_MEMCPY
 extern void *memcpy(void *, const void *, __kernel_size_t);
 
@@ -34,4 +46,7 @@ extern void *memchr(const void *, int, __kernel_size_t);
 #define __HAVE_ARCH_MEMSET
 extern void *memset(void *, int, __kernel_size_t);
 
+#define __HAVE_ARCH_MEMCMP
+extern int memcmp(const void *, const void *, size_t);
+
 #endif
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index ac389d32ccde..a8ad571c4758 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -30,6 +30,7 @@ arm64-obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 arm64-obj-$(CONFIG_ARM_CPU_TOPOLOGY)  += topology.o
 arm64-obj-$(CONFIG_ARM64_CPU_SUSPEND)	+= sleep.o suspend.o
 arm64-obj-$(CONFIG_JUMP_LABEL)		+= jump_label.o
+arm64-obj-$(CONFIG_CPU_IDLE)		+= cpuidle.o
 arm64-obj-$(CONFIG_KGDB)		+= kgdb.o
 arm64-obj-$(CONFIG_EFI)			+= efi.o efi-stub.o efi-entry.o
 
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 7f0512feaa13..a85843ddbde8 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -44,10 +44,15 @@ EXPORT_SYMBOL(memstart_addr);
 	/* string / mem functions */
 EXPORT_SYMBOL(strchr);
 EXPORT_SYMBOL(strrchr);
+EXPORT_SYMBOL(strcmp);
+EXPORT_SYMBOL(strncmp);
+EXPORT_SYMBOL(strlen);
+EXPORT_SYMBOL(strnlen);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(memchr);
+EXPORT_SYMBOL(memcmp);
 
 	/* atomic bitops */
 EXPORT_SYMBOL(set_bit);
diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c
new file mode 100644
index 000000000000..19d17f51db37
--- /dev/null
+++ b/arch/arm64/kernel/cpuidle.c
@@ -0,0 +1,31 @@
+/*
+ * ARM64 CPU idle arch support
+ *
+ * Copyright (C) 2014 ARM Ltd.
+ * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/of.h>
+#include <linux/of_device.h>
+
+#include <asm/cpuidle.h>
+#include <asm/cpu_ops.h>
+
+int cpu_init_idle(unsigned int cpu)
+{
+	int ret = -EOPNOTSUPP;
+	struct device_node *cpu_node = of_cpu_device_node_get(cpu);
+
+	if (!cpu_node)
+		return -ENODEV;
+
+	if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_init_idle)
+		ret = cpu_ops[cpu]->cpu_init_idle(cpu_node, cpu);
+
+	of_node_put(cpu_node);
+	return ret;
+}
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index bbc9fe1658fa..50e0cc849b8e 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -349,6 +349,8 @@ ENTRY(set_cpu_boot_mode_flag)
 	b.ne	1f
 	add	x1, x1, #4
 1:	str	w20, [x1]			// This CPU has booted in EL1
+	dmb	sy
+	dc	ivac, x1			// Invalidate potentially stale cache line
 	ret
 ENDPROC(set_cpu_boot_mode_flag)
 
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 75c6b1e0d606..150134bf8738 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -52,6 +52,12 @@
 #include <asm/processor.h>
 #include <asm/stacktrace.h>
 
+#ifdef CONFIG_CC_STACKPROTECTOR
+#include <linux/stackprotector.h>
+unsigned long __stack_chk_guard __read_mostly;
+EXPORT_SYMBOL(__stack_chk_guard);
+#endif
+
 static void setup_restart(void)
 {
 	/*
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c
index 0e32ab453e5b..e58a67974f98 100644
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -20,6 +20,10 @@
 #include <linux/of.h>
 #include <linux/smp.h>
 #include <linux/slab.h>
+#include <linux/reboot.h>
+#include <linux/pm.h>
+#include <linux/delay.h>
+#include <uapi/linux/psci.h>
 
 #include <asm/compiler.h>
 #include <asm/cpu_ops.h>
@@ -27,6 +31,7 @@
 #include <asm/psci.h>
 #include <asm/smp_plat.h>
 #include <asm/suspend.h>
+#include <asm/system_misc.h>
 
 #define PSCI_POWER_STATE_TYPE_STANDBY		0
 #define PSCI_POWER_STATE_TYPE_POWER_DOWN	1
@@ -43,17 +48,23 @@ struct psci_operations {
 	int (*cpu_off)(struct psci_power_state state);
 	int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
 	int (*migrate)(unsigned long cpuid);
+	int (*affinity_info)(unsigned long target_affinity,
+			unsigned long lowest_affinity_level);
+	int (*migrate_info_type)(void);
 };
 
 static struct psci_operations psci_ops;
 
 static int (*invoke_psci_fn)(u64, u64, u64, u64);
+typedef int (*psci_initcall_t)(const struct device_node *);
 
 enum psci_function {
 	PSCI_FN_CPU_SUSPEND,
 	PSCI_FN_CPU_ON,
 	PSCI_FN_CPU_OFF,
 	PSCI_FN_MIGRATE,
+	PSCI_FN_AFFINITY_INFO,
+	PSCI_FN_MIGRATE_INFO_TYPE,
 	PSCI_FN_MAX,
 };
 
@@ -61,53 +72,41 @@ static DEFINE_PER_CPU_READ_MOSTLY(struct psci_power_state *, psci_power_state);
 
 static u32 psci_function_id[PSCI_FN_MAX];
 
-#define PSCI_RET_SUCCESS		0
-#define PSCI_RET_EOPNOTSUPP		-1
-#define PSCI_RET_EINVAL			-2
-#define PSCI_RET_EPERM			-3
-
 static int psci_to_linux_errno(int errno)
 {
 	switch (errno) {
 	case PSCI_RET_SUCCESS:
 		return 0;
-	case PSCI_RET_EOPNOTSUPP:
+	case PSCI_RET_NOT_SUPPORTED:
 		return -EOPNOTSUPP;
-	case PSCI_RET_EINVAL:
+	case PSCI_RET_INVALID_PARAMS:
 		return -EINVAL;
-	case PSCI_RET_EPERM:
+	case PSCI_RET_DENIED:
 		return -EPERM;
 	};
 
 	return -EINVAL;
 }
 
-#define PSCI_POWER_STATE_ID_MASK	0xffff
-#define PSCI_POWER_STATE_ID_SHIFT	0
-#define PSCI_POWER_STATE_TYPE_MASK	0x1
-#define PSCI_POWER_STATE_TYPE_SHIFT	16
-#define PSCI_POWER_STATE_AFFL_MASK	0x3
-#define PSCI_POWER_STATE_AFFL_SHIFT	24
-
 static u32 psci_power_state_pack(struct psci_power_state state)
 {
-	return	((state.id & PSCI_POWER_STATE_ID_MASK)
-			<< PSCI_POWER_STATE_ID_SHIFT)	|
-		((state.type & PSCI_POWER_STATE_TYPE_MASK)
-			<< PSCI_POWER_STATE_TYPE_SHIFT)	|
-		((state.affinity_level & PSCI_POWER_STATE_AFFL_MASK)
-			<< PSCI_POWER_STATE_AFFL_SHIFT);
+	return ((state.id << PSCI_0_2_POWER_STATE_ID_SHIFT)
+			& PSCI_0_2_POWER_STATE_ID_MASK) |
+		((state.type << PSCI_0_2_POWER_STATE_TYPE_SHIFT)
+		 & PSCI_0_2_POWER_STATE_TYPE_MASK) |
+		((state.affinity_level << PSCI_0_2_POWER_STATE_AFFL_SHIFT)
+		 & PSCI_0_2_POWER_STATE_AFFL_MASK);
 }
 
 static void psci_power_state_unpack(u32 power_state,
 				    struct psci_power_state *state)
 {
-	state->id = (power_state >> PSCI_POWER_STATE_ID_SHIFT)
-			& PSCI_POWER_STATE_ID_MASK;
-	state->type = (power_state >> PSCI_POWER_STATE_TYPE_SHIFT)
-			& PSCI_POWER_STATE_TYPE_MASK;
-	state->affinity_level = (power_state >> PSCI_POWER_STATE_AFFL_SHIFT)
-			& PSCI_POWER_STATE_AFFL_MASK;
+	state->id = (power_state >> PSCI_0_2_POWER_STATE_ID_SHIFT)
+			& PSCI_0_2_POWER_STATE_ID_MASK;
+	state->type = (power_state >> PSCI_0_2_POWER_STATE_TYPE_SHIFT)
+			& PSCI_0_2_POWER_STATE_TYPE_MASK;
+	state->affinity_level = (power_state >> PSCI_0_2_POWER_STATE_AFFL_SHIFT)
+			& PSCI_0_2_POWER_STATE_AFFL_MASK;
 }
 
 /*
@@ -144,6 +143,14 @@ static noinline int __invoke_psci_fn_smc(u64 function_id, u64 arg0, u64 arg1,
 	return function_id;
 }
 
+static int psci_get_version(void)
+{
+	int err;
+
+	err = invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0);
+	return err;
+}
+
 static int psci_cpu_suspend(struct psci_power_state state,
 			    unsigned long entry_point)
 {
@@ -187,107 +194,135 @@ static int psci_migrate(unsigned long cpuid)
 	return psci_to_linux_errno(err);
 }
 
-static const struct of_device_id psci_of_match[] __initconst = {
-	{ .compatible = "arm,psci",	},
-	{},
-};
+static int psci_affinity_info(unsigned long target_affinity,
+		unsigned long lowest_affinity_level)
+{
+	int err;
+	u32 fn;
 
-int __init psci_dt_register_idle_states(struct cpuidle_driver *drv,
-					struct device_node *state_nodes[])
+	fn = psci_function_id[PSCI_FN_AFFINITY_INFO];
+	err = invoke_psci_fn(fn, target_affinity, lowest_affinity_level, 0);
+	return err;
+}
+
+static int psci_migrate_info_type(void)
 {
-	int cpu, i;
-	struct psci_power_state *psci_states;
-	const struct cpu_operations *cpu_ops_ptr;
+	int err;
+	u32 fn;
+
+	fn = psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE];
+	err = invoke_psci_fn(fn, 0, 0, 0);
+	return err;
+}
+
+static int get_set_conduit_method(struct device_node *np)
+{
+	const char *method;
+
+	pr_info("probing for conduit method from DT.\n");
+
+	if (of_property_read_string(np, "method", &method)) {
+		pr_warn("missing \"method\" property\n");
+		return -ENXIO;
+	}
 
-	if (!state_nodes)
+	if (!strcmp("hvc", method)) {
+		invoke_psci_fn = __invoke_psci_fn_hvc;
+	} else if (!strcmp("smc", method)) {
+		invoke_psci_fn = __invoke_psci_fn_smc;
+	} else {
+		pr_warn("invalid \"method\" property: %s\n", method);
 		return -EINVAL;
-	/*
-	 * This is belt-and-braces: make sure that if the idle
-	 * specified protocol is psci, the cpu_ops have been
-	 * initialized to psci operations. Anything else is
-	 * a recipe for mayhem.
-	 */
-	for_each_cpu(cpu, drv->cpumask) {
-		cpu_ops_ptr = cpu_ops[cpu];
-		if (WARN_ON(!cpu_ops_ptr || strcmp(cpu_ops_ptr->name, "psci")))
-			return -EOPNOTSUPP;
 	}
+	return 0;
+}
 
-	psci_states = kcalloc(drv->state_count, sizeof(*psci_states),
-			      GFP_KERNEL);
+static void psci_sys_reset(char str, const char *cmd)
+{
+	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_RESET, 0, 0, 0);
+}
 
-	if (!psci_states) {
-		pr_warn("psci idle state allocation failed\n");
-		return -ENOMEM;
-	}
+static void psci_sys_poweroff(void)
+{
+	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
+}
+
+/*
+ * PSCI Function IDs for v0.2+ are well defined so use
+ * standard values.
+ */
+static int psci_0_2_init(struct device_node *np)
+{
+	int err, ver;
+
+	err = get_set_conduit_method(np);
 
-	for_each_cpu(cpu, drv->cpumask) {
-		if (per_cpu(psci_power_state, cpu)) {
-			pr_warn("idle states already initialized on cpu %u\n",
-				cpu);
-			continue;
+	if (err)
+		goto out_put_node;
+
+	ver = psci_get_version();
+
+	if (ver == PSCI_RET_NOT_SUPPORTED) {
+		/* PSCI v0.2 mandates implementation of PSCI_ID_VERSION. */
+		pr_err("PSCI firmware does not comply with the v0.2 spec.\n");
+		err = -EOPNOTSUPP;
+		goto out_put_node;
+	} else {
+		pr_info("PSCIv%d.%d detected in firmware.\n",
+				PSCI_VERSION_MAJOR(ver),
+				PSCI_VERSION_MINOR(ver));
+
+		if (PSCI_VERSION_MAJOR(ver) == 0 &&
+				PSCI_VERSION_MINOR(ver) < 2) {
+			err = -EINVAL;
+			pr_err("Conflicting PSCI version detected.\n");
+			goto out_put_node;
 		}
-		per_cpu(psci_power_state, cpu) = psci_states;
 	}
 
+	pr_info("Using standard PSCI v0.2 function IDs\n");
+	psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_0_2_FN64_CPU_SUSPEND;
+	psci_ops.cpu_suspend = psci_cpu_suspend;
 
-	for (i = 0; i < drv->state_count; i++) {
-		u32 psci_power_state;
+	psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF;
+	psci_ops.cpu_off = psci_cpu_off;
 
-		if (!state_nodes[i]) {
-			/*
-			 * An index with a missing node pointer falls back to
-			 * simple STANDBYWFI
-			 */
-			psci_states[i].type = PSCI_POWER_STATE_TYPE_STANDBY;
-			continue;
-		}
+	psci_function_id[PSCI_FN_CPU_ON] = PSCI_0_2_FN64_CPU_ON;
+	psci_ops.cpu_on = psci_cpu_on;
 
-		if (of_property_read_u32(state_nodes[i], "entry-method-param",
-					 &psci_power_state)) {
-			pr_warn(" * %s missing entry-method-param property\n",
-				state_nodes[i]->full_name);
-			/*
-			 * If entry-method-param property is missing, fall
-			 * back to STANDBYWFI state
-			 */
-			psci_states[i].type = PSCI_POWER_STATE_TYPE_STANDBY;
-			continue;
-		}
+	psci_function_id[PSCI_FN_MIGRATE] = PSCI_0_2_FN64_MIGRATE;
+	psci_ops.migrate = psci_migrate;
 
-		pr_debug("psci-power-state %#x index %u\n",
-			 psci_power_state, i);
-		psci_power_state_unpack(psci_power_state, &psci_states[i]);
-	}
+	psci_function_id[PSCI_FN_AFFINITY_INFO] = PSCI_0_2_FN64_AFFINITY_INFO;
+	psci_ops.affinity_info = psci_affinity_info;
 
-	return 0;
+	psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE] =
+		PSCI_0_2_FN_MIGRATE_INFO_TYPE;
+	psci_ops.migrate_info_type = psci_migrate_info_type;
+
+	arm_pm_restart = psci_sys_reset;
+
+	pm_power_off = psci_sys_poweroff;
+
+out_put_node:
+	of_node_put(np);
+	return err;
 }
 
-void __init psci_init(void)
+/*
+ * PSCI < v0.2 get PSCI Function IDs via DT.
+ */
+static int psci_0_1_init(struct device_node *np)
 {
-	struct device_node *np;
-	const char *method;
 	u32 id;
+	int err;
 
-	np = of_find_matching_node(NULL, psci_of_match);
-	if (!np)
-		return;
-
-	pr_info("probing function IDs from device-tree\n");
+	err = get_set_conduit_method(np);
 
-	if (of_property_read_string(np, "method", &method)) {
-		pr_warning("missing \"method\" property\n");
+	if (err)
 		goto out_put_node;
-	}
 
-	if (!strcmp("hvc", method)) {
-		invoke_psci_fn = __invoke_psci_fn_hvc;
-	} else if (!strcmp("smc", method)) {
-		invoke_psci_fn = __invoke_psci_fn_smc;
-	} else {
-		pr_warning("invalid \"method\" property: %s\n", method);
-		goto out_put_node;
-	}
+	pr_info("Using PSCI v0.1 Function IDs from DT\n");
 
 	if (!of_property_read_u32(np, "cpu_suspend", &id)) {
 		psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
@@ -311,7 +346,28 @@ void __init psci_init(void)
 
 out_put_node:
 	of_node_put(np);
-	return;
+	return err;
+}
+
+static const struct of_device_id psci_of_match[] __initconst = {
+	{ .compatible = "arm,psci",	.data = psci_0_1_init},
+	{ .compatible = "arm,psci-0.2",	.data = psci_0_2_init},
+	{},
+};
+
+int __init psci_init(void)
+{
+	struct device_node *np;
+	const struct of_device_id *matched_np;
+	psci_initcall_t init_fn;
+
+	np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);
+
+	if (!np)
+		return -ENODEV;
+
+	init_fn = (psci_initcall_t)matched_np->data;
+	return init_fn(np);
 }
 
 #ifdef CONFIG_SMP
@@ -364,6 +420,35 @@ static void cpu_psci_cpu_die(unsigned int cpu)
 
 	pr_crit("unable to power off CPU%u (%d)\n", cpu, ret);
 }
+
+static int cpu_psci_cpu_kill(unsigned int cpu)
+{
+	int err, i;
+
+	if (!psci_ops.affinity_info)
+		return 1;
+	/*
+	 * cpu_kill could race with cpu_die and we can
+	 * potentially end up declaring this cpu undead
+	 * while it is dying. So, try again a few times.
+	 */
+
+	for (i = 0; i < 10; i++) {
+		err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
+		if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) {
+			pr_info("CPU%d killed.\n", cpu);
+			return 1;
+		}
+
+		msleep(10);
+		pr_info("Retrying again to check for CPU kill\n");
+	}
+
+	pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n",
+			cpu, err);
+	/* Make op_cpu_kill() fail. */
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_ARM64_CPU_SUSPEND
@@ -386,6 +471,7 @@ const struct cpu_operations cpu_psci_ops = {
 #ifdef CONFIG_HOTPLUG_CPU
 	.cpu_disable	= cpu_psci_cpu_disable,
 	.cpu_die	= cpu_psci_cpu_die,
+	.cpu_kill	= cpu_psci_cpu_kill,
 #endif
 #ifdef CONFIG_ARM64_CPU_SUSPEND
 	.cpu_suspend	= cpu_psci_cpu_suspend,
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index bc09a147454f..8ba6b0fa1753 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -655,11 +655,16 @@ static int compat_gpr_get(struct task_struct *target,
 			reg = task_pt_regs(target)->regs[idx];
 		}
 
-		ret = copy_to_user(ubuf, &reg, sizeof(reg));
-		if (ret)
-			break;
-
-		ubuf += sizeof(reg);
+		if (kbuf) {
+			memcpy(kbuf, &reg, sizeof(reg));
+			kbuf += sizeof(reg);
+		} else {
+			ret = copy_to_user(ubuf, &reg, sizeof(reg));
+			if (ret)
+				break;
+
+			ubuf += sizeof(reg);
+		}
 	}
 
 	return ret;
@@ -689,11 +694,16 @@ static int compat_gpr_set(struct task_struct *target,
 		unsigned int idx = start + i;
 		compat_ulong_t reg;
 
-		ret = copy_from_user(&reg, ubuf, sizeof(reg));
-		if (ret)
-			return ret;
+		if (kbuf) {
+			memcpy(&reg, kbuf, sizeof(reg));
+			kbuf += sizeof(reg);
+		} else {
+			ret = copy_from_user(&reg, ubuf, sizeof(reg));
+			if (ret)
+				return ret;
 
-		ubuf += sizeof(reg);
+			ubuf += sizeof(reg);
+		}
 
 		switch (idx) {
 		case 15:
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 7c868a2ac38b..0ac31a581f02 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -231,6 +231,19 @@ int __cpu_disable(void)
 	return 0;
 }
 
+static int op_cpu_kill(unsigned int cpu)
+{
+	/*
+	 * If we have no means of synchronising with the dying CPU, then assume
+	 * that it is really dead. We can only wait for an arbitrary length of
+	 * time and hope that it's dead, so let's skip the wait and just hope.
+	 */
+	if (!cpu_ops[cpu]->cpu_kill)
+		return 1;
+
+	return cpu_ops[cpu]->cpu_kill(cpu);
+}
+
 static DECLARE_COMPLETION(cpu_died);
 
 /*
@@ -244,6 +257,15 @@ void __cpu_die(unsigned int cpu)
 		return;
 	}
 	pr_notice("CPU%u: shutdown\n", cpu);
+
+	/*
+	 * Now that the dying CPU is beyond the point of no return w.r.t.
+	 * in-kernel synchronisation, try to get the firwmare to help us to
+	 * verify that it has really left the kernel before we consider
+	 * clobbering anything it might still be using.
+	 */
+	if (!op_cpu_kill(cpu))
+		pr_warn("CPU%d may not have shut down cleanly\n", cpu);
 }
 
 /*
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index 03dc3718eb13..e1c7a540361b 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -18,6 +18,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/clockchips.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 55d0e035205f..de93a4bbdb7c 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -13,7 +13,7 @@
 #define ARM_EXIT_DISCARD(x)	x
 
 OUTPUT_ARCH(aarch64)
-ENTRY(stext)
+ENTRY(_text)
 
 jiffies = jiffies_64;
 
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..d98d3e39879e 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,5 @@
 lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\
 		   copy_to_user.o copy_in_user.o copy_page.o		\
 		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
+		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
 		   strchr.o strrchr.o
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
new file mode 100644
index 000000000000..6ea0776ba6de
--- /dev/null
+++ b/arch/arm64/lib/memcmp.S
@@ -0,0 +1,258 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+* compare memory areas(when two memory areas' offset are different,
+* alignment handled by the hardware)
+*
+* Parameters:
+*  x0 - const memory area 1 pointer
+*  x1 - const memory area 2 pointer
+*  x2 - the maximal compare byte length
+* Returns:
+*  x0 - a compare result, maybe less than, equal to, or greater than ZERO
+*/
+
+/* Parameters and result.  */
+src1		.req	x0
+src2		.req	x1
+limit		.req	x2
+result		.req	x0
+
+/* Internal variables.  */
+data1		.req	x3
+data1w		.req	w3
+data2		.req	x4
+data2w		.req	w4
+has_nul		.req	x5
+diff		.req	x6
+endloop		.req	x7
+tmp1		.req	x8
+tmp2		.req	x9
+tmp3		.req	x10
+pos		.req	x11
+limit_wd	.req	x12
+mask		.req	x13
+
+ENTRY(memcmp)
+	cbz	limit, .Lret0
+	eor	tmp1, src1, src2
+	tst	tmp1, #7
+	b.ne	.Lmisaligned8
+	ands	tmp1, src1, #7
+	b.ne	.Lmutual_align
+	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
+	lsr	limit_wd, limit_wd, #3 /* Convert to Dwords.  */
+	/*
+	* The input source addresses are at alignment boundary.
+	* Directly compare eight bytes each time.
+	*/
+.Lloop_aligned:
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+.Lstart_realigned:
+	subs	limit_wd, limit_wd, #1
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, cs	/* Last Dword or differences.  */
+	cbz	endloop, .Lloop_aligned
+
+	/* Not reached the limit, must have found a diff.  */
+	tbz	limit_wd, #63, .Lnot_limit
+
+	/* Limit % 8 == 0 => the diff is in the last 8 bytes. */
+	ands	limit, limit, #7
+	b.eq	.Lnot_limit
+	/*
+	* The remained bytes less than 8. It is needed to extract valid data
+	* from last eight bytes of the intended memory range.
+	*/
+	lsl	limit, limit, #3	/* bytes-> bits.  */
+	mov	mask, #~0
+CPU_BE( lsr	mask, mask, limit )
+CPU_LE( lsl	mask, mask, limit )
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	orr	diff, diff, mask
+	b	.Lnot_limit
+
+.Lmutual_align:
+	/*
+	* Sources are mutually aligned, but are not currently at an
+	* alignment boundary. Round down the addresses and then mask off
+	* the bytes that precede the start point.
+	*/
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	/*
+	* We can not add limit with alignment offset(tmp1) here. Since the
+	* addition probably make the limit overflown.
+	*/
+	sub	limit_wd, limit, #1/*limit != 0, so no underflow.*/
+	and	tmp3, limit_wd, #7
+	lsr	limit_wd, limit_wd, #3
+	add	tmp3, tmp3, tmp1
+	add	limit_wd, limit_wd, tmp3, lsr #3
+	add	limit, limit, tmp1/* Adjust the limit for the extra.  */
+
+	lsl	tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
+	neg	tmp1, tmp1/* Bits to alignment -64.  */
+	mov	tmp2, #~0
+	/*mask off the non-intended bytes before the start address.*/
+CPU_BE( lsl	tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
+	/* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr	tmp2, tmp2, tmp1 )
+
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	.Lstart_realigned
+
+	/*src1 and src2 have different alignment offset.*/
+.Lmisaligned8:
+	cmp	limit, #8
+	b.lo	.Ltiny8proc /*limit < 8: compare byte by byte*/
+
+	and	tmp1, src1, #7
+	neg	tmp1, tmp1
+	add	tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
+	and	tmp2, src2, #7
+	neg	tmp2, tmp2
+	add	tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
+	subs	tmp3, tmp1, tmp2
+	csel	pos, tmp1, tmp2, hi /*Choose the maximum.*/
+
+	sub	limit, limit, pos
+	/*compare the proceeding bytes in the first 8 byte segment.*/
+.Ltinycmp:
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	pos, pos, #1
+	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
+	b.eq	.Ltinycmp
+	cbnz	pos, 1f /*diff occurred before the last byte.*/
+	cmp	data1w, data2w
+	b.eq	.Lstart_align
+1:
+	sub	result, data1, data2
+	ret
+
+.Lstart_align:
+	lsr	limit_wd, limit, #3
+	cbz	limit_wd, .Lremain8
+
+	ands	xzr, src1, #7
+	b.eq	.Lrecal_offset
+	/*process more leading bytes to make src1 aligned...*/
+	add	src1, src1, tmp3 /*backwards src1 to alignment boundary*/
+	add	src2, src2, tmp3
+	sub	limit, limit, tmp3
+	lsr	limit_wd, limit, #3
+	cbz	limit_wd, .Lremain8
+	/*load 8 bytes from aligned SRC1..*/
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+
+	subs	limit_wd, limit_wd, #1
+	eor	diff, data1, data2  /*Non-zero if differences found.*/
+	csinv	endloop, diff, xzr, ne
+	cbnz	endloop, .Lunequal_proc
+	/*How far is the current SRC2 from the alignment boundary...*/
+	and	tmp3, tmp3, #7
+
+.Lrecal_offset:/*src1 is aligned now..*/
+	neg	pos, tmp3
+.Lloopcmp_proc:
+	/*
+	* Divide the eight bytes into two parts. First,backwards the src2
+	* to an alignment boundary,load eight bytes and compare from
+	* the SRC2 alignment boundary. If all 8 bytes are equal,then start
+	* the second part's comparison. Otherwise finish the comparison.
+	* This special handle can garantee all the accesses are in the
+	* thread/task space in avoid to overrange access.
+	*/
+	ldr	data1, [src1,pos]
+	ldr	data2, [src2,pos]
+	eor	diff, data1, data2  /* Non-zero if differences found.  */
+	cbnz	diff, .Lnot_limit
+
+	/*The second part process*/
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	eor	diff, data1, data2  /* Non-zero if differences found.  */
+	subs	limit_wd, limit_wd, #1
+	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
+	cbz	endloop, .Lloopcmp_proc
+.Lunequal_proc:
+	cbz	diff, .Lremain8
+
+/*There is differnence occured in the latest comparison.*/
+.Lnot_limit:
+/*
+* For little endian,reverse the low significant equal bits into MSB,then
+* following CLZ can find how many equal bits exist.
+*/
+CPU_LE( rev	diff, diff )
+CPU_LE( rev	data1, data1 )
+CPU_LE( rev	data2, data2 )
+
+	/*
+	* The MS-non-zero bit of DIFF marks either the first bit
+	* that is different, or the end of the significant data.
+	* Shifting left now will bring the critical information into the
+	* top bits.
+	*/
+	clz	pos, diff
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/*
+	* We need to zero-extend (char is unsigned) the value and then
+	* perform a signed subtraction.
+	*/
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+
+.Lremain8:
+	/* Limit % 8 == 0 =>. all data are equal.*/
+	ands	limit, limit, #7
+	b.eq	.Lret0
+
+.Ltiny8proc:
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+
+	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000. */
+	b.eq	.Ltiny8proc
+	sub	result, data1, data2
+	ret
+.Lret0:
+	mov	result, #0
+	ret
+ENDPROC(memcmp)
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 27b5003609b6..8a9a96d3ddae 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,5 +1,13 @@
 /*
  * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 
 /*
  * Copy a buffer from src to dest (alignment handled by the hardware)
@@ -27,27 +36,166 @@
  * Returns:
  *	x0 - dest
  */
+dstin	.req	x0
+src	.req	x1
+count	.req	x2
+tmp1	.req	x3
+tmp1w	.req	w3
+tmp2	.req	x4
+tmp2w	.req	w4
+tmp3	.req	x5
+tmp3w	.req	w5
+dst	.req	x6
+
+A_l	.req	x7
+A_h	.req	x8
+B_l	.req	x9
+B_h	.req	x10
+C_l	.req	x11
+C_h	.req	x12
+D_l	.req	x13
+D_h	.req	x14
+
 ENTRY(memcpy)
-	mov	x4, x0
-	subs	x2, x2, #8
-	b.mi	2f
-1:	ldr	x3, [x1], #8
-	subs	x2, x2, #8
-	str	x3, [x4], #8
-	b.pl	1b
-2:	adds	x2, x2, #4
-	b.mi	3f
-	ldr	w3, [x1], #4
-	sub	x2, x2, #4
-	str	w3, [x4], #4
-3:	adds	x2, x2, #2
-	b.mi	4f
-	ldrh	w3, [x1], #2
-	sub	x2, x2, #2
-	strh	w3, [x4], #2
-4:	adds	x2, x2, #1
-	b.mi	5f
-	ldrb	w3, [x1]
-	strb	w3, [x4]
-5:	ret
+	mov	dst, dstin
+	cmp	count, #16
+	/*When memory length is less than 16, the accessed are not aligned.*/
+	b.lo	.Ltiny15
+
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
+	b.eq	.LSrcAligned
+	sub	count, count, tmp2
+	/*
+	* Copy the leading memory data from src to dst in an increasing
+	* address order.By this way,the risk of overwritting the source
+	* memory data is eliminated when the distance between src and
+	* dst is less than 16. The memory accesses here are alignment.
+	*/
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src], #1
+	strb	tmp1w, [dst], #1
+1:
+	tbz	tmp2, #1, 2f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+2:
+	tbz	tmp2, #2, 3f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+3:
+	tbz	tmp2, #3, .LSrcAligned
+	ldr	tmp1, [src],#8
+	str	tmp1, [dst],#8
+
+.LSrcAligned:
+	cmp	count, #64
+	b.ge	.Lcpy_over64
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltiny15
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+1:
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+2:
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+.Ltiny15:
+	/*
+	* Prefer to break one ldp/stp into several load/store to access
+	* memory in an increasing address order,rather than to load/store 16
+	* bytes from (src-16) to (dst-16) and to backward the src to aligned
+	* address,which way is used in original cortex memcpy. If keeping
+	* the original memcpy process here, memmove need to satisfy the
+	* precondition that src address is at least 16 bytes bigger than dst
+	* address,otherwise some source data will be overwritten when memove
+	* call memcpy directly. To make memmove simpler and decouple the
+	* memcpy's dependency on memmove, withdrew the original process.
+	*/
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	count, #2, 2f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+2:
+	tbz	count, #1, 3f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+3:
+	tbz	count, #0, .Lexitfunc
+	ldrb	tmp1w, [src]
+	strb	tmp1w, [dst]
+
+.Lexitfunc:
+	ret
+
+.Lcpy_over64:
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/*
+	* Less than 128 bytes to copy, so handle 64 here and then jump
+	* to the tail.
+	*/
+	ldp	A_l, A_h, [src],#16
+	stp	A_l, A_h, [dst],#16
+	ldp	B_l, B_h, [src],#16
+	ldp	C_l, C_h, [src],#16
+	stp	B_l, B_h, [dst],#16
+	stp	C_l, C_h, [dst],#16
+	ldp	D_l, D_h, [src],#16
+	stp	D_l, D_h, [dst],#16
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	ret
+
+	/*
+	* Critical loop.  Start at a new cache line boundary.  Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+	.p2align	L1_CACHE_SHIFT
+.Lcpy_body_large:
+	/* pre-get 64 bytes data. */
+	ldp	A_l, A_h, [src],#16
+	ldp	B_l, B_h, [src],#16
+	ldp	C_l, C_h, [src],#16
+	ldp	D_l, D_h, [src],#16
+1:
+	/*
+	* interlace the load of next 64 bytes data block with store of the last
+	* loaded 64 bytes data.
+	*/
+	stp	A_l, A_h, [dst],#16
+	ldp	A_l, A_h, [src],#16
+	stp	B_l, B_h, [dst],#16
+	ldp	B_l, B_h, [src],#16
+	stp	C_l, C_h, [dst],#16
+	ldp	C_l, C_h, [src],#16
+	stp	D_l, D_h, [dst],#16
+	ldp	D_l, D_h, [src],#16
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst],#16
+	stp	B_l, B_h, [dst],#16
+	stp	C_l, C_h, [dst],#16
+	stp	D_l, D_h, [dst],#16
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	ret
 ENDPROC(memcpy)
diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S
index b79fdfa42d39..57b19ea2dad4 100644
--- a/arch/arm64/lib/memmove.S
+++ b/arch/arm64/lib/memmove.S
@@ -1,5 +1,13 @@
 /*
  * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 
 /*
  * Move a buffer from src to test (alignment handled by the hardware).
@@ -28,30 +37,161 @@
  * Returns:
  *	x0 - dest
  */
+dstin	.req	x0
+src	.req	x1
+count	.req	x2
+tmp1	.req	x3
+tmp1w	.req	w3
+tmp2	.req	x4
+tmp2w	.req	w4
+tmp3	.req	x5
+tmp3w	.req	w5
+dst	.req	x6
+
+A_l	.req	x7
+A_h	.req	x8
+B_l	.req	x9
+B_h	.req	x10
+C_l	.req	x11
+C_h	.req	x12
+D_l	.req	x13
+D_h	.req	x14
+
 ENTRY(memmove)
-	cmp	x0, x1
-	b.ls	memcpy
-	add	x4, x0, x2
-	add	x1, x1, x2
-	subs	x2, x2, #8
-	b.mi	2f
-1:	ldr	x3, [x1, #-8]!
-	subs	x2, x2, #8
-	str	x3, [x4, #-8]!
-	b.pl	1b
-2:	adds	x2, x2, #4
-	b.mi	3f
-	ldr	w3, [x1, #-4]!
-	sub	x2, x2, #4
-	str	w3, [x4, #-4]!
-3:	adds	x2, x2, #2
-	b.mi	4f
-	ldrh	w3, [x1, #-2]!
-	sub	x2, x2, #2
-	strh	w3, [x4, #-2]!
-4:	adds	x2, x2, #1
-	b.mi	5f
-	ldrb	w3, [x1, #-1]
-	strb	w3, [x4, #-1]
-5:	ret
+	cmp	dstin, src
+	b.lo	memcpy
+	add	tmp1, src, count
+	cmp	dstin, tmp1
+	b.hs	memcpy		/* No overlap.  */
+
+	add	dst, dstin, count
+	add	src, src, count
+	cmp	count, #16
+	b.lo	.Ltail15  /*probably non-alignment accesses.*/
+
+	ands	tmp2, src, #15     /* Bytes to reach alignment.  */
+	b.eq	.LSrcAligned
+	sub	count, count, tmp2
+	/*
+	* process the aligned offset length to make the src aligned firstly.
+	* those extra instructions' cost is acceptable. It also make the
+	* coming accesses are based on aligned address.
+	*/
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src, #-1]!
+	strb	tmp1w, [dst, #-1]!
+1:
+	tbz	tmp2, #1, 2f
+	ldrh	tmp1w, [src, #-2]!
+	strh	tmp1w, [dst, #-2]!
+2:
+	tbz	tmp2, #2, 3f
+	ldr	tmp1w, [src, #-4]!
+	str	tmp1w, [dst, #-4]!
+3:
+	tbz	tmp2, #3, .LSrcAligned
+	ldr	tmp1, [src, #-8]!
+	str	tmp1, [dst, #-8]!
+
+.LSrcAligned:
+	cmp	count, #64
+	b.ge	.Lcpy_over64
+
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltail15
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src, #-16]!
+	stp	A_l, A_h, [dst, #-16]!
+1:
+	ldp	A_l, A_h, [src, #-16]!
+	stp	A_l, A_h, [dst, #-16]!
+2:
+	ldp	A_l, A_h, [src, #-16]!
+	stp	A_l, A_h, [dst, #-16]!
+
+.Ltail15:
+	tbz	count, #3, 1f
+	ldr	tmp1, [src, #-8]!
+	str	tmp1, [dst, #-8]!
+1:
+	tbz	count, #2, 2f
+	ldr	tmp1w, [src, #-4]!
+	str	tmp1w, [dst, #-4]!
+2:
+	tbz	count, #1, 3f
+	ldrh	tmp1w, [src, #-2]!
+	strh	tmp1w, [dst, #-2]!
+3:
+	tbz	count, #0, .Lexitfunc
+	ldrb	tmp1w, [src, #-1]
+	strb	tmp1w, [dst, #-1]
+
+.Lexitfunc:
+	ret
+
+.Lcpy_over64:
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/*
+	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
+	* to the tail.
+	*/
+	ldp	A_l, A_h, [src, #-16]
+	stp	A_l, A_h, [dst, #-16]
+	ldp	B_l, B_h, [src, #-32]
+	ldp	C_l, C_h, [src, #-48]
+	stp	B_l, B_h, [dst, #-32]
+	stp	C_l, C_h, [dst, #-48]
+	ldp	D_l, D_h, [src, #-64]!
+	stp	D_l, D_h, [dst, #-64]!
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	ret
+
+	/*
+	* Critical loop. Start at a new cache line boundary. Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+	.p2align	L1_CACHE_SHIFT
+.Lcpy_body_large:
+	/* pre-load 64 bytes data. */
+	ldp	A_l, A_h, [src, #-16]
+	ldp	B_l, B_h, [src, #-32]
+	ldp	C_l, C_h, [src, #-48]
+	ldp	D_l, D_h, [src, #-64]!
+1:
+	/*
+	* interlace the load of next 64 bytes data block with store of the last
+	* loaded 64 bytes data.
+	*/
+	stp	A_l, A_h, [dst, #-16]
+	ldp	A_l, A_h, [src, #-16]
+	stp	B_l, B_h, [dst, #-32]
+	ldp	B_l, B_h, [src, #-32]
+	stp	C_l, C_h, [dst, #-48]
+	ldp	C_l, C_h, [src, #-48]
+	stp	D_l, D_h, [dst, #-64]!
+	ldp	D_l, D_h, [src, #-64]!
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst, #-16]
+	stp	B_l, B_h, [dst, #-32]
+	stp	C_l, C_h, [dst, #-48]
+	stp	D_l, D_h, [dst, #-64]!
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	ret
 ENDPROC(memmove)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index 87e4a68fbbbc..7c72dfd36b63 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -1,5 +1,13 @@
 /*
  * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 
 /*
  * Fill in the buffer with character c (alignment handled by the hardware)
@@ -27,27 +36,181 @@
  * Returns:
  *	x0 - buf
  */
+
+dstin		.req	x0
+val		.req	w1
+count		.req	x2
+tmp1		.req	x3
+tmp1w		.req	w3
+tmp2		.req	x4
+tmp2w		.req	w4
+zva_len_x	.req	x5
+zva_len		.req	w5
+zva_bits_x	.req	x6
+
+A_l		.req	x7
+A_lw		.req	w7
+dst		.req	x8
+tmp3w		.req	w9
+tmp3		.req	x9
+
 ENTRY(memset)
-	mov	x4, x0
-	and	w1, w1, #0xff
-	orr	w1, w1, w1, lsl #8
-	orr	w1, w1, w1, lsl #16
-	orr	x1, x1, x1, lsl #32
-	subs	x2, x2, #8
-	b.mi	2f
-1:	str	x1, [x4], #8
-	subs	x2, x2, #8
-	b.pl	1b
-2:	adds	x2, x2, #4
-	b.mi	3f
-	sub	x2, x2, #4
-	str	w1, [x4], #4
-3:	adds	x2, x2, #2
-	b.mi	4f
-	sub	x2, x2, #2
-	strh	w1, [x4], #2
-4:	adds	x2, x2, #1
-	b.mi	5f
-	strb	w1, [x4]
-5:	ret
+	mov	dst, dstin	/* Preserve return value.  */
+	and	A_lw, val, #255
+	orr	A_lw, A_lw, A_lw, lsl #8
+	orr	A_lw, A_lw, A_lw, lsl #16
+	orr	A_l, A_l, A_l, lsl #32
+
+	cmp	count, #15
+	b.hi	.Lover16_proc
+	/*All store maybe are non-aligned..*/
+	tbz	count, #3, 1f
+	str	A_l, [dst], #8
+1:
+	tbz	count, #2, 2f
+	str	A_lw, [dst], #4
+2:
+	tbz	count, #1, 3f
+	strh	A_lw, [dst], #2
+3:
+	tbz	count, #0, 4f
+	strb	A_lw, [dst]
+4:
+	ret
+
+.Lover16_proc:
+	/*Whether  the start address is aligned with 16.*/
+	neg	tmp2, dst
+	ands	tmp2, tmp2, #15
+	b.eq	.Laligned
+/*
+* The count is not less than 16, we can use stp to store the start 16 bytes,
+* then adjust the dst aligned with 16.This process will make the current
+* memory address at alignment boundary.
+*/
+	stp	A_l, A_l, [dst] /*non-aligned store..*/
+	/*make the dst aligned..*/
+	sub	count, count, tmp2
+	add	dst, dst, tmp2
+
+.Laligned:
+	cbz	A_l, .Lzero_mem
+
+.Ltail_maybe_long:
+	cmp	count, #64
+	b.ge	.Lnot_short
+.Ltail63:
+	ands	tmp1, count, #0x30
+	b.eq	3f
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	stp	A_l, A_l, [dst], #16
+1:
+	stp	A_l, A_l, [dst], #16
+2:
+	stp	A_l, A_l, [dst], #16
+/*
+* The last store length is less than 16,use stp to write last 16 bytes.
+* It will lead some bytes written twice and the access is non-aligned.
+*/
+3:
+	ands	count, count, #15
+	cbz	count, 4f
+	add	dst, dst, count
+	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
+4:
+	ret
+
+	/*
+	* Critical loop. Start at a new cache line boundary. Assuming
+	* 64 bytes per line, this ensures the entire loop is in one line.
+	*/
+	.p2align	L1_CACHE_SHIFT
+.Lnot_short:
+	sub	dst, dst, #16/* Pre-bias.  */
+	sub	count, count, #64
+1:
+	stp	A_l, A_l, [dst, #16]
+	stp	A_l, A_l, [dst, #32]
+	stp	A_l, A_l, [dst, #48]
+	stp	A_l, A_l, [dst, #64]!
+	subs	count, count, #64
+	b.ge	1b
+	tst	count, #0x3f
+	add	dst, dst, #16
+	b.ne	.Ltail63
+.Lexitfunc:
+	ret
+
+	/*
+	* For zeroing memory, check to see if we can use the ZVA feature to
+	* zero entire 'cache' lines.
+	*/
+.Lzero_mem:
+	cmp	count, #63
+	b.le	.Ltail63
+	/*
+	* For zeroing small amounts of memory, it's not worth setting up
+	* the line-clear code.
+	*/
+	cmp	count, #128
+	b.lt	.Lnot_short /*count is at least  128 bytes*/
+
+	mrs	tmp1, dczid_el0
+	tbnz	tmp1, #4, .Lnot_short
+	mov	tmp3w, #4
+	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
+	lsl	zva_len, tmp3w, zva_len
+
+	ands	tmp3w, zva_len, #63
+	/*
+	* ensure the zva_len is not less than 64.
+	* It is not meaningful to use ZVA if the block size is less than 64.
+	*/
+	b.ne	.Lnot_short
+.Lzero_by_line:
+	/*
+	* Compute how far we need to go to become suitably aligned. We're
+	* already at quad-word alignment.
+	*/
+	cmp	count, zva_len_x
+	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
+	sub	zva_bits_x, zva_len_x, #1
+	neg	tmp2, dst
+	ands	tmp2, tmp2, zva_bits_x
+	b.eq	2f			/* Already aligned.  */
+	/* Not aligned, check that there's enough to copy after alignment.*/
+	sub	tmp1, count, tmp2
+	/*
+	* grantee the remain length to be ZVA is bigger than 64,
+	* avoid to make the 2f's process over mem range.*/
+	cmp	tmp1, #64
+	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
+	b.lt	.Lnot_short
+	/*
+	* We know that there's at least 64 bytes to zero and that it's safe
+	* to overrun by 64 bytes.
+	*/
+	mov	count, tmp1
+1:
+	stp	A_l, A_l, [dst]
+	stp	A_l, A_l, [dst, #16]
+	stp	A_l, A_l, [dst, #32]
+	subs	tmp2, tmp2, #64
+	stp	A_l, A_l, [dst, #48]
+	add	dst, dst, #64
+	b.ge	1b
+	/* We've overrun a bit, so adjust dst downwards.*/
+	add	dst, dst, tmp2
+2:
+	sub	count, count, zva_len_x
+3:
+	dc	zva, dst
+	add	dst, dst, zva_len_x
+	subs	count, count, zva_len_x
+	b.ge	3b
+	ands	count, count, zva_bits_x
+	b.ne	.Ltail_maybe_long
+	ret
 ENDPROC(memset)
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
new file mode 100644
index 000000000000..42f828b06c59
--- /dev/null
+++ b/arch/arm64/lib/strcmp.S
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * compare two strings
+ *
+ * Parameters:
+ *	x0 - const string 1 pointer
+ *    x1 - const string 2 pointer
+ * Returns:
+ * x0 - an integer less than, equal to, or greater than zero
+ * if  s1  is  found, respectively, to be less than, to match,
+ * or be greater than s2.
+ */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result.  */
+src1		.req	x0
+src2		.req	x1
+result		.req	x0
+
+/* Internal variables.  */
+data1		.req	x2
+data1w		.req	w2
+data2		.req	x3
+data2w		.req	w3
+has_nul		.req	x4
+diff		.req	x5
+syndrome	.req	x6
+tmp1		.req	x7
+tmp2		.req	x8
+tmp3		.req	x9
+zeroones	.req	x10
+pos		.req	x11
+
+ENTRY(strcmp)
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	b.ne	.Lmisaligned8
+	ands	tmp1, src1, #7
+	b.ne	.Lmutual_align
+
+	/*
+	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	* can be done in parallel across the entire word.
+	*/
+.Lloop_aligned:
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+.Lstart_realigned:
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, .Lloop_aligned
+	b	.Lcal_cmpresult
+
+.Lmutual_align:
+	/*
+	* Sources are mutually aligned, but are not currently at an
+	* alignment boundary.  Round down the addresses and then mask off
+	* the bytes that preceed the start point.
+	*/
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	ldr	data1, [src1], #8
+	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+	/* Big-endian.  Early bytes are at MSB.  */
+CPU_BE( lsl	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */
+	/* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */
+
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	.Lstart_realigned
+
+.Lmisaligned8:
+	/*
+	* Get the align offset length to compare per byte first.
+	* After this process, one string's address will be aligned.
+	*/
+	and	tmp1, src1, #7
+	neg	tmp1, tmp1
+	add	tmp1, tmp1, #8
+	and	tmp2, src2, #7
+	neg	tmp2, tmp2
+	add	tmp2, tmp2, #8
+	subs	tmp3, tmp1, tmp2
+	csel	pos, tmp1, tmp2, hi /*Choose the maximum. */
+.Ltinycmp:
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	pos, pos, #1
+	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+	b.eq	.Ltinycmp
+	cbnz	pos, 1f /*find the null or unequal...*/
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs
+	b.eq	.Lstart_align /*the last bytes are equal....*/
+1:
+	sub	result, data1, data2
+	ret
+
+.Lstart_align:
+	ands	xzr, src1, #7
+	b.eq	.Lrecal_offset
+	/*process more leading bytes to make str1 aligned...*/
+	add	src1, src1, tmp3
+	add	src2, src2, tmp3
+	/*load 8 bytes from aligned str1 and non-aligned str2..*/
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	eor	diff, data1, data2 /* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	cbnz	syndrome, .Lcal_cmpresult
+	/*How far is the current str2 from the alignment boundary...*/
+	and	tmp3, tmp3, #7
+.Lrecal_offset:
+	neg	pos, tmp3
+.Lloopcmp_proc:
+	/*
+	* Divide the eight bytes into two parts. First,backwards the src2
+	* to an alignment boundary,load eight bytes from the SRC2 alignment
+	* boundary,then compare with the relative bytes from SRC1.
+	* If all 8 bytes are equal,then start the second part's comparison.
+	* Otherwise finish the comparison.
+	* This special handle can garantee all the accesses are in the
+	* thread/task space in avoid to overrange access.
+	*/
+	ldr	data1, [src1,pos]
+	ldr	data2, [src2,pos]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	eor	diff, data1, data2  /* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	cbnz	syndrome, .Lcal_cmpresult
+
+	/*The second part process*/
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	eor	diff, data1, data2  /* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, .Lloopcmp_proc
+
+.Lcal_cmpresult:
+	/*
+	* reversed the byte-order as big-endian,then CLZ can find the most
+	* significant zero bits.
+	*/
+CPU_LE( rev	syndrome, syndrome )
+CPU_LE( rev	data1, data1 )
+CPU_LE( rev	data2, data2 )
+
+	/*
+	* For big-endian we cannot use the trick with the syndrome value
+	* as carry-propagation can corrupt the upper bits if the trailing
+	* bytes in the string contain 0x01.
+	* However, if there is no NUL byte in the dword, we can generate
+	* the result directly.  We ca not just subtract the bytes as the
+	* MSB might be significant.
+	*/
+CPU_BE( cbnz	has_nul, 1f )
+CPU_BE( cmp	data1, data2 )
+CPU_BE( cset	result, ne )
+CPU_BE( cneg	result, result, lo )
+CPU_BE( ret )
+CPU_BE( 1: )
+	/*Re-compute the NUL-byte detection, using a byte-reversed value. */
+CPU_BE(	rev	tmp3, data1 )
+CPU_BE(	sub	tmp1, tmp3, zeroones )
+CPU_BE(	orr	tmp2, tmp3, #REP8_7f )
+CPU_BE(	bic	has_nul, tmp1, tmp2 )
+CPU_BE(	rev	has_nul, has_nul )
+CPU_BE(	orr	syndrome, diff, has_nul )
+
+	clz	pos, syndrome
+	/*
+	* The MS-non-zero bit of the syndrome marks either the first bit
+	* that is different, or the top bit of the first zero byte.
+	* Shifting left now will bring the critical information into the
+	* top bits.
+	*/
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/*
+	* But we need to zero-extend (char is unsigned) the value and then
+	* perform a signed 32-bit subtraction.
+	*/
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+ENDPROC(strcmp)
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
new file mode 100644
index 000000000000..987b68b9ce44
--- /dev/null
+++ b/arch/arm64/lib/strlen.S
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * calculate the length of a string
+ *
+ * Parameters:
+ *	x0 - const string pointer
+ * Returns:
+ *	x0 - the return length of specific string
+ */
+
+/* Arguments and results.  */
+srcin		.req	x0
+len		.req	x0
+
+/* Locals and temporaries.  */
+src		.req	x1
+data1		.req	x2
+data2		.req	x3
+data2a		.req	x4
+has_nul1	.req	x5
+has_nul2	.req	x6
+tmp1		.req	x7
+tmp2		.req	x8
+tmp3		.req	x9
+tmp4		.req	x10
+zeroones	.req	x11
+pos		.req	x12
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ENTRY(strlen)
+	mov	zeroones, #REP8_01
+	bic	src, srcin, #15
+	ands	tmp1, srcin, #15
+	b.ne	.Lmisaligned
+	/*
+	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	* can be done in parallel across the entire word.
+	*/
+	/*
+	* The inner loop deals with two Dwords at a time. This has a
+	* slightly higher start-up cost, but we should win quite quickly,
+	* especially on cores with a high number of issue slots per
+	* cycle, as we get much better parallelism out of the operations.
+	*/
+.Lloop:
+	ldp	data1, data2, [src], #16
+.Lrealigned:
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	.Lloop
+
+	sub	len, src, srcin
+	cbz	has_nul1, .Lnul_in_data2
+CPU_BE(	mov	data2, data1 )	/*prepare data to re-calculate the syndrome*/
+	sub	len, len, #8
+	mov	has_nul2, has_nul1
+.Lnul_in_data2:
+	/*
+	* For big-endian, carry propagation (if the final byte in the
+	* string is 0x01) means we cannot use has_nul directly.  The
+	* easiest way to get the correct byte is to byte-swap the data
+	* and calculate the syndrome a second time.
+	*/
+CPU_BE( rev	data2, data2 )
+CPU_BE( sub	tmp1, data2, zeroones )
+CPU_BE( orr	tmp2, data2, #REP8_7f )
+CPU_BE( bic	has_nul2, tmp1, tmp2 )
+
+	sub	len, len, #8
+	rev	has_nul2, has_nul2
+	clz	pos, has_nul2
+	add	len, len, pos, lsr #3		/* Bits to bytes.  */
+	ret
+
+.Lmisaligned:
+	cmp	tmp1, #8
+	neg	tmp1, tmp1
+	ldp	data1, data2, [src], #16
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	mov	tmp2, #~0
+	/* Big-endian.  Early bytes are at MSB.  */
+CPU_BE( lsl	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */
+	/* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */
+
+	orr	data1, data1, tmp2
+	orr	data2a, data2, tmp2
+	csinv	data1, data1, xzr, le
+	csel	data2, data2, data2a, le
+	b	.Lrealigned
+ENDPROC(strlen)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
new file mode 100644
index 000000000000..0224cf5a5533
--- /dev/null
+++ b/arch/arm64/lib/strncmp.S
@@ -0,0 +1,310 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * compare two strings
+ *
+ * Parameters:
+ *  x0 - const string 1 pointer
+ *  x1 - const string 2 pointer
+ *  x2 - the maximal length to be compared
+ * Returns:
+ *  x0 - an integer less than, equal to, or greater than zero if s1 is found,
+ *     respectively, to be less than, to match, or be greater than s2.
+ */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result.  */
+src1		.req	x0
+src2		.req	x1
+limit		.req	x2
+result		.req	x0
+
+/* Internal variables.  */
+data1		.req	x3
+data1w		.req	w3
+data2		.req	x4
+data2w		.req	w4
+has_nul		.req	x5
+diff		.req	x6
+syndrome	.req	x7
+tmp1		.req	x8
+tmp2		.req	x9
+tmp3		.req	x10
+zeroones	.req	x11
+pos		.req	x12
+limit_wd	.req	x13
+mask		.req	x14
+endloop		.req	x15
+
+ENTRY(strncmp)
+	cbz	limit, .Lret0
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	b.ne	.Lmisaligned8
+	ands	tmp1, src1, #7
+	b.ne	.Lmutual_align
+	/* Calculate the number of full and partial words -1.  */
+	/*
+	* when limit is mulitply of 8, if not sub 1,
+	* the judgement of last dword will wrong.
+	*/
+	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
+	lsr	limit_wd, limit_wd, #3  /* Convert to Dwords.  */
+
+	/*
+	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	* can be done in parallel across the entire word.
+	*/
+.Lloop_aligned:
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+.Lstart_realigned:
+	subs	limit_wd, limit_wd, #1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2  /* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, pl  /* Last Dword or differences.*/
+	bics	has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */
+	ccmp	endloop, #0, #0, eq
+	b.eq	.Lloop_aligned
+
+	/*Not reached the limit, must have found the end or a diff.  */
+	tbz	limit_wd, #63, .Lnot_limit
+
+	/* Limit % 8 == 0 => all bytes significant.  */
+	ands	limit, limit, #7
+	b.eq	.Lnot_limit
+
+	lsl	limit, limit, #3    /* Bits -> bytes.  */
+	mov	mask, #~0
+CPU_BE( lsr	mask, mask, limit )
+CPU_LE( lsl	mask, mask, limit )
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+.Lnot_limit:
+	orr	syndrome, diff, has_nul
+	b	.Lcal_cmpresult
+
+.Lmutual_align:
+	/*
+	* Sources are mutually aligned, but are not currently at an
+	* alignment boundary.  Round down the addresses and then mask off
+	* the bytes that precede the start point.
+	* We also need to adjust the limit calculations, but without
+	* overflowing if the limit is near ULONG_MAX.
+	*/
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	ldr	data1, [src1], #8
+	neg	tmp3, tmp1, lsl #3  /* 64 - bits(bytes beyond align). */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
+	/* Big-endian.  Early bytes are at MSB.  */
+CPU_BE( lsl	tmp2, tmp2, tmp3 )	/* Shift (tmp1 & 63).  */
+	/* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr	tmp2, tmp2, tmp3 )	/* Shift (tmp1 & 63).  */
+
+	and	tmp3, limit_wd, #7
+	lsr	limit_wd, limit_wd, #3
+	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/
+	add	limit, limit, tmp1
+	add	tmp3, tmp3, tmp1
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	add	limit_wd, limit_wd, tmp3, lsr #3
+	b	.Lstart_realigned
+
+/*when src1 offset is not equal to src2 offset...*/
+.Lmisaligned8:
+	cmp	limit, #8
+	b.lo	.Ltiny8proc /*limit < 8... */
+	/*
+	* Get the align offset length to compare per byte first.
+	* After this process, one string's address will be aligned.*/
+	and	tmp1, src1, #7
+	neg	tmp1, tmp1
+	add	tmp1, tmp1, #8
+	and	tmp2, src2, #7
+	neg	tmp2, tmp2
+	add	tmp2, tmp2, #8
+	subs	tmp3, tmp1, tmp2
+	csel	pos, tmp1, tmp2, hi /*Choose the maximum. */
+	/*
+	* Here, limit is not less than 8, so directly run .Ltinycmp
+	* without checking the limit.*/
+	sub	limit, limit, pos
+.Ltinycmp:
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	pos, pos, #1
+	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+	b.eq	.Ltinycmp
+	cbnz	pos, 1f /*find the null or unequal...*/
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs
+	b.eq	.Lstart_align /*the last bytes are equal....*/
+1:
+	sub	result, data1, data2
+	ret
+
+.Lstart_align:
+	lsr	limit_wd, limit, #3
+	cbz	limit_wd, .Lremain8
+	/*process more leading bytes to make str1 aligned...*/
+	ands	xzr, src1, #7
+	b.eq	.Lrecal_offset
+	add	src1, src1, tmp3	/*tmp3 is positive in this branch.*/
+	add	src2, src2, tmp3
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+
+	sub	limit, limit, tmp3
+	lsr	limit_wd, limit, #3
+	subs	limit_wd, limit_wd, #1
+
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2  /* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
+	bics	has_nul, tmp1, tmp2
+	ccmp	endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
+	b.ne	.Lunequal_proc
+	/*How far is the current str2 from the alignment boundary...*/
+	and	tmp3, tmp3, #7
+.Lrecal_offset:
+	neg	pos, tmp3
+.Lloopcmp_proc:
+	/*
+	* Divide the eight bytes into two parts. First,backwards the src2
+	* to an alignment boundary,load eight bytes from the SRC2 alignment
+	* boundary,then compare with the relative bytes from SRC1.
+	* If all 8 bytes are equal,then start the second part's comparison.
+	* Otherwise finish the comparison.
+	* This special handle can garantee all the accesses are in the
+	* thread/task space in avoid to overrange access.
+	*/
+	ldr	data1, [src1,pos]
+	ldr	data2, [src2,pos]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bics	has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */
+	eor	diff, data1, data2  /* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, eq
+	cbnz	endloop, .Lunequal_proc
+
+	/*The second part process*/
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	subs	limit_wd, limit_wd, #1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2  /* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
+	bics	has_nul, tmp1, tmp2
+	ccmp	endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
+	b.eq	.Lloopcmp_proc
+
+.Lunequal_proc:
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, .Lremain8
+.Lcal_cmpresult:
+	/*
+	* reversed the byte-order as big-endian,then CLZ can find the most
+	* significant zero bits.
+	*/
+CPU_LE( rev	syndrome, syndrome )
+CPU_LE( rev	data1, data1 )
+CPU_LE( rev	data2, data2 )
+	/*
+	* For big-endian we cannot use the trick with the syndrome value
+	* as carry-propagation can corrupt the upper bits if the trailing
+	* bytes in the string contain 0x01.
+	* However, if there is no NUL byte in the dword, we can generate
+	* the result directly.  We can't just subtract the bytes as the
+	* MSB might be significant.
+	*/
+CPU_BE( cbnz	has_nul, 1f )
+CPU_BE( cmp	data1, data2 )
+CPU_BE( cset	result, ne )
+CPU_BE( cneg	result, result, lo )
+CPU_BE( ret )
+CPU_BE( 1: )
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.*/
+CPU_BE( rev	tmp3, data1 )
+CPU_BE( sub	tmp1, tmp3, zeroones )
+CPU_BE( orr	tmp2, tmp3, #REP8_7f )
+CPU_BE( bic	has_nul, tmp1, tmp2 )
+CPU_BE( rev	has_nul, has_nul )
+CPU_BE( orr	syndrome, diff, has_nul )
+	/*
+	* The MS-non-zero bit of the syndrome marks either the first bit
+	* that is different, or the top bit of the first zero byte.
+	* Shifting left now will bring the critical information into the
+	* top bits.
+	*/
+	clz	pos, syndrome
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/*
+	* But we need to zero-extend (char is unsigned) the value and then
+	* perform a signed 32-bit subtraction.
+	*/
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+
+.Lremain8:
+	/* Limit % 8 == 0 => all bytes significant.  */
+	ands	limit, limit, #7
+	b.eq	.Lret0
+.Ltiny8proc:
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+
+	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+	b.eq	.Ltiny8proc
+	sub	result, data1, data2
+	ret
+
+.Lret0:
+	mov	result, #0
+	ret
+ENDPROC(strncmp)
diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S
new file mode 100644
index 000000000000..2ca665711bf2
--- /dev/null
+++ b/arch/arm64/lib/strnlen.S
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * determine the length of a fixed-size string
+ *
+ * Parameters:
+ *	x0 - const string pointer
+ *	x1 - maximal string length
+ * Returns:
+ *	x0 - the return length of specific string
+ */
+
+/* Arguments and results.  */
+srcin		.req	x0
+len		.req	x0
+limit		.req	x1
+
+/* Locals and temporaries.  */
+src		.req	x2
+data1		.req	x3
+data2		.req	x4
+data2a		.req	x5
+has_nul1	.req	x6
+has_nul2	.req	x7
+tmp1		.req	x8
+tmp2		.req	x9
+tmp3		.req	x10
+tmp4		.req	x11
+zeroones	.req	x12
+pos		.req	x13
+limit_wd	.req	x14
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ENTRY(strnlen)
+	cbz	limit, .Lhit_limit
+	mov	zeroones, #REP8_01
+	bic	src, srcin, #15
+	ands	tmp1, srcin, #15
+	b.ne	.Lmisaligned
+	/* Calculate the number of full and partial words -1.  */
+	sub	limit_wd, limit, #1 /* Limit != 0, so no underflow.  */
+	lsr	limit_wd, limit_wd, #4  /* Convert to Qwords.  */
+
+	/*
+	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	* can be done in parallel across the entire word.
+	*/
+	/*
+	* The inner loop deals with two Dwords at a time.  This has a
+	* slightly higher start-up cost, but we should win quite quickly,
+	* especially on cores with a high number of issue slots per
+	* cycle, as we get much better parallelism out of the operations.
+	*/
+.Lloop:
+	ldp	data1, data2, [src], #16
+.Lrealigned:
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	subs	limit_wd, limit_wd, #1
+	orr	tmp1, has_nul1, has_nul2
+	ccmp	tmp1, #0, #0, pl    /* NZCV = 0000  */
+	b.eq	.Lloop
+
+	cbz	tmp1, .Lhit_limit   /* No null in final Qword.  */
+
+	/*
+	* We know there's a null in the final Qword. The easiest thing
+	* to do now is work out the length of the string and return
+	* MIN (len, limit).
+	*/
+	sub	len, src, srcin
+	cbz	has_nul1, .Lnul_in_data2
+CPU_BE( mov	data2, data1 )	/*perpare data to re-calculate the syndrome*/
+
+	sub	len, len, #8
+	mov	has_nul2, has_nul1
+.Lnul_in_data2:
+	/*
+	* For big-endian, carry propagation (if the final byte in the
+	* string is 0x01) means we cannot use has_nul directly.  The
+	* easiest way to get the correct byte is to byte-swap the data
+	* and calculate the syndrome a second time.
+	*/
+CPU_BE( rev	data2, data2 )
+CPU_BE( sub	tmp1, data2, zeroones )
+CPU_BE( orr	tmp2, data2, #REP8_7f )
+CPU_BE( bic	has_nul2, tmp1, tmp2 )
+
+	sub	len, len, #8
+	rev	has_nul2, has_nul2
+	clz	pos, has_nul2
+	add	len, len, pos, lsr #3       /* Bits to bytes.  */
+	cmp	len, limit
+	csel	len, len, limit, ls     /* Return the lower value.  */
+	ret
+
+.Lmisaligned:
+	/*
+	* Deal with a partial first word.
+	* We're doing two things in parallel here;
+	* 1) Calculate the number of words (but avoiding overflow if
+	* limit is near ULONG_MAX) - to do this we need to work out
+	* limit + tmp1 - 1 as a 65-bit value before shifting it;
+	* 2) Load and mask the initial data words - we force the bytes
+	* before the ones we are interested in to 0xff - this ensures
+	* early bytes will not hit any zero detection.
+	*/
+	ldp	data1, data2, [src], #16
+
+	sub	limit_wd, limit, #1
+	and	tmp3, limit_wd, #15
+	lsr	limit_wd, limit_wd, #4
+
+	add	tmp3, tmp3, tmp1
+	add	limit_wd, limit_wd, tmp3, lsr #4
+
+	neg	tmp4, tmp1
+	lsl	tmp4, tmp4, #3  /* Bytes beyond alignment -> bits.  */
+
+	mov	tmp2, #~0
+	/* Big-endian.  Early bytes are at MSB.  */
+CPU_BE( lsl	tmp2, tmp2, tmp4 )	/* Shift (tmp1 & 63).  */
+	/* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr	tmp2, tmp2, tmp4 )	/* Shift (tmp1 & 63).  */
+
+	cmp	tmp1, #8
+
+	orr	data1, data1, tmp2
+	orr	data2a, data2, tmp2
+
+	csinv	data1, data1, xzr, le
+	csel	data2, data2, data2a, le
+	b	.Lrealigned
+
+.Lhit_limit:
+	mov	len, limit
+	ret
+ENDPROC(strnlen)
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index fda756875fa6..23663837acff 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -31,7 +31,7 @@
  *	Corrupted registers: x0-x7, x9-x11
  */
 __flush_dcache_all:
-	dsb	sy				// ensure ordering with previous memory accesses
+	dmb	sy				// ensure ordering with previous memory accesses
 	mrs	x0, clidr_el1			// read clidr
 	and	x3, x0, #0x7000000		// extract loc from clidr
 	lsr	x3, x3, #23			// left align loc bit field
@@ -128,7 +128,7 @@ USER(9f, dc	cvau, x4	)		// clean D line to PoU
 	add	x4, x4, x2
 	cmp	x4, x1
 	b.lo	1b
-	dsb	sy
+	dsb	ish
 
 	icache_line_size x2, x3
 	sub	x3, x2, #1
@@ -139,7 +139,7 @@ USER(9f, ic	ivau, x4	)		// invalidate I line PoU
 	cmp	x4, x1
 	b.lo	1b
 9:						// ignore any faulting cache operation
-	dsb	sy
+	dsb	ish
 	isb
 	ret
 ENDPROC(flush_icache_range)
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 2fc8258bab2d..023747bf4dd7 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -51,7 +51,11 @@ int pmd_huge(pmd_t pmd)
 
 int pud_huge(pud_t pud)
 {
+#ifndef __PAGETABLE_PMD_FOLDED
 	return !(pud_val(pud) & PUD_TABLE_BIT);
+#else
+	return 0;
+#endif
 }
 
 static __init int setup_hugepagesz(char *opt)
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index e085ee6ef4e2..c3c7b8f83b55 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -175,7 +175,7 @@ ENDPROC(cpu_do_switch_mm)
 ENTRY(__cpu_setup)
 	ic	iallu				// I+BTB cache invalidate
 	tlbi	vmalle1is			// invalidate I + D TLBs
-	dsb	sy
+	dsb	ish
 
 	mov	x0, #3 << 20
 	msr	cpacr_el1, x0			// Enable FP/ASIMD
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 90fd1195f276..3876c04feef9 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,7 +36,7 @@ void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
 
 static const struct desc_ptr no_idt = {};
-static int reboot_mode;
+static enum reboot_mode reboot_mode;
 enum reboot_type reboot_type = BOOT_ACPI;
 int reboot_force;
 
@@ -88,11 +88,11 @@ static int __init reboot_setup(char *str)
 
 		switch (*str) {
 		case 'w':
-			reboot_mode = 0x1234;
+			reboot_mode = REBOOT_WARM;
 			break;
 
 		case 'c':
-			reboot_mode = 0;
+			reboot_mode = REBOOT_COLD;
 			break;
 
 #ifdef CONFIG_SMP
@@ -552,6 +552,7 @@ static void native_machine_emergency_restart(void)
 	int i;
 	int attempt = 0;
 	int orig_reboot_type = reboot_type;
+	unsigned short mode;
 
 	if (reboot_emergency)
 		emergency_vmx_disable_all();
@@ -559,7 +560,8 @@ static void native_machine_emergency_restart(void)
 	tboot_shutdown(TB_SHUTDOWN_REBOOT);
 
 	/* Tell the BIOS if we want cold or warm reboot */
-	*((unsigned short *)__va(0x472)) = reboot_mode;
+	mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0;
+	*((unsigned short *)__va(0x472)) = mode;
 
 	for (;;) {
 		/* Could also try the reset bit in the Hammer NB */
@@ -601,7 +603,7 @@ static void native_machine_emergency_restart(void)
 
 		case BOOT_EFI:
 			if (efi_enabled(EFI_RUNTIME_SERVICES))
-				efi.reset_system(reboot_mode ?
+				efi.reset_system(reboot_mode == REBOOT_WARM ?
 						 EFI_RESET_WARM :
 						 EFI_RESET_COLD,
 						 EFI_SUCCESS, 0, NULL);
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 842d7ba83101..9625ce7ed5f8 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -31,26 +31,20 @@ config CPU_IDLE_GOV_MENU
 config ARCH_NEEDS_CPU_IDLE_COUPLED
 	def_bool n
 
-config OF_IDLE_STATES
-        bool "Idle states DT support"
-	depends on ARM || ARM64
-	default n
-	help
-	 Allows the CPU idle framework to initialize CPU idle drivers
-	 state data by using DT provided nodes compliant with idle states
-	 device tree bindings.
+config DT_IDLE_STATES
+	bool
 
 if CPU_IDLE
 
+menu "ARM64 CPU Idle Drivers"
+depends on ARM64
+source "drivers/cpuidle/Kconfig.arm64"
+endmenu
+
 config CPU_IDLE_CALXEDA
 	bool "CPU Idle Driver for Calxeda processors"
 	depends on ARCH_HIGHBANK
 	help
 	  Select this to enable cpuidle on Calxeda processors.
 
-menu "ARM64 CPU Idle Drivers"
-depends on ARM64
-source "drivers/cpuidle/Kconfig.arm64"
-endmenu
-
 endif
diff --git a/drivers/cpuidle/Kconfig.arm64 b/drivers/cpuidle/Kconfig.arm64
index b83612c67e6d..d0a08ed1b2ee 100644
--- a/drivers/cpuidle/Kconfig.arm64
+++ b/drivers/cpuidle/Kconfig.arm64
@@ -4,10 +4,11 @@
 
 config ARM64_CPUIDLE
 	bool "Generic ARM64 CPU idle Driver"
-	select OF_IDLE_STATES
+	select ARM64_CPU_SUSPEND
+	select DT_IDLE_STATES
 	help
-	  Select this to enable generic cpuidle driver for ARM v8.
+	  Select this to enable generic cpuidle driver for ARM64.
 	  It provides a generic idle driver whose idle states are configured
 	  at run-time through DT nodes. The CPUidle suspend backend is
-	  initialized by the device tree parsing code on matching the entry
-	  method to the respective CPU operations.
+	  initialized by calling the CPU operations init idle hook
+	  provided by architecture code.
diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
index 2d97bcfecd00..0bd32cd03f0a 100644
--- a/drivers/cpuidle/Makefile
+++ b/drivers/cpuidle/Makefile
@@ -5,7 +5,7 @@
 obj-y += cpuidle.o driver.o governor.o sysfs.o governors/
 obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
 obj-$(CONFIG_BIG_LITTLE) += arm_big_little.o
-obj-$(CONFIG_OF_IDLE_STATES)		  += of_idle_states.o
+obj-$(CONFIG_DT_IDLE_STATES)		  += dt_idle_states.o
 
 obj-$(CONFIG_CPU_IDLE_CALXEDA) += cpuidle-calxeda.o
 obj-$(CONFIG_ARCH_KIRKWOOD) += cpuidle-kirkwood.o
diff --git a/drivers/cpuidle/cpuidle-arm64.c b/drivers/cpuidle/cpuidle-arm64.c
index 2cfde6ce3086..50997ea942fc 100644
--- a/drivers/cpuidle/cpuidle-arm64.c
+++ b/drivers/cpuidle/cpuidle-arm64.c
@@ -9,6 +9,8 @@
  * published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt) "CPUidle arm64: " fmt
+
 #include <linux/cpuidle.h>
 #include <linux/cpumask.h>
 #include <linux/cpu_pm.h>
@@ -16,50 +18,23 @@
 #include <linux/module.h>
 #include <linux/of.h>
 
-#include <asm/psci.h>
+#include <asm/cpuidle.h>
 #include <asm/suspend.h>
 
-#include "of_idle_states.h"
-
-typedef int (*suspend_init_fn)(struct cpuidle_driver *,
-			       struct device_node *[]);
-
-struct cpu_suspend_ops {
-	const char *id;
-	suspend_init_fn init_fn;
-};
-
-static const struct cpu_suspend_ops suspend_operations[] __initconst = {
-	{"arm,psci", psci_dt_register_idle_states},
-	{}
-};
-
-static __init const struct cpu_suspend_ops *get_suspend_ops(const char *str)
-{
-	int i;
-
-	if (!str)
-		return NULL;
-
-	for (i = 0; suspend_operations[i].id; i++)
-		if (!strcmp(suspend_operations[i].id, str))
-			return &suspend_operations[i];
-
-	return NULL;
-}
+#include "dt_idle_states.h"
 
 /*
- * arm_enter_idle_state - Programs CPU to enter the specified state
+ * arm64_enter_idle_state - Programs CPU to enter the specified state
  *
- * @dev: cpuidle device
- * @drv: cpuidle driver
- * @idx: state index
+ * dev: cpuidle device
+ * drv: cpuidle driver
+ * idx: state index
  *
  * Called from the CPUidle framework to program the device to the
  * specified target state selected by the governor.
  */
-static int arm_enter_idle_state(struct cpuidle_device *dev,
-				struct cpuidle_driver *drv, int idx)
+static int arm64_enter_idle_state(struct cpuidle_device *dev,
+				  struct cpuidle_driver *drv, int idx)
 {
 	int ret;
 
@@ -68,30 +43,47 @@ static int arm_enter_idle_state(struct cpuidle_device *dev,
 		return idx;
 	}
 
-	cpu_pm_enter();
-	/*
-	 * Pass idle state index to cpu_suspend which in turn will call
-	 * the CPU ops suspend protocol with idle index as a parameter.
-	 *
-	 * Some states would not require context to be saved and flushed
-	 * to DRAM, so calling cpu_suspend would not be stricly necessary.
-	 * When power domains specifications for ARM CPUs are finalized then
-	 * this code can be optimized to prevent saving registers if not
-	 * needed.
-	 */
-	ret = cpu_suspend(idx);
+	ret = cpu_pm_enter();
+	if (!ret) {
+		/*
+		 * Pass idle state index to cpu_suspend which in turn will
+		 * call the CPU ops suspend protocol with idle index as a
+		 * parameter.
+		 */
+		ret = cpu_suspend(idx);
 
-	cpu_pm_exit();
+		cpu_pm_exit();
+	}
 
 	return ret ? -1 : idx;
 }
 
-struct cpuidle_driver arm64_idle_driver = {
+static struct cpuidle_driver arm64_idle_driver = {
 	.name = "arm64_idle",
 	.owner = THIS_MODULE,
+	/*
+	 * State at index 0 is standby wfi and considered standard
+	 * on all ARM platforms. If in some platforms simple wfi
+	 * can't be used as "state 0", DT bindings must be implemented
+	 * to work around this issue and allow installing a special
+	 * handler for idle state index 0.
+	 */
+	.states[0] = {
+		.enter                  = arm64_enter_idle_state,
+		.exit_latency           = 1,
+		.target_residency       = 1,
+		.power_usage		= UINT_MAX,
+		.flags                  = CPUIDLE_FLAG_TIME_VALID,
+		.name                   = "WFI",
+		.desc                   = "ARM64 WFI",
+	}
 };
 
-static struct device_node *state_nodes[CPUIDLE_STATE_MAX] __initdata;
+static const struct of_device_id arm64_idle_state_match[] __initconst = {
+	{ .compatible = "arm,idle-state",
+	  .data = arm64_enter_idle_state },
+	{ },
+};
 
 /*
  * arm64_idle_init
@@ -102,58 +94,40 @@ static struct device_node *state_nodes[CPUIDLE_STATE_MAX] __initdata;
  */
 static int __init arm64_idle_init(void)
 {
-	int i, ret;
-	const char *entry_method;
-	struct device_node *idle_states_node;
-	const struct cpu_suspend_ops *suspend_init;
+	int cpu, ret;
 	struct cpuidle_driver *drv = &arm64_idle_driver;
 
-	idle_states_node = of_find_node_by_path("/cpus/idle-states");
-	if (!idle_states_node)
-		return -ENOENT;
-
-	if (of_property_read_string(idle_states_node, "entry-method",
-				    &entry_method)) {
-		pr_warn(" * %s missing entry-method property\n",
-			    idle_states_node->full_name);
-		of_node_put(idle_states_node);
-		return -EOPNOTSUPP;
-	}
-
-	suspend_init = get_suspend_ops(entry_method);
-	if (!suspend_init) {
-		pr_warn("Missing suspend initializer\n");
-		of_node_put(idle_states_node);
-		return -EOPNOTSUPP;
-	}
-
 	/*
-	 * State at index 0 is standby wfi and considered standard
-	 * on all ARM platforms. If in some platforms simple wfi
-	 * can't be used as "state 0", DT bindings must be implemented
-	 * to work around this issue and allow installing a special
-	 * handler for idle state index 0.
+	 * Initialize idle states data, starting at index 1.
+	 * This driver is DT only, if no DT idle states are detected (ret == 0)
+	 * let the driver initialization fail accordingly since there is no
+	 * reason to initialize the idle driver if only wfi is supported.
 	 */
-	drv->states[0].exit_latency = 1;
-	drv->states[0].target_residency = 1;
-	drv->states[0].flags = CPUIDLE_FLAG_TIME_VALID;
-	strncpy(drv->states[0].name, "ARM WFI", CPUIDLE_NAME_LEN);
-	strncpy(drv->states[0].desc, "ARM WFI", CPUIDLE_DESC_LEN);
+	ret = dt_init_idle_driver(drv, arm64_idle_state_match, 1);
+	if (ret <= 0) {
+		if (ret)
+			pr_err("failed to initialize idle states\n");
+		return ret ? : -ENODEV;
+	}
 
-	drv->cpumask = (struct cpumask *) cpu_possible_mask;
 	/*
-	 * Start at index 1, request idle state nodes to be filled
+	 * Call arch CPU operations in order to initialize
+	 * idle states suspend back-end specific data
 	 */
-	ret = of_init_idle_driver(drv, state_nodes, 1, true);
-	if (ret)
-		return ret;
-
-	if (suspend_init->init_fn(drv, state_nodes))
-		return -EOPNOTSUPP;
+	for_each_possible_cpu(cpu) {
+		ret = cpu_init_idle(cpu);
+		if (ret) {
+			pr_err("CPU %d failed to init idle CPU ops\n", cpu);
+			return ret;
+		}
+	}
 
-	for (i = 0; i < drv->state_count; i++)
-		drv->states[i].enter = arm_enter_idle_state;
+	ret = cpuidle_register(drv, NULL);
+	if (ret) {
+		pr_err("failed to register cpuidle driver\n");
+		return ret;
+	}
 
-	return cpuidle_register(drv, NULL);
+	return 0;
 }
 device_initcall(arm64_idle_init);
diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c
new file mode 100644
index 000000000000..52f4d11bbf3f
--- /dev/null
+++ b/drivers/cpuidle/dt_idle_states.c
@@ -0,0 +1,213 @@
+/*
+ * DT idle states parsing code.
+ *
+ * Copyright (C) 2014 ARM Ltd.
+ * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) "DT idle-states: " fmt
+
+#include <linux/cpuidle.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+
+#include "dt_idle_states.h"
+
+static int init_state_node(struct cpuidle_state *idle_state,
+			   const struct of_device_id *matches,
+			   struct device_node *state_node)
+{
+	int err;
+	const struct of_device_id *match_id;
+
+	match_id = of_match_node(matches, state_node);
+	if (!match_id)
+		return -ENODEV;
+	/*
+	 * CPUidle drivers are expected to initialize the const void *data
+	 * pointer of the passed in struct of_device_id array to the idle
+	 * state enter function.
+	 */
+	idle_state->enter = match_id->data;
+
+	err = of_property_read_u32(state_node, "wakeup-latency-us",
+				   &idle_state->exit_latency);
+	if (err) {
+		u32 entry_latency, exit_latency;
+
+		err = of_property_read_u32(state_node, "entry-latency-us",
+					   &entry_latency);
+		if (err) {
+			pr_debug(" * %s missing entry-latency-us property\n",
+				 state_node->full_name);
+			return -EINVAL;
+		}
+
+		err = of_property_read_u32(state_node, "exit-latency-us",
+					   &exit_latency);
+		if (err) {
+			pr_debug(" * %s missing exit-latency-us property\n",
+				 state_node->full_name);
+			return -EINVAL;
+		}
+		/*
+		 * If wakeup-latency-us is missing, default to entry+exit
+		 * latencies as defined in idle states bindings
+		 */
+		idle_state->exit_latency = entry_latency + exit_latency;
+	}
+
+	err = of_property_read_u32(state_node, "min-residency-us",
+				   &idle_state->target_residency);
+	if (err) {
+		pr_debug(" * %s missing min-residency-us property\n",
+			     state_node->full_name);
+		return -EINVAL;
+	}
+
+	idle_state->flags = CPUIDLE_FLAG_TIME_VALID;
+	if (of_property_read_bool(state_node, "local-timer-stop"))
+		idle_state->flags |= CPUIDLE_FLAG_TIMER_STOP;
+	/*
+	 * TODO:
+	 *	replace with kstrdup and pointer assignment when name
+	 *	and desc become string pointers
+	 */
+	strncpy(idle_state->name, state_node->name, CPUIDLE_NAME_LEN - 1);
+	strncpy(idle_state->desc, state_node->name, CPUIDLE_DESC_LEN - 1);
+	return 0;
+}
+
+/*
+ * Check that the idle state is uniform across all CPUs in the CPUidle driver
+ * cpumask
+ */
+static bool idle_state_valid(struct device_node *state_node, unsigned int idx,
+			     const cpumask_t *cpumask)
+{
+	int cpu;
+	struct device_node *cpu_node, *curr_state_node;
+	bool valid = true;
+
+	/*
+	 * Compare idle state phandles for index idx on all CPUs in the
+	 * CPUidle driver cpumask. Start from next logical cpu following
+	 * cpumask_first(cpumask) since that's the CPU state_node was
+	 * retrieved from. If a mismatch is found bail out straight
+	 * away since we certainly hit a firmware misconfiguration.
+	 */
+	for (cpu = cpumask_next(cpumask_first(cpumask), cpumask);
+	     cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpumask)) {
+		cpu_node = of_cpu_device_node_get(cpu);
+		curr_state_node = of_parse_phandle(cpu_node, "cpu-idle-states",
+						   idx);
+		if (state_node != curr_state_node)
+			valid = false;
+
+		of_node_put(curr_state_node);
+		of_node_put(cpu_node);
+		if (!valid)
+			break;
+	}
+
+	return valid;
+}
+
+/**
+ * dt_init_idle_driver() - Parse the DT idle states and initialize the
+ *			   idle driver states array
+ * @drv:	  Pointer to CPU idle driver to be initialized
+ * @matches:	  Array of of_device_id match structures to search in for
+ *		  compatible idle state nodes. The data pointer for each valid
+ *		  struct of_device_id entry in the matches array must point to
+ *		  a function with the following signature, that corresponds to
+ *		  the CPUidle state enter function signature:
+ *
+ *		  int (*)(struct cpuidle_device *dev,
+ *			  struct cpuidle_driver *drv,
+ *			  int index);
+ *
+ * @start_idx:    First idle state index to be initialized
+ *
+ * If DT idle states are detected and are valid the state count and states
+ * array entries in the cpuidle driver are initialized accordingly starting
+ * from index start_idx.
+ *
+ * Return: number of valid DT idle states parsed, <0 on failure
+ */
+int dt_init_idle_driver(struct cpuidle_driver *drv,
+			const struct of_device_id *matches,
+			unsigned int start_idx)
+{
+	struct cpuidle_state *idle_state;
+	struct device_node *state_node, *cpu_node;
+	int i, err = 0;
+	const cpumask_t *cpumask;
+	unsigned int state_idx = start_idx;
+
+	if (state_idx >= CPUIDLE_STATE_MAX)
+		return -EINVAL;
+	/*
+	 * We get the idle states for the first logical cpu in the
+	 * driver mask (or cpu_possible_mask if the driver cpumask is not set)
+	 * and we check through idle_state_valid() if they are uniform
+	 * across CPUs, otherwise we hit a firmware misconfiguration.
+	 */
+	cpumask = drv->cpumask ? : cpu_possible_mask;
+	cpu_node = of_cpu_device_node_get(cpumask_first(cpumask));
+
+	for (i = 0; ; i++) {
+		state_node = of_parse_phandle(cpu_node, "cpu-idle-states", i);
+		if (!state_node)
+			break;
+
+		if (!idle_state_valid(state_node, i, cpumask)) {
+			pr_warn("%s idle state not valid, bailing out\n",
+				state_node->full_name);
+			err = -EINVAL;
+			break;
+		}
+
+		if (state_idx == CPUIDLE_STATE_MAX) {
+			pr_warn("State index reached static CPU idle driver states array size\n");
+			break;
+		}
+
+		idle_state = &drv->states[state_idx++];
+		err = init_state_node(idle_state, matches, state_node);
+		if (err) {
+			pr_err("Parsing idle state node %s failed with err %d\n",
+			       state_node->full_name, err);
+			err = -EINVAL;
+			break;
+		}
+		of_node_put(state_node);
+	}
+
+	of_node_put(state_node);
+	of_node_put(cpu_node);
+	if (err)
+		return err;
+	/*
+	 * Update the driver state count only if some valid DT idle states
+	 * were detected
+	 */
+	if (i)
+		drv->state_count = state_idx;
+
+	/*
+	 * Return the number of present and valid DT idle states, which can
+	 * also be 0 on platforms with missing DT idle states or legacy DT
+	 * configuration predating the DT idle states bindings.
+	 */
+	return i;
+}
+EXPORT_SYMBOL_GPL(dt_init_idle_driver);
diff --git a/drivers/cpuidle/dt_idle_states.h b/drivers/cpuidle/dt_idle_states.h
new file mode 100644
index 000000000000..4818134bc65b
--- /dev/null
+++ b/drivers/cpuidle/dt_idle_states.h
@@ -0,0 +1,7 @@
+#ifndef __DT_IDLE_STATES
+#define __DT_IDLE_STATES
+
+int dt_init_idle_driver(struct cpuidle_driver *drv,
+			const struct of_device_id *matches,
+			unsigned int start_idx);
+#endif
diff --git a/drivers/cpuidle/of_idle_states.c b/drivers/cpuidle/of_idle_states.c
deleted file mode 100644
index eceb1b4c4657..000000000000
--- a/drivers/cpuidle/of_idle_states.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * OF idle states parsing code.
- *
- * Copyright (C) 2014 ARM Ltd.
- * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/cpuidle.h>
-#include <linux/cpumask.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/list_sort.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/slab.h>
-
-#include "of_idle_states.h"
-
-struct state_elem {
-	struct list_head list;
-	struct device_node *node;
-	int val;
-};
-
-static struct list_head head __initdata = LIST_HEAD_INIT(head);
-
-static bool __init state_cpu_valid(struct device_node *state_node,
-				   struct device_node *cpu_node)
-{
-	int i = 0;
-	struct device_node *cpu_state;
-
-	while ((cpu_state = of_parse_phandle(cpu_node,
-					     "cpu-idle-states", i++))) {
-		if (cpu_state && state_node == cpu_state) {
-			of_node_put(cpu_state);
-			return true;
-		}
-		of_node_put(cpu_state);
-	}
-	return false;
-}
-
-static bool __init state_cpus_valid(const cpumask_t *cpus,
-				    struct device_node *state_node)
-{
-	int cpu;
-	struct device_node *cpu_node;
-
-	/*
-	 * Check if state is valid on driver cpumask cpus
-	 */
-	for_each_cpu(cpu, cpus) {
-		cpu_node = of_get_cpu_node(cpu, NULL);
-
-		if (!cpu_node) {
-			pr_err("Missing device node for CPU %d\n", cpu);
-			return false;
-		}
-
-		if (!state_cpu_valid(state_node, cpu_node))
-			return false;
-	}
-
-	return true;
-}
-
-static int __init state_cmp(void *priv, struct list_head *a,
-			    struct list_head *b)
-{
-	struct state_elem *ela, *elb;
-
-	ela = container_of(a, struct state_elem, list);
-	elb = container_of(b, struct state_elem, list);
-
-	return ela->val - elb->val;
-}
-
-static int __init add_state_node(cpumask_t *cpumask,
-				 struct device_node *state_node)
-{
-	struct state_elem *el;
-	u32 val;
-
-	pr_debug(" * %s...\n", state_node->full_name);
-
-	if (!state_cpus_valid(cpumask, state_node))
-		return -EINVAL;
-	/*
-	 * Parse just the value required to sort the states.
-	 */
-	if (of_property_read_u32(state_node, "min-residency-us",
-				 &val)) {
-		pr_debug(" * %s missing min-residency-us property\n",
-			 state_node->full_name);
-		return -EINVAL;
-	}
-
-	el = kmalloc(sizeof(*el), GFP_KERNEL);
-	if (!el) {
-		pr_err("%s failed to allocate memory\n", __func__);
-		return -ENOMEM;
-	}
-
-	el->node = state_node;
-	el->val = val;
-	list_add_tail(&el->list, &head);
-
-	return 0;
-}
-
-static void __init init_state_node(struct cpuidle_driver *drv,
-				   struct device_node *state_node,
-				   int *cnt)
-{
-	struct cpuidle_state *idle_state;
-
-	pr_debug(" * %s...\n", state_node->full_name);
-
-	idle_state = &drv->states[*cnt];
-
-	if (of_property_read_u32(state_node, "exit-latency-us",
-				 &idle_state->exit_latency)) {
-		pr_debug(" * %s missing exit-latency-us property\n",
-			     state_node->full_name);
-		return;
-	}
-
-	if (of_property_read_u32(state_node, "min-residency-us",
-				 &idle_state->target_residency)) {
-		pr_debug(" * %s missing min-residency-us property\n",
-			     state_node->full_name);
-		return;
-	}
-	/*
-	 * It is unknown to the idle driver if and when the tick_device
-	 * loses context when the CPU enters the idle states. To solve
-	 * this issue the tick device must be linked to a power domain
-	 * so that the idle driver can check on which states the device
-	 * loses its context. Current code takes the conservative choice
-	 * of defining the idle state as one where the tick device always
-	 * loses its context. On platforms where tick device never loses
-	 * its context (ie it is not a C3STOP device) this turns into
-	 * a nop. On platforms where the tick device does lose context in some
-	 * states, this code can be optimized, when power domain specifications
-	 * for ARM CPUs are finalized.
-	 */
-	idle_state->flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TIMER_STOP;
-
-	strncpy(idle_state->name, state_node->name, CPUIDLE_NAME_LEN);
-	strncpy(idle_state->desc, state_node->name, CPUIDLE_NAME_LEN);
-
-	(*cnt)++;
-}
-
-static int __init init_idle_states(struct cpuidle_driver *drv,
-				   struct device_node *state_nodes[],
-				   unsigned int start_idx, bool init_nodes)
-{
-	struct state_elem *el;
-	struct list_head *curr, *tmp;
-	unsigned int cnt = start_idx;
-
-	list_for_each_entry(el, &head, list) {
-		/*
-		 * Check if the init function has to fill the
-		 * state_nodes array on behalf of the CPUidle driver.
-		 */
-		if (init_nodes)
-			state_nodes[cnt] = el->node;
-		/*
-		 * cnt is updated on return if a state was added.
-		 */
-		init_state_node(drv, el->node, &cnt);
-
-		if (cnt == CPUIDLE_STATE_MAX) {
-			pr_warn("State index reached static CPU idle state limit\n");
-			break;
-		}
-	}
-
-	drv->state_count = cnt;
-
-	list_for_each_safe(curr, tmp, &head) {
-		list_del(curr);
-		kfree(container_of(curr, struct state_elem, list));
-	}
-
-	/*
-	 * If no idle states are detected, return an error and let the idle
-	 * driver initialization fail accordingly.
-	 */
-	return (cnt > start_idx) ? 0 : -ENODATA;
-}
-
-static void __init add_idle_states(struct cpuidle_driver *drv,
-				   struct device_node *idle_states)
-{
-	struct device_node *state_node;
-
-	for_each_child_of_node(idle_states, state_node) {
-		if ((!of_device_is_compatible(state_node, "arm,idle-state"))) {
-			pr_warn(" * %s: children of /cpus/idle-states must be \"arm,idle-state\" compatible\n",
-				     state_node->full_name);
-			continue;
-		}
-		/*
-		 * If memory allocation fails, better bail out.
-		 * Initialized nodes are freed at initialization
-		 * completion in of_init_idle_driver().
-		 */
-		if ((add_state_node(drv->cpumask, state_node) == -ENOMEM))
-			break;
-	}
-	/*
-	 * Sort the states list before initializing the CPUidle driver
-	 * states array.
-	 */
-	list_sort(NULL, &head, state_cmp);
-}
-
-/*
- * of_init_idle_driver - Parse the DT idle states and initialize the
- *			 idle driver states array
- *
- * @drv:	  Pointer to CPU idle driver to be initialized
- * @state_nodes:  Array of struct device_nodes to be initialized if
- *		  init_nodes == true. Must be sized CPUIDLE_STATE_MAX
- * @start_idx:    First idle state index to be initialized
- * @init_nodes:   Boolean to request device nodes initialization
- *
- * Returns:
- *	0 on success
- *	<0 on failure
- *
- *	On success the states array in the cpuidle driver contains
- *	initialized entries in the states array, starting from index start_idx.
- *	If init_nodes == true, on success the state_nodes array is initialized
- *	with idle state DT node pointers, starting from index start_idx,
- *	in a 1:1 relation with the idle driver states array.
- */
-int __init of_init_idle_driver(struct cpuidle_driver *drv,
-			       struct device_node *state_nodes[],
-			       unsigned int start_idx, bool init_nodes)
-{
-	struct device_node *idle_states_node;
-	int ret;
-
-	if (start_idx >= CPUIDLE_STATE_MAX) {
-		pr_warn("State index exceeds static CPU idle driver states array size\n");
-		return -EINVAL;
-	}
-
-	if (WARN(init_nodes && !state_nodes,
-		"Requested nodes stashing in an invalid nodes container\n"))
-		return -EINVAL;
-
-	idle_states_node = of_find_node_by_path("/cpus/idle-states");
-	if (!idle_states_node)
-		return -ENOENT;
-
-	add_idle_states(drv, idle_states_node);
-
-	ret = init_idle_states(drv, state_nodes, start_idx, init_nodes);
-
-	of_node_put(idle_states_node);
-
-	return ret;
-}
diff --git a/drivers/cpuidle/of_idle_states.h b/drivers/cpuidle/of_idle_states.h
deleted file mode 100644
index 049f94ff4428..000000000000
--- a/drivers/cpuidle/of_idle_states.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __OF_IDLE_STATES
-#define __OF_IDLE_STATES
-
-int __init of_init_idle_driver(struct cpuidle_driver *drv,
-			       struct device_node *state_nodes[],
-			       unsigned int start_idx,
-			       bool init_nodes);
-#endif
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 901b7435e890..192c783c2ec6 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_OF_DEVICE_H
 #define _LINUX_OF_DEVICE_H
 
+#include <linux/cpu.h>
 #include <linux/platform_device.h>
 #include <linux/of_platform.h> /* temporary until merge */
 
@@ -43,6 +44,15 @@ static inline void of_device_node_put(struct device *dev)
 	of_node_put(dev->of_node);
 }
 
+static inline struct device_node *of_cpu_device_node_get(int cpu)
+{
+	struct device *cpu_dev;
+	cpu_dev = get_cpu_device(cpu);
+	if (!cpu_dev)
+		return NULL;
+	return of_node_get(cpu_dev->of_node);
+}
+
 #else /* CONFIG_OF_DEVICE */
 
 static inline int of_driver_match_device(struct device *dev,
@@ -67,6 +77,11 @@ static inline const struct of_device_id *of_match_device(
 {
 	return NULL;
 }
+
+static inline struct device_node *of_cpu_device_node_get(int cpu)
+{
+	return NULL;
+}
 #endif /* CONFIG_OF_DEVICE */
 
 #endif /* _LINUX_OF_DEVICE_H */
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 23b36304cd88..2ca9ed7cfc9b 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -10,6 +10,11 @@
 #define SYS_HALT	0x0002	/* Notify of system halt */
 #define SYS_POWER_OFF	0x0003	/* Notify of system power off */
 
+enum reboot_mode {
+	REBOOT_COLD = 0,
+	REBOOT_WARM,
+};
+
 extern int register_reboot_notifier(struct notifier_block *);
 extern int unregister_reboot_notifier(struct notifier_block *);
author	Mark Brown <broonie@kernel.org>	2015-01-18 20:50:33 +0000
committer	Mark Brown <broonie@kernel.org>	2015-01-18 20:50:33 +0000
commit	99687559e46be79e7b0b8f57570928e341b24a6d (patch)
tree	81dfd6f7a0930769a1c801edc370da8ed5d48021
parent	041be7a254157109dec33e38ff9fe5c6fea60299 (diff)
parent	2eb736d6b425cb932b038fd555243b9b0e59c036 (diff)