78 files changed, 31659 insertions, 0 deletions
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
new file mode 100644
index 000000000000..40836ad9079c
--- /dev/null
+++ b/arch/x86/kernel/.gitignore
@@ -0,0 +1 @@
+vsyscall.lds
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
new file mode 100644
index 000000000000..577d08f4b8bb
--- /dev/null
+++ b/arch/x86/kernel/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/kernel/Makefile_32
+else
+include ${srctree}/arch/x86_64/kernel/Makefile_64
+endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
new file mode 100644
index 000000000000..5096f486d389
--- /dev/null
+++ b/arch/x86/kernel/Makefile_32
@@ -0,0 +1,88 @@
+#
+# Makefile for the linux kernel.
+#
+
+extra-y := head_32.o init_task_32.o vmlinux.lds
+
+obj-y	:= process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
+		ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
+		pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
+		quirks.o i8237.o topology.o alternative.o i8253_32.o tsc_32.o
+
+obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
+obj-y				+= ../../x86/kernel/cpu/
+obj-y				+= ../../x86/kernel/acpi/
+obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot_32.o
+obj-$(CONFIG_MCA)		+= mca_32.o
+obj-$(CONFIG_X86_MSR)		+= msr.o
+obj-$(CONFIG_X86_CPUID)		+= cpuid.o
+obj-$(CONFIG_MICROCODE)		+= microcode.o
+obj-$(CONFIG_APM)		+= apm_32.o
+obj-$(CONFIG_X86_SMP)		+= smp_32.o smpboot_32.o tsc_sync.o
+obj-$(CONFIG_SMP)		+= smpcommon_32.o
+obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline_32.o
+obj-$(CONFIG_X86_MPPARSE)	+= mpparse_32.o
+obj-$(CONFIG_X86_LOCAL_APIC)	+= apic_32.o nmi_32.o
+obj-$(CONFIG_X86_IO_APIC)	+= io_apic_32.o
+obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec_32.o relocate_kernel_32.o crash_32.o
+obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_32.o
+obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
+obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit_32.o
+obj-$(CONFIG_KPROBES)		+= kprobes_32.o
+obj-$(CONFIG_MODULES)		+= module_32.o
+obj-y				+= sysenter_32.o vsyscall_32.o
+obj-$(CONFIG_ACPI_SRAT) 	+= srat_32.o
+obj-$(CONFIG_EFI) 		+= efi_32.o efi_stub_32.o
+obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
+obj-$(CONFIG_VM86)		+= vm86_32.o
+obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_HPET_TIMER) 	+= hpet_32.o
+obj-$(CONFIG_K8_NB)		+= k8.o
+obj-$(CONFIG_MGEODE_LX)		+= geode_32.o
+
+obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt_32.o
+obj-y				+= pcspeaker.o
+
+obj-$(CONFIG_SCx200)		+= scx200_32.o
+
+# vsyscall_32.o contains the vsyscall DSO images as __initdata.
+# We must build both images before we can assemble it.
+# Note: kbuild does not track this dependency due to usage of .incbin
+$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
+targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so)
+targets += vsyscall-note_32.o vsyscall_32.lds
+
+# The DSO images are built using a special linker script.
+quiet_cmd_syscall = SYSCALL $@
+      cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
+		          -Wl,-T,$(filter-out FORCE,$^) -o $@
+
+export CPPFLAGS_vsyscall_32.lds += -P -C -U$(ARCH)
+
+vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
+		 $(call ld-option, -Wl$(comma)--hash-style=sysv)
+SYSCFLAGS_vsyscall-sysenter_32.so	= $(vsyscall-flags)
+SYSCFLAGS_vsyscall-int80_32.so	= $(vsyscall-flags)
+
+$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \
+$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \
+		      $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE
+	$(call if_changed,syscall)
+
+# We also create a special relocatable object that should mirror the symbol
+# table and layout of the linked DSO.  With ld -R we can then refer to
+# these symbols in the kernel code rather than hand-coded addresses.
+extra-y += vsyscall-syms.o
+$(obj)/built-in.o: $(obj)/vsyscall-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
+
+SYSCFLAGS_vsyscall-syms.o = -r
+$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
+			$(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
+	$(call if_changed,syscall)
+
+k8-y                      += ../../x86_64/kernel/k8.o
+stacktrace-y		  += ../../x86_64/kernel/stacktrace.o
+
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
new file mode 100644
index 000000000000..bd72d94e713e
--- /dev/null
+++ b/arch/x86/kernel/alternative.c
@@ -0,0 +1,450 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/kprobes.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/alternative.h>
+#include <asm/sections.h>
+#include <asm/pgtable.h>
+#include <asm/mce.h>
+#include <asm/nmi.h>
+
+#define MAX_PATCH_LEN (255-1)
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int smp_alt_once;
+
+static int __init bootonly(char *str)
+{
+	smp_alt_once = 1;
+	return 1;
+}
+__setup("smp-alt-boot", bootonly);
+#else
+#define smp_alt_once 1
+#endif
+
+static int debug_alternative;
+
+static int __init debug_alt(char *str)
+{
+	debug_alternative = 1;
+	return 1;
+}
+__setup("debug-alternative", debug_alt);
+
+static int noreplace_smp;
+
+static int __init setup_noreplace_smp(char *str)
+{
+	noreplace_smp = 1;
+	return 1;
+}
+__setup("noreplace-smp", setup_noreplace_smp);
+
+#ifdef CONFIG_PARAVIRT
+static int noreplace_paravirt = 0;
+
+static int __init setup_noreplace_paravirt(char *str)
+{
+	noreplace_paravirt = 1;
+	return 1;
+}
+__setup("noreplace-paravirt", setup_noreplace_paravirt);
+#endif
+
+#define DPRINTK(fmt, args...) if (debug_alternative) \
+	printk(KERN_DEBUG fmt, args)
+
+#ifdef GENERIC_NOP1
+/* Use inline assembly to define this because the nops are defined
+   as inline assembly strings in the include files and we cannot
+   get them easily into strings. */
+asm("\t.data\nintelnops: "
+	GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
+	GENERIC_NOP7 GENERIC_NOP8);
+extern unsigned char intelnops[];
+static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
+	NULL,
+	intelnops,
+	intelnops + 1,
+	intelnops + 1 + 2,
+	intelnops + 1 + 2 + 3,
+	intelnops + 1 + 2 + 3 + 4,
+	intelnops + 1 + 2 + 3 + 4 + 5,
+	intelnops + 1 + 2 + 3 + 4 + 5 + 6,
+	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+};
+#endif
+
+#ifdef K8_NOP1
+asm("\t.data\nk8nops: "
+	K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
+	K8_NOP7 K8_NOP8);
+extern unsigned char k8nops[];
+static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
+	NULL,
+	k8nops,
+	k8nops + 1,
+	k8nops + 1 + 2,
+	k8nops + 1 + 2 + 3,
+	k8nops + 1 + 2 + 3 + 4,
+	k8nops + 1 + 2 + 3 + 4 + 5,
+	k8nops + 1 + 2 + 3 + 4 + 5 + 6,
+	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+};
+#endif
+
+#ifdef K7_NOP1
+asm("\t.data\nk7nops: "
+	K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
+	K7_NOP7 K7_NOP8);
+extern unsigned char k7nops[];
+static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
+	NULL,
+	k7nops,
+	k7nops + 1,
+	k7nops + 1 + 2,
+	k7nops + 1 + 2 + 3,
+	k7nops + 1 + 2 + 3 + 4,
+	k7nops + 1 + 2 + 3 + 4 + 5,
+	k7nops + 1 + 2 + 3 + 4 + 5 + 6,
+	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+};
+#endif
+
+#ifdef CONFIG_X86_64
+
+extern char __vsyscall_0;
+static inline unsigned char** find_nop_table(void)
+{
+	return k8_nops;
+}
+
+#else /* CONFIG_X86_64 */
+
+static struct nop {
+	int cpuid;
+	unsigned char **noptable;
+} noptypes[] = {
+	{ X86_FEATURE_K8, k8_nops },
+	{ X86_FEATURE_K7, k7_nops },
+	{ -1, NULL }
+};
+
+static unsigned char** find_nop_table(void)
+{
+	unsigned char **noptable = intel_nops;
+	int i;
+
+	for (i = 0; noptypes[i].cpuid >= 0; i++) {
+		if (boot_cpu_has(noptypes[i].cpuid)) {
+			noptable = noptypes[i].noptable;
+			break;
+		}
+	}
+	return noptable;
+}
+
+#endif /* CONFIG_X86_64 */
+
+/* Use this to add nops to a buffer, then text_poke the whole buffer. */
+static void add_nops(void *insns, unsigned int len)
+{
+	unsigned char **noptable = find_nop_table();
+
+	while (len > 0) {
+		unsigned int noplen = len;
+		if (noplen > ASM_NOP_MAX)
+			noplen = ASM_NOP_MAX;
+		memcpy(insns, noptable[noplen], noplen);
+		insns += noplen;
+		len -= noplen;
+	}
+}
+
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+extern u8 *__smp_locks[], *__smp_locks_end[];
+
+/* Replace instructions with better alternatives for this CPU type.
+   This runs before SMP is initialized to avoid SMP problems with
+   self modifying code. This implies that assymetric systems where
+   APs have less capabilities than the boot processor are not handled.
+   Tough. Make sure you disable such features by hand. */
+
+void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
+{
+	struct alt_instr *a;
+	char insnbuf[MAX_PATCH_LEN];
+
+	DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
+	for (a = start; a < end; a++) {
+		u8 *instr = a->instr;
+		BUG_ON(a->replacementlen > a->instrlen);
+		BUG_ON(a->instrlen > sizeof(insnbuf));
+		if (!boot_cpu_has(a->cpuid))
+			continue;
+#ifdef CONFIG_X86_64
+		/* vsyscall code is not mapped yet. resolve it manually. */
+		if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
+			instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
+			DPRINTK("%s: vsyscall fixup: %p => %p\n",
+				__FUNCTION__, a->instr, instr);
+		}
+#endif
+		memcpy(insnbuf, a->replacement, a->replacementlen);
+		add_nops(insnbuf + a->replacementlen,
+			 a->instrlen - a->replacementlen);
+		text_poke(instr, insnbuf, a->instrlen);
+	}
+}
+
+#ifdef CONFIG_SMP
+
+static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+{
+	u8 **ptr;
+
+	for (ptr = start; ptr < end; ptr++) {
+		if (*ptr < text)
+			continue;
+		if (*ptr > text_end)
+			continue;
+		text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
+	};
+}
+
+static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+{
+	u8 **ptr;
+	char insn[1];
+
+	if (noreplace_smp)
+		return;
+
+	add_nops(insn, 1);
+	for (ptr = start; ptr < end; ptr++) {
+		if (*ptr < text)
+			continue;
+		if (*ptr > text_end)
+			continue;
+		text_poke(*ptr, insn, 1);
+	};
+}
+
+struct smp_alt_module {
+	/* what is this ??? */
+	struct module	*mod;
+	char		*name;
+
+	/* ptrs to lock prefixes */
+	u8		**locks;
+	u8		**locks_end;
+
+	/* .text segment, needed to avoid patching init code ;) */
+	u8		*text;
+	u8		*text_end;
+
+	struct list_head next;
+};
+static LIST_HEAD(smp_alt_modules);
+static DEFINE_SPINLOCK(smp_alt);
+
+void alternatives_smp_module_add(struct module *mod, char *name,
+				 void *locks, void *locks_end,
+				 void *text,  void *text_end)
+{
+	struct smp_alt_module *smp;
+	unsigned long flags;
+
+	if (noreplace_smp)
+		return;
+
+	if (smp_alt_once) {
+		if (boot_cpu_has(X86_FEATURE_UP))
+			alternatives_smp_unlock(locks, locks_end,
+						text, text_end);
+		return;
+	}
+
+	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
+	if (NULL == smp)
+		return; /* we'll run the (safe but slow) SMP code then ... */
+
+	smp->mod	= mod;
+	smp->name	= name;
+	smp->locks	= locks;
+	smp->locks_end	= locks_end;
+	smp->text	= text;
+	smp->text_end	= text_end;
+	DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
+		__FUNCTION__, smp->locks, smp->locks_end,
+		smp->text, smp->text_end, smp->name);
+
+	spin_lock_irqsave(&smp_alt, flags);
+	list_add_tail(&smp->next, &smp_alt_modules);
+	if (boot_cpu_has(X86_FEATURE_UP))
+		alternatives_smp_unlock(smp->locks, smp->locks_end,
+					smp->text, smp->text_end);
+	spin_unlock_irqrestore(&smp_alt, flags);
+}
+
+void alternatives_smp_module_del(struct module *mod)
+{
+	struct smp_alt_module *item;
+	unsigned long flags;
+
+	if (smp_alt_once || noreplace_smp)
+		return;
+
+	spin_lock_irqsave(&smp_alt, flags);
+	list_for_each_entry(item, &smp_alt_modules, next) {
+		if (mod != item->mod)
+			continue;
+		list_del(&item->next);
+		spin_unlock_irqrestore(&smp_alt, flags);
+		DPRINTK("%s: %s\n", __FUNCTION__, item->name);
+		kfree(item);
+		return;
+	}
+	spin_unlock_irqrestore(&smp_alt, flags);
+}
+
+void alternatives_smp_switch(int smp)
+{
+	struct smp_alt_module *mod;
+	unsigned long flags;
+
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * A not yet fixed binutils section handling bug prevents
+	 * alternatives-replacement from working reliably, so turn
+	 * it off:
+	 */
+	printk("lockdep: not fixing up alternatives.\n");
+	return;
+#endif
+
+	if (noreplace_smp || smp_alt_once)
+		return;
+	BUG_ON(!smp && (num_online_cpus() > 1));
+
+	spin_lock_irqsave(&smp_alt, flags);
+	if (smp) {
+		printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
+		clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
+		clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
+		list_for_each_entry(mod, &smp_alt_modules, next)
+			alternatives_smp_lock(mod->locks, mod->locks_end,
+					      mod->text, mod->text_end);
+	} else {
+		printk(KERN_INFO "SMP alternatives: switching to UP code\n");
+		set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
+		set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
+		list_for_each_entry(mod, &smp_alt_modules, next)
+			alternatives_smp_unlock(mod->locks, mod->locks_end,
+						mod->text, mod->text_end);
+	}
+	spin_unlock_irqrestore(&smp_alt, flags);
+}
+
+#endif
+
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch_site *start,
+		    struct paravirt_patch_site *end)
+{
+	struct paravirt_patch_site *p;
+	char insnbuf[MAX_PATCH_LEN];
+
+	if (noreplace_paravirt)
+		return;
+
+	for (p = start; p < end; p++) {
+		unsigned int used;
+
+		BUG_ON(p->len > MAX_PATCH_LEN);
+		/* prep the buffer with the original instructions */
+		memcpy(insnbuf, p->instr, p->len);
+		used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf,
+					  (unsigned long)p->instr, p->len);
+
+		BUG_ON(used > p->len);
+
+		/* Pad the rest with nops */
+		add_nops(insnbuf + used, p->len - used);
+		text_poke(p->instr, insnbuf, p->len);
+	}
+}
+extern struct paravirt_patch_site __start_parainstructions[],
+	__stop_parainstructions[];
+#endif	/* CONFIG_PARAVIRT */
+
+void __init alternative_instructions(void)
+{
+	unsigned long flags;
+
+	/* The patching is not fully atomic, so try to avoid local interruptions
+	   that might execute the to be patched code.
+	   Other CPUs are not running. */
+	stop_nmi();
+#ifdef CONFIG_X86_MCE
+	stop_mce();
+#endif
+
+	local_irq_save(flags);
+	apply_alternatives(__alt_instructions, __alt_instructions_end);
+
+	/* switch to patch-once-at-boottime-only mode and free the
+	 * tables in case we know the number of CPUs will never ever
+	 * change */
+#ifdef CONFIG_HOTPLUG_CPU
+	if (num_possible_cpus() < 2)
+		smp_alt_once = 1;
+#endif
+
+#ifdef CONFIG_SMP
+	if (smp_alt_once) {
+		if (1 == num_possible_cpus()) {
+			printk(KERN_INFO "SMP alternatives: switching to UP code\n");
+			set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
+			set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
+			alternatives_smp_unlock(__smp_locks, __smp_locks_end,
+						_text, _etext);
+		}
+		free_init_pages("SMP alternatives",
+				(unsigned long)__smp_locks,
+				(unsigned long)__smp_locks_end);
+	} else {
+		alternatives_smp_module_add(NULL, "core kernel",
+					    __smp_locks, __smp_locks_end,
+					    _text, _etext);
+		alternatives_smp_switch(0);
+	}
+#endif
+ 	apply_paravirt(__parainstructions, __parainstructions_end);
+	local_irq_restore(flags);
+
+	restart_nmi();
+#ifdef CONFIG_X86_MCE
+	restart_mce();
+#endif
+}
+
+/*
+ * Warning:
+ * When you use this code to patch more than one byte of an instruction
+ * you need to make sure that other CPUs cannot execute this code in parallel.
+ * Also no thread must be currently preempted in the middle of these instructions.
+ * And on the local CPU you need to be protected again NMI or MCE handlers
+ * seeing an inconsistent instruction while you patch.
+ */
+void __kprobes text_poke(void *addr, unsigned char *opcode, int len)
+{
+	memcpy(addr, opcode, len);
+	sync_core();
+	/* Could also do a CLFLUSH here to speed up CPU recovery; but
+	   that causes hangs on some VIA CPUs. */
+}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
new file mode 100644
index 000000000000..3d67ae18d762
--- /dev/null
+++ b/arch/x86/kernel/apic_32.c
@@ -0,0 +1,1566 @@
+/*
+ *	Local APIC handling, local APIC timers
+ *
+ *	(c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *	Fixes
+ *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs;
+ *					thanks to Eric Gilmore
+ *					and Rolf G. Tews
+ *					for testing these extensively.
+ *	Maciej W. Rozycki	:	Various updates and fixes.
+ *	Mikael Pettersson	:	Power Management for UP-APIC.
+ *	Pavel Machek and
+ *	Mikael Pettersson	:	PM converted to driver model.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <linux/clockchips.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/module.h>
+#include <linux/dmi.h>
+
+#include <asm/atomic.h>
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/desc.h>
+#include <asm/arch_hooks.h>
+#include <asm/hpet.h>
+#include <asm/i8253.h>
+#include <asm/nmi.h>
+
+#include <mach_apic.h>
+#include <mach_apicdef.h>
+#include <mach_ipi.h>
+
+#include "io_ports.h"
+
+/*
+ * Sanity check
+ */
+#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F
+# error SPURIOUS_APIC_VECTOR definition error
+#endif
+
+/*
+ * Knob to control our willingness to enable the local APIC.
+ *
+ * -1=force-disable, +1=force-enable
+ */
+static int enable_local_apic __initdata = 0;
+
+/* Local APIC timer verification ok */
+static int local_apic_timer_verify_ok;
+/* Disable local APIC timer from the kernel commandline or via dmi quirk
+   or using CPU MSR check */
+int local_apic_timer_disabled;
+/* Local APIC timer works in C2 */
+int local_apic_timer_c2_ok;
+EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
+
+/*
+ * Debug level, exported for io_apic.c
+ */
+int apic_verbosity;
+
+static unsigned int calibration_result;
+
+static int lapic_next_event(unsigned long delta,
+			    struct clock_event_device *evt);
+static void lapic_timer_setup(enum clock_event_mode mode,
+			      struct clock_event_device *evt);
+static void lapic_timer_broadcast(cpumask_t mask);
+static void apic_pm_activate(void);
+
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device lapic_clockevent = {
+	.name		= "lapic",
+	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
+			| CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
+	.shift		= 32,
+	.set_mode	= lapic_timer_setup,
+	.set_next_event	= lapic_next_event,
+	.broadcast	= lapic_timer_broadcast,
+	.rating		= 100,
+	.irq		= -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+
+/* Local APIC was disabled by the BIOS and enabled by the kernel */
+static int enabled_via_apicbase;
+
+/*
+ * Get the LAPIC version
+ */
+static inline int lapic_get_version(void)
+{
+	return GET_APIC_VERSION(apic_read(APIC_LVR));
+}
+
+/*
+ * Check, if the APIC is integrated or a seperate chip
+ */
+static inline int lapic_is_integrated(void)
+{
+	return APIC_INTEGRATED(lapic_get_version());
+}
+
+/*
+ * Check, whether this is a modern or a first generation APIC
+ */
+static int modern_apic(void)
+{
+	/* AMD systems use old APIC versions, so check the CPU */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+	    boot_cpu_data.x86 >= 0xf)
+		return 1;
+	return lapic_get_version() >= 0x14;
+}
+
+void apic_wait_icr_idle(void)
+{
+	while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
+		cpu_relax();
+}
+
+unsigned long safe_apic_wait_icr_idle(void)
+{
+	unsigned long send_status;
+	int timeout;
+
+	timeout = 0;
+	do {
+		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+		if (!send_status)
+			break;
+		udelay(100);
+	} while (timeout++ < 1000);
+
+	return send_status;
+}
+
+/**
+ * enable_NMI_through_LVT0 - enable NMI through local vector table 0
+ */
+void enable_NMI_through_LVT0 (void * dummy)
+{
+	unsigned int v = APIC_DM_NMI;
+
+	/* Level triggered for 82489DX */
+	if (!lapic_is_integrated())
+		v |= APIC_LVT_LEVEL_TRIGGER;
+	apic_write_around(APIC_LVT0, v);
+}
+
+/**
+ * get_physical_broadcast - Get number of physical broadcast IDs
+ */
+int get_physical_broadcast(void)
+{
+	return modern_apic() ? 0xff : 0xf;
+}
+
+/**
+ * lapic_get_maxlvt - get the maximum number of local vector table entries
+ */
+int lapic_get_maxlvt(void)
+{
+	unsigned int v = apic_read(APIC_LVR);
+
+	/* 82489DXs do not report # of LVT entries. */
+	return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
+}
+
+/*
+ * Local APIC timer
+ */
+
+/* Clock divisor is set to 16 */
+#define APIC_DIVISOR 16
+
+/*
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
+ */
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
+{
+	unsigned int lvtt_value, tmp_value;
+
+	lvtt_value = LOCAL_TIMER_VECTOR;
+	if (!oneshot)
+		lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+	if (!lapic_is_integrated())
+		lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+
+	if (!irqen)
+		lvtt_value |= APIC_LVT_MASKED;
+
+	apic_write_around(APIC_LVTT, lvtt_value);
+
+	/*
+	 * Divide PICLK by 16
+	 */
+	tmp_value = apic_read(APIC_TDCR);
+	apic_write_around(APIC_TDCR, (tmp_value
+				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+				| APIC_TDR_DIV_16);
+
+	if (!oneshot)
+		apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+}
+
+/*
+ * Program the next event, relative to now
+ */
+static int lapic_next_event(unsigned long delta,
+			    struct clock_event_device *evt)
+{
+	apic_write_around(APIC_TMICT, delta);
+	return 0;
+}
+
+/*
+ * Setup the lapic timer in periodic or oneshot mode
+ */
+static void lapic_timer_setup(enum clock_event_mode mode,
+			      struct clock_event_device *evt)
+{
+	unsigned long flags;
+	unsigned int v;
+
+	/* Lapic used for broadcast ? */
+	if (!local_apic_timer_verify_ok)
+		return;
+
+	local_irq_save(flags);
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+	case CLOCK_EVT_MODE_ONESHOT:
+		__setup_APIC_LVTT(calibration_result,
+				  mode != CLOCK_EVT_MODE_PERIODIC, 1);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		v = apic_read(APIC_LVTT);
+		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+		apic_write_around(APIC_LVTT, v);
+		break;
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here */
+		break;
+	}
+
+	local_irq_restore(flags);
+}
+
+/*
+ * Local APIC timer broadcast function
+ */
+static void lapic_timer_broadcast(cpumask_t mask)
+{
+#ifdef CONFIG_SMP
+	send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+#endif
+}
+
+/*
+ * Setup the local APIC timer for this CPU. Copy the initilized values
+ * of the boot CPU and register the clock event in the framework.
+ */
+static void __devinit setup_APIC_timer(void)
+{
+	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+
+	memcpy(levt, &lapic_clockevent, sizeof(*levt));
+	levt->cpumask = cpumask_of_cpu(smp_processor_id());
+
+	clockevents_register_device(levt);
+}
+
+/*
+ * In this functions we calibrate APIC bus clocks to the external timer.
+ *
+ * We want to do the calibration only once since we want to have local timer
+ * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * frequency.
+ *
+ * This was previously done by reading the PIT/HPET and waiting for a wrap
+ * around to find out, that a tick has elapsed. I have a box, where the PIT
+ * readout is broken, so it never gets out of the wait loop again. This was
+ * also reported by others.
+ *
+ * Monitoring the jiffies value is inaccurate and the clockevents
+ * infrastructure allows us to do a simple substitution of the interrupt
+ * handler.
+ *
+ * The calibration routine also uses the pm_timer when possible, as the PIT
+ * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
+ * back to normal later in the boot process).
+ */
+
+#define LAPIC_CAL_LOOPS		(HZ/10)
+
+static __initdata int lapic_cal_loops = -1;
+static __initdata long lapic_cal_t1, lapic_cal_t2;
+static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
+static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
+
+/*
+ * Temporary interrupt handler.
+ */
+static void __init lapic_cal_handler(struct clock_event_device *dev)
+{
+	unsigned long long tsc = 0;
+	long tapic = apic_read(APIC_TMCCT);
+	unsigned long pm = acpi_pm_read_early();
+
+	if (cpu_has_tsc)
+		rdtscll(tsc);
+
+	switch (lapic_cal_loops++) {
+	case 0:
+		lapic_cal_t1 = tapic;
+		lapic_cal_tsc1 = tsc;
+		lapic_cal_pm1 = pm;
+		lapic_cal_j1 = jiffies;
+		break;
+
+	case LAPIC_CAL_LOOPS:
+		lapic_cal_t2 = tapic;
+		lapic_cal_tsc2 = tsc;
+		if (pm < lapic_cal_pm1)
+			pm += ACPI_PM_OVRRUN;
+		lapic_cal_pm2 = pm;
+		lapic_cal_j2 = jiffies;
+		break;
+	}
+}
+
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
+{
+	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+	const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
+	const long pm_thresh = pm_100ms/100;
+	void (*real_handler)(struct clock_event_device *dev);
+	unsigned long deltaj;
+	long delta, deltapm;
+	int pm_referenced = 0;
+
+	/*
+	 * The local apic timer can be disabled via the kernel
+	 * commandline or from the CPU detection code. Register the lapic
+	 * timer as a dummy clock event source on SMP systems, so the
+	 * broadcast mechanism is used. On UP systems simply ignore it.
+	 */
+	if (local_apic_timer_disabled) {
+		/* No broadcast on UP ! */
+		if (num_possible_cpus() > 1)
+			setup_APIC_timer();
+		return;
+	}
+
+	apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+		    "calibrating APIC timer ...\n");
+
+	local_irq_disable();
+
+	/* Replace the global interrupt handler */
+	real_handler = global_clock_event->event_handler;
+	global_clock_event->event_handler = lapic_cal_handler;
+
+	/*
+	 * Setup the APIC counter to 1e9. There is no way the lapic
+	 * can underflow in the 100ms detection time frame
+	 */
+	__setup_APIC_LVTT(1000000000, 0, 0);
+
+	/* Let the interrupts run */
+	local_irq_enable();
+
+	while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
+		cpu_relax();
+
+	local_irq_disable();
+
+	/* Restore the real event handler */
+	global_clock_event->event_handler = real_handler;
+
+	/* Build delta t1-t2 as apic timer counts down */
+	delta = lapic_cal_t1 - lapic_cal_t2;
+	apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+
+	/* Check, if the PM timer is available */
+	deltapm = lapic_cal_pm2 - lapic_cal_pm1;
+	apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+
+	if (deltapm) {
+		unsigned long mult;
+		u64 res;
+
+		mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
+
+		if (deltapm > (pm_100ms - pm_thresh) &&
+		    deltapm < (pm_100ms + pm_thresh)) {
+			apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
+		} else {
+			res = (((u64) deltapm) *  mult) >> 22;
+			do_div(res, 1000000);
+			printk(KERN_WARNING "APIC calibration not consistent "
+			       "with PM Timer: %ldms instead of 100ms\n",
+			       (long)res);
+			/* Correct the lapic counter value */
+			res = (((u64) delta ) * pm_100ms);
+			do_div(res, deltapm);
+			printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
+			       "%lu (%ld)\n", (unsigned long) res, delta);
+			delta = (long) res;
+		}
+		pm_referenced = 1;
+	}
+
+	/* Calculate the scaled math multiplication factor */
+	lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32);
+	lapic_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+	lapic_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &lapic_clockevent);
+
+	calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+
+	apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
+	apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
+	apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
+		    calibration_result);
+
+	if (cpu_has_tsc) {
+		delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
+		apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
+			    "%ld.%04ld MHz.\n",
+			    (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ),
+			    (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+	}
+
+	apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
+		    "%u.%04u MHz.\n",
+		    calibration_result / (1000000 / HZ),
+		    calibration_result % (1000000 / HZ));
+
+	local_apic_timer_verify_ok = 1;
+
+	/* We trust the pm timer based calibration */
+	if (!pm_referenced) {
+		apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+
+		/*
+		 * Setup the apic timer manually
+		 */
+		levt->event_handler = lapic_cal_handler;
+		lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+		lapic_cal_loops = -1;
+
+		/* Let the interrupts run */
+		local_irq_enable();
+
+		while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
+			cpu_relax();
+
+		local_irq_disable();
+
+		/* Stop the lapic timer */
+		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+
+		local_irq_enable();
+
+		/* Jiffies delta */
+		deltaj = lapic_cal_j2 - lapic_cal_j1;
+		apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+
+		/* Check, if the jiffies result is consistent */
+		if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
+			apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+		else
+			local_apic_timer_verify_ok = 0;
+	} else
+		local_irq_enable();
+
+	if (!local_apic_timer_verify_ok) {
+		printk(KERN_WARNING
+		       "APIC timer disabled due to verification failure.\n");
+		/* No broadcast on UP ! */
+		if (num_possible_cpus() == 1)
+			return;
+	} else {
+		/*
+		 * If nmi_watchdog is set to IO_APIC, we need the
+		 * PIT/HPET going.  Otherwise register lapic as a dummy
+		 * device.
+		 */
+		if (nmi_watchdog != NMI_IO_APIC)
+			lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+		else
+			printk(KERN_WARNING "APIC timer registered as dummy,"
+			       " due to nmi_watchdog=1!\n");
+	}
+
+	/* Setup the lapic or request the broadcast */
+	setup_APIC_timer();
+}
+
+void __devinit setup_secondary_APIC_clock(void)
+{
+	setup_APIC_timer();
+}
+
+/*
+ * The guts of the apic timer interrupt
+ */
+static void local_apic_timer_interrupt(void)
+{
+	int cpu = smp_processor_id();
+	struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+
+	/*
+	 * Normally we should not be here till LAPIC has been initialized but
+	 * in some cases like kdump, its possible that there is a pending LAPIC
+	 * timer interrupt from previous kernel's context and is delivered in
+	 * new kernel the moment interrupts are enabled.
+	 *
+	 * Interrupts are enabled early and LAPIC is setup much later, hence
+	 * its possible that when we get here evt->event_handler is NULL.
+	 * Check for event_handler being NULL and discard the interrupt as
+	 * spurious.
+	 */
+	if (!evt->event_handler) {
+		printk(KERN_WARNING
+		       "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+		/* Switch it off */
+		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+		return;
+	}
+
+	per_cpu(irq_stat, cpu).apic_timer_irqs++;
+
+	evt->event_handler(evt);
+}
+
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ *   interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+
+void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	/*
+	 * NOTE! We'd better ACK the irq immediately,
+	 * because timer handling can be slow.
+	 */
+	ack_APIC_irq();
+	/*
+	 * update_process_times() expects us to have done irq_enter().
+	 * Besides, if we don't timer interrupts ignore the global
+	 * interrupt lock, which is the WrongThing (tm) to do.
+	 */
+	irq_enter();
+	local_apic_timer_interrupt();
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+	return -EINVAL;
+}
+
+/*
+ * Local APIC start and shutdown
+ */
+
+/**
+ * clear_local_APIC - shutdown the local APIC
+ *
+ * This is called, when a CPU is disabled and before rebooting, so the state of
+ * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
+ * leftovers during boot.
+ */
+void clear_local_APIC(void)
+{
+	int maxlvt = lapic_get_maxlvt();
+	unsigned long v;
+
+	/*
+	 * Masking an LVT entry can trigger a local APIC error
+	 * if the vector is zero. Mask LVTERR first to prevent this.
+	 */
+	if (maxlvt >= 3) {
+		v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+		apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
+	}
+	/*
+	 * Careful: we have to set masks only first to deassert
+	 * any level-triggered sources.
+	 */
+	v = apic_read(APIC_LVTT);
+	apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+	v = apic_read(APIC_LVT0);
+	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+	v = apic_read(APIC_LVT1);
+	apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
+	if (maxlvt >= 4) {
+		v = apic_read(APIC_LVTPC);
+		apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
+	}
+
+	/* lets not touch this if we didn't frob it */
+#ifdef CONFIG_X86_MCE_P4THERMAL
+	if (maxlvt >= 5) {
+		v = apic_read(APIC_LVTTHMR);
+		apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
+	}
+#endif
+	/*
+	 * Clean APIC state for other OSs:
+	 */
+	apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
+	apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+	apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
+	if (maxlvt >= 3)
+		apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
+	if (maxlvt >= 4)
+		apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
+
+#ifdef CONFIG_X86_MCE_P4THERMAL
+	if (maxlvt >= 5)
+		apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
+#endif
+	/* Integrated APIC (!82489DX) ? */
+	if (lapic_is_integrated()) {
+		if (maxlvt > 3)
+			/* Clear ESR due to Pentium errata 3AP and 11AP */
+			apic_write(APIC_ESR, 0);
+		apic_read(APIC_ESR);
+	}
+}
+
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
+void disable_local_APIC(void)
+{
+	unsigned long value;
+
+	clear_local_APIC();
+
+	/*
+	 * Disable APIC (implies clearing of registers
+	 * for 82489DX!).
+	 */
+	value = apic_read(APIC_SPIV);
+	value &= ~APIC_SPIV_APIC_ENABLED;
+	apic_write_around(APIC_SPIV, value);
+
+	/*
+	 * When LAPIC was disabled by the BIOS and enabled by the kernel,
+	 * restore the disabled state.
+	 */
+	if (enabled_via_apicbase) {
+		unsigned int l, h;
+
+		rdmsr(MSR_IA32_APICBASE, l, h);
+		l &= ~MSR_IA32_APICBASE_ENABLE;
+		wrmsr(MSR_IA32_APICBASE, l, h);
+	}
+}
+
+/*
+ * If Linux enabled the LAPIC against the BIOS default disable it down before
+ * re-entering the BIOS on shutdown.  Otherwise the BIOS may get confused and
+ * not power-off.  Additionally clear all LVT entries before disable_local_APIC
+ * for the case where Linux didn't enable the LAPIC.
+ */
+void lapic_shutdown(void)
+{
+	unsigned long flags;
+
+	if (!cpu_has_apic)
+		return;
+
+	local_irq_save(flags);
+	clear_local_APIC();
+
+	if (enabled_via_apicbase)
+		disable_local_APIC();
+
+	local_irq_restore(flags);
+}
+
+/*
+ * This is to verify that we're looking at a real local APIC.
+ * Check these against your board if the CPUs aren't getting
+ * started for no apparent reason.
+ */
+int __init verify_local_APIC(void)
+{
+	unsigned int reg0, reg1;
+
+	/*
+	 * The version register is read-only in a real APIC.
+	 */
+	reg0 = apic_read(APIC_LVR);
+	apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
+	apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
+	reg1 = apic_read(APIC_LVR);
+	apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+
+	/*
+	 * The two version reads above should print the same
+	 * numbers.  If the second one is different, then we
+	 * poke at a non-APIC.
+	 */
+	if (reg1 != reg0)
+		return 0;
+
+	/*
+	 * Check if the version looks reasonably.
+	 */
+	reg1 = GET_APIC_VERSION(reg0);
+	if (reg1 == 0x00 || reg1 == 0xff)
+		return 0;
+	reg1 = lapic_get_maxlvt();
+	if (reg1 < 0x02 || reg1 == 0xff)
+		return 0;
+
+	/*
+	 * The ID register is read/write in a real APIC.
+	 */
+	reg0 = apic_read(APIC_ID);
+	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+
+	/*
+	 * The next two are just to see if we have sane values.
+	 * They're only really relevant if we're in Virtual Wire
+	 * compatibility mode, but most boxes are anymore.
+	 */
+	reg0 = apic_read(APIC_LVT0);
+	apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
+	reg1 = apic_read(APIC_LVT1);
+	apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
+
+	return 1;
+}
+
+/**
+ * sync_Arb_IDs - synchronize APIC bus arbitration IDs
+ */
+void __init sync_Arb_IDs(void)
+{
+	/*
+	 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
+	 * needed on AMD.
+	 */
+	if (modern_apic())
+		return;
+	/*
+	 * Wait for idle.
+	 */
+	apic_wait_icr_idle();
+
+	apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
+	apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
+				| APIC_DM_INIT);
+}
+
+/*
+ * An initial setup of the virtual wire mode.
+ */
+void __init init_bsp_APIC(void)
+{
+	unsigned long value;
+
+	/*
+	 * Don't do the setup now if we have a SMP BIOS as the
+	 * through-I/O-APIC virtual wire mode might be active.
+	 */
+	if (smp_found_config || !cpu_has_apic)
+		return;
+
+	/*
+	 * Do not trust the local APIC being empty at bootup.
+	 */
+	clear_local_APIC();
+
+	/*
+	 * Enable APIC.
+	 */
+	value = apic_read(APIC_SPIV);
+	value &= ~APIC_VECTOR_MASK;
+	value |= APIC_SPIV_APIC_ENABLED;
+
+	/* This bit is reserved on P4/Xeon and should be cleared */
+	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+	    (boot_cpu_data.x86 == 15))
+		value &= ~APIC_SPIV_FOCUS_DISABLED;
+	else
+		value |= APIC_SPIV_FOCUS_DISABLED;
+	value |= SPURIOUS_APIC_VECTOR;
+	apic_write_around(APIC_SPIV, value);
+
+	/*
+	 * Set up the virtual wire mode.
+	 */
+	apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+	value = APIC_DM_NMI;
+	if (!lapic_is_integrated())		/* 82489DX */
+		value |= APIC_LVT_LEVEL_TRIGGER;
+	apic_write_around(APIC_LVT1, value);
+}
+
+/**
+ * setup_local_APIC - setup the local APIC
+ */
+void __devinit setup_local_APIC(void)
+{
+	unsigned long oldvalue, value, maxlvt, integrated;
+	int i, j;
+
+	/* Pound the ESR really hard over the head with a big hammer - mbligh */
+	if (esr_disable) {
+		apic_write(APIC_ESR, 0);
+		apic_write(APIC_ESR, 0);
+		apic_write(APIC_ESR, 0);
+		apic_write(APIC_ESR, 0);
+	}
+
+	integrated = lapic_is_integrated();
+
+	/*
+	 * Double-check whether this APIC is really registered.
+	 */
+	if (!apic_id_registered())
+		BUG();
+
+	/*
+	 * Intel recommends to set DFR, LDR and TPR before enabling
+	 * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+	 * document number 292116).  So here it goes...
+	 */
+	init_apic_ldr();
+
+	/*
+	 * Set Task Priority to 'accept all'. We never change this
+	 * later on.
+	 */
+	value = apic_read(APIC_TASKPRI);
+	value &= ~APIC_TPRI_MASK;
+	apic_write_around(APIC_TASKPRI, value);
+
+	/*
+	 * After a crash, we no longer service the interrupts and a pending
+	 * interrupt from previous kernel might still have ISR bit set.
+	 *
+	 * Most probably by now CPU has serviced that pending interrupt and
+	 * it might not have done the ack_APIC_irq() because it thought,
+	 * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
+	 * does not clear the ISR bit and cpu thinks it has already serivced
+	 * the interrupt. Hence a vector might get locked. It was noticed
+	 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
+	 */
+	for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+		value = apic_read(APIC_ISR + i*0x10);
+		for (j = 31; j >= 0; j--) {
+			if (value & (1<<j))
+				ack_APIC_irq();
+		}
+	}
+
+	/*
+	 * Now that we are all set up, enable the APIC
+	 */
+	value = apic_read(APIC_SPIV);
+	value &= ~APIC_VECTOR_MASK;
+	/*
+	 * Enable APIC
+	 */
+	value |= APIC_SPIV_APIC_ENABLED;
+
+	/*
+	 * Some unknown Intel IO/APIC (or APIC) errata is biting us with
+	 * certain networking cards. If high frequency interrupts are
+	 * happening on a particular IOAPIC pin, plus the IOAPIC routing
+	 * entry is masked/unmasked at a high rate as well then sooner or
+	 * later IOAPIC line gets 'stuck', no more interrupts are received
+	 * from the device. If focus CPU is disabled then the hang goes
+	 * away, oh well :-(
+	 *
+	 * [ This bug can be reproduced easily with a level-triggered
+	 *   PCI Ne2000 networking cards and PII/PIII processors, dual
+	 *   BX chipset. ]
+	 */
+	/*
+	 * Actually disabling the focus CPU check just makes the hang less
+	 * frequent as it makes the interrupt distributon model be more
+	 * like LRU than MRU (the short-term load is more even across CPUs).
+	 * See also the comment in end_level_ioapic_irq().  --macro
+	 */
+
+	/* Enable focus processor (bit==0) */
+	value &= ~APIC_SPIV_FOCUS_DISABLED;
+
+	/*
+	 * Set spurious IRQ vector
+	 */
+	value |= SPURIOUS_APIC_VECTOR;
+	apic_write_around(APIC_SPIV, value);
+
+	/*
+	 * Set up LVT0, LVT1:
+	 *
+	 * set up through-local-APIC on the BP's LINT0. This is not
+	 * strictly necessery in pure symmetric-IO mode, but sometimes
+	 * we delegate interrupts to the 8259A.
+	 */
+	/*
+	 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
+	 */
+	value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
+	if (!smp_processor_id() && (pic_mode || !value)) {
+		value = APIC_DM_EXTINT;
+		apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
+				smp_processor_id());
+	} else {
+		value = APIC_DM_EXTINT | APIC_LVT_MASKED;
+		apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
+				smp_processor_id());
+	}
+	apic_write_around(APIC_LVT0, value);
+
+	/*
+	 * only the BP should see the LINT1 NMI signal, obviously.
+	 */
+	if (!smp_processor_id())
+		value = APIC_DM_NMI;
+	else
+		value = APIC_DM_NMI | APIC_LVT_MASKED;
+	if (!integrated)		/* 82489DX */
+		value |= APIC_LVT_LEVEL_TRIGGER;
+	apic_write_around(APIC_LVT1, value);
+
+	if (integrated && !esr_disable) {		/* !82489DX */
+		maxlvt = lapic_get_maxlvt();
+		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP. */
+			apic_write(APIC_ESR, 0);
+		oldvalue = apic_read(APIC_ESR);
+
+		/* enables sending errors */
+		value = ERROR_APIC_VECTOR;
+		apic_write_around(APIC_LVTERR, value);
+		/*
+		 * spec says clear errors after enabling vector.
+		 */
+		if (maxlvt > 3)
+			apic_write(APIC_ESR, 0);
+		value = apic_read(APIC_ESR);
+		if (value != oldvalue)
+			apic_printk(APIC_VERBOSE, "ESR value before enabling "
+				"vector: 0x%08lx  after: 0x%08lx\n",
+				oldvalue, value);
+	} else {
+		if (esr_disable)
+			/*
+			 * Something untraceble is creating bad interrupts on
+			 * secondary quads ... for the moment, just leave the
+			 * ESR disabled - we can't do anything useful with the
+			 * errors anyway - mbligh
+			 */
+			printk(KERN_INFO "Leaving ESR disabled.\n");
+		else
+			printk(KERN_INFO "No ESR for 82489DX.\n");
+	}
+
+	/* Disable the local apic timer */
+	value = apic_read(APIC_LVTT);
+	value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+	apic_write_around(APIC_LVTT, value);
+
+	setup_apic_nmi_watchdog(NULL);
+	apic_pm_activate();
+}
+
+/*
+ * Detect and initialize APIC
+ */
+static int __init detect_init_APIC (void)
+{
+	u32 h, l, features;
+
+	/* Disabled by kernel option? */
+	if (enable_local_apic < 0)
+		return -1;
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
+		    (boot_cpu_data.x86 == 15))
+			break;
+		goto no_apic;
+	case X86_VENDOR_INTEL:
+		if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
+		    (boot_cpu_data.x86 == 5 && cpu_has_apic))
+			break;
+		goto no_apic;
+	default:
+		goto no_apic;
+	}
+
+	if (!cpu_has_apic) {
+		/*
+		 * Over-ride BIOS and try to enable the local APIC only if
+		 * "lapic" specified.
+		 */
+		if (enable_local_apic <= 0) {
+			printk(KERN_INFO "Local APIC disabled by BIOS -- "
+			       "you can enable it with \"lapic\"\n");
+			return -1;
+		}
+		/*
+		 * Some BIOSes disable the local APIC in the APIC_BASE
+		 * MSR. This can only be done in software for Intel P6 or later
+		 * and AMD K7 (Model > 1) or later.
+		 */
+		rdmsr(MSR_IA32_APICBASE, l, h);
+		if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+			printk(KERN_INFO
+			       "Local APIC disabled by BIOS -- reenabling.\n");
+			l &= ~MSR_IA32_APICBASE_BASE;
+			l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+			wrmsr(MSR_IA32_APICBASE, l, h);
+			enabled_via_apicbase = 1;
+		}
+	}
+	/*
+	 * The APIC feature bit should now be enabled
+	 * in `cpuid'
+	 */
+	features = cpuid_edx(1);
+	if (!(features & (1 << X86_FEATURE_APIC))) {
+		printk(KERN_WARNING "Could not enable APIC!\n");
+		return -1;
+	}
+	set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+	/* The BIOS may have set up the APIC at some other address */
+	rdmsr(MSR_IA32_APICBASE, l, h);
+	if (l & MSR_IA32_APICBASE_ENABLE)
+		mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+
+	if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
+		nmi_watchdog = NMI_LOCAL_APIC;
+
+	printk(KERN_INFO "Found and enabled local APIC!\n");
+
+	apic_pm_activate();
+
+	return 0;
+
+no_apic:
+	printk(KERN_INFO "No local APIC present or hardware disabled\n");
+	return -1;
+}
+
+/**
+ * init_apic_mappings - initialize APIC mappings
+ */
+void __init init_apic_mappings(void)
+{
+	unsigned long apic_phys;
+
+	/*
+	 * If no local APIC can be found then set up a fake all
+	 * zeroes page to simulate the local APIC and another
+	 * one for the IO-APIC.
+	 */
+	if (!smp_found_config && detect_init_APIC()) {
+		apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+		apic_phys = __pa(apic_phys);
+	} else
+		apic_phys = mp_lapic_addr;
+
+	set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+	printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
+	       apic_phys);
+
+	/*
+	 * Fetch the APIC ID of the BSP in case we have a
+	 * default configuration (or the MP table is broken).
+	 */
+	if (boot_cpu_physical_apicid == -1U)
+		boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+
+#ifdef CONFIG_X86_IO_APIC
+	{
+		unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+		int i;
+
+		for (i = 0; i < nr_ioapics; i++) {
+			if (smp_found_config) {
+				ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+				if (!ioapic_phys) {
+					printk(KERN_ERR
+					       "WARNING: bogus zero IO-APIC "
+					       "address found in MPTABLE, "
+					       "disabling IO/APIC support!\n");
+					smp_found_config = 0;
+					skip_ioapic_setup = 1;
+					goto fake_ioapic_page;
+				}
+			} else {
+fake_ioapic_page:
+				ioapic_phys = (unsigned long)
+					      alloc_bootmem_pages(PAGE_SIZE);
+				ioapic_phys = __pa(ioapic_phys);
+			}
+			set_fixmap_nocache(idx, ioapic_phys);
+			printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
+			       __fix_to_virt(idx), ioapic_phys);
+			idx++;
+		}
+	}
+#endif
+}
+
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor (void)
+{
+	if (enable_local_apic < 0)
+		clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+
+	if (!smp_found_config && !cpu_has_apic)
+		return -1;
+
+	/*
+	 * Complain if the BIOS pretends there is one.
+	 */
+	if (!cpu_has_apic &&
+	    APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+		       boot_cpu_physical_apicid);
+		clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+		return -1;
+	}
+
+	verify_local_APIC();
+
+	connect_bsp_APIC();
+
+	/*
+	 * Hack: In case of kdump, after a crash, kernel might be booting
+	 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
+	 * might be zero if read from MP tables. Get it from LAPIC.
+	 */
+#ifdef CONFIG_CRASH_DUMP
+	boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+#endif
+	phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+
+	setup_local_APIC();
+
+#ifdef CONFIG_X86_IO_APIC
+	if (smp_found_config)
+		if (!skip_ioapic_setup && nr_ioapics)
+			setup_IO_APIC();
+#endif
+	setup_boot_clock();
+
+	return 0;
+}
+
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+	enable_local_apic = 1;
+	return 0;
+}
+early_param("lapic", parse_lapic);
+
+static int __init parse_nolapic(char *arg)
+{
+	enable_local_apic = -1;
+	clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+	return 0;
+}
+early_param("nolapic", parse_nolapic);
+
+static int __init parse_disable_lapic_timer(char *arg)
+{
+	local_apic_timer_disabled = 1;
+	return 0;
+}
+early_param("nolapic_timer", parse_disable_lapic_timer);
+
+static int __init parse_lapic_timer_c2_ok(char *arg)
+{
+	local_apic_timer_c2_ok = 1;
+	return 0;
+}
+early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
+
+static int __init apic_set_verbosity(char *str)
+{
+	if (strcmp("debug", str) == 0)
+		apic_verbosity = APIC_DEBUG;
+	else if (strcmp("verbose", str) == 0)
+		apic_verbosity = APIC_VERBOSE;
+	return 1;
+}
+
+__setup("apic=", apic_set_verbosity);
+
+
+/*
+ * Local APIC interrupts
+ */
+
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+void smp_spurious_interrupt(struct pt_regs *regs)
+{
+	unsigned long v;
+
+	irq_enter();
+	/*
+	 * Check if this really is a spurious interrupt and ACK it
+	 * if it is a vectored one.  Just in case...
+	 * Spurious interrupts should not be ACKed.
+	 */
+	v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
+	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+		ack_APIC_irq();
+
+	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
+	printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
+	       "should never happen.\n", smp_processor_id());
+	irq_exit();
+}
+
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+void smp_error_interrupt(struct pt_regs *regs)
+{
+	unsigned long v, v1;
+
+	irq_enter();
+	/* First tickle the hardware, only then report what went on. -- REW */
+	v = apic_read(APIC_ESR);
+	apic_write(APIC_ESR, 0);
+	v1 = apic_read(APIC_ESR);
+	ack_APIC_irq();
+	atomic_inc(&irq_err_count);
+
+	/* Here is what the APIC error bits mean:
+	   0: Send CS error
+	   1: Receive CS error
+	   2: Send accept error
+	   3: Receive accept error
+	   4: Reserved
+	   5: Send illegal vector
+	   6: Received illegal vector
+	   7: Illegal register address
+	*/
+	printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
+		smp_processor_id(), v , v1);
+	irq_exit();
+}
+
+/*
+ * Initialize APIC interrupts
+ */
+void __init apic_intr_init(void)
+{
+#ifdef CONFIG_SMP
+	smp_intr_init();
+#endif
+	/* self generated IPI for local APIC timer */
+	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+	/* IPI vectors for APIC spurious and error interrupts */
+	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+	/* thermal monitor LVT interrupt */
+#ifdef CONFIG_X86_MCE_P4THERMAL
+	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+}
+
+/**
+ * connect_bsp_APIC - attach the APIC to the interrupt system
+ */
+void __init connect_bsp_APIC(void)
+{
+	if (pic_mode) {
+		/*
+		 * Do not trust the local APIC being empty at bootup.
+		 */
+		clear_local_APIC();
+		/*
+		 * PIC mode, enable APIC mode in the IMCR, i.e.  connect BSP's
+		 * local APIC to INT and NMI lines.
+		 */
+		apic_printk(APIC_VERBOSE, "leaving PIC mode, "
+				"enabling APIC mode.\n");
+		outb(0x70, 0x22);
+		outb(0x01, 0x23);
+	}
+	enable_apic_mode();
+}
+
+/**
+ * disconnect_bsp_APIC - detach the APIC from the interrupt system
+ * @virt_wire_setup:	indicates, whether virtual wire mode is selected
+ *
+ * Virtual wire mode is necessary to deliver legacy interrupts even when the
+ * APIC is disabled.
+ */
+void disconnect_bsp_APIC(int virt_wire_setup)
+{
+	if (pic_mode) {
+		/*
+		 * Put the board back into PIC mode (has an effect only on
+		 * certain older boards).  Note that APIC interrupts, including
+		 * IPIs, won't work beyond this point!  The only exception are
+		 * INIT IPIs.
+		 */
+		apic_printk(APIC_VERBOSE, "disabling APIC mode, "
+				"entering PIC mode.\n");
+		outb(0x70, 0x22);
+		outb(0x00, 0x23);
+	} else {
+		/* Go back to Virtual Wire compatibility mode */
+		unsigned long value;
+
+		/* For the spurious interrupt use vector F, and enable it */
+		value = apic_read(APIC_SPIV);
+		value &= ~APIC_VECTOR_MASK;
+		value |= APIC_SPIV_APIC_ENABLED;
+		value |= 0xf;
+		apic_write_around(APIC_SPIV, value);
+
+		if (!virt_wire_setup) {
+			/*
+			 * For LVT0 make it edge triggered, active high,
+			 * external and enabled
+			 */
+			value = apic_read(APIC_LVT0);
+			value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+				APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+				APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+			value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+			value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+			apic_write_around(APIC_LVT0, value);
+		} else {
+			/* Disable LVT0 */
+			apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+		}
+
+		/*
+		 * For LVT1 make it edge triggered, active high, nmi and
+		 * enabled
+		 */
+		value = apic_read(APIC_LVT1);
+		value &= ~(
+			APIC_MODE_MASK | APIC_SEND_PENDING |
+			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+		apic_write_around(APIC_LVT1, value);
+	}
+}
+
+/*
+ * Power management
+ */
+#ifdef CONFIG_PM
+
+static struct {
+	int active;
+	/* r/w apic fields */
+	unsigned int apic_id;
+	unsigned int apic_taskpri;
+	unsigned int apic_ldr;
+	unsigned int apic_dfr;
+	unsigned int apic_spiv;
+	unsigned int apic_lvtt;
+	unsigned int apic_lvtpc;
+	unsigned int apic_lvt0;
+	unsigned int apic_lvt1;
+	unsigned int apic_lvterr;
+	unsigned int apic_tmict;
+	unsigned int apic_tdcr;
+	unsigned int apic_thmr;
+} apic_pm_state;
+
+static int lapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+	unsigned long flags;
+	int maxlvt;
+
+	if (!apic_pm_state.active)
+		return 0;
+
+	maxlvt = lapic_get_maxlvt();
+
+	apic_pm_state.apic_id = apic_read(APIC_ID);
+	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+	apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+	apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+	apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+	apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+	if (maxlvt >= 4)
+		apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+	apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+	apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+	if (maxlvt >= 5)
+		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
+
+	local_irq_save(flags);
+	disable_local_APIC();
+	local_irq_restore(flags);
+	return 0;
+}
+
+static int lapic_resume(struct sys_device *dev)
+{
+	unsigned int l, h;
+	unsigned long flags;
+	int maxlvt;
+
+	if (!apic_pm_state.active)
+		return 0;
+
+	maxlvt = lapic_get_maxlvt();
+
+	local_irq_save(flags);
+
+	/*
+	 * Make sure the APICBASE points to the right address
+	 *
+	 * FIXME! This will be wrong if we ever support suspend on
+	 * SMP! We'll need to do this as part of the CPU restore!
+	 */
+	rdmsr(MSR_IA32_APICBASE, l, h);
+	l &= ~MSR_IA32_APICBASE_BASE;
+	l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+	wrmsr(MSR_IA32_APICBASE, l, h);
+
+	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+	apic_write(APIC_ID, apic_pm_state.apic_id);
+	apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+	apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+	apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+	if (maxlvt >= 5)
+		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+	if (maxlvt >= 4)
+		apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+	apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+	apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+	apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+	apic_write(APIC_ESR, 0);
+	apic_read(APIC_ESR);
+	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+	apic_write(APIC_ESR, 0);
+	apic_read(APIC_ESR);
+	local_irq_restore(flags);
+	return 0;
+}
+
+/*
+ * This device has no shutdown method - fully functioning local APICs
+ * are needed on every CPU up until machine_halt/restart/poweroff.
+ */
+
+static struct sysdev_class lapic_sysclass = {
+	set_kset_name("lapic"),
+	.resume		= lapic_resume,
+	.suspend	= lapic_suspend,
+};
+
+static struct sys_device device_lapic = {
+	.id	= 0,
+	.cls	= &lapic_sysclass,
+};
+
+static void __devinit apic_pm_activate(void)
+{
+	apic_pm_state.active = 1;
+}
+
+static int __init init_lapic_sysfs(void)
+{
+	int error;
+
+	if (!cpu_has_apic)
+		return 0;
+	/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+
+	error = sysdev_class_register(&lapic_sysclass);
+	if (!error)
+		error = sysdev_register(&device_lapic);
+	return error;
+}
+device_initcall(init_lapic_sysfs);
+
+#else	/* CONFIG_PM */
+
+static void apic_pm_activate(void) { }
+
+#endif	/* CONFIG_PM */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
new file mode 100644
index 000000000000..f02a8aca826b
--- /dev/null
+++ b/arch/x86/kernel/apm_32.c
@@ -0,0 +1,2403 @@
+/* -*- linux-c -*-
+ * APM BIOS driver for Linux
+ * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au)
+ *
+ * Initial development of this driver was funded by NEC Australia P/L
+ *	and NEC Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * October 1995, Rik Faith (faith@cs.unc.edu):
+ *    Minor enhancements and updates (to the patch set) for 1.3.x
+ *    Documentation
+ * January 1996, Rik Faith (faith@cs.unc.edu):
+ *    Make /proc/apm easy to format (bump driver version)
+ * March 1996, Rik Faith (faith@cs.unc.edu):
+ *    Prohibit APM BIOS calls unless apm_enabled.
+ *    (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>)
+ * April 1996, Stephen Rothwell (sfr@canb.auug.org.au)
+ *    Version 1.0 and 1.1
+ * May 1996, Version 1.2
+ * Feb 1998, Version 1.3
+ * Feb 1998, Version 1.4
+ * Aug 1998, Version 1.5
+ * Sep 1998, Version 1.6
+ * Nov 1998, Version 1.7
+ * Jan 1999, Version 1.8
+ * Jan 1999, Version 1.9
+ * Oct 1999, Version 1.10
+ * Nov 1999, Version 1.11
+ * Jan 2000, Version 1.12
+ * Feb 2000, Version 1.13
+ * Nov 2000, Version 1.14
+ * Oct 2001, Version 1.15
+ * Jan 2002, Version 1.16
+ * Oct 2002, Version 1.16ac
+ *
+ * History:
+ *    0.6b: first version in official kernel, Linux 1.3.46
+ *    0.7: changed /proc/apm format, Linux 1.3.58
+ *    0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59
+ *    0.9: only call bios if bios is present, Linux 1.3.72
+ *    1.0: use fixed device number, consolidate /proc/apm into this file,
+ *         Linux 1.3.85
+ *    1.1: support user-space standby and suspend, power off after system
+ *         halted, Linux 1.3.98
+ *    1.2: When resetting RTC after resume, take care so that the time
+ *         is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth
+ *         <jtoth@princeton.edu>); improve interaction between
+ *         screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4
+ *    1.2a:Simple change to stop mysterious bug reports with SMP also added
+ *	   levels to the printk calls. APM is not defined for SMP machines.
+ *         The new replacment for it is, but Linux doesn't yet support this.
+ *         Alan Cox Linux 2.1.55
+ *    1.3: Set up a valid data descriptor 0x40 for buggy BIOS's
+ *    1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by
+ *         Dean Gaudet <dgaudet@arctic.org>.
+ *         C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87
+ *    1.5: Fix segment register reloading (in case of bad segments saved
+ *         across BIOS call).
+ *         Stephen Rothwell
+ *    1.6: Cope with complier/assembler differences.
+ *         Only try to turn off the first display device.
+ *         Fix OOPS at power off with no APM BIOS by Jan Echternach
+ *                   <echter@informatik.uni-rostock.de>
+ *         Stephen Rothwell
+ *    1.7: Modify driver's cached copy of the disabled/disengaged flags
+ *         to reflect current state of APM BIOS.
+ *         Chris Rankin <rankinc@bellsouth.net>
+ *         Reset interrupt 0 timer to 100Hz after suspend
+ *         Chad Miller <cmiller@surfsouth.com>
+ *         Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE
+ *         Richard Gooch <rgooch@atnf.csiro.au>
+ *         Allow boot time disabling of APM
+ *         Make boot messages far less verbose by default
+ *         Make asm safer
+ *         Stephen Rothwell
+ *    1.8: Add CONFIG_APM_RTC_IS_GMT
+ *         Richard Gooch <rgooch@atnf.csiro.au>
+ *         change APM_NOINTS to CONFIG_APM_ALLOW_INTS
+ *         remove dependency on CONFIG_PROC_FS
+ *         Stephen Rothwell
+ *    1.9: Fix small typo.  <laslo@wodip.opole.pl>
+ *         Try to cope with BIOS's that need to have all display
+ *         devices blanked and not just the first one.
+ *         Ross Paterson <ross@soi.city.ac.uk>
+ *         Fix segment limit setting it has always been wrong as
+ *         the segments needed to have byte granularity.
+ *         Mark a few things __init.
+ *         Add hack to allow power off of SMP systems by popular request.
+ *         Use CONFIG_SMP instead of __SMP__
+ *         Ignore BOUNCES for three seconds.
+ *         Stephen Rothwell
+ *   1.10: Fix for Thinkpad return code.
+ *         Merge 2.2 and 2.3 drivers.
+ *         Remove APM dependencies in arch/i386/kernel/process.c
+ *         Remove APM dependencies in drivers/char/sysrq.c
+ *         Reset time across standby.
+ *         Allow more inititialisation on SMP.
+ *         Remove CONFIG_APM_POWER_OFF and make it boot time
+ *         configurable (default on).
+ *         Make debug only a boot time parameter (remove APM_DEBUG).
+ *         Try to blank all devices on any error.
+ *   1.11: Remove APM dependencies in drivers/char/console.c
+ *         Check nr_running to detect if we are idle (from
+ *         Borislav Deianov <borislav@lix.polytechnique.fr>)
+ *         Fix for bioses that don't zero the top part of the
+ *         entrypoint offset (Mario Sitta <sitta@al.unipmn.it>)
+ *         (reported by Panos Katsaloulis <teras@writeme.com>).
+ *         Real mode power off patch (Walter Hofmann
+ *         <Walter.Hofmann@physik.stud.uni-erlangen.de>).
+ *   1.12: Remove CONFIG_SMP as the compiler will optimize
+ *         the code away anyway (smp_num_cpus == 1 in UP)
+ *         noted by Artur Skawina <skawina@geocities.com>.
+ *         Make power off under SMP work again.
+ *         Fix thinko with initial engaging of BIOS.
+ *         Make sure power off only happens on CPU 0
+ *         (Paul "Rusty" Russell <rusty@rustcorp.com.au>).
+ *         Do error notification to user mode if BIOS calls fail.
+ *         Move entrypoint offset fix to ...boot/setup.S
+ *         where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>).
+ *         Remove smp-power-off. SMP users must now specify
+ *         "apm=power-off" on the kernel command line. Suggested
+ *         by Jim Avera <jima@hal.com>, modified by Alan Cox
+ *         <alan@lxorguk.ukuu.org.uk>.
+ *         Register the /proc/apm entry even on SMP so that
+ *         scripts that check for it before doing power off
+ *         work (Jim Avera <jima@hal.com>).
+ *   1.13: Changes for new pm_ interfaces (Andy Henroid
+ *         <andy_henroid@yahoo.com>).
+ *         Modularize the code.
+ *         Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS
+ *         is now the way life works).
+ *         Fix thinko in suspend() (wrong return).
+ *         Notify drivers on critical suspend.
+ *         Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
+ *         modified by sfr).
+ *         Disable interrupts while we are suspended (Andy Henroid
+ *         <andy_henroid@yahoo.com> fixed by sfr).
+ *         Make power off work on SMP again (Tony Hoyle
+ *         <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr.
+ *         Remove CONFIG_APM_SUSPEND_BOUNCE.  The bounce ignore
+ *         interval is now configurable.
+ *   1.14: Make connection version persist across module unload/load.
+ *         Enable and engage power management earlier.
+ *         Disengage power management on module unload.
+ *         Changed to use the sysrq-register hack for registering the
+ *         power off function called by magic sysrq based upon discussions
+ *         in irc://irc.openprojects.net/#kernelnewbies
+ *         (Crutcher Dunnavant <crutcher+kernel@datastacks.com>).
+ *         Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable.
+ *         (Arjan van de Ven <arjanv@redhat.com>) modified by sfr.
+ *         Work around byte swap bug in one of the Vaio's BIOS's
+ *         (Marc Boucher <marc@mbsi.ca>).
+ *         Exposed the disable flag to dmi so that we can handle known
+ *         broken APM (Alan Cox <alan@redhat.com>).
+ *   1.14ac: If the BIOS says "I slowed the CPU down" then don't spin
+ *         calling it - instead idle. (Alan Cox <alan@redhat.com>)
+ *         If an APM idle fails log it and idle sensibly
+ *   1.15: Don't queue events to clients who open the device O_WRONLY.
+ *         Don't expect replies from clients who open the device O_RDONLY.
+ *         (Idea from Thomas Hood)
+ *         Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>)
+ *   1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.)
+ *         Notify listeners of standby or suspend events before notifying
+ *         drivers. Return EBUSY to ioctl() if suspend is rejected.
+ *         (Russell King <rmk@arm.linux.org.uk> and Thomas Hood)
+ *         Ignore first resume after we generate our own resume event
+ *         after a suspend (Thomas Hood)
+ *         Daemonize now gets rid of our controlling terminal (sfr).
+ *         CONFIG_APM_CPU_IDLE now just affects the default value of
+ *         idle_threshold (sfr).
+ *         Change name of kernel apm daemon (as it no longer idles) (sfr).
+ *   1.16ac: Fix up SMP support somewhat. You can now force SMP on and we
+ *	   make _all_ APM calls on the CPU#0. Fix unsafe sign bug.
+ *	   TODO: determine if its "boot CPU" or "CPU0" we want to lock to.
+ *
+ * APM 1.1 Reference:
+ *
+ *   Intel Corporation, Microsoft Corporation. Advanced Power Management
+ *   (APM) BIOS Interface Specification, Revision 1.1, September 1993.
+ *   Intel Order Number 241704-001.  Microsoft Part Number 781-110-X01.
+ *
+ * [This document is available free from Intel by calling 800.628.8686 (fax
+ * 916.356.6100) or 800.548.4725; or via anonymous ftp from
+ * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc.  It is also
+ * available from Microsoft by calling 206.882.8080.]
+ *
+ * APM 1.2 Reference:
+ *   Intel Corporation, Microsoft Corporation. Advanced Power Management
+ *   (APM) BIOS Interface Specification, Revision 1.2, February 1996.
+ *
+ * [This document is available from Microsoft at:
+ *    http://www.microsoft.com/whdc/archive/amp_12.mspx]
+ */
+
+#include <linux/module.h>
+
+#include <linux/poll.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/timer.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+#include <linux/apm_bios.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/pm.h>
+#include <linux/pm_legacy.h>
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/freezer.h>
+#include <linux/smp.h>
+#include <linux/dmi.h>
+#include <linux/suspend.h>
+#include <linux/kthread.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+#include <asm/i8253.h>
+#include <asm/paravirt.h>
+#include <asm/reboot.h>
+
+#include "io_ports.h"
+
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+extern int (*console_blank_hook)(int);
+#endif
+
+/*
+ * The apm_bios device is one of the misc char devices.
+ * This is its minor number.
+ */
+#define	APM_MINOR_DEV	134
+
+/*
+ * See Documentation/Config.help for the configuration options.
+ *
+ * Various options can be changed at boot time as follows:
+ * (We allow underscores for compatibility with the modules code)
+ *	apm=on/off			enable/disable APM
+ *	    [no-]allow[-_]ints		allow interrupts during BIOS calls
+ *	    [no-]broken[-_]psr		BIOS has a broken GetPowerStatus call
+ *	    [no-]realmode[-_]power[-_]off	switch to real mode before
+ *	    					powering off
+ *	    [no-]debug			log some debugging messages
+ *	    [no-]power[-_]off		power off on shutdown
+ *	    [no-]smp			Use apm even on an SMP box
+ *	    bounce[-_]interval=<n>	number of ticks to ignore suspend
+ *	    				bounces
+ *          idle[-_]threshold=<n>       System idle percentage above which to
+ *                                      make APM BIOS idle calls. Set it to
+ *                                      100 to disable.
+ *          idle[-_]period=<n>          Period (in 1/100s of a second) over
+ *                                      which the idle percentage is
+ *                                      calculated.
+ */
+
+/* KNOWN PROBLEM MACHINES:
+ *
+ * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant
+ *                         [Confirmed by TI representative]
+ * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification
+ *                    [Confirmed by BIOS disassembly]
+ *                    [This may work now ...]
+ * P: Toshiba 1950S: battery life information only gets updated after resume
+ * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking
+ * 	broken in BIOS [Reported by Garst R. Reese <reese@isn.net>]
+ * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP
+ * 	Neale Banks <neale@lowendale.com.au> December 2000
+ *
+ * Legend: U = unusable with APM patches
+ *         P = partially usable with APM patches
+ */
+
+/*
+ * Define as 1 to make the driver always call the APM BIOS busy
+ * routine even if the clock was not reported as slowed by the
+ * idle routine.  Otherwise, define as 0.
+ */
+#define ALWAYS_CALL_BUSY   1
+
+/*
+ * Define to make the APM BIOS calls zero all data segment registers (so
+ * that an incorrect BIOS implementation will cause a kernel panic if it
+ * tries to write to arbitrary memory).
+ */
+#define APM_ZERO_SEGS
+
+#include "apm.h"
+
+/*
+ * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
+ * This patched by Chad Miller <cmiller@surfsouth.com>, original code by
+ * David Chen <chen@ctpa04.mit.edu>
+ */
+#undef INIT_TIMER_AFTER_SUSPEND
+
+#ifdef INIT_TIMER_AFTER_SUSPEND
+#include <linux/timex.h>
+#include <asm/io.h>
+#include <linux/delay.h>
+#endif
+
+/*
+ * Need to poll the APM BIOS every second
+ */
+#define APM_CHECK_TIMEOUT	(HZ)
+
+/*
+ * Ignore suspend events for this amount of time after a resume
+ */
+#define DEFAULT_BOUNCE_INTERVAL		(3 * HZ)
+
+/*
+ * Maximum number of events stored
+ */
+#define APM_MAX_EVENTS		20
+
+/*
+ * The per-file APM data
+ */
+struct apm_user {
+	int		magic;
+	struct apm_user *	next;
+	unsigned int	suser: 1;
+	unsigned int	writer: 1;
+	unsigned int	reader: 1;
+	unsigned int	suspend_wait: 1;
+	int		suspend_result;
+	int		suspends_pending;
+	int		standbys_pending;
+	int		suspends_read;
+	int		standbys_read;
+	int		event_head;
+	int		event_tail;
+	apm_event_t	events[APM_MAX_EVENTS];
+};
+
+/*
+ * The magic number in apm_user
+ */
+#define APM_BIOS_MAGIC		0x4101
+
+/*
+ * idle percentage above which bios idle calls are done
+ */
+#ifdef CONFIG_APM_CPU_IDLE
+#define DEFAULT_IDLE_THRESHOLD	95
+#else
+#define DEFAULT_IDLE_THRESHOLD	100
+#endif
+#define DEFAULT_IDLE_PERIOD	(100 / 3)
+
+/*
+ * Local variables
+ */
+static struct {
+	unsigned long	offset;
+	unsigned short	segment;
+}				apm_bios_entry;
+static int			clock_slowed;
+static int			idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
+static int			idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
+static int			set_pm_idle;
+static int			suspends_pending;
+static int			standbys_pending;
+static int			ignore_sys_suspend;
+static int			ignore_normal_resume;
+static int			bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
+
+static int			debug __read_mostly;
+static int			smp __read_mostly;
+static int			apm_disabled = -1;
+#ifdef CONFIG_SMP
+static int			power_off;
+#else
+static int			power_off = 1;
+#endif
+#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
+static int			realmode_power_off = 1;
+#else
+static int			realmode_power_off;
+#endif
+#ifdef CONFIG_APM_ALLOW_INTS
+static int			allow_ints = 1;
+#else
+static int			allow_ints;
+#endif
+static int			broken_psr;
+
+static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
+static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
+static struct apm_user *	user_list;
+static DEFINE_SPINLOCK(user_list_lock);
+static const struct desc_struct	bad_bios_desc = { 0, 0x00409200 };
+
+static const char		driver_version[] = "1.16ac";	/* no spaces */
+
+static struct task_struct *kapmd_task;
+
+/*
+ *	APM event names taken from the APM 1.2 specification. These are
+ *	the message codes that the BIOS uses to tell us about events
+ */
+static const char *	const apm_event_name[] = {
+	"system standby",
+	"system suspend",
+	"normal resume",
+	"critical resume",
+	"low battery",
+	"power status change",
+	"update time",
+	"critical suspend",
+	"user standby",
+	"user suspend",
+	"system standby resume",
+	"capabilities change"
+};
+#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name)
+
+typedef struct lookup_t {
+	int	key;
+	char *	msg;
+} lookup_t;
+
+/*
+ *	The BIOS returns a set of standard error codes in AX when the
+ *	carry flag is set.
+ */
+ 
+static const lookup_t error_table[] = {
+/* N/A	{ APM_SUCCESS,		"Operation succeeded" }, */
+	{ APM_DISABLED,		"Power management disabled" },
+	{ APM_CONNECTED,	"Real mode interface already connected" },
+	{ APM_NOT_CONNECTED,	"Interface not connected" },
+	{ APM_16_CONNECTED,	"16 bit interface already connected" },
+/* N/A	{ APM_16_UNSUPPORTED,	"16 bit interface not supported" }, */
+	{ APM_32_CONNECTED,	"32 bit interface already connected" },
+	{ APM_32_UNSUPPORTED,	"32 bit interface not supported" },
+	{ APM_BAD_DEVICE,	"Unrecognized device ID" },
+	{ APM_BAD_PARAM,	"Parameter out of range" },
+	{ APM_NOT_ENGAGED,	"Interface not engaged" },
+	{ APM_BAD_FUNCTION,     "Function not supported" },
+	{ APM_RESUME_DISABLED,	"Resume timer disabled" },
+	{ APM_BAD_STATE,	"Unable to enter requested state" },
+/* N/A	{ APM_NO_EVENTS,	"No events pending" }, */
+	{ APM_NO_ERROR,		"BIOS did not set a return code" },
+	{ APM_NOT_PRESENT,	"No APM present" }
+};
+#define ERROR_COUNT	ARRAY_SIZE(error_table)
+
+/**
+ *	apm_error	-	display an APM error
+ *	@str: information string
+ *	@err: APM BIOS return code
+ *
+ *	Write a meaningful log entry to the kernel log in the event of
+ *	an APM error.
+ */
+ 
+static void apm_error(char *str, int err)
+{
+	int	i;
+
+	for (i = 0; i < ERROR_COUNT; i++)
+		if (error_table[i].key == err) break;
+	if (i < ERROR_COUNT)
+		printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
+	else
+		printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
+			str, err);
+}
+
+/*
+ * Lock APM functionality to physical CPU 0
+ */
+ 
+#ifdef CONFIG_SMP
+
+static cpumask_t apm_save_cpus(void)
+{
+	cpumask_t x = current->cpus_allowed;
+	/* Some bioses don't like being called from CPU != 0 */
+	set_cpus_allowed(current, cpumask_of_cpu(0));
+	BUG_ON(smp_processor_id() != 0);
+	return x;
+}
+
+static inline void apm_restore_cpus(cpumask_t mask)
+{
+	set_cpus_allowed(current, mask);
+}
+
+#else
+
+/*
+ *	No CPU lockdown needed on a uniprocessor
+ */
+ 
+#define apm_save_cpus()		(current->cpus_allowed)
+#define apm_restore_cpus(x)	(void)(x)
+
+#endif
+
+/*
+ * These are the actual BIOS calls.  Depending on APM_ZERO_SEGS and
+ * apm_info.allow_ints, we are being really paranoid here!  Not only
+ * are interrupts disabled, but all the segment registers (except SS)
+ * are saved and zeroed this means that if the BIOS tries to reference
+ * any data without explicitly loading the segment registers, the kernel
+ * will fault immediately rather than have some unforeseen circumstances
+ * for the rest of the kernel.  And it will be very obvious!  :-) Doing
+ * this depends on CS referring to the same physical memory as DS so that
+ * DS can be zeroed before the call. Unfortunately, we can't do anything
+ * about the stack segment/pointer.  Also, we tell the compiler that
+ * everything could change.
+ *
+ * Also, we KNOW that for the non error case of apm_bios_call, there
+ * is no useful data returned in the low order 8 bits of eax.
+ */
+
+static inline unsigned long __apm_irq_save(void)
+{
+	unsigned long flags;
+	local_save_flags(flags);
+	if (apm_info.allow_ints) {
+		if (irqs_disabled_flags(flags))
+			local_irq_enable();
+	} else
+		local_irq_disable();
+
+	return flags;
+}
+
+#define apm_irq_save(flags) \
+	do { flags = __apm_irq_save(); } while (0)
+
+static inline void apm_irq_restore(unsigned long flags)
+{
+	if (irqs_disabled_flags(flags))
+		local_irq_disable();
+	else if (irqs_disabled())
+		local_irq_enable();
+}
+
+#ifdef APM_ZERO_SEGS
+#	define APM_DECL_SEGS \
+		unsigned int saved_fs; unsigned int saved_gs;
+#	define APM_DO_SAVE_SEGS \
+		savesegment(fs, saved_fs); savesegment(gs, saved_gs)
+#	define APM_DO_RESTORE_SEGS \
+		loadsegment(fs, saved_fs); loadsegment(gs, saved_gs)
+#else
+#	define APM_DECL_SEGS
+#	define APM_DO_SAVE_SEGS
+#	define APM_DO_RESTORE_SEGS
+#endif
+
+/**
+ *	apm_bios_call	-	Make an APM BIOS 32bit call
+ *	@func: APM function to execute
+ *	@ebx_in: EBX register for call entry
+ *	@ecx_in: ECX register for call entry
+ *	@eax: EAX register return
+ *	@ebx: EBX register return
+ *	@ecx: ECX register return
+ *	@edx: EDX register return
+ *	@esi: ESI register return
+ *
+ *	Make an APM call using the 32bit protected mode interface. The
+ *	caller is responsible for knowing if APM BIOS is configured and
+ *	enabled. This call can disable interrupts for a long period of
+ *	time on some laptops.  The return value is in AH and the carry
+ *	flag is loaded into AL.  If there is an error, then the error
+ *	code is returned in AH (bits 8-15 of eax) and this function
+ *	returns non-zero.
+ */
+ 
+static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
+	u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
+{
+	APM_DECL_SEGS
+	unsigned long		flags;
+	cpumask_t		cpus;
+	int			cpu;
+	struct desc_struct	save_desc_40;
+	struct desc_struct	*gdt;
+
+	cpus = apm_save_cpus();
+	
+	cpu = get_cpu();
+	gdt = get_cpu_gdt_table(cpu);
+	save_desc_40 = gdt[0x40 / 8];
+	gdt[0x40 / 8] = bad_bios_desc;
+
+	apm_irq_save(flags);
+	APM_DO_SAVE_SEGS;
+	apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi);
+	APM_DO_RESTORE_SEGS;
+	apm_irq_restore(flags);
+	gdt[0x40 / 8] = save_desc_40;
+	put_cpu();
+	apm_restore_cpus(cpus);
+	
+	return *eax & 0xff;
+}
+
+/**
+ *	apm_bios_call_simple	-	make a simple APM BIOS 32bit call
+ *	@func: APM function to invoke
+ *	@ebx_in: EBX register value for BIOS call
+ *	@ecx_in: ECX register value for BIOS call
+ *	@eax: EAX register on return from the BIOS call
+ *
+ *	Make a BIOS call that returns one value only, or just status.
+ *	If there is an error, then the error code is returned in AH
+ *	(bits 8-15 of eax) and this function returns non-zero. This is
+ *	used for simpler BIOS operations. This call may hold interrupts
+ *	off for a long time on some laptops.
+ */
+
+static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
+{
+	u8			error;
+	APM_DECL_SEGS
+	unsigned long		flags;
+	cpumask_t		cpus;
+	int			cpu;
+	struct desc_struct	save_desc_40;
+	struct desc_struct	*gdt;
+
+	cpus = apm_save_cpus();
+	
+	cpu = get_cpu();
+	gdt = get_cpu_gdt_table(cpu);
+	save_desc_40 = gdt[0x40 / 8];
+	gdt[0x40 / 8] = bad_bios_desc;
+
+	apm_irq_save(flags);
+	APM_DO_SAVE_SEGS;
+	error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax);
+	APM_DO_RESTORE_SEGS;
+	apm_irq_restore(flags);
+	gdt[0x40 / 8] = save_desc_40;
+	put_cpu();
+	apm_restore_cpus(cpus);
+	return error;
+}
+
+/**
+ *	apm_driver_version	-	APM driver version
+ *	@val:	loaded with the APM version on return
+ *
+ *	Retrieve the APM version supported by the BIOS. This is only
+ *	supported for APM 1.1 or higher. An error indicates APM 1.0 is
+ *	probably present.
+ *
+ *	On entry val should point to a value indicating the APM driver
+ *	version with the high byte being the major and the low byte the
+ *	minor number both in BCD
+ *
+ *	On return it will hold the BIOS revision supported in the
+ *	same format.
+ */
+
+static int apm_driver_version(u_short *val)
+{
+	u32	eax;
+
+	if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
+		return (eax >> 8) & 0xff;
+	*val = eax;
+	return APM_SUCCESS;
+}
+
+/**
+ *	apm_get_event	-	get an APM event from the BIOS
+ *	@event: pointer to the event
+ *	@info: point to the event information
+ *
+ *	The APM BIOS provides a polled information for event
+ *	reporting. The BIOS expects to be polled at least every second
+ *	when events are pending. When a message is found the caller should
+ *	poll until no more messages are present.  However, this causes
+ *	problems on some laptops where a suspend event notification is
+ *	not cleared until it is acknowledged.
+ *
+ *	Additional information is returned in the info pointer, providing
+ *	that APM 1.2 is in use. If no messges are pending the value 0x80
+ *	is returned (No power management events pending).
+ */
+ 
+static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
+{
+	u32	eax;
+	u32	ebx;
+	u32	ecx;
+	u32	dummy;
+
+	if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
+			&dummy, &dummy))
+		return (eax >> 8) & 0xff;
+	*event = ebx;
+	if (apm_info.connection_version < 0x0102)
+		*info = ~0; /* indicate info not valid */
+	else
+		*info = ecx;
+	return APM_SUCCESS;
+}
+
+/**
+ *	set_power_state	-	set the power management state
+ *	@what: which items to transition
+ *	@state: state to transition to
+ *
+ *	Request an APM change of state for one or more system devices. The
+ *	processor state must be transitioned last of all. what holds the
+ *	class of device in the upper byte and the device number (0xFF for
+ *	all) for the object to be transitioned.
+ *
+ *	The state holds the state to transition to, which may in fact
+ *	be an acceptance of a BIOS requested state change.
+ */
+ 
+static int set_power_state(u_short what, u_short state)
+{
+	u32	eax;
+
+	if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
+		return (eax >> 8) & 0xff;
+	return APM_SUCCESS;
+}
+
+/**
+ *	set_system_power_state - set system wide power state
+ *	@state: which state to enter
+ *
+ *	Transition the entire system into a new APM power state.
+ */
+ 
+static int set_system_power_state(u_short state)
+{
+	return set_power_state(APM_DEVICE_ALL, state);
+}
+
+/**
+ *	apm_do_idle	-	perform power saving
+ *
+ *	This function notifies the BIOS that the processor is (in the view
+ *	of the OS) idle. It returns -1 in the event that the BIOS refuses
+ *	to handle the idle request. On a success the function returns 1
+ *	if the BIOS did clock slowing or 0 otherwise.
+ */
+ 
+static int apm_do_idle(void)
+{
+	u32	eax;
+	u8	ret = 0;
+	int	idled = 0;
+	int	polling;
+
+	polling = !!(current_thread_info()->status & TS_POLLING);
+	if (polling) {
+		current_thread_info()->status &= ~TS_POLLING;
+		/*
+		 * TS_POLLING-cleared state must be visible before we
+		 * test NEED_RESCHED:
+		 */
+		smp_mb();
+	}
+	if (!need_resched()) {
+		idled = 1;
+		ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);
+	}
+	if (polling)
+		current_thread_info()->status |= TS_POLLING;
+
+	if (!idled)
+		return 0;
+
+	if (ret) {
+		static unsigned long t;
+
+		/* This always fails on some SMP boards running UP kernels.
+		 * Only report the failure the first 5 times.
+		 */
+		if (++t < 5)
+		{
+			printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
+					(eax >> 8) & 0xff);
+			t = jiffies;
+		}
+		return -1;
+	}
+	clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0;
+	return clock_slowed;
+}
+
+/**
+ *	apm_do_busy	-	inform the BIOS the CPU is busy
+ *
+ *	Request that the BIOS brings the CPU back to full performance. 
+ */
+ 
+static void apm_do_busy(void)
+{
+	u32	dummy;
+
+	if (clock_slowed || ALWAYS_CALL_BUSY) {
+		(void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
+		clock_slowed = 0;
+	}
+}
+
+/*
+ * If no process has really been interested in
+ * the CPU for some time, we want to call BIOS
+ * power management - we probably want
+ * to conserve power.
+ */
+#define IDLE_CALC_LIMIT   (HZ * 100)
+#define IDLE_LEAKY_MAX    16
+
+static void (*original_pm_idle)(void) __read_mostly;
+
+/**
+ * apm_cpu_idle		-	cpu idling for APM capable Linux
+ *
+ * This is the idling function the kernel executes when APM is available. It 
+ * tries to do BIOS powermanagement based on the average system idle time.
+ * Furthermore it calls the system default idle routine.
+ */
+
+static void apm_cpu_idle(void)
+{
+	static int use_apm_idle; /* = 0 */
+	static unsigned int last_jiffies; /* = 0 */
+	static unsigned int last_stime; /* = 0 */
+
+	int apm_idle_done = 0;
+	unsigned int jiffies_since_last_check = jiffies - last_jiffies;
+	unsigned int bucket;
+
+recalc:
+	if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
+		use_apm_idle = 0;
+		last_jiffies = jiffies;
+		last_stime = current->stime;
+	} else if (jiffies_since_last_check > idle_period) {
+		unsigned int idle_percentage;
+
+		idle_percentage = current->stime - last_stime;
+		idle_percentage *= 100;
+		idle_percentage /= jiffies_since_last_check;
+		use_apm_idle = (idle_percentage > idle_threshold);
+		if (apm_info.forbid_idle)
+			use_apm_idle = 0;
+		last_jiffies = jiffies;
+		last_stime = current->stime;
+	}
+
+	bucket = IDLE_LEAKY_MAX;
+
+	while (!need_resched()) {
+		if (use_apm_idle) {
+			unsigned int t;
+
+			t = jiffies;
+			switch (apm_do_idle()) {
+			case 0: apm_idle_done = 1;
+				if (t != jiffies) {
+					if (bucket) {
+						bucket = IDLE_LEAKY_MAX;
+						continue;
+					}
+				} else if (bucket) {
+					bucket--;
+					continue;
+				}
+				break;
+			case 1: apm_idle_done = 1;
+				break;
+			default: /* BIOS refused */
+				break;
+			}
+		}
+		if (original_pm_idle)
+			original_pm_idle();
+		else
+			default_idle();
+		jiffies_since_last_check = jiffies - last_jiffies;
+		if (jiffies_since_last_check > idle_period)
+			goto recalc;
+	}
+
+	if (apm_idle_done)
+		apm_do_busy();
+}
+
+/**
+ *	apm_power_off	-	ask the BIOS to power off
+ *
+ *	Handle the power off sequence. This is the one piece of code we
+ *	will execute even on SMP machines. In order to deal with BIOS
+ *	bugs we support real mode APM BIOS power off calls. We also make
+ *	the SMP call on CPU0 as some systems will only honour this call
+ *	on their first cpu.
+ */
+ 
+static void apm_power_off(void)
+{
+	unsigned char	po_bios_call[] = {
+		0xb8, 0x00, 0x10,	/* movw  $0x1000,ax  */
+		0x8e, 0xd0,		/* movw  ax,ss       */
+		0xbc, 0x00, 0xf0,	/* movw  $0xf000,sp  */
+		0xb8, 0x07, 0x53,	/* movw  $0x5307,ax  */
+		0xbb, 0x01, 0x00,	/* movw  $0x0001,bx  */
+		0xb9, 0x03, 0x00,	/* movw  $0x0003,cx  */
+		0xcd, 0x15		/* int   $0x15       */
+	};
+
+	/* Some bioses don't like being called from CPU != 0 */
+	if (apm_info.realmode_power_off)
+	{
+		(void)apm_save_cpus();
+		machine_real_restart(po_bios_call, sizeof(po_bios_call));
+	}
+	else
+		(void) set_system_power_state(APM_STATE_OFF);
+}
+
+#ifdef CONFIG_APM_DO_ENABLE
+
+/**
+ *	apm_enable_power_management - enable BIOS APM power management
+ *	@enable: enable yes/no
+ *
+ *	Enable or disable the APM BIOS power services. 
+ */
+ 
+static int apm_enable_power_management(int enable)
+{
+	u32	eax;
+
+	if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
+		return APM_NOT_ENGAGED;
+	if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
+			enable, &eax))
+		return (eax >> 8) & 0xff;
+	if (enable)
+		apm_info.bios.flags &= ~APM_BIOS_DISABLED;
+	else
+		apm_info.bios.flags |= APM_BIOS_DISABLED;
+	return APM_SUCCESS;
+}
+#endif
+
+/**
+ *	apm_get_power_status	-	get current power state
+ *	@status: returned status
+ *	@bat: battery info
+ *	@life: estimated life
+ *
+ *	Obtain the current power status from the APM BIOS. We return a
+ *	status which gives the rough battery status, and current power
+ *	source. The bat value returned give an estimate as a percentage
+ *	of life and a status value for the battery. The estimated life
+ *	if reported is a lifetime in secodnds/minutes at current powwer
+ *	consumption.
+ */
+ 
+static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
+{
+	u32	eax;
+	u32	ebx;
+	u32	ecx;
+	u32	edx;
+	u32	dummy;
+
+	if (apm_info.get_power_status_broken)
+		return APM_32_UNSUPPORTED;
+	if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
+			&eax, &ebx, &ecx, &edx, &dummy))
+		return (eax >> 8) & 0xff;
+	*status = ebx;
+	*bat = ecx;
+	if (apm_info.get_power_status_swabinminutes) {
+		*life = swab16((u16)edx);
+		*life |= 0x8000;
+	} else
+		*life = edx;
+	return APM_SUCCESS;
+}
+
+#if 0
+static int apm_get_battery_status(u_short which, u_short *status,
+				  u_short *bat, u_short *life, u_short *nbat)
+{
+	u32	eax;
+	u32	ebx;
+	u32	ecx;
+	u32	edx;
+	u32	esi;
+
+	if (apm_info.connection_version < 0x0102) {
+		/* pretend we only have one battery. */
+		if (which != 1)
+			return APM_BAD_DEVICE;
+		*nbat = 1;
+		return apm_get_power_status(status, bat, life);
+	}
+
+	if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
+			&ebx, &ecx, &edx, &esi))
+		return (eax >> 8) & 0xff;
+	*status = ebx;
+	*bat = ecx;
+	*life = edx;
+	*nbat = esi;
+	return APM_SUCCESS;
+}
+#endif
+
+/**
+ *	apm_engage_power_management	-	enable PM on a device
+ *	@device: identity of device
+ *	@enable: on/off
+ *
+ *	Activate or deactive power management on either a specific device
+ *	or the entire system (%APM_DEVICE_ALL).
+ */
+ 
+static int apm_engage_power_management(u_short device, int enable)
+{
+	u32	eax;
+
+	if ((enable == 0) && (device == APM_DEVICE_ALL)
+	    && (apm_info.bios.flags & APM_BIOS_DISABLED))
+		return APM_DISABLED;
+	if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax))
+		return (eax >> 8) & 0xff;
+	if (device == APM_DEVICE_ALL) {
+		if (enable)
+			apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
+		else
+			apm_info.bios.flags |= APM_BIOS_DISENGAGED;
+	}
+	return APM_SUCCESS;
+}
+
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+
+/**
+ *	apm_console_blank	-	blank the display
+ *	@blank: on/off
+ *
+ *	Attempt to blank the console, firstly by blanking just video device
+ *	zero, and if that fails (some BIOSes don't support it) then it blanks
+ *	all video devices. Typically the BIOS will do laptop backlight and
+ *	monitor powerdown for us.
+ */
+ 
+static int apm_console_blank(int blank)
+{
+	int error = APM_NOT_ENGAGED; /* silence gcc */
+	int i;
+	u_short state;
+	static const u_short dev[3] = { 0x100, 0x1FF, 0x101 };
+
+	state = blank ? APM_STATE_STANDBY : APM_STATE_READY;
+
+	for (i = 0; i < ARRAY_SIZE(dev); i++) {
+		error = set_power_state(dev[i], state);
+
+		if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
+			return 1;
+
+		if (error == APM_NOT_ENGAGED)
+			break;
+	}
+
+	if (error == APM_NOT_ENGAGED) {
+		static int tried;
+		int eng_error;
+		if (tried++ == 0) {
+			eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1);
+			if (eng_error) {
+				apm_error("set display", error);
+				apm_error("engage interface", eng_error);
+				return 0;
+			} else
+				return apm_console_blank(blank);
+		}
+	}
+	apm_error("set display", error);
+	return 0;
+}
+#endif
+
+static int queue_empty(struct apm_user *as)
+{
+	return as->event_head == as->event_tail;
+}
+
+static apm_event_t get_queued_event(struct apm_user *as)
+{
+	if (++as->event_tail >= APM_MAX_EVENTS)
+		as->event_tail = 0;
+	return as->events[as->event_tail];
+}
+
+static void queue_event(apm_event_t event, struct apm_user *sender)
+{
+	struct apm_user *	as;
+
+	spin_lock(&user_list_lock);
+	if (user_list == NULL)
+		goto out;
+	for (as = user_list; as != NULL; as = as->next) {
+		if ((as == sender) || (!as->reader))
+			continue;
+		if (++as->event_head >= APM_MAX_EVENTS)
+			as->event_head = 0;
+
+		if (as->event_head == as->event_tail) {
+			static int notified;
+
+			if (notified++ == 0)
+			    printk(KERN_ERR "apm: an event queue overflowed\n");
+			if (++as->event_tail >= APM_MAX_EVENTS)
+				as->event_tail = 0;
+		}
+		as->events[as->event_head] = event;
+		if ((!as->suser) || (!as->writer))
+			continue;
+		switch (event) {
+		case APM_SYS_SUSPEND:
+		case APM_USER_SUSPEND:
+			as->suspends_pending++;
+			suspends_pending++;
+			break;
+
+		case APM_SYS_STANDBY:
+		case APM_USER_STANDBY:
+			as->standbys_pending++;
+			standbys_pending++;
+			break;
+		}
+	}
+	wake_up_interruptible(&apm_waitqueue);
+out:
+	spin_unlock(&user_list_lock);
+}
+
+static void reinit_timer(void)
+{
+#ifdef INIT_TIMER_AFTER_SUSPEND
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+	/* set the clock to HZ */
+	outb_p(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
+	udelay(10);
+	outb_p(LATCH & 0xff, PIT_CH0);	/* LSB */
+	udelay(10);
+	outb(LATCH >> 8, PIT_CH0);	/* MSB */
+	udelay(10);
+	spin_unlock_irqrestore(&i8253_lock, flags);
+#endif
+}
+
+static int suspend(int vetoable)
+{
+	int		err;
+	struct apm_user	*as;
+
+	if (pm_send_all(PM_SUSPEND, (void *)3)) {
+		/* Vetoed */
+		if (vetoable) {
+			if (apm_info.connection_version > 0x100)
+				set_system_power_state(APM_STATE_REJECT);
+			err = -EBUSY;
+			ignore_sys_suspend = 0;
+			printk(KERN_WARNING "apm: suspend was vetoed.\n");
+			goto out;
+		}
+		printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n");
+	}
+
+	device_suspend(PMSG_SUSPEND);
+	local_irq_disable();
+	device_power_down(PMSG_SUSPEND);
+
+	local_irq_enable();
+
+	save_processor_state();
+	err = set_system_power_state(APM_STATE_SUSPEND);
+	ignore_normal_resume = 1;
+	restore_processor_state();
+
+	local_irq_disable();
+	reinit_timer();
+
+	if (err == APM_NO_ERROR)
+		err = APM_SUCCESS;
+	if (err != APM_SUCCESS)
+		apm_error("suspend", err);
+	err = (err == APM_SUCCESS) ? 0 : -EIO;
+	device_power_up();
+	local_irq_enable();
+	device_resume();
+	pm_send_all(PM_RESUME, (void *)0);
+	queue_event(APM_NORMAL_RESUME, NULL);
+ out:
+	spin_lock(&user_list_lock);
+	for (as = user_list; as != NULL; as = as->next) {
+		as->suspend_wait = 0;
+		as->suspend_result = err;
+	}
+	spin_unlock(&user_list_lock);
+	wake_up_interruptible(&apm_suspend_waitqueue);
+	return err;
+}
+
+static void standby(void)
+{
+	int	err;
+
+	local_irq_disable();
+	device_power_down(PMSG_SUSPEND);
+	local_irq_enable();
+
+	err = set_system_power_state(APM_STATE_STANDBY);
+	if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
+		apm_error("standby", err);
+
+	local_irq_disable();
+	device_power_up();
+	local_irq_enable();
+}
+
+static apm_event_t get_event(void)
+{
+	int		error;
+	apm_event_t	event = APM_NO_EVENTS; /* silence gcc */
+	apm_eventinfo_t	info;
+
+	static int notified;
+
+	/* we don't use the eventinfo */
+	error = apm_get_event(&event, &info);
+	if (error == APM_SUCCESS)
+		return event;
+
+	if ((error != APM_NO_EVENTS) && (notified++ == 0))
+		apm_error("get_event", error);
+
+	return 0;
+}
+
+static void check_events(void)
+{
+	apm_event_t		event;
+	static unsigned long	last_resume;
+	static int		ignore_bounce;
+
+	while ((event = get_event()) != 0) {
+		if (debug) {
+			if (event <= NR_APM_EVENT_NAME)
+				printk(KERN_DEBUG "apm: received %s notify\n",
+				       apm_event_name[event - 1]);
+			else
+				printk(KERN_DEBUG "apm: received unknown "
+				       "event 0x%02x\n", event);
+		}
+		if (ignore_bounce
+		    && ((jiffies - last_resume) > bounce_interval))
+			ignore_bounce = 0;
+
+		switch (event) {
+		case APM_SYS_STANDBY:
+		case APM_USER_STANDBY:
+			queue_event(event, NULL);
+			if (standbys_pending <= 0)
+				standby();
+			break;
+
+		case APM_USER_SUSPEND:
+#ifdef CONFIG_APM_IGNORE_USER_SUSPEND
+			if (apm_info.connection_version > 0x100)
+				set_system_power_state(APM_STATE_REJECT);
+			break;
+#endif
+		case APM_SYS_SUSPEND:
+			if (ignore_bounce) {
+				if (apm_info.connection_version > 0x100)
+					set_system_power_state(APM_STATE_REJECT);
+				break;
+			}
+			/*
+			 * If we are already processing a SUSPEND,
+			 * then further SUSPEND events from the BIOS
+			 * will be ignored.  We also return here to
+			 * cope with the fact that the Thinkpads keep
+			 * sending a SUSPEND event until something else
+			 * happens!
+			 */
+			if (ignore_sys_suspend)
+				return;
+			ignore_sys_suspend = 1;
+			queue_event(event, NULL);
+			if (suspends_pending <= 0)
+				(void) suspend(1);
+			break;
+
+		case APM_NORMAL_RESUME:
+		case APM_CRITICAL_RESUME:
+		case APM_STANDBY_RESUME:
+			ignore_sys_suspend = 0;
+			last_resume = jiffies;
+			ignore_bounce = 1;
+			if ((event != APM_NORMAL_RESUME)
+			    || (ignore_normal_resume == 0)) {
+				device_resume();
+				pm_send_all(PM_RESUME, (void *)0);
+				queue_event(event, NULL);
+			}
+			ignore_normal_resume = 0;
+			break;
+
+		case APM_CAPABILITY_CHANGE:
+		case APM_LOW_BATTERY:
+		case APM_POWER_STATUS_CHANGE:
+			queue_event(event, NULL);
+			/* If needed, notify drivers here */
+			break;
+
+		case APM_UPDATE_TIME:
+			break;
+
+		case APM_CRITICAL_SUSPEND:
+			/*
+			 * We are not allowed to reject a critical suspend.
+			 */
+			(void) suspend(0);
+			break;
+		}
+	}
+}
+
+static void apm_event_handler(void)
+{
+	static int	pending_count = 4;
+	int		err;
+
+	if ((standbys_pending > 0) || (suspends_pending > 0)) {
+		if ((apm_info.connection_version > 0x100) &&
+				(pending_count-- <= 0)) {
+			pending_count = 4;
+			if (debug)
+				printk(KERN_DEBUG "apm: setting state busy\n");
+			err = set_system_power_state(APM_STATE_BUSY);
+			if (err)
+				apm_error("busy", err);
+		}
+	} else
+		pending_count = 4;
+	check_events();
+}
+
+/*
+ * This is the APM thread main loop.
+ */
+
+static void apm_mainloop(void)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(&apm_waitqueue, &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+	for (;;) {
+		schedule_timeout(APM_CHECK_TIMEOUT);
+		if (kthread_should_stop())
+			break;
+		/*
+		 * Ok, check all events, check for idle (and mark us sleeping
+		 * so as not to count towards the load average)..
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		apm_event_handler();
+	}
+	remove_wait_queue(&apm_waitqueue, &wait);
+}
+
+static int check_apm_user(struct apm_user *as, const char *func)
+{
+	if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) {
+		printk(KERN_ERR "apm: %s passed bad filp\n", func);
+		return 1;
+	}
+	return 0;
+}
+
+static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct apm_user *	as;
+	int			i;
+	apm_event_t		event;
+
+	as = fp->private_data;
+	if (check_apm_user(as, "read"))
+		return -EIO;
+	if ((int)count < sizeof(apm_event_t))
+		return -EINVAL;
+	if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK))
+		return -EAGAIN;
+	wait_event_interruptible(apm_waitqueue, !queue_empty(as));
+	i = count;
+	while ((i >= sizeof(event)) && !queue_empty(as)) {
+		event = get_queued_event(as);
+		if (copy_to_user(buf, &event, sizeof(event))) {
+			if (i < count)
+				break;
+			return -EFAULT;
+		}
+		switch (event) {
+		case APM_SYS_SUSPEND:
+		case APM_USER_SUSPEND:
+			as->suspends_read++;
+			break;
+
+		case APM_SYS_STANDBY:
+		case APM_USER_STANDBY:
+			as->standbys_read++;
+			break;
+		}
+		buf += sizeof(event);
+		i -= sizeof(event);
+	}
+	if (i < count)
+		return count - i;
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+	return 0;
+}
+
+static unsigned int do_poll(struct file *fp, poll_table * wait)
+{
+	struct apm_user * as;
+
+	as = fp->private_data;
+	if (check_apm_user(as, "poll"))
+		return 0;
+	poll_wait(fp, &apm_waitqueue, wait);
+	if (!queue_empty(as))
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
+static int do_ioctl(struct inode * inode, struct file *filp,
+		    u_int cmd, u_long arg)
+{
+	struct apm_user *	as;
+
+	as = filp->private_data;
+	if (check_apm_user(as, "ioctl"))
+		return -EIO;
+	if ((!as->suser) || (!as->writer))
+		return -EPERM;
+	switch (cmd) {
+	case APM_IOC_STANDBY:
+		if (as->standbys_read > 0) {
+			as->standbys_read--;
+			as->standbys_pending--;
+			standbys_pending--;
+		} else
+			queue_event(APM_USER_STANDBY, as);
+		if (standbys_pending <= 0)
+			standby();
+		break;
+	case APM_IOC_SUSPEND:
+		if (as->suspends_read > 0) {
+			as->suspends_read--;
+			as->suspends_pending--;
+			suspends_pending--;
+		} else
+			queue_event(APM_USER_SUSPEND, as);
+		if (suspends_pending <= 0) {
+			return suspend(1);
+		} else {
+			as->suspend_wait = 1;
+			wait_event_interruptible(apm_suspend_waitqueue,
+					as->suspend_wait == 0);
+			return as->suspend_result;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int do_release(struct inode * inode, struct file * filp)
+{
+	struct apm_user *	as;
+
+	as = filp->private_data;
+	if (check_apm_user(as, "release"))
+		return 0;
+	filp->private_data = NULL;
+	if (as->standbys_pending > 0) {
+		standbys_pending -= as->standbys_pending;
+		if (standbys_pending <= 0)
+			standby();
+	}
+	if (as->suspends_pending > 0) {
+		suspends_pending -= as->suspends_pending;
+		if (suspends_pending <= 0)
+			(void) suspend(1);
+	}
+  	spin_lock(&user_list_lock);
+	if (user_list == as)
+		user_list = as->next;
+	else {
+		struct apm_user *	as1;
+
+		for (as1 = user_list;
+		     (as1 != NULL) && (as1->next != as);
+		     as1 = as1->next)
+			;
+		if (as1 == NULL)
+			printk(KERN_ERR "apm: filp not in user list\n");
+		else
+			as1->next = as->next;
+	}
+	spin_unlock(&user_list_lock);
+	kfree(as);
+	return 0;
+}
+
+static int do_open(struct inode * inode, struct file * filp)
+{
+	struct apm_user *	as;
+
+	as = kmalloc(sizeof(*as), GFP_KERNEL);
+	if (as == NULL) {
+		printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
+		       sizeof(*as));
+		return -ENOMEM;
+	}
+	as->magic = APM_BIOS_MAGIC;
+	as->event_tail = as->event_head = 0;
+	as->suspends_pending = as->standbys_pending = 0;
+	as->suspends_read = as->standbys_read = 0;
+	/*
+	 * XXX - this is a tiny bit broken, when we consider BSD
+         * process accounting. If the device is opened by root, we
+	 * instantly flag that we used superuser privs. Who knows,
+	 * we might close the device immediately without doing a
+	 * privileged operation -- cevans
+	 */
+	as->suser = capable(CAP_SYS_ADMIN);
+	as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE;
+	as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ;
+	spin_lock(&user_list_lock);
+	as->next = user_list;
+	user_list = as;
+	spin_unlock(&user_list_lock);
+	filp->private_data = as;
+	return 0;
+}
+
+static int proc_apm_show(struct seq_file *m, void *v)
+{
+	unsigned short	bx;
+	unsigned short	cx;
+	unsigned short	dx;
+	int		error;
+	unsigned short  ac_line_status = 0xff;
+	unsigned short  battery_status = 0xff;
+	unsigned short  battery_flag   = 0xff;
+	int		percentage     = -1;
+	int             time_units     = -1;
+	char            *units         = "?";
+
+	if ((num_online_cpus() == 1) &&
+	    !(error = apm_get_power_status(&bx, &cx, &dx))) {
+		ac_line_status = (bx >> 8) & 0xff;
+		battery_status = bx & 0xff;
+		if ((cx & 0xff) != 0xff)
+			percentage = cx & 0xff;
+
+		if (apm_info.connection_version > 0x100) {
+			battery_flag = (cx >> 8) & 0xff;
+			if (dx != 0xffff) {
+				units = (dx & 0x8000) ? "min" : "sec";
+				time_units = dx & 0x7fff;
+			}
+		}
+	}
+	/* Arguments, with symbols from linux/apm_bios.h.  Information is
+	   from the Get Power Status (0x0a) call unless otherwise noted.
+
+	   0) Linux driver version (this will change if format changes)
+	   1) APM BIOS Version.  Usually 1.0, 1.1 or 1.2.
+	   2) APM flags from APM Installation Check (0x00):
+	      bit 0: APM_16_BIT_SUPPORT
+	      bit 1: APM_32_BIT_SUPPORT
+	      bit 2: APM_IDLE_SLOWS_CLOCK
+	      bit 3: APM_BIOS_DISABLED
+	      bit 4: APM_BIOS_DISENGAGED
+	   3) AC line status
+	      0x00: Off-line
+	      0x01: On-line
+	      0x02: On backup power (BIOS >= 1.1 only)
+	      0xff: Unknown
+	   4) Battery status
+	      0x00: High
+	      0x01: Low
+	      0x02: Critical
+	      0x03: Charging
+	      0x04: Selected battery not present (BIOS >= 1.2 only)
+	      0xff: Unknown
+	   5) Battery flag
+	      bit 0: High
+	      bit 1: Low
+	      bit 2: Critical
+	      bit 3: Charging
+	      bit 7: No system battery
+	      0xff: Unknown
+	   6) Remaining battery life (percentage of charge):
+	      0-100: valid
+	      -1: Unknown
+	   7) Remaining battery life (time units):
+	      Number of remaining minutes or seconds
+	      -1: Unknown
+	   8) min = minutes; sec = seconds */
+
+	seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
+		     driver_version,
+		     (apm_info.bios.version >> 8) & 0xff,
+		     apm_info.bios.version & 0xff,
+		     apm_info.bios.flags,
+		     ac_line_status,
+		     battery_status,
+		     battery_flag,
+		     percentage,
+		     time_units,
+		     units);
+	return 0;
+}
+
+static int proc_apm_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_apm_show, NULL);
+}
+
+static const struct file_operations apm_file_ops = {
+	.owner		= THIS_MODULE,
+	.open		= proc_apm_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int apm(void *unused)
+{
+	unsigned short	bx;
+	unsigned short	cx;
+	unsigned short	dx;
+	int		error;
+	char *		power_stat;
+	char *		bat_stat;
+
+#ifdef CONFIG_SMP
+	/* 2002/08/01 - WT
+	 * This is to avoid random crashes at boot time during initialization
+	 * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D.
+	 * Some bioses don't like being called from CPU != 0.
+	 * Method suggested by Ingo Molnar.
+	 */
+	set_cpus_allowed(current, cpumask_of_cpu(0));
+	BUG_ON(smp_processor_id() != 0);
+#endif
+
+	if (apm_info.connection_version == 0) {
+		apm_info.connection_version = apm_info.bios.version;
+		if (apm_info.connection_version > 0x100) {
+			/*
+			 * We only support BIOSs up to version 1.2
+			 */
+			if (apm_info.connection_version > 0x0102)
+				apm_info.connection_version = 0x0102;
+			error = apm_driver_version(&apm_info.connection_version);
+			if (error != APM_SUCCESS) {
+				apm_error("driver version", error);
+				/* Fall back to an APM 1.0 connection. */
+				apm_info.connection_version = 0x100;
+			}
+		}
+	}
+
+	if (debug)
+		printk(KERN_INFO "apm: Connection version %d.%d\n",
+			(apm_info.connection_version >> 8) & 0xff,
+			apm_info.connection_version & 0xff);
+
+#ifdef CONFIG_APM_DO_ENABLE
+	if (apm_info.bios.flags & APM_BIOS_DISABLED) {
+		/*
+		 * This call causes my NEC UltraLite Versa 33/C to hang if it
+		 * is booted with PM disabled but not in the docking station.
+		 * Unfortunate ...
+		 */
+		error = apm_enable_power_management(1);
+		if (error) {
+			apm_error("enable power management", error);
+			return -1;
+		}
+	}
+#endif
+
+	if ((apm_info.bios.flags & APM_BIOS_DISENGAGED)
+	    && (apm_info.connection_version > 0x0100)) {
+		error = apm_engage_power_management(APM_DEVICE_ALL, 1);
+		if (error) {
+			apm_error("engage power management", error);
+			return -1;
+		}
+	}
+
+	if (debug && (num_online_cpus() == 1 || smp )) {
+		error = apm_get_power_status(&bx, &cx, &dx);
+		if (error)
+			printk(KERN_INFO "apm: power status not available\n");
+		else {
+			switch ((bx >> 8) & 0xff) {
+			case 0: power_stat = "off line"; break;
+			case 1: power_stat = "on line"; break;
+			case 2: power_stat = "on backup power"; break;
+			default: power_stat = "unknown"; break;
+			}
+			switch (bx & 0xff) {
+			case 0: bat_stat = "high"; break;
+			case 1: bat_stat = "low"; break;
+			case 2: bat_stat = "critical"; break;
+			case 3: bat_stat = "charging"; break;
+			default: bat_stat = "unknown"; break;
+			}
+			printk(KERN_INFO
+			       "apm: AC %s, battery status %s, battery life ",
+			       power_stat, bat_stat);
+			if ((cx & 0xff) == 0xff)
+				printk("unknown\n");
+			else
+				printk("%d%%\n", cx & 0xff);
+			if (apm_info.connection_version > 0x100) {
+				printk(KERN_INFO
+				       "apm: battery flag 0x%02x, battery life ",
+				       (cx >> 8) & 0xff);
+				if (dx == 0xffff)
+					printk("unknown\n");
+				else
+					printk("%d %s\n", dx & 0x7fff,
+						(dx & 0x8000) ?
+						"minutes" : "seconds");
+			}
+		}
+	}
+
+	/* Install our power off handler.. */
+	if (power_off)
+		pm_power_off = apm_power_off;
+
+	if (num_online_cpus() == 1 || smp) {
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+		console_blank_hook = apm_console_blank;
+#endif
+		apm_mainloop();
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+		console_blank_hook = NULL;
+#endif
+	}
+
+	return 0;
+}
+
+#ifndef MODULE
+static int __init apm_setup(char *str)
+{
+	int	invert;
+
+	while ((str != NULL) && (*str != '\0')) {
+		if (strncmp(str, "off", 3) == 0)
+			apm_disabled = 1;
+		if (strncmp(str, "on", 2) == 0)
+			apm_disabled = 0;
+		if ((strncmp(str, "bounce-interval=", 16) == 0) ||
+		    (strncmp(str, "bounce_interval=", 16) == 0))
+			bounce_interval = simple_strtol(str + 16, NULL, 0);
+		if ((strncmp(str, "idle-threshold=", 15) == 0) ||
+		    (strncmp(str, "idle_threshold=", 15) == 0))
+			idle_threshold = simple_strtol(str + 15, NULL, 0);
+		if ((strncmp(str, "idle-period=", 12) == 0) ||
+		    (strncmp(str, "idle_period=", 12) == 0))
+			idle_period = simple_strtol(str + 12, NULL, 0);
+		invert = (strncmp(str, "no-", 3) == 0) ||
+			(strncmp(str, "no_", 3) == 0);
+		if (invert)
+			str += 3;
+		if (strncmp(str, "debug", 5) == 0)
+			debug = !invert;
+		if ((strncmp(str, "power-off", 9) == 0) ||
+		    (strncmp(str, "power_off", 9) == 0))
+			power_off = !invert;
+		if (strncmp(str, "smp", 3) == 0)
+		{
+			smp = !invert;
+			idle_threshold = 100;
+		}
+		if ((strncmp(str, "allow-ints", 10) == 0) ||
+		    (strncmp(str, "allow_ints", 10) == 0))
+ 			apm_info.allow_ints = !invert;
+		if ((strncmp(str, "broken-psr", 10) == 0) ||
+		    (strncmp(str, "broken_psr", 10) == 0))
+			apm_info.get_power_status_broken = !invert;
+		if ((strncmp(str, "realmode-power-off", 18) == 0) ||
+		    (strncmp(str, "realmode_power_off", 18) == 0))
+			apm_info.realmode_power_off = !invert;
+		str = strchr(str, ',');
+		if (str != NULL)
+			str += strspn(str, ", \t");
+	}
+	return 1;
+}
+
+__setup("apm=", apm_setup);
+#endif
+
+static const struct file_operations apm_bios_fops = {
+	.owner		= THIS_MODULE,
+	.read		= do_read,
+	.poll		= do_poll,
+	.ioctl		= do_ioctl,
+	.open		= do_open,
+	.release	= do_release,
+};
+
+static struct miscdevice apm_device = {
+	APM_MINOR_DEV,
+	"apm_bios",
+	&apm_bios_fops
+};
+
+
+/* Simple "print if true" callback */
+static int __init print_if_true(struct dmi_system_id *d)
+{
+	printk("%s\n", d->ident);
+	return 0;
+}
+
+/*
+ * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was
+ * disabled before the suspend. Linux used to get terribly confused by that.
+ */
+static int __init broken_ps2_resume(struct dmi_system_id *d)
+{
+	printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident);
+	return 0;
+}
+
+/* Some bioses have a broken protected mode poweroff and need to use realmode */
+static int __init set_realmode_power_off(struct dmi_system_id *d)
+{
+	if (apm_info.realmode_power_off == 0) {
+		apm_info.realmode_power_off = 1;
+		printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident);
+	}
+	return 0;
+}
+
+/* Some laptops require interrupts to be enabled during APM calls */
+static int __init set_apm_ints(struct dmi_system_id *d)
+{
+	if (apm_info.allow_ints == 0) {
+		apm_info.allow_ints = 1;
+		printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident);
+	}
+	return 0;
+}
+
+/* Some APM bioses corrupt memory or just plain do not work */
+static int __init apm_is_horked(struct dmi_system_id *d)
+{
+	if (apm_info.disabled == 0) {
+		apm_info.disabled = 1;
+		printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
+	}
+	return 0;
+}
+
+static int __init apm_is_horked_d850md(struct dmi_system_id *d)
+{
+	if (apm_info.disabled == 0) {
+		apm_info.disabled = 1;
+		printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
+		printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
+		printk(KERN_INFO "download from support.intel.com \n");
+	}
+	return 0;
+}
+
+/* Some APM bioses hang on APM idle calls */
+static int __init apm_likes_to_melt(struct dmi_system_id *d)
+{
+	if (apm_info.forbid_idle == 0) {
+		apm_info.forbid_idle = 1;
+		printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident);
+	}
+	return 0;
+}
+
+/*
+ *  Check for clue free BIOS implementations who use
+ *  the following QA technique
+ *
+ *      [ Write BIOS Code ]<------
+ *               |                ^
+ *      < Does it Compile >----N--
+ *               |Y               ^
+ *	< Does it Boot Win98 >-N--
+ *               |Y
+ *           [Ship It]
+ *
+ *	Phoenix A04  08/24/2000 is known bad (Dell Inspiron 5000e)
+ *	Phoenix A07  09/29/2000 is known good (Dell Inspiron 5000)
+ */
+static int __init broken_apm_power(struct dmi_system_id *d)
+{
+	apm_info.get_power_status_broken = 1;
+	printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n");
+	return 0;
+}
+
+/*
+ * This bios swaps the APM minute reporting bytes over (Many sony laptops
+ * have this problem).
+ */
+static int __init swab_apm_power_in_minutes(struct dmi_system_id *d)
+{
+	apm_info.get_power_status_swabinminutes = 1;
+	printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n");
+	return 0;
+}
+
+static struct dmi_system_id __initdata apm_dmi_table[] = {
+	{
+		print_if_true,
+		KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.",
+		{	DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+			DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), },
+	},
+	{	/* Handle problems with APM on the C600 */
+		broken_ps2_resume, "Dell Latitude C600",
+		{	DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), },
+	},
+	{	/* Allow interrupts during suspend on Dell Latitude laptops*/
+		set_apm_ints, "Dell Latitude",
+		{	DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), }
+	},
+	{	/* APM crashes */
+		apm_is_horked, "Dell Inspiron 2500",
+		{	DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
+			DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+	},
+	{	/* Allow interrupts during suspend on Dell Inspiron laptops*/
+		set_apm_ints, "Dell Inspiron", {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), },
+	},
+	{	/* Handle problems with APM on Inspiron 5000e */
+		broken_apm_power, "Dell Inspiron 5000e",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "A04"),
+			DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), },
+	},
+	{	/* Handle problems with APM on Inspiron 2500 */
+		broken_apm_power, "Dell Inspiron 2500",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "A12"),
+			DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), },
+	},
+	{	/* APM crashes */
+		apm_is_horked, "Dell Dimension 4100",
+		{	DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
+			DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."),
+			DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+	},
+	{	/* Allow interrupts during suspend on Compaq Laptops*/
+		set_apm_ints, "Compaq 12XL125",
+		{	DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
+			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION,"4.06"), },
+	},
+	{	/* Allow interrupts during APM or the clock goes slow */
+		set_apm_ints, "ASUSTeK",
+		{	DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), },
+	},
+	{	/* APM blows on shutdown */
+		apm_is_horked, "ABIT KX7-333[R]",
+		{	DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"),
+			DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), },
+	},
+	{	/* APM crashes */
+		apm_is_horked, "Trigem Delhi3",
+		{	DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), },
+	},
+	{	/* APM crashes */
+		apm_is_horked, "Fujitsu-Siemens",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"),
+			DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), },
+	},
+	{	/* APM crashes */
+		apm_is_horked_d850md, "Intel D850MD",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+			DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), },
+	},
+	{	/* APM crashes */
+		apm_is_horked, "Intel D810EMO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+			DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), },
+	},
+	{	/* APM crashes */
+		apm_is_horked, "Dell XPS-Z",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+			DMI_MATCH(DMI_BIOS_VERSION, "A11"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), },
+	},
+	{	/* APM crashes */
+		apm_is_horked, "Sharp PC-PJ/AX",
+		{	DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
+			DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"),
+			DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), },
+	},
+	{	/* APM crashes */
+		apm_is_horked, "Dell Inspiron 2500",
+		{	DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
+			DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+	},
+	{	/* APM idle hangs */
+		apm_likes_to_melt, "Jabil AMD",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+			DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), },
+	},
+	{	/* APM idle hangs */
+		apm_likes_to_melt, "AMI Bios",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+			DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), },
+	},
+	{	/* Handle problems with APM on Sony Vaio PCG-N505X(DE) */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "R0206H"),
+			DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), },
+	},
+	{	/* Handle problems with APM on Sony Vaio PCG-N505VX */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"),
+			DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), },
+	},
+	{	/* Handle problems with APM on Sony Vaio PCG-XG29 */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"),
+			DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), },
+	},
+	{	/* Handle problems with APM on Sony Vaio PCG-Z600NE */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"),
+			DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), },
+	},
+	{	/* Handle problems with APM on Sony Vaio PCG-Z600NE */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"),
+			DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), },
+	},
+	{	/* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"),
+			DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), },
+	},
+	{	/* Handle problems with APM on Sony Vaio PCG-Z505LS */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"),
+			DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), },
+	},
+	{	/* Handle problems with APM on Sony Vaio PCG-Z505LS */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"),
+			DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), },
+	},
+	{	/* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"),
+			DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), },
+	},
+	{	/* Handle problems with APM on Sony Vaio PCG-F104K */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"),
+			DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), },
+	},
+
+	{	/* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"),
+			DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), },
+	},
+	{	/* Handle problems with APM on Sony Vaio PCG-C1VE */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"),
+			DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), },
+	},
+	{	/* Handle problems with APM on Sony Vaio PCG-C1VE */
+		swab_apm_power_in_minutes, "Sony VAIO",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"),
+			DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), },
+	},
+	{	/* broken PM poweroff bios */
+		set_realmode_power_off, "Award Software v4.60 PGMA",
+		{	DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."),
+			DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
+			DMI_MATCH(DMI_BIOS_DATE, "134526184"), },
+	},
+
+	/* Generic per vendor APM settings  */
+
+	{	/* Allow interrupts during suspend on IBM laptops */
+		set_apm_ints, "IBM",
+		{	DMI_MATCH(DMI_SYS_VENDOR, "IBM"), },
+	},
+
+	{ }
+};
+
+/*
+ * Just start the APM thread. We do NOT want to do APM BIOS
+ * calls from anything but the APM thread, if for no other reason
+ * than the fact that we don't trust the APM BIOS. This way,
+ * most common APM BIOS problems that lead to protection errors
+ * etc will have at least some level of being contained...
+ *
+ * In short, if something bad happens, at least we have a choice
+ * of just killing the apm thread..
+ */
+static int __init apm_init(void)
+{
+	struct proc_dir_entry *apm_proc;
+	struct desc_struct *gdt;
+	int err;
+
+	dmi_check_system(apm_dmi_table);
+
+	if (apm_info.bios.version == 0 || paravirt_enabled()) {
+		printk(KERN_INFO "apm: BIOS not found.\n");
+		return -ENODEV;
+	}
+	printk(KERN_INFO
+		"apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
+		((apm_info.bios.version >> 8) & 0xff),
+		(apm_info.bios.version & 0xff),
+		apm_info.bios.flags,
+		driver_version);
+	if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
+		printk(KERN_INFO "apm: no 32 bit BIOS support\n");
+		return -ENODEV;
+	}
+
+	if (allow_ints)
+		apm_info.allow_ints = 1;
+	if (broken_psr)
+		apm_info.get_power_status_broken = 1;
+	if (realmode_power_off)
+		apm_info.realmode_power_off = 1;
+	/* User can override, but default is to trust DMI */
+	if (apm_disabled != -1)
+		apm_info.disabled = apm_disabled;
+
+	/*
+	 * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1
+	 * but is reportedly a 1.0 BIOS.
+	 */
+	if (apm_info.bios.version == 0x001)
+		apm_info.bios.version = 0x100;
+
+	/* BIOS < 1.2 doesn't set cseg_16_len */
+	if (apm_info.bios.version < 0x102)
+		apm_info.bios.cseg_16_len = 0; /* 64k */
+
+	if (debug) {
+		printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x",
+			apm_info.bios.cseg, apm_info.bios.offset,
+			apm_info.bios.cseg_16, apm_info.bios.dseg);
+		if (apm_info.bios.version > 0x100)
+			printk(" cseg len %x, dseg len %x",
+				apm_info.bios.cseg_len,
+				apm_info.bios.dseg_len);
+		if (apm_info.bios.version > 0x101)
+			printk(" cseg16 len %x", apm_info.bios.cseg_16_len);
+		printk("\n");
+	}
+
+	if (apm_info.disabled) {
+		printk(KERN_NOTICE "apm: disabled on user request.\n");
+		return -ENODEV;
+	}
+	if ((num_online_cpus() > 1) && !power_off && !smp) {
+		printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n");
+		apm_info.disabled = 1;
+		return -ENODEV;
+	}
+	if (PM_IS_ACTIVE()) {
+		printk(KERN_NOTICE "apm: overridden by ACPI.\n");
+		apm_info.disabled = 1;
+		return -ENODEV;
+	}
+#ifdef CONFIG_PM_LEGACY
+	pm_active = 1;
+#endif
+
+	/*
+	 * Set up a segment that references the real mode segment 0x40
+	 * that extends up to the end of page zero (that we have reserved).
+	 * This is for buggy BIOS's that refer to (real mode) segment 0x40
+	 * even though they are called in protected mode.
+	 */
+	set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
+	_set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
+
+	/*
+	 * Set up the long jump entry point to the APM BIOS, which is called
+	 * from inline assembly.
+	 */
+	apm_bios_entry.offset = apm_info.bios.offset;
+	apm_bios_entry.segment = APM_CS;
+
+	/*
+	 * The APM 1.1 BIOS is supposed to provide limit information that it
+	 * recognizes.  Many machines do this correctly, but many others do
+	 * not restrict themselves to their claimed limit.  When this happens,
+	 * they will cause a segmentation violation in the kernel at boot time.
+	 * Most BIOS's, however, will respect a 64k limit, so we use that.
+	 *
+	 * Note we only set APM segments on CPU zero, since we pin the APM
+	 * code to that CPU.
+	 */
+	gdt = get_cpu_gdt_table(0);
+	set_base(gdt[APM_CS >> 3],
+		 __va((unsigned long)apm_info.bios.cseg << 4));
+	set_base(gdt[APM_CS_16 >> 3],
+		 __va((unsigned long)apm_info.bios.cseg_16 << 4));
+	set_base(gdt[APM_DS >> 3],
+		 __va((unsigned long)apm_info.bios.dseg << 4));
+
+	apm_proc = create_proc_entry("apm", 0, NULL);
+	if (apm_proc)
+		apm_proc->proc_fops = &apm_file_ops;
+
+	kapmd_task = kthread_create(apm, NULL, "kapmd");
+	if (IS_ERR(kapmd_task)) {
+		printk(KERN_ERR "apm: disabled - Unable to start kernel "
+				"thread.\n");
+		err = PTR_ERR(kapmd_task);
+		kapmd_task = NULL;
+		remove_proc_entry("apm", NULL);
+		return err;
+	}
+	wake_up_process(kapmd_task);
+
+	if (num_online_cpus() > 1 && !smp ) {
+		printk(KERN_NOTICE
+		   "apm: disabled - APM is not SMP safe (power off active).\n");
+		return 0;
+	}
+
+	/*
+	 * Note we don't actually care if the misc_device cannot be registered.
+	 * this driver can do its job without it, even if userspace can't
+	 * control it.  just log the error
+	 */
+	if (misc_register(&apm_device))
+		printk(KERN_WARNING "apm: Could not register misc device.\n");
+
+	if (HZ != 100)
+		idle_period = (idle_period * HZ) / 100;
+	if (idle_threshold < 100) {
+		original_pm_idle = pm_idle;
+		pm_idle  = apm_cpu_idle;
+		set_pm_idle = 1;
+	}
+
+	return 0;
+}
+
+static void __exit apm_exit(void)
+{
+	int	error;
+
+	if (set_pm_idle) {
+		pm_idle = original_pm_idle;
+		/*
+		 * We are about to unload the current idle thread pm callback
+		 * (pm_idle), Wait for all processors to update cached/local
+		 * copies of pm_idle before proceeding.
+		 */
+		cpu_idle_wait();
+	}
+	if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
+	    && (apm_info.connection_version > 0x0100)) {
+		error = apm_engage_power_management(APM_DEVICE_ALL, 0);
+		if (error)
+			apm_error("disengage power management", error);
+	}
+	misc_deregister(&apm_device);
+	remove_proc_entry("apm", NULL);
+	if (power_off)
+		pm_power_off = NULL;
+	if (kapmd_task) {
+		kthread_stop(kapmd_task);
+		kapmd_task = NULL;
+	}
+#ifdef CONFIG_PM_LEGACY
+	pm_active = 0;
+#endif
+}
+
+module_init(apm_init);
+module_exit(apm_exit);
+
+MODULE_AUTHOR("Stephen Rothwell");
+MODULE_DESCRIPTION("Advanced Power Management");
+MODULE_LICENSE("GPL");
+module_param(debug, bool, 0644);
+MODULE_PARM_DESC(debug, "Enable debug mode");
+module_param(power_off, bool, 0444);
+MODULE_PARM_DESC(power_off, "Enable power off");
+module_param(bounce_interval, int, 0444);
+MODULE_PARM_DESC(bounce_interval,
+		"Set the number of ticks to ignore suspend bounces");
+module_param(allow_ints, bool, 0444);
+MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls");
+module_param(broken_psr, bool, 0444);
+MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call");
+module_param(realmode_power_off, bool, 0444);
+MODULE_PARM_DESC(realmode_power_off,
+		"Switch to real mode before powering off");
+module_param(idle_threshold, int, 0444);
+MODULE_PARM_DESC(idle_threshold,
+	"System idle percentage above which to make APM BIOS idle calls");
+module_param(idle_period, int, 0444);
+MODULE_PARM_DESC(idle_period,
+	"Period (in sec/100) over which to caculate the idle percentage");
+module_param(smp, bool, 0444);
+MODULE_PARM_DESC(smp,
+	"Set this to enable APM use on an SMP platform. Use with caution on older systems");
+MODULE_ALIAS_MISCDEV(APM_MINOR_DEV);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
new file mode 100644
index 000000000000..cfa82c899f47
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets.c
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "asm-offsets_32.c"
+#else
+# include "asm-offsets_64.c"
+#endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
new file mode 100644
index 000000000000..8029742c0fc1
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -0,0 +1,147 @@
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed
+ * to extract and format the required data.
+ */
+
+#include <linux/crypto.h>
+#include <linux/sched.h>
+#include <linux/signal.h>
+#include <linux/personality.h>
+#include <linux/suspend.h>
+#include <asm/ucontext.h>
+#include "sigframe_32.h"
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/thread_info.h>
+#include <asm/elf.h>
+
+#include <xen/interface/xen.h>
+
+#ifdef CONFIG_LGUEST_GUEST
+#include <linux/lguest.h>
+#include "../../../drivers/lguest/lg.h"
+#endif
+
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+
+#define BLANK() asm volatile("\n->" : : )
+
+#define OFFSET(sym, str, mem) \
+	DEFINE(sym, offsetof(struct str, mem));
+
+/* workaround for a warning with -Wmissing-prototypes */
+void foo(void);
+
+void foo(void)
+{
+	OFFSET(SIGCONTEXT_eax, sigcontext, eax);
+	OFFSET(SIGCONTEXT_ebx, sigcontext, ebx);
+	OFFSET(SIGCONTEXT_ecx, sigcontext, ecx);
+	OFFSET(SIGCONTEXT_edx, sigcontext, edx);
+	OFFSET(SIGCONTEXT_esi, sigcontext, esi);
+	OFFSET(SIGCONTEXT_edi, sigcontext, edi);
+	OFFSET(SIGCONTEXT_ebp, sigcontext, ebp);
+	OFFSET(SIGCONTEXT_esp, sigcontext, esp);
+	OFFSET(SIGCONTEXT_eip, sigcontext, eip);
+	BLANK();
+
+	OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
+	OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
+	OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
+	OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
+	OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math);
+	OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
+	OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
+	OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
+	BLANK();
+
+	OFFSET(TI_task, thread_info, task);
+	OFFSET(TI_exec_domain, thread_info, exec_domain);
+	OFFSET(TI_flags, thread_info, flags);
+	OFFSET(TI_status, thread_info, status);
+	OFFSET(TI_preempt_count, thread_info, preempt_count);
+	OFFSET(TI_addr_limit, thread_info, addr_limit);
+	OFFSET(TI_restart_block, thread_info, restart_block);
+	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+	OFFSET(TI_cpu, thread_info, cpu);
+	BLANK();
+
+	OFFSET(GDS_size, Xgt_desc_struct, size);
+	OFFSET(GDS_address, Xgt_desc_struct, address);
+	OFFSET(GDS_pad, Xgt_desc_struct, pad);
+	BLANK();
+
+	OFFSET(PT_EBX, pt_regs, ebx);
+	OFFSET(PT_ECX, pt_regs, ecx);
+	OFFSET(PT_EDX, pt_regs, edx);
+	OFFSET(PT_ESI, pt_regs, esi);
+	OFFSET(PT_EDI, pt_regs, edi);
+	OFFSET(PT_EBP, pt_regs, ebp);
+	OFFSET(PT_EAX, pt_regs, eax);
+	OFFSET(PT_DS,  pt_regs, xds);
+	OFFSET(PT_ES,  pt_regs, xes);
+	OFFSET(PT_FS,  pt_regs, xfs);
+	OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
+	OFFSET(PT_EIP, pt_regs, eip);
+	OFFSET(PT_CS,  pt_regs, xcs);
+	OFFSET(PT_EFLAGS, pt_regs, eflags);
+	OFFSET(PT_OLDESP, pt_regs, esp);
+	OFFSET(PT_OLDSS,  pt_regs, xss);
+	BLANK();
+
+	OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
+	OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
+	BLANK();
+
+	OFFSET(pbe_address, pbe, address);
+	OFFSET(pbe_orig_address, pbe, orig_address);
+	OFFSET(pbe_next, pbe, next);
+
+	/* Offset from the sysenter stack to tss.esp0 */
+	DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
+		 sizeof(struct tss_struct));
+
+	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+	DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
+	DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
+	DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
+	DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
+
+	DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
+
+	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+
+#ifdef CONFIG_PARAVIRT
+	BLANK();
+	OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
+	OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+	OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+	OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+#endif
+
+#ifdef CONFIG_XEN
+	BLANK();
+	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#endif
+
+#ifdef CONFIG_LGUEST_GUEST
+	BLANK();
+	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
+	OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
+	OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
+	OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
+	OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
+	OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
+	OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
+	OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
+	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
+	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
+#endif
+}
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
new file mode 100644
index 000000000000..0b9860530a6b
--- /dev/null
+++ b/arch/x86/kernel/bootflag.c
@@ -0,0 +1,98 @@
+/*
+ *	Implement 'Simple Boot Flag Specification 2.0'
+ */
+
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/acpi.h>
+#include <asm/io.h>
+
+#include <linux/mc146818rtc.h>
+
+
+#define SBF_RESERVED (0x78)
+#define SBF_PNPOS    (1<<0)
+#define SBF_BOOTING  (1<<1)
+#define SBF_DIAG     (1<<2)
+#define SBF_PARITY   (1<<7)
+
+
+int sbf_port __initdata = -1;	/* set via acpi_boot_init() */
+
+
+static int __init parity(u8 v)
+{
+	int x = 0;
+	int i;
+	
+	for(i=0;i<8;i++)
+	{
+		x^=(v&1);
+		v>>=1;
+	}
+	return x;
+}
+
+static void __init sbf_write(u8 v)
+{
+	unsigned long flags;
+	if(sbf_port != -1)
+	{
+		v &= ~SBF_PARITY;
+		if(!parity(v))
+			v|=SBF_PARITY;
+
+		printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v);
+
+		spin_lock_irqsave(&rtc_lock, flags);
+		CMOS_WRITE(v, sbf_port);
+		spin_unlock_irqrestore(&rtc_lock, flags);
+	}
+}
+
+static u8 __init sbf_read(void)
+{
+	u8 v;
+	unsigned long flags;
+	if(sbf_port == -1)
+		return 0;
+	spin_lock_irqsave(&rtc_lock, flags);
+	v = CMOS_READ(sbf_port);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+	return v;
+}
+
+static int __init sbf_value_valid(u8 v)
+{
+	if(v&SBF_RESERVED)		/* Reserved bits */
+		return 0;
+	if(!parity(v))
+		return 0;
+	return 1;
+}
+
+static int __init sbf_init(void)
+{
+	u8 v;
+	if(sbf_port == -1)
+		return 0;
+	v = sbf_read();
+	if(!sbf_value_valid(v))
+		printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v);
+
+	v &= ~SBF_RESERVED;
+	v &= ~SBF_BOOTING;
+	v &= ~SBF_DIAG;
+#if defined(CONFIG_ISAPNP)
+	v |= SBF_PNPOS;
+#endif
+	sbf_write(v);
+	return 0;
+}
+
+module_init(sbf_init);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
new file mode 100644
index 000000000000..5c2faa10e9fa
--- /dev/null
+++ b/arch/x86/kernel/cpuid.c
@@ -0,0 +1,242 @@
+/* ----------------------------------------------------------------------- *
+ *   
+ *   Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * cpuid.c
+ *
+ * x86 CPUID access device
+ *
+ * This device is accessed by lseek() to the appropriate CPUID level
+ * and then read in chunks of 16 bytes.  A larger size means multiple
+ * reads of consecutive levels.
+ *
+ * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
+ * an SMP box will direct the access to CPU %d.
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/smp.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+static struct class *cpuid_class;
+
+#ifdef CONFIG_SMP
+
+struct cpuid_command {
+	u32 reg;
+	u32 *data;
+};
+
+static void cpuid_smp_cpuid(void *cmd_block)
+{
+	struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
+
+	cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
+		      &cmd->data[3]);
+}
+
+static inline void do_cpuid(int cpu, u32 reg, u32 * data)
+{
+	struct cpuid_command cmd;
+
+	preempt_disable();
+	if (cpu == smp_processor_id()) {
+		cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
+	} else {
+		cmd.reg = reg;
+		cmd.data = data;
+
+		smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
+	}
+	preempt_enable();
+}
+#else				/* ! CONFIG_SMP */
+
+static inline void do_cpuid(int cpu, u32 reg, u32 * data)
+{
+	cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
+}
+
+#endif				/* ! CONFIG_SMP */
+
+static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
+{
+	loff_t ret;
+
+	lock_kernel();
+
+	switch (orig) {
+	case 0:
+		file->f_pos = offset;
+		ret = file->f_pos;
+		break;
+	case 1:
+		file->f_pos += offset;
+		ret = file->f_pos;
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t cpuid_read(struct file *file, char __user *buf,
+			  size_t count, loff_t * ppos)
+{
+	char __user *tmp = buf;
+	u32 data[4];
+	u32 reg = *ppos;
+	int cpu = iminor(file->f_path.dentry->d_inode);
+
+	if (count % 16)
+		return -EINVAL;	/* Invalid chunk size */
+
+	for (; count; count -= 16) {
+		do_cpuid(cpu, reg, data);
+		if (copy_to_user(tmp, &data, 16))
+			return -EFAULT;
+		tmp += 16;
+		*ppos = reg++;
+	}
+
+	return tmp - buf;
+}
+
+static int cpuid_open(struct inode *inode, struct file *file)
+{
+	unsigned int cpu = iminor(file->f_path.dentry->d_inode);
+	struct cpuinfo_x86 *c = &(cpu_data)[cpu];
+
+	if (cpu >= NR_CPUS || !cpu_online(cpu))
+		return -ENXIO;	/* No such CPU */
+	if (c->cpuid_level < 0)
+		return -EIO;	/* CPUID not supported */
+
+	return 0;
+}
+
+/*
+ * File operations we support
+ */
+static const struct file_operations cpuid_fops = {
+	.owner = THIS_MODULE,
+	.llseek = cpuid_seek,
+	.read = cpuid_read,
+	.open = cpuid_open,
+};
+
+static int cpuid_device_create(int i)
+{
+	int err = 0;
+	struct device *dev;
+
+	dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i);
+	if (IS_ERR(dev))
+		err = PTR_ERR(dev);
+	return err;
+}
+
+static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		cpuid_device_create(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
+{
+	.notifier_call = cpuid_class_cpu_callback,
+};
+
+static int __init cpuid_init(void)
+{
+	int i, err = 0;
+	i = 0;
+
+	if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) {
+		printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
+		       CPUID_MAJOR);
+		err = -EBUSY;
+		goto out;
+	}
+	cpuid_class = class_create(THIS_MODULE, "cpuid");
+	if (IS_ERR(cpuid_class)) {
+		err = PTR_ERR(cpuid_class);
+		goto out_chrdev;
+	}
+	for_each_online_cpu(i) {
+		err = cpuid_device_create(i);
+		if (err != 0) 
+			goto out_class;
+	}
+	register_hotcpu_notifier(&cpuid_class_cpu_notifier);
+
+	err = 0;
+	goto out;
+
+out_class:
+	i = 0;
+	for_each_online_cpu(i) {
+		device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i));
+	}
+	class_destroy(cpuid_class);
+out_chrdev:
+	unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");	
+out:
+	return err;
+}
+
+static void __exit cpuid_exit(void)
+{
+	int cpu = 0;
+
+	for_each_online_cpu(cpu)
+		device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+	class_destroy(cpuid_class);
+	unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
+	unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
+}
+
+module_init(cpuid_init);
+module_exit(cpuid_exit);
+
+MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
+MODULE_DESCRIPTION("x86 generic CPUID driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/crash_32.c b/arch/x86/kernel/crash_32.c
new file mode 100644
index 000000000000..53589d1b1a05
--- /dev/null
+++ b/arch/x86/kernel/crash_32.c
@@ -0,0 +1,137 @@
+/*
+ * Architecture specific (i386) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <linux/kdebug.h>
+#include <asm/smp.h>
+
+#include <mach_ipi.h>
+
+
+/* This keeps a track of which one is crashing cpu. */
+static int crashing_cpu;
+
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct notifier_block *self,
+			unsigned long val, void *data)
+{
+	struct pt_regs *regs;
+	struct pt_regs fixed_regs;
+	int cpu;
+
+	if (val != DIE_NMI_IPI)
+		return NOTIFY_OK;
+
+	regs = ((struct die_args *)data)->regs;
+	cpu = raw_smp_processor_id();
+
+	/* Don't do anything if this handler is invoked on crashing cpu.
+	 * Otherwise, system will completely hang. Crashing cpu can get
+	 * an NMI if system was initially booted with nmi_watchdog parameter.
+	 */
+	if (cpu == crashing_cpu)
+		return NOTIFY_STOP;
+	local_irq_disable();
+
+	if (!user_mode_vm(regs)) {
+		crash_fixup_ss_esp(&fixed_regs, regs);
+		regs = &fixed_regs;
+	}
+	crash_save_cpu(regs, cpu);
+	disable_local_APIC();
+	atomic_dec(&waiting_for_crash_ipi);
+	/* Assume hlt works */
+	halt();
+	for (;;)
+		cpu_relax();
+
+	return 1;
+}
+
+static void smp_send_nmi_allbutself(void)
+{
+	cpumask_t mask = cpu_online_map;
+	cpu_clear(safe_smp_processor_id(), mask);
+	if (!cpus_empty(mask))
+		send_IPI_mask(mask, NMI_VECTOR);
+}
+
+static struct notifier_block crash_nmi_nb = {
+	.notifier_call = crash_nmi_callback,
+};
+
+static void nmi_shootdown_cpus(void)
+{
+	unsigned long msecs;
+
+	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+	/* Would it be better to replace the trap vector here? */
+	if (register_die_notifier(&crash_nmi_nb))
+		return;		/* return what? */
+	/* Ensure the new callback function is set before sending
+	 * out the NMI
+	 */
+	wmb();
+
+	smp_send_nmi_allbutself();
+
+	msecs = 1000; /* Wait at most a second for the other cpus to stop */
+	while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+		mdelay(1);
+		msecs--;
+	}
+
+	/* Leave the nmi callback set */
+	disable_local_APIC();
+}
+#else
+static void nmi_shootdown_cpus(void)
+{
+	/* There are no cpus to shootdown */
+}
+#endif
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* This function is only called after the system
+	 * has panicked or is otherwise in a critical state.
+	 * The minimum amount of code to allow a kexec'd kernel
+	 * to run successfully needs to happen here.
+	 *
+	 * In practice this means shooting down the other cpus in
+	 * an SMP system.
+	 */
+	/* The kernel is broken so disable interrupts */
+	local_irq_disable();
+
+	/* Make a note of crashing cpu. Will be used in NMI callback.*/
+	crashing_cpu = safe_smp_processor_id();
+	nmi_shootdown_cpus();
+	lapic_shutdown();
+#if defined(CONFIG_X86_IO_APIC)
+	disable_IO_APIC();
+#endif
+	crash_save_cpu(regs, safe_smp_processor_id());
+}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
new file mode 100644
index 000000000000..3f532df488bc
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -0,0 +1,74 @@
+/*
+ *	kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ *	Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *	Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
+
+#include <asm/uaccess.h>
+
+static void *kdump_buf_page;
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ *	space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ *	otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ *
+ * Calling copy_to_user() in atomic context is not desirable. Hence first
+ * copying the data to a pre-allocated kernel page and then copying to user
+ * space in non-atomic context.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+                               size_t csize, unsigned long offset, int userbuf)
+{
+	void  *vaddr;
+
+	if (!csize)
+		return 0;
+
+	vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+
+	if (!userbuf) {
+		memcpy(buf, (vaddr + offset), csize);
+		kunmap_atomic(vaddr, KM_PTE0);
+	} else {
+		if (!kdump_buf_page) {
+			printk(KERN_WARNING "Kdump: Kdump buffer page not"
+				" allocated\n");
+			return -EFAULT;
+		}
+		copy_page(kdump_buf_page, vaddr);
+		kunmap_atomic(vaddr, KM_PTE0);
+		if (copy_to_user(buf, (kdump_buf_page + offset), csize))
+			return -EFAULT;
+	}
+
+	return csize;
+}
+
+static int __init kdump_buf_page_init(void)
+{
+	int ret = 0;
+
+	kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!kdump_buf_page) {
+		printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
+			 " page\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+arch_initcall(kdump_buf_page_init);
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
new file mode 100644
index 000000000000..40978af630e7
--- /dev/null
+++ b/arch/x86/kernel/doublefault_32.c
@@ -0,0 +1,70 @@
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+
+#define DOUBLEFAULT_STACKSIZE (1024)
+static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
+#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+
+#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+
+static void doublefault_fn(void)
+{
+	struct Xgt_desc_struct gdt_desc = {0, 0};
+	unsigned long gdt, tss;
+
+	store_gdt(&gdt_desc);
+	gdt = gdt_desc.address;
+
+	printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+
+	if (ptr_ok(gdt)) {
+		gdt += GDT_ENTRY_TSS << 3;
+		tss = *(u16 *)(gdt+2);
+		tss += *(u8 *)(gdt+4) << 16;
+		tss += *(u8 *)(gdt+7) << 24;
+		printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
+
+		if (ptr_ok(tss)) {
+			struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
+
+			printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp);
+
+			printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
+				t->eax, t->ebx, t->ecx, t->edx);
+			printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
+				t->esi, t->edi);
+		}
+	}
+
+	for (;;)
+		cpu_relax();
+}
+
+struct tss_struct doublefault_tss __cacheline_aligned = {
+	.x86_tss = {
+		.esp0		= STACK_START,
+		.ss0		= __KERNEL_DS,
+		.ldt		= 0,
+		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
+
+		.eip		= (unsigned long) doublefault_fn,
+		/* 0x2 bit is always set */
+		.eflags		= X86_EFLAGS_SF | 0x2,
+		.esp		= STACK_START,
+		.es		= __USER_DS,
+		.cs		= __KERNEL_CS,
+		.ss		= __KERNEL_DS,
+		.ds		= __USER_DS,
+		.fs		= __KERNEL_PERCPU,
+
+		.__cr3		= __pa(swapper_pg_dir)
+	}
+};
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
new file mode 100644
index 000000000000..3c86b979a40a
--- /dev/null
+++ b/arch/x86/kernel/e820_32.c
@@ -0,0 +1,944 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/pfn.h>
+#include <linux/uaccess.h>
+#include <linux/suspend.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_EFI
+int efi_enabled = 0;
+EXPORT_SYMBOL(efi_enabled);
+#endif
+
+struct e820map e820;
+struct change_member {
+	struct e820entry *pbios; /* pointer to original bios entry */
+	unsigned long long addr; /* address for this change point */
+};
+static struct change_member change_point_list[2*E820MAX] __initdata;
+static struct change_member *change_point[2*E820MAX] __initdata;
+static struct e820entry *overlap_list[E820MAX] __initdata;
+static struct e820entry new_bios[E820MAX] __initdata;
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0x10000000;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+extern int user_defined_memmap;
+struct resource data_resource = {
+	.name	= "Kernel data",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+struct resource code_resource = {
+	.name	= "Kernel code",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource system_rom_resource = {
+	.name	= "System ROM",
+	.start	= 0xf0000,
+	.end	= 0xfffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+	.name	= "Extension ROM",
+	.start	= 0xe0000,
+	.end	= 0xeffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+	.name 	= "Adapter ROM",
+	.start	= 0xc8000,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+	.name 	= "Video ROM",
+	.start	= 0xc0000,
+	.end	= 0xc7fff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource video_ram_resource = {
+	.name	= "Video RAM area",
+	.start	= 0xa0000,
+	.end	= 0xbffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource standard_io_resources[] = { {
+	.name	= "dma1",
+	.start	= 0x0000,
+	.end	= 0x001f,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "pic1",
+	.start	= 0x0020,
+	.end	= 0x0021,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name   = "timer0",
+	.start	= 0x0040,
+	.end    = 0x0043,
+	.flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name   = "timer1",
+	.start  = 0x0050,
+	.end    = 0x0053,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "keyboard",
+	.start	= 0x0060,
+	.end	= 0x006f,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "dma page reg",
+	.start	= 0x0080,
+	.end	= 0x008f,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "pic2",
+	.start	= 0x00a0,
+	.end	= 0x00a1,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "dma2",
+	.start	= 0x00c0,
+	.end	= 0x00df,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "fpu",
+	.start	= 0x00f0,
+	.end	= 0x00ff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+} };
+
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
+{
+	const unsigned short * const ptr = (const unsigned short *)rom;
+	unsigned short sig;
+
+	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+	unsigned char sum, c;
+
+	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+		sum += c;
+	return !length && !sum;
+}
+
+static void __init probe_roms(void)
+{
+	const unsigned char *rom;
+	unsigned long start, length, upper;
+	unsigned char c;
+	int i;
+
+	/* video rom */
+	upper = adapter_rom_resources[0].start;
+	for (start = video_rom_resource.start; start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		video_rom_resource.start = start;
+
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = c * 512;
+
+		/* if checksum okay, trust length byte */
+		if (length && romchecksum(rom, length))
+			video_rom_resource.end = start + length - 1;
+
+		request_resource(&iomem_resource, &video_rom_resource);
+		break;
+	}
+
+	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+	if (start < upper)
+		start = upper;
+
+	/* system rom */
+	request_resource(&iomem_resource, &system_rom_resource);
+	upper = system_rom_resource.start;
+
+	/* check for extension rom (ignore length byte!) */
+	rom = isa_bus_to_virt(extension_rom_resource.start);
+	if (romsignature(rom)) {
+		length = extension_rom_resource.end - extension_rom_resource.start + 1;
+		if (romchecksum(rom, length)) {
+			request_resource(&iomem_resource, &extension_rom_resource);
+			upper = extension_rom_resource.start;
+		}
+	}
+
+	/* check for adapter roms on 2k boundaries */
+	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = c * 512;
+
+		/* but accept any length that fits if checksum okay */
+		if (!length || start + length > upper || !romchecksum(rom, length))
+			continue;
+
+		adapter_rom_resources[i].start = start;
+		adapter_rom_resources[i].end = start + length - 1;
+		request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+		start = adapter_rom_resources[i++].end & ~2047UL;
+	}
+}
+
+/*
+ * Request address space for all standard RAM and ROM resources
+ * and also for regions reported as reserved by the e820.
+ */
+static void __init
+legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
+{
+	int i;
+
+	probe_roms();
+	for (i = 0; i < e820.nr_map; i++) {
+		struct resource *res;
+#ifndef CONFIG_RESOURCES_64BIT
+		if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
+			continue;
+#endif
+		res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
+		switch (e820.map[i].type) {
+		case E820_RAM:	res->name = "System RAM"; break;
+		case E820_ACPI:	res->name = "ACPI Tables"; break;
+		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
+		default:	res->name = "reserved";
+		}
+		res->start = e820.map[i].addr;
+		res->end = res->start + e820.map[i].size - 1;
+		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		if (request_resource(&iomem_resource, res)) {
+			kfree(res);
+			continue;
+		}
+		if (e820.map[i].type == E820_RAM) {
+			/*
+			 *  We don't know which RAM region contains kernel data,
+			 *  so we try it repeatedly and let the resource manager
+			 *  test it.
+			 */
+			request_resource(res, code_resource);
+			request_resource(res, data_resource);
+#ifdef CONFIG_KEXEC
+			request_resource(res, &crashk_res);
+#endif
+		}
+	}
+}
+
+/*
+ * Request address space for all standard resources
+ *
+ * This is called just before pcibios_init(), which is also a
+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
+ */
+static int __init request_standard_resources(void)
+{
+	int i;
+
+	printk("Setting up standard PCI resources\n");
+	if (efi_enabled)
+		efi_initialize_iomem_resources(&code_resource, &data_resource);
+	else
+		legacy_init_iomem_resources(&code_resource, &data_resource);
+
+	/* EFI systems may still have VGA */
+	request_resource(&iomem_resource, &video_ram_resource);
+
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+	return 0;
+}
+
+subsys_initcall(request_standard_resources);
+
+#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
+/**
+ * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
+ * correspond to e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+	int i;
+	unsigned long pfn;
+
+	pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+	for (i = 1; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (pfn < PFN_UP(ei->addr))
+			register_nosave_region(pfn, PFN_UP(ei->addr));
+
+		pfn = PFN_DOWN(ei->addr + ei->size);
+		if (ei->type != E820_RAM)
+			register_nosave_region(PFN_UP(ei->addr), pfn);
+
+		if (pfn >= max_low_pfn)
+			break;
+	}
+}
+#endif
+
+void __init add_memory_region(unsigned long long start,
+			      unsigned long long size, int type)
+{
+	int x;
+
+	if (!efi_enabled) {
+       		x = e820.nr_map;
+
+		if (x == E820MAX) {
+		    printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+		    return;
+		}
+
+		e820.map[x].addr = start;
+		e820.map[x].size = size;
+		e820.map[x].type = type;
+		e820.nr_map++;
+	}
+} /* add_memory_region */
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries.  The following
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+{
+	struct change_member *change_tmp;
+	unsigned long current_type, last_type;
+	unsigned long long last_addr;
+	int chgidx, still_changing;
+	int overlap_entries;
+	int new_bios_entry;
+	int old_nr, new_nr, chg_nr;
+	int i;
+
+	/*
+		Visually we're performing the following (1,2,3,4 = memory types)...
+
+		Sample memory map (w/overlaps):
+		   ____22__________________
+		   ______________________4_
+		   ____1111________________
+		   _44_____________________
+		   11111111________________
+		   ____________________33__
+		   ___________44___________
+		   __________33333_________
+		   ______________22________
+		   ___________________2222_
+		   _________111111111______
+		   _____________________11_
+		   _________________4______
+
+		Sanitized equivalent (no overlap):
+		   1_______________________
+		   _44_____________________
+		   ___1____________________
+		   ____22__________________
+		   ______11________________
+		   _________1______________
+		   __________3_____________
+		   ___________44___________
+		   _____________33_________
+		   _______________2________
+		   ________________1_______
+		   _________________4______
+		   ___________________2____
+		   ____________________33__
+		   ______________________4_
+	*/
+	/* if there's only one memory region, don't bother */
+	if (*pnr_map < 2) {
+		return -1;
+	}
+
+	old_nr = *pnr_map;
+
+	/* bail out if we find any unreasonable addresses in bios map */
+	for (i=0; i<old_nr; i++)
+		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
+			return -1;
+		}
+
+	/* create pointers for initial change-point information (for sorting) */
+	for (i=0; i < 2*old_nr; i++)
+		change_point[i] = &change_point_list[i];
+
+	/* record all known change-points (starting and ending addresses),
+	   omitting those that are for empty memory regions */
+	chgidx = 0;
+	for (i=0; i < old_nr; i++)	{
+		if (biosmap[i].size != 0) {
+			change_point[chgidx]->addr = biosmap[i].addr;
+			change_point[chgidx++]->pbios = &biosmap[i];
+			change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+			change_point[chgidx++]->pbios = &biosmap[i];
+		}
+	}
+	chg_nr = chgidx;    	/* true number of change-points */
+
+	/* sort change-point list by memory addresses (low -> high) */
+	still_changing = 1;
+	while (still_changing)	{
+		still_changing = 0;
+		for (i=1; i < chg_nr; i++)  {
+			/* if <current_addr> > <last_addr>, swap */
+			/* or, if current=<start_addr> & last=<end_addr>, swap */
+			if ((change_point[i]->addr < change_point[i-1]->addr) ||
+				((change_point[i]->addr == change_point[i-1]->addr) &&
+				 (change_point[i]->addr == change_point[i]->pbios->addr) &&
+				 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
+			   )
+			{
+				change_tmp = change_point[i];
+				change_point[i] = change_point[i-1];
+				change_point[i-1] = change_tmp;
+				still_changing=1;
+			}
+		}
+	}
+
+	/* create a new bios memory map, removing overlaps */
+	overlap_entries=0;	 /* number of entries in the overlap table */
+	new_bios_entry=0;	 /* index for creating new bios map entries */
+	last_type = 0;		 /* start with undefined memory type */
+	last_addr = 0;		 /* start with 0 as last starting address */
+	/* loop through change-points, determining affect on the new bios map */
+	for (chgidx=0; chgidx < chg_nr; chgidx++)
+	{
+		/* keep track of all overlapping bios entries */
+		if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
+		{
+			/* add map entry to overlap list (> 1 entry implies an overlap) */
+			overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+		}
+		else
+		{
+			/* remove entry from list (order independent, so swap with last) */
+			for (i=0; i<overlap_entries; i++)
+			{
+				if (overlap_list[i] == change_point[chgidx]->pbios)
+					overlap_list[i] = overlap_list[overlap_entries-1];
+			}
+			overlap_entries--;
+		}
+		/* if there are overlapping entries, decide which "type" to use */
+		/* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+		current_type = 0;
+		for (i=0; i<overlap_entries; i++)
+			if (overlap_list[i]->type > current_type)
+				current_type = overlap_list[i]->type;
+		/* continue building up new bios map based on this information */
+		if (current_type != last_type)	{
+			if (last_type != 0)	 {
+				new_bios[new_bios_entry].size =
+					change_point[chgidx]->addr - last_addr;
+				/* move forward only if the new size was non-zero */
+				if (new_bios[new_bios_entry].size != 0)
+					if (++new_bios_entry >= E820MAX)
+						break; 	/* no more space left for new bios entries */
+			}
+			if (current_type != 0)	{
+				new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+				new_bios[new_bios_entry].type = current_type;
+				last_addr=change_point[chgidx]->addr;
+			}
+			last_type = current_type;
+		}
+	}
+	new_nr = new_bios_entry;   /* retain count for new bios entries */
+
+	/* copy new bios mapping into original location */
+	memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+	*pnr_map = new_nr;
+
+	return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ *
+ * We check to see that the memory map contains at least 2 elements
+ * before we'll use it, because the detection code in setup.S may
+ * not be perfect and most every PC known to man has two memory
+ * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
+ * thinkpad 560x, for example, does not cooperate with the memory
+ * detection code.)
+ */
+int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+{
+	/* Only one memory region (or negative)? Ignore it */
+	if (nr_map < 2)
+		return -1;
+
+	do {
+		unsigned long long start = biosmap->addr;
+		unsigned long long size = biosmap->size;
+		unsigned long long end = start + size;
+		unsigned long type = biosmap->type;
+
+		/* Overflow in 64 bits? Ignore the memory map. */
+		if (start > end)
+			return -1;
+
+		/*
+		 * Some BIOSes claim RAM in the 640k - 1M region.
+		 * Not right. Fix it up.
+		 */
+		if (type == E820_RAM) {
+			if (start < 0x100000ULL && end > 0xA0000ULL) {
+				if (start < 0xA0000ULL)
+					add_memory_region(start, 0xA0000ULL-start, type);
+				if (end <= 0x100000ULL)
+					continue;
+				start = 0x100000ULL;
+				size = end - start;
+			}
+		}
+		add_memory_region(start, size, type);
+	} while (biosmap++,--nr_map);
+	return 0;
+}
+
+/*
+ * Callback for efi_memory_walk.
+ */
+static int __init
+efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
+{
+	unsigned long *max_pfn = arg, pfn;
+
+	if (start < end) {
+		pfn = PFN_UP(end -1);
+		if (pfn > *max_pfn)
+			*max_pfn = pfn;
+	}
+	return 0;
+}
+
+static int __init
+efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
+{
+	memory_present(0, PFN_UP(start), PFN_DOWN(end));
+	return 0;
+}
+
+/*
+ * Find the highest page frame number we have available
+ */
+void __init find_max_pfn(void)
+{
+	int i;
+
+	max_pfn = 0;
+	if (efi_enabled) {
+		efi_memmap_walk(efi_find_max_pfn, &max_pfn);
+		efi_memmap_walk(efi_memory_present_wrapper, NULL);
+		return;
+	}
+
+	for (i = 0; i < e820.nr_map; i++) {
+		unsigned long start, end;
+		/* RAM? */
+		if (e820.map[i].type != E820_RAM)
+			continue;
+		start = PFN_UP(e820.map[i].addr);
+		end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+		if (start >= end)
+			continue;
+		if (end > max_pfn)
+			max_pfn = end;
+		memory_present(0, start, end);
+	}
+}
+
+/*
+ * Free all available memory for boot time allocation.  Used
+ * as a callback function by efi_memory_walk()
+ */
+
+static int __init
+free_available_memory(unsigned long start, unsigned long end, void *arg)
+{
+	/* check max_low_pfn */
+	if (start >= (max_low_pfn << PAGE_SHIFT))
+		return 0;
+	if (end >= (max_low_pfn << PAGE_SHIFT))
+		end = max_low_pfn << PAGE_SHIFT;
+	if (start < end)
+		free_bootmem(start, end - start);
+
+	return 0;
+}
+/*
+ * Register fully available low RAM pages with the bootmem allocator.
+ */
+void __init register_bootmem_low_pages(unsigned long max_low_pfn)
+{
+	int i;
+
+	if (efi_enabled) {
+		efi_memmap_walk(free_available_memory, NULL);
+		return;
+	}
+	for (i = 0; i < e820.nr_map; i++) {
+		unsigned long curr_pfn, last_pfn, size;
+		/*
+		 * Reserve usable low memory
+		 */
+		if (e820.map[i].type != E820_RAM)
+			continue;
+		/*
+		 * We are rounding up the start address of usable memory:
+		 */
+		curr_pfn = PFN_UP(e820.map[i].addr);
+		if (curr_pfn >= max_low_pfn)
+			continue;
+		/*
+		 * ... and at the end of the usable range downwards:
+		 */
+		last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+
+		if (last_pfn > max_low_pfn)
+			last_pfn = max_low_pfn;
+
+		/*
+		 * .. finally, did all the rounding and playing
+		 * around just make the area go away?
+		 */
+		if (last_pfn <= curr_pfn)
+			continue;
+
+		size = last_pfn - curr_pfn;
+		free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
+	}
+}
+
+void __init e820_register_memory(void)
+{
+	unsigned long gapstart, gapsize, round;
+	unsigned long long last;
+	int i;
+
+	/*
+	 * Search for the bigest gap in the low 32 bits of the e820
+	 * memory space.
+	 */
+	last = 0x100000000ull;
+	gapstart = 0x10000000;
+	gapsize = 0x400000;
+	i = e820.nr_map;
+	while (--i >= 0) {
+		unsigned long long start = e820.map[i].addr;
+		unsigned long long end = start + e820.map[i].size;
+
+		/*
+		 * Since "last" is at most 4GB, we know we'll
+		 * fit in 32 bits if this condition is true
+		 */
+		if (last > end) {
+			unsigned long gap = last - end;
+
+			if (gap > gapsize) {
+				gapsize = gap;
+				gapstart = end;
+			}
+		}
+		if (start < last)
+			last = start;
+	}
+
+	/*
+	 * See how much we want to round up: start off with
+	 * rounding to the next 1MB area.
+	 */
+	round = 0x100000;
+	while ((gapsize >> 4) > round)
+		round += round;
+	/* Fun with two's complement */
+	pci_mem_start = (gapstart + round) & -round;
+
+	printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
+		pci_mem_start, gapstart, gapsize);
+}
+
+void __init print_memory_map(char *who)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		printk(" %s: %016Lx - %016Lx ", who,
+			e820.map[i].addr,
+			e820.map[i].addr + e820.map[i].size);
+		switch (e820.map[i].type) {
+		case E820_RAM:	printk("(usable)\n");
+				break;
+		case E820_RESERVED:
+				printk("(reserved)\n");
+				break;
+		case E820_ACPI:
+				printk("(ACPI data)\n");
+				break;
+		case E820_NVS:
+				printk("(ACPI NVS)\n");
+				break;
+		default:	printk("type %u\n", e820.map[i].type);
+				break;
+		}
+	}
+}
+
+static __init __always_inline void efi_limit_regions(unsigned long long size)
+{
+	unsigned long long current_addr = 0;
+	efi_memory_desc_t *md, *next_md;
+	void *p, *p1;
+	int i, j;
+
+	j = 0;
+	p1 = memmap.map;
+	for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
+		md = p;
+		next_md = p1;
+		current_addr = md->phys_addr +
+			PFN_PHYS(md->num_pages);
+		if (is_available_memory(md)) {
+			if (md->phys_addr >= size) continue;
+			memcpy(next_md, md, memmap.desc_size);
+			if (current_addr >= size) {
+				next_md->num_pages -=
+					PFN_UP(current_addr-size);
+			}
+			p1 += memmap.desc_size;
+			next_md = p1;
+			j++;
+		} else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
+			   EFI_MEMORY_RUNTIME) {
+			/* In order to make runtime services
+			 * available we have to include runtime
+			 * memory regions in memory map */
+			memcpy(next_md, md, memmap.desc_size);
+			p1 += memmap.desc_size;
+			next_md = p1;
+			j++;
+		}
+	}
+	memmap.nr_map = j;
+	memmap.map_end = memmap.map +
+		(memmap.nr_map * memmap.desc_size);
+}
+
+void __init limit_regions(unsigned long long size)
+{
+	unsigned long long current_addr;
+	int i;
+
+	print_memory_map("limit_regions start");
+	if (efi_enabled) {
+		efi_limit_regions(size);
+		return;
+	}
+	for (i = 0; i < e820.nr_map; i++) {
+		current_addr = e820.map[i].addr + e820.map[i].size;
+		if (current_addr < size)
+			continue;
+
+		if (e820.map[i].type != E820_RAM)
+			continue;
+
+		if (e820.map[i].addr >= size) {
+			/*
+			 * This region starts past the end of the
+			 * requested size, skip it completely.
+			 */
+			e820.nr_map = i;
+		} else {
+			e820.nr_map = i + 1;
+			e820.map[i].size -= current_addr - size;
+		}
+		print_memory_map("limit_regions endfor");
+		return;
+	}
+	print_memory_map("limit_regions endfunc");
+}
+
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(u64 start, u64 end, unsigned type)
+{
+	int i;
+	for (i = 0; i < e820.nr_map; i++) {
+		const struct e820entry *ei = &e820.map[i];
+		if (type && ei->type != type)
+			continue;
+		if (ei->addr >= end || ei->addr + ei->size <= start)
+			continue;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
+ /*
+  * This function checks if the entire range <start,end> is mapped with type.
+  *
+  * Note: this function only works correct if the e820 table is sorted and
+  * not-overlapping, which is the case
+  */
+int __init
+e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
+{
+	u64 start = s;
+	u64 end = e;
+	int i;
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		if (type && ei->type != type)
+			continue;
+		/* is the region (part) in overlap with the current region ?*/
+		if (ei->addr >= end || ei->addr + ei->size <= start)
+			continue;
+		/* if the region is at the beginning of <start,end> we move
+		 * start to the end of the region since it's ok until there
+		 */
+		if (ei->addr <= start)
+			start = ei->addr + ei->size;
+		/* if start is now at or beyond end, we're done, full
+		 * coverage */
+		if (start >= end)
+			return 1; /* we're done */
+	}
+	return 0;
+}
+
+static int __init parse_memmap(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (strcmp(arg, "exactmap") == 0) {
+#ifdef CONFIG_CRASH_DUMP
+		/* If we are doing a crash dump, we
+		 * still need to know the real mem
+		 * size before original memory map is
+		 * reset.
+		 */
+		find_max_pfn();
+		saved_max_pfn = max_pfn;
+#endif
+		e820.nr_map = 0;
+		user_defined_memmap = 1;
+	} else {
+		/* If the user specifies memory size, we
+		 * limit the BIOS-provided memory map to
+		 * that size. exactmap can be used to specify
+		 * the exact map. mem=number can be used to
+		 * trim the existing memory map.
+		 */
+		unsigned long long start_at, mem_size;
+
+		mem_size = memparse(arg, &arg);
+		if (*arg == '@') {
+			start_at = memparse(arg+1, &arg);
+			add_memory_region(start_at, mem_size, E820_RAM);
+		} else if (*arg == '#') {
+			start_at = memparse(arg+1, &arg);
+			add_memory_region(start_at, mem_size, E820_ACPI);
+		} else if (*arg == '$') {
+			start_at = memparse(arg+1, &arg);
+			add_memory_region(start_at, mem_size, E820_RESERVED);
+		} else {
+			limit_regions(mem_size);
+			user_defined_memmap = 1;
+		}
+	}
+	return 0;
+}
+early_param("memmap", parse_memmap);
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
new file mode 100644
index 000000000000..92f812ba275c
--- /dev/null
+++ b/arch/x86/kernel/early_printk.c
@@ -0,0 +1,2 @@
+
+#include "../../x86_64/kernel/early_printk.c"
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
new file mode 100644
index 000000000000..2452c6fbe992
--- /dev/null
+++ b/arch/x86/kernel/efi_32.c
@@ -0,0 +1,712 @@
+/*
+ * Extensible Firmware Interface
+ *
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * All EFI Runtime Services are not implemented yet as EFI only
+ * supports physical mode addressing on SoftSDV. This is to be fixed
+ * in a future version.  --drummond 1999-07-20
+ *
+ * Implemented EFI runtime services and virtual mode calls.  --davidm
+ *
+ * Goutham Rao: <goutham.rao@intel.com>
+ *	Skip non-WB memory and ignore empty memory ranges.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/kexec.h>
+
+#include <asm/setup.h>
+#include <asm/io.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
+
+#define EFI_DEBUG	0
+#define PFX 		"EFI: "
+
+extern efi_status_t asmlinkage efi_call_phys(void *, ...);
+
+struct efi efi;
+EXPORT_SYMBOL(efi);
+static struct efi efi_phys;
+struct efi_memory_map memmap;
+
+/*
+ * We require an early boot_ioremap mapping mechanism initially
+ */
+extern void * boot_ioremap(unsigned long, unsigned long);
+
+/*
+ * To make EFI call EFI runtime service in physical addressing mode we need
+ * prelog/epilog before/after the invocation to disable interrupt, to
+ * claim EFI runtime service handler exclusively and to duplicate a memory in
+ * low memory space say 0 - 3G.
+ */
+
+static unsigned long efi_rt_eflags;
+static DEFINE_SPINLOCK(efi_rt_lock);
+static pgd_t efi_bak_pg_dir_pointer[2];
+
+static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
+{
+	unsigned long cr4;
+	unsigned long temp;
+	struct Xgt_desc_struct gdt_descr;
+
+	spin_lock(&efi_rt_lock);
+	local_irq_save(efi_rt_eflags);
+
+	/*
+	 * If I don't have PSE, I should just duplicate two entries in page
+	 * directory. If I have PSE, I just need to duplicate one entry in
+	 * page directory.
+	 */
+	cr4 = read_cr4();
+
+	if (cr4 & X86_CR4_PSE) {
+		efi_bak_pg_dir_pointer[0].pgd =
+		    swapper_pg_dir[pgd_index(0)].pgd;
+		swapper_pg_dir[0].pgd =
+		    swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
+	} else {
+		efi_bak_pg_dir_pointer[0].pgd =
+		    swapper_pg_dir[pgd_index(0)].pgd;
+		efi_bak_pg_dir_pointer[1].pgd =
+		    swapper_pg_dir[pgd_index(0x400000)].pgd;
+		swapper_pg_dir[pgd_index(0)].pgd =
+		    swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
+		temp = PAGE_OFFSET + 0x400000;
+		swapper_pg_dir[pgd_index(0x400000)].pgd =
+		    swapper_pg_dir[pgd_index(temp)].pgd;
+	}
+
+	/*
+	 * After the lock is released, the original page table is restored.
+	 */
+	local_flush_tlb();
+
+	gdt_descr.address = __pa(get_cpu_gdt_table(0));
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
+}
+
+static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
+{
+	unsigned long cr4;
+	struct Xgt_desc_struct gdt_descr;
+
+	gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
+
+	cr4 = read_cr4();
+
+	if (cr4 & X86_CR4_PSE) {
+		swapper_pg_dir[pgd_index(0)].pgd =
+		    efi_bak_pg_dir_pointer[0].pgd;
+	} else {
+		swapper_pg_dir[pgd_index(0)].pgd =
+		    efi_bak_pg_dir_pointer[0].pgd;
+		swapper_pg_dir[pgd_index(0x400000)].pgd =
+		    efi_bak_pg_dir_pointer[1].pgd;
+	}
+
+	/*
+	 * After the lock is released, the original page table is restored.
+	 */
+	local_flush_tlb();
+
+	local_irq_restore(efi_rt_eflags);
+	spin_unlock(&efi_rt_lock);
+}
+
+static efi_status_t
+phys_efi_set_virtual_address_map(unsigned long memory_map_size,
+				 unsigned long descriptor_size,
+				 u32 descriptor_version,
+				 efi_memory_desc_t *virtual_map)
+{
+	efi_status_t status;
+
+	efi_call_phys_prelog();
+	status = efi_call_phys(efi_phys.set_virtual_address_map,
+				     memory_map_size, descriptor_size,
+				     descriptor_version, virtual_map);
+	efi_call_phys_epilog();
+	return status;
+}
+
+static efi_status_t
+phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+	efi_status_t status;
+
+	efi_call_phys_prelog();
+	status = efi_call_phys(efi_phys.get_time, tm, tc);
+	efi_call_phys_epilog();
+	return status;
+}
+
+inline int efi_set_rtc_mmss(unsigned long nowtime)
+{
+	int real_seconds, real_minutes;
+	efi_status_t 	status;
+	efi_time_t 	eft;
+	efi_time_cap_t 	cap;
+
+	spin_lock(&efi_rt_lock);
+	status = efi.get_time(&eft, &cap);
+	spin_unlock(&efi_rt_lock);
+	if (status != EFI_SUCCESS)
+		panic("Ooops, efitime: can't read time!\n");
+	real_seconds = nowtime % 60;
+	real_minutes = nowtime / 60;
+
+	if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
+		real_minutes += 30;
+	real_minutes %= 60;
+
+	eft.minute = real_minutes;
+	eft.second = real_seconds;
+
+	if (status != EFI_SUCCESS) {
+		printk("Ooops: efitime: can't read time!\n");
+		return -1;
+	}
+	return 0;
+}
+/*
+ * This is used during kernel init before runtime
+ * services have been remapped and also during suspend, therefore,
+ * we'll need to call both in physical and virtual modes.
+ */
+inline unsigned long efi_get_time(void)
+{
+	efi_status_t status;
+	efi_time_t eft;
+	efi_time_cap_t cap;
+
+	if (efi.get_time) {
+		/* if we are in virtual mode use remapped function */
+ 		status = efi.get_time(&eft, &cap);
+	} else {
+		/* we are in physical mode */
+		status = phys_efi_get_time(&eft, &cap);
+	}
+
+	if (status != EFI_SUCCESS)
+		printk("Oops: efitime: can't read time status: 0x%lx\n",status);
+
+	return mktime(eft.year, eft.month, eft.day, eft.hour,
+			eft.minute, eft.second);
+}
+
+int is_available_memory(efi_memory_desc_t * md)
+{
+	if (!(md->attribute & EFI_MEMORY_WB))
+		return 0;
+
+	switch (md->type) {
+		case EFI_LOADER_CODE:
+		case EFI_LOADER_DATA:
+		case EFI_BOOT_SERVICES_CODE:
+		case EFI_BOOT_SERVICES_DATA:
+		case EFI_CONVENTIONAL_MEMORY:
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * We need to map the EFI memory map again after paging_init().
+ */
+void __init efi_map_memmap(void)
+{
+	memmap.map = NULL;
+
+	memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
+			(memmap.nr_map * memmap.desc_size));
+	if (memmap.map == NULL)
+		printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
+
+	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+}
+
+#if EFI_DEBUG
+static void __init print_efi_memmap(void)
+{
+	efi_memory_desc_t *md;
+	void *p;
+	int i;
+
+	for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
+		md = p;
+		printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
+			"range=[0x%016llx-0x%016llx) (%lluMB)\n",
+			i, md->type, md->attribute, md->phys_addr,
+			md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
+			(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
+	}
+}
+#endif  /*  EFI_DEBUG  */
+
+/*
+ * Walks the EFI memory map and calls CALLBACK once for each EFI
+ * memory descriptor that has memory that is available for kernel use.
+ */
+void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
+{
+	int prev_valid = 0;
+	struct range {
+		unsigned long start;
+		unsigned long end;
+	} uninitialized_var(prev), curr;
+	efi_memory_desc_t *md;
+	unsigned long start, end;
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+
+		if ((md->num_pages == 0) || (!is_available_memory(md)))
+			continue;
+
+		curr.start = md->phys_addr;
+		curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
+
+		if (!prev_valid) {
+			prev = curr;
+			prev_valid = 1;
+		} else {
+			if (curr.start < prev.start)
+				printk(KERN_INFO PFX "Unordered memory map\n");
+			if (prev.end == curr.start)
+				prev.end = curr.end;
+			else {
+				start =
+				    (unsigned long) (PAGE_ALIGN(prev.start));
+				end = (unsigned long) (prev.end & PAGE_MASK);
+				if ((end > start)
+				    && (*callback) (start, end, arg) < 0)
+					return;
+				prev = curr;
+			}
+		}
+	}
+	if (prev_valid) {
+		start = (unsigned long) PAGE_ALIGN(prev.start);
+		end = (unsigned long) (prev.end & PAGE_MASK);
+		if (end > start)
+			(*callback) (start, end, arg);
+	}
+}
+
+void __init efi_init(void)
+{
+	efi_config_table_t *config_tables;
+	efi_runtime_services_t *runtime;
+	efi_char16_t *c16;
+	char vendor[100] = "unknown";
+	unsigned long num_config_tables;
+	int i = 0;
+
+	memset(&efi, 0, sizeof(efi) );
+	memset(&efi_phys, 0, sizeof(efi_phys));
+
+	efi_phys.systab = EFI_SYSTAB;
+	memmap.phys_map = EFI_MEMMAP;
+	memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE;
+	memmap.desc_version = EFI_MEMDESC_VERSION;
+	memmap.desc_size = EFI_MEMDESC_SIZE;
+
+	efi.systab = (efi_system_table_t *)
+		boot_ioremap((unsigned long) efi_phys.systab,
+			sizeof(efi_system_table_t));
+	/*
+	 * Verify the EFI Table
+	 */
+	if (efi.systab == NULL)
+		printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
+	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+		printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
+	if ((efi.systab->hdr.revision >> 16) == 0)
+		printk(KERN_ERR PFX "Warning: EFI system table version "
+		       "%d.%02d, expected 1.00 or greater\n",
+		       efi.systab->hdr.revision >> 16,
+		       efi.systab->hdr.revision & 0xffff);
+
+	/*
+	 * Grab some details from the system table
+	 */
+	num_config_tables = efi.systab->nr_tables;
+	config_tables = (efi_config_table_t *)efi.systab->tables;
+	runtime = efi.systab->runtime;
+
+	/*
+	 * Show what we know for posterity
+	 */
+	c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
+	if (c16) {
+		for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i)
+			vendor[i] = *c16++;
+		vendor[i] = '\0';
+	} else
+		printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
+
+	printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
+	       efi.systab->hdr.revision >> 16,
+	       efi.systab->hdr.revision & 0xffff, vendor);
+
+	/*
+	 * Let's see what config tables the firmware passed to us.
+	 */
+	config_tables = (efi_config_table_t *)
+				boot_ioremap((unsigned long) config_tables,
+			        num_config_tables * sizeof(efi_config_table_t));
+
+	if (config_tables == NULL)
+		printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
+
+	efi.mps        = EFI_INVALID_TABLE_ADDR;
+	efi.acpi       = EFI_INVALID_TABLE_ADDR;
+	efi.acpi20     = EFI_INVALID_TABLE_ADDR;
+	efi.smbios     = EFI_INVALID_TABLE_ADDR;
+	efi.sal_systab = EFI_INVALID_TABLE_ADDR;
+	efi.boot_info  = EFI_INVALID_TABLE_ADDR;
+	efi.hcdp       = EFI_INVALID_TABLE_ADDR;
+	efi.uga        = EFI_INVALID_TABLE_ADDR;
+
+	for (i = 0; i < num_config_tables; i++) {
+		if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
+			efi.mps = config_tables[i].table;
+			printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
+		} else
+		    if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
+			efi.acpi20 = config_tables[i].table;
+			printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
+		} else
+		    if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
+			efi.acpi = config_tables[i].table;
+			printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
+		} else
+		    if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
+			efi.smbios = config_tables[i].table;
+			printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
+		} else
+		    if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
+			efi.hcdp = config_tables[i].table;
+			printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
+		} else
+		    if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
+			efi.uga = config_tables[i].table;
+			printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
+		}
+	}
+	printk("\n");
+
+	/*
+	 * Check out the runtime services table. We need to map
+	 * the runtime services table so that we can grab the physical
+	 * address of several of the EFI runtime functions, needed to
+	 * set the firmware into virtual mode.
+	 */
+
+	runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
+						runtime,
+				      		sizeof(efi_runtime_services_t));
+	if (runtime != NULL) {
+		/*
+	 	 * We will only need *early* access to the following
+		 * two EFI runtime services before set_virtual_address_map
+		 * is invoked.
+ 	 	 */
+		efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
+		efi_phys.set_virtual_address_map =
+			(efi_set_virtual_address_map_t *)
+				runtime->set_virtual_address_map;
+	} else
+		printk(KERN_ERR PFX "Could not map the runtime service table!\n");
+
+	/* Map the EFI memory map for use until paging_init() */
+	memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE);
+	if (memmap.map == NULL)
+		printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
+
+	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+
+#if EFI_DEBUG
+	print_efi_memmap();
+#endif
+}
+
+static inline void __init check_range_for_systab(efi_memory_desc_t *md)
+{
+	if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
+		((unsigned long)efi_phys.systab < md->phys_addr +
+		((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
+		unsigned long addr;
+
+		addr = md->virt_addr - md->phys_addr +
+			(unsigned long)efi_phys.systab;
+		efi.systab = (efi_system_table_t *)addr;
+	}
+}
+
+/*
+ * Wrap all the virtual calls in a way that forces the parameters on the stack.
+ */
+
+#define efi_call_virt(f, args...) \
+     ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
+
+static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+	return efi_call_virt(get_time, tm, tc);
+}
+
+static efi_status_t virt_efi_set_time (efi_time_t *tm)
+{
+	return efi_call_virt(set_time, tm);
+}
+
+static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled,
+					      efi_bool_t *pending,
+					      efi_time_t *tm)
+{
+	return efi_call_virt(get_wakeup_time, enabled, pending, tm);
+}
+
+static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled,
+					      efi_time_t *tm)
+{
+	return efi_call_virt(set_wakeup_time, enabled, tm);
+}
+
+static efi_status_t virt_efi_get_variable (efi_char16_t *name,
+					   efi_guid_t *vendor, u32 *attr,
+					   unsigned long *data_size, void *data)
+{
+	return efi_call_virt(get_variable, name, vendor, attr, data_size, data);
+}
+
+static efi_status_t virt_efi_get_next_variable (unsigned long *name_size,
+						efi_char16_t *name,
+						efi_guid_t *vendor)
+{
+	return efi_call_virt(get_next_variable, name_size, name, vendor);
+}
+
+static efi_status_t virt_efi_set_variable (efi_char16_t *name,
+					   efi_guid_t *vendor,
+					   unsigned long attr,
+					   unsigned long data_size, void *data)
+{
+	return efi_call_virt(set_variable, name, vendor, attr, data_size, data);
+}
+
+static efi_status_t virt_efi_get_next_high_mono_count (u32 *count)
+{
+	return efi_call_virt(get_next_high_mono_count, count);
+}
+
+static void virt_efi_reset_system (int reset_type, efi_status_t status,
+				   unsigned long data_size,
+				   efi_char16_t *data)
+{
+	efi_call_virt(reset_system, reset_type, status, data_size, data);
+}
+
+/*
+ * This function will switch the EFI runtime services to virtual mode.
+ * Essentially, look through the EFI memmap and map every region that
+ * has the runtime attribute bit set in its memory descriptor and update
+ * that memory descriptor with the virtual address obtained from ioremap().
+ * This enables the runtime services to be called without having to
+ * thunk back into physical mode for every invocation.
+ */
+
+void __init efi_enter_virtual_mode(void)
+{
+	efi_memory_desc_t *md;
+	efi_status_t status;
+	void *p;
+
+	efi.systab = NULL;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+
+		if (!(md->attribute & EFI_MEMORY_RUNTIME))
+			continue;
+
+		md->virt_addr = (unsigned long)ioremap(md->phys_addr,
+			md->num_pages << EFI_PAGE_SHIFT);
+		if (!(unsigned long)md->virt_addr) {
+			printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
+				(unsigned long)md->phys_addr);
+		}
+		/* update the virtual address of the EFI system table */
+		check_range_for_systab(md);
+	}
+
+	BUG_ON(!efi.systab);
+
+	status = phys_efi_set_virtual_address_map(
+			memmap.desc_size * memmap.nr_map,
+			memmap.desc_size,
+			memmap.desc_version,
+		       	memmap.phys_map);
+
+	if (status != EFI_SUCCESS) {
+		printk (KERN_ALERT "You are screwed! "
+			"Unable to switch EFI into virtual mode "
+			"(status=%lx)\n", status);
+		panic("EFI call to SetVirtualAddressMap() failed!");
+	}
+
+	/*
+	 * Now that EFI is in virtual mode, update the function
+	 * pointers in the runtime service table to the new virtual addresses.
+	 */
+
+	efi.get_time = virt_efi_get_time;
+	efi.set_time = virt_efi_set_time;
+	efi.get_wakeup_time = virt_efi_get_wakeup_time;
+	efi.set_wakeup_time = virt_efi_set_wakeup_time;
+	efi.get_variable = virt_efi_get_variable;
+	efi.get_next_variable = virt_efi_get_next_variable;
+	efi.set_variable = virt_efi_set_variable;
+	efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
+	efi.reset_system = virt_efi_reset_system;
+}
+
+void __init
+efi_initialize_iomem_resources(struct resource *code_resource,
+			       struct resource *data_resource)
+{
+	struct resource *res;
+	efi_memory_desc_t *md;
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+
+		if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
+		    0x100000000ULL)
+			continue;
+		res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
+		switch (md->type) {
+		case EFI_RESERVED_TYPE:
+			res->name = "Reserved Memory";
+			break;
+		case EFI_LOADER_CODE:
+			res->name = "Loader Code";
+			break;
+		case EFI_LOADER_DATA:
+			res->name = "Loader Data";
+			break;
+		case EFI_BOOT_SERVICES_DATA:
+			res->name = "BootServices Data";
+			break;
+		case EFI_BOOT_SERVICES_CODE:
+			res->name = "BootServices Code";
+			break;
+		case EFI_RUNTIME_SERVICES_CODE:
+			res->name = "Runtime Service Code";
+			break;
+		case EFI_RUNTIME_SERVICES_DATA:
+			res->name = "Runtime Service Data";
+			break;
+		case EFI_CONVENTIONAL_MEMORY:
+			res->name = "Conventional Memory";
+			break;
+		case EFI_UNUSABLE_MEMORY:
+			res->name = "Unusable Memory";
+			break;
+		case EFI_ACPI_RECLAIM_MEMORY:
+			res->name = "ACPI Reclaim";
+			break;
+		case EFI_ACPI_MEMORY_NVS:
+			res->name = "ACPI NVS";
+			break;
+		case EFI_MEMORY_MAPPED_IO:
+			res->name = "Memory Mapped IO";
+			break;
+		case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
+			res->name = "Memory Mapped IO Port Space";
+			break;
+		default:
+			res->name = "Reserved";
+			break;
+		}
+		res->start = md->phys_addr;
+		res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
+		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		if (request_resource(&iomem_resource, res) < 0)
+			printk(KERN_ERR PFX "Failed to allocate res %s : "
+				"0x%llx-0x%llx\n", res->name,
+				(unsigned long long)res->start,
+				(unsigned long long)res->end);
+		/*
+		 * We don't know which region contains kernel data so we try
+		 * it repeatedly and let the resource manager test it.
+		 */
+		if (md->type == EFI_CONVENTIONAL_MEMORY) {
+			request_resource(res, code_resource);
+			request_resource(res, data_resource);
+#ifdef CONFIG_KEXEC
+			request_resource(res, &crashk_res);
+#endif
+		}
+	}
+}
+
+/*
+ * Convenience functions to obtain memory types and attributes
+ */
+
+u32 efi_mem_type(unsigned long phys_addr)
+{
+	efi_memory_desc_t *md;
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		if ((md->phys_addr <= phys_addr) && (phys_addr <
+			(md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
+			return md->type;
+	}
+	return 0;
+}
+
+u64 efi_mem_attributes(unsigned long phys_addr)
+{
+	efi_memory_desc_t *md;
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		if ((md->phys_addr <= phys_addr) && (phys_addr <
+			(md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
+			return md->attribute;
+	}
+	return 0;
+}
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
new file mode 100644
index 000000000000..ef00bb77d7e4
--- /dev/null
+++ b/arch/x86/kernel/efi_stub_32.S
@@ -0,0 +1,122 @@
+/*
+ * EFI call stub for IA32.
+ *
+ * This stub allows us to make EFI calls in physical mode with interrupts
+ * turned off.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * efi_call_phys(void *, ...) is a function with variable parameters.
+ * All the callers of this function assure that all the parameters are 4-bytes.
+ */
+
+/*
+ * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
+ * So we'd better save all of them at the beginning of this function and restore
+ * at the end no matter how many we use, because we can not assure EFI runtime
+ * service functions will comply with gcc calling convention, too.
+ */
+
+.text
+ENTRY(efi_call_phys)
+	/*
+	 * 0. The function can only be called in Linux kernel. So CS has been
+	 * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
+	 * the values of these registers are the same. And, the corresponding
+	 * GDT entries are identical. So I will do nothing about segment reg
+	 * and GDT, but change GDT base register in prelog and epilog.
+	 */
+
+	/*
+	 * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
+	 * But to make it smoothly switch from virtual mode to flat mode.
+	 * The mapping of lower virtual memory has been created in prelog and
+	 * epilog.
+	 */
+	movl	$1f, %edx
+	subl	$__PAGE_OFFSET, %edx
+	jmp	*%edx
+1:
+
+	/*
+	 * 2. Now on the top of stack is the return
+	 * address in the caller of efi_call_phys(), then parameter 1,
+	 * parameter 2, ..., param n. To make things easy, we save the return
+	 * address of efi_call_phys in a global variable.
+	 */
+	popl	%edx
+	movl	%edx, saved_return_addr
+	/* get the function pointer into ECX*/
+	popl	%ecx
+	movl	%ecx, efi_rt_function_ptr
+	movl	$2f, %edx
+	subl	$__PAGE_OFFSET, %edx
+	pushl	%edx
+
+	/*
+	 * 3. Clear PG bit in %CR0.
+	 */
+	movl	%cr0, %edx
+	andl	$0x7fffffff, %edx
+	movl	%edx, %cr0
+	jmp	1f
+1:
+
+	/*
+	 * 4. Adjust stack pointer.
+	 */
+	subl	$__PAGE_OFFSET, %esp
+
+	/*
+	 * 5. Call the physical function.
+	 */
+	jmp	*%ecx
+
+2:
+	/*
+	 * 6. After EFI runtime service returns, control will return to
+	 * following instruction. We'd better readjust stack pointer first.
+	 */
+	addl	$__PAGE_OFFSET, %esp
+
+	/*
+	 * 7. Restore PG bit
+	 */
+	movl	%cr0, %edx
+	orl	$0x80000000, %edx
+	movl	%edx, %cr0
+	jmp	1f
+1:
+	/*
+	 * 8. Now restore the virtual mode from flat mode by
+	 * adding EIP with PAGE_OFFSET.
+	 */
+	movl	$1f, %edx
+	jmp	*%edx
+1:
+
+	/*
+	 * 9. Balance the stack. And because EAX contain the return value,
+	 * we'd better not clobber it.
+	 */
+	leal	efi_rt_function_ptr, %edx
+	movl	(%edx), %ecx
+	pushl	%ecx
+
+	/*
+	 * 10. Push the saved return address onto the stack and return.
+	 */
+	leal	saved_return_addr, %edx
+	movl	(%edx), %ecx
+	pushl	%ecx
+	ret
+.previous
+
+.data
+saved_return_addr:
+	.long 0
+efi_rt_function_ptr:
+	.long 0
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
new file mode 100644
index 000000000000..290b7bc82da3
--- /dev/null
+++ b/arch/x86/kernel/entry_32.S
@@ -0,0 +1,1112 @@
+/*
+ *  linux/arch/i386/entry.S
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ * This also contains the timer-interrupt handler, as well as all interrupts
+ * and faults that can result in a task-switch.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after a timer-interrupt and after each system call.
+ *
+ * I changed all the .align's to 4 (16 byte alignment), as that's faster
+ * on a 486.
+ *
+ * Stack layout in 'syscall_exit':
+ * 	ptrace needs to have all regs on the stack.
+ *	if the order here is changed, it needs to be
+ *	updated in fork.c:copy_process, signal.c:do_signal,
+ *	ptrace.c and ptrace.h
+ *
+ *	 0(%esp) - %ebx
+ *	 4(%esp) - %ecx
+ *	 8(%esp) - %edx
+ *       C(%esp) - %esi
+ *	10(%esp) - %edi
+ *	14(%esp) - %ebp
+ *	18(%esp) - %eax
+ *	1C(%esp) - %ds
+ *	20(%esp) - %es
+ *	24(%esp) - %fs
+ *	28(%esp) - orig_eax
+ *	2C(%esp) - %eip
+ *	30(%esp) - %cs
+ *	34(%esp) - %eflags
+ *	38(%esp) - %oldesp
+ *	3C(%esp) - %oldss
+ *
+ * "current" is in register %ebx during any slow entries.
+ */
+
+#include <linux/linkage.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/page.h>
+#include <asm/desc.h>
+#include <asm/percpu.h>
+#include <asm/dwarf2.h>
+#include "irq_vectors.h"
+
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization.  The following will never clobber any registers:
+ *   INTERRUPT_RETURN (aka. "iret")
+ *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
+#define nr_syscalls ((syscall_table_size)/4)
+
+CF_MASK		= 0x00000001
+TF_MASK		= 0x00000100
+IF_MASK		= 0x00000200
+DF_MASK		= 0x00000400 
+NT_MASK		= 0x00004000
+VM_MASK		= 0x00020000
+
+#ifdef CONFIG_PREEMPT
+#define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+#else
+#define preempt_stop(clobbers)
+#define resume_kernel		restore_nocheck
+#endif
+
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+	testl $IF_MASK,PT_EFLAGS(%esp)     # interrupts off?
+	jz 1f
+	TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+#ifdef CONFIG_VM86
+#define resume_userspace_sig	check_userspace
+#else
+#define resume_userspace_sig	resume_userspace
+#endif
+
+#define SAVE_ALL \
+	cld; \
+	pushl %fs; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	/*CFI_REL_OFFSET fs, 0;*/\
+	pushl %es; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	/*CFI_REL_OFFSET es, 0;*/\
+	pushl %ds; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	/*CFI_REL_OFFSET ds, 0;*/\
+	pushl %eax; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET eax, 0;\
+	pushl %ebp; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET ebp, 0;\
+	pushl %edi; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET edi, 0;\
+	pushl %esi; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET esi, 0;\
+	pushl %edx; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET edx, 0;\
+	pushl %ecx; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET ecx, 0;\
+	pushl %ebx; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET ebx, 0;\
+	movl $(__USER_DS), %edx; \
+	movl %edx, %ds; \
+	movl %edx, %es; \
+	movl $(__KERNEL_PERCPU), %edx; \
+	movl %edx, %fs
+
+#define RESTORE_INT_REGS \
+	popl %ebx;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE ebx;\
+	popl %ecx;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE ecx;\
+	popl %edx;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE edx;\
+	popl %esi;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE esi;\
+	popl %edi;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE edi;\
+	popl %ebp;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE ebp;\
+	popl %eax;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE eax
+
+#define RESTORE_REGS	\
+	RESTORE_INT_REGS; \
+1:	popl %ds;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	/*CFI_RESTORE ds;*/\
+2:	popl %es;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	/*CFI_RESTORE es;*/\
+3:	popl %fs;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	/*CFI_RESTORE fs;*/\
+.pushsection .fixup,"ax";	\
+4:	movl $0,(%esp);	\
+	jmp 1b;		\
+5:	movl $0,(%esp);	\
+	jmp 2b;		\
+6:	movl $0,(%esp);	\
+	jmp 3b;		\
+.section __ex_table,"a";\
+	.align 4;	\
+	.long 1b,4b;	\
+	.long 2b,5b;	\
+	.long 3b,6b;	\
+.popsection
+
+#define RING0_INT_FRAME \
+	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
+	CFI_DEF_CFA esp, 3*4;\
+	/*CFI_OFFSET cs, -2*4;*/\
+	CFI_OFFSET eip, -3*4
+
+#define RING0_EC_FRAME \
+	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
+	CFI_DEF_CFA esp, 4*4;\
+	/*CFI_OFFSET cs, -2*4;*/\
+	CFI_OFFSET eip, -3*4
+
+#define RING0_PTREGS_FRAME \
+	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
+	CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
+	/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
+	CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
+	/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
+	/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
+	CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
+	CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
+	CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
+	CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
+	CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
+	CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
+	CFI_OFFSET ebx, PT_EBX-PT_OLDESP
+
+ENTRY(ret_from_fork)
+	CFI_STARTPROC
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	call schedule_tail
+	GET_THREAD_INFO(%ebp)
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	pushl $0x0202			# Reset kernel eflags
+	CFI_ADJUST_CFA_OFFSET 4
+	popfl
+	CFI_ADJUST_CFA_OFFSET -4
+	jmp syscall_exit
+	CFI_ENDPROC
+END(ret_from_fork)
+
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+
+	# userspace resumption stub bypassing syscall exit tracing
+	ALIGN
+	RING0_PTREGS_FRAME
+ret_from_exception:
+	preempt_stop(CLBR_ANY)
+ret_from_intr:
+	GET_THREAD_INFO(%ebp)
+check_userspace:
+	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
+	movb PT_CS(%esp), %al
+	andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
+	cmpl $USER_RPL, %eax
+	jb resume_kernel		# not returning to v8086 or userspace
+
+ENTRY(resume_userspace)
+ 	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
+					# setting need_resched or sigpending
+					# between sampling and the iret
+	movl TI_flags(%ebp), %ecx
+	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done on
+					# int/exception return?
+	jne work_pending
+	jmp restore_all
+END(ret_from_exception)
+
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
+	jnz restore_nocheck
+need_resched:
+	movl TI_flags(%ebp), %ecx	# need_resched set ?
+	testb $_TIF_NEED_RESCHED, %cl
+	jz restore_all
+	testl $IF_MASK,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
+	jz restore_all
+	call preempt_schedule_irq
+	jmp need_resched
+END(resume_kernel)
+#endif
+	CFI_ENDPROC
+
+/* SYSENTER_RETURN points to after the "sysenter" instruction in
+   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
+
+	# sysenter call handler stub
+ENTRY(sysenter_entry)
+	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA esp, 0
+	CFI_REGISTER esp, ebp
+	movl TSS_sysenter_esp0(%esp),%esp
+sysenter_past_esp:
+	/*
+	 * No need to follow this irqs on/off section: the syscall
+	 * disabled irqs and here we enable it straight after entry:
+	 */
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	pushl $(__USER_DS)
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET ss, 0*/
+	pushl %ebp
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esp, 0
+	pushfl
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $(__USER_CS)
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET cs, 0*/
+	/*
+	 * Push current_thread_info()->sysenter_return to the stack.
+	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
+	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
+	 */
+	pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET eip, 0
+
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+	cmpl $__PAGE_OFFSET-3,%ebp
+	jae syscall_fault
+1:	movl (%ebp),%ebp
+.section __ex_table,"a"
+	.align 4
+	.long 1b,syscall_fault
+.previous
+
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	GET_THREAD_INFO(%ebp)
+
+	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	jnz syscall_trace_entry
+	cmpl $(nr_syscalls), %eax
+	jae syscall_badsys
+	call *sys_call_table(,%eax,4)
+	movl %eax,PT_EAX(%esp)
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	TRACE_IRQS_OFF
+	movl TI_flags(%ebp), %ecx
+	testw $_TIF_ALLWORK_MASK, %cx
+	jne syscall_exit_work
+/* if something modifies registers it must also disable sysexit */
+	movl PT_EIP(%esp), %edx
+	movl PT_OLDESP(%esp), %ecx
+	xorl %ebp,%ebp
+	TRACE_IRQS_ON
+1:	mov  PT_FS(%esp), %fs
+	ENABLE_INTERRUPTS_SYSEXIT
+	CFI_ENDPROC
+.pushsection .fixup,"ax"
+2:	movl $0,PT_FS(%esp)
+	jmp 1b
+.section __ex_table,"a"
+	.align 4
+	.long 1b,2b
+.popsection
+ENDPROC(sysenter_entry)
+
+	# system call handler stub
+ENTRY(system_call)
+	RING0_INT_FRAME			# can't unwind into user space anyway
+	pushl %eax			# save orig_eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	GET_THREAD_INFO(%ebp)
+					# system call tracing in operation / emulation
+	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	jnz syscall_trace_entry
+	cmpl $(nr_syscalls), %eax
+	jae syscall_badsys
+syscall_call:
+	call *sys_call_table(,%eax,4)
+	movl %eax,PT_EAX(%esp)		# store the return value
+syscall_exit:
+	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
+					# setting need_resched or sigpending
+					# between sampling and the iret
+	TRACE_IRQS_OFF
+	testl $TF_MASK,PT_EFLAGS(%esp)	# If tracing set singlestep flag on exit
+	jz no_singlestep
+	orl $_TIF_SINGLESTEP,TI_flags(%ebp)
+no_singlestep:
+	movl TI_flags(%ebp), %ecx
+	testw $_TIF_ALLWORK_MASK, %cx	# current->work
+	jne syscall_exit_work
+
+restore_all:
+	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS, SS and CS
+	# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+	# are returning to the kernel.
+	# See comments in process.c:copy_thread() for details.
+	movb PT_OLDSS(%esp), %ah
+	movb PT_CS(%esp), %al
+	andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+	CFI_REMEMBER_STATE
+	je ldt_ss			# returning to user-space with LDT SS
+restore_nocheck:
+	TRACE_IRQS_IRET
+restore_nocheck_notrace:
+	RESTORE_REGS
+	addl $4, %esp			# skip orig_eax/error_code
+	CFI_ADJUST_CFA_OFFSET -4
+1:	INTERRUPT_RETURN
+.section .fixup,"ax"
+iret_exc:
+	pushl $0			# no error code
+	pushl $do_iret_error
+	jmp error_code
+.previous
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+
+	CFI_RESTORE_STATE
+ldt_ss:
+	larl PT_OLDSS(%esp), %eax
+	jnz restore_nocheck
+	testl $0x00400000, %eax		# returning to 32bit stack?
+	jnz restore_nocheck		# allright, normal return
+
+#ifdef CONFIG_PARAVIRT
+	/*
+	 * The kernel can't run on a non-flat stack if paravirt mode
+	 * is active.  Rather than try to fixup the high bits of
+	 * ESP, bypass this code entirely.  This may break DOSemu
+	 * and/or Wine support in a paravirt VM, although the option
+	 * is still available to implement the setting of the high
+	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
+	 */
+	cmpl $0, paravirt_ops+PARAVIRT_enabled
+	jne restore_nocheck
+#endif
+
+	/* If returning to userspace with 16bit stack,
+	 * try to fix the higher word of ESP, as the CPU
+	 * won't restore it.
+	 * This is an "official" bug of all the x86-compatible
+	 * CPUs, which we can try to work around to make
+	 * dosemu and wine happy. */
+	movl PT_OLDESP(%esp), %eax
+	movl %esp, %edx
+	call patch_espfix_desc
+	pushl $__ESPFIX_SS
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	DISABLE_INTERRUPTS(CLBR_EAX)
+	TRACE_IRQS_OFF
+	lss (%esp), %esp
+	CFI_ADJUST_CFA_OFFSET -8
+	jmp restore_nocheck
+	CFI_ENDPROC
+ENDPROC(system_call)
+
+	# perform work that needs to be done immediately before resumption
+	ALIGN
+	RING0_PTREGS_FRAME		# can't unwind into user space anyway
+work_pending:
+	testb $_TIF_NEED_RESCHED, %cl
+	jz work_notifysig
+work_resched:
+	call schedule
+	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
+					# setting need_resched or sigpending
+					# between sampling and the iret
+	TRACE_IRQS_OFF
+	movl TI_flags(%ebp), %ecx
+	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
+					# than syscall tracing?
+	jz restore_all
+	testb $_TIF_NEED_RESCHED, %cl
+	jnz work_resched
+
+work_notifysig:				# deal with pending signals and
+					# notify-resume requests
+#ifdef CONFIG_VM86
+	testl $VM_MASK, PT_EFLAGS(%esp)
+	movl %esp, %eax
+	jne work_notifysig_v86		# returning to kernel-space or
+					# vm86-space
+	xorl %edx, %edx
+	call do_notify_resume
+	jmp resume_userspace_sig
+
+	ALIGN
+work_notifysig_v86:
+	pushl %ecx			# save ti_flags for do_notify_resume
+	CFI_ADJUST_CFA_OFFSET 4
+	call save_v86_state		# %eax contains pt_regs pointer
+	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	movl %eax, %esp
+#else
+	movl %esp, %eax
+#endif
+	xorl %edx, %edx
+	call do_notify_resume
+	jmp resume_userspace_sig
+END(work_pending)
+
+	# perform syscall exit tracing
+	ALIGN
+syscall_trace_entry:
+	movl $-ENOSYS,PT_EAX(%esp)
+	movl %esp, %eax
+	xorl %edx,%edx
+	call do_syscall_trace
+	cmpl $0, %eax
+	jne resume_userspace		# ret != 0 -> running under PTRACE_SYSEMU,
+					# so must skip actual syscall
+	movl PT_ORIG_EAX(%esp), %eax
+	cmpl $(nr_syscalls), %eax
+	jnae syscall_call
+	jmp syscall_exit
+END(syscall_trace_entry)
+
+	# perform syscall exit tracing
+	ALIGN
+syscall_exit_work:
+	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
+	jz work_pending
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_ANY)	# could let do_syscall_trace() call
+					# schedule() instead
+	movl %esp, %eax
+	movl $1, %edx
+	call do_syscall_trace
+	jmp resume_userspace
+END(syscall_exit_work)
+	CFI_ENDPROC
+
+	RING0_INT_FRAME			# can't unwind into user space anyway
+syscall_fault:
+	pushl %eax			# save orig_eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	GET_THREAD_INFO(%ebp)
+	movl $-EFAULT,PT_EAX(%esp)
+	jmp resume_userspace
+END(syscall_fault)
+
+syscall_badsys:
+	movl $-ENOSYS,PT_EAX(%esp)
+	jmp resume_userspace
+END(syscall_badsys)
+	CFI_ENDPROC
+
+#define FIXUP_ESPFIX_STACK \
+	/* since we are on a wrong stack, we cant make it a C code :( */ \
+	PER_CPU(gdt_page, %ebx); \
+	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
+	addl %esp, %eax; \
+	pushl $__KERNEL_DS; \
+	CFI_ADJUST_CFA_OFFSET 4; \
+	pushl %eax; \
+	CFI_ADJUST_CFA_OFFSET 4; \
+	lss (%esp), %esp; \
+	CFI_ADJUST_CFA_OFFSET -8;
+#define UNWIND_ESPFIX_STACK \
+	movl %ss, %eax; \
+	/* see if on espfix stack */ \
+	cmpw $__ESPFIX_SS, %ax; \
+	jne 27f; \
+	movl $__KERNEL_DS, %eax; \
+	movl %eax, %ds; \
+	movl %eax, %es; \
+	/* switch to normal stack */ \
+	FIXUP_ESPFIX_STACK; \
+27:;
+
+/*
+ * Build the entry stubs and pointer table with
+ * some assembler magic.
+ */
+.data
+ENTRY(interrupt)
+.text
+
+ENTRY(irq_entries_start)
+	RING0_INT_FRAME
+vector=0
+.rept NR_IRQS
+	ALIGN
+ .if vector
+	CFI_ADJUST_CFA_OFFSET -4
+ .endif
+1:	pushl $~(vector)
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp common_interrupt
+ .previous
+	.long 1b
+ .text
+vector=vector+1
+.endr
+END(irq_entries_start)
+
+.previous
+END(interrupt)
+.previous
+
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
+	ALIGN
+common_interrupt:
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	movl %esp,%eax
+	call do_IRQ
+	jmp ret_from_intr
+ENDPROC(common_interrupt)
+	CFI_ENDPROC
+
+#define BUILD_INTERRUPT(name, nr)	\
+ENTRY(name)				\
+	RING0_INT_FRAME;		\
+	pushl $~(nr);			\
+	CFI_ADJUST_CFA_OFFSET 4;	\
+	SAVE_ALL;			\
+	TRACE_IRQS_OFF			\
+	movl %esp,%eax;			\
+	call smp_##name;		\
+	jmp ret_from_intr;		\
+	CFI_ENDPROC;			\
+ENDPROC(name)
+
+/* The include is where all of the SMP etc. interrupts come from */
+#include "entry_arch.h"
+
+KPROBE_ENTRY(page_fault)
+	RING0_EC_FRAME
+	pushl $do_page_fault
+	CFI_ADJUST_CFA_OFFSET 4
+	ALIGN
+error_code:
+	/* the function address is in %fs's slot on the stack */
+	pushl %es
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET es, 0*/
+	pushl %ds
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET ds, 0*/
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET eax, 0
+	pushl %ebp
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebp, 0
+	pushl %edi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edi, 0
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	pushl %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx, 0
+	pushl %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx, 0
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	cld
+	pushl %fs
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET fs, 0*/
+	movl $(__KERNEL_PERCPU), %ecx
+	movl %ecx, %fs
+	UNWIND_ESPFIX_STACK
+	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	/*CFI_REGISTER es, ecx*/
+	movl PT_FS(%esp), %edi		# get the function address
+	movl PT_ORIG_EAX(%esp), %edx	# get the error code
+	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
+	mov  %ecx, PT_FS(%esp)
+	/*CFI_REL_OFFSET fs, ES*/
+	movl $(__USER_DS), %ecx
+	movl %ecx, %ds
+	movl %ecx, %es
+	movl %esp,%eax			# pt_regs pointer
+	call *%edi
+	jmp ret_from_exception
+	CFI_ENDPROC
+KPROBE_END(page_fault)
+
+ENTRY(coprocessor_error)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_coprocessor_error
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(coprocessor_error)
+
+ENTRY(simd_coprocessor_error)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_simd_coprocessor_error
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(simd_coprocessor_error)
+
+ENTRY(device_not_available)
+	RING0_INT_FRAME
+	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	GET_CR0_INTO_EAX
+	testl $0x4, %eax		# EM (math emulation bit)
+	jne device_not_available_emulate
+	preempt_stop(CLBR_ANY)
+	call math_state_restore
+	jmp ret_from_exception
+device_not_available_emulate:
+	pushl $0			# temporary storage for ORIG_EIP
+	CFI_ADJUST_CFA_OFFSET 4
+	call math_emulate
+	addl $4, %esp
+	CFI_ADJUST_CFA_OFFSET -4
+	jmp ret_from_exception
+	CFI_ENDPROC
+END(device_not_available)
+
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+#define FIX_STACK(offset, ok, label)		\
+	cmpw $__KERNEL_CS,4(%esp);		\
+	jne ok;					\
+label:						\
+	movl TSS_sysenter_esp0+offset(%esp),%esp;	\
+	CFI_DEF_CFA esp, 0;			\
+	CFI_UNDEFINED eip;			\
+	pushfl;					\
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	pushl $__KERNEL_CS;			\
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	pushl $sysenter_past_esp;		\
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	CFI_REL_OFFSET eip, 0
+
+KPROBE_ENTRY(debug)
+	RING0_INT_FRAME
+	cmpl $sysenter_entry,(%esp)
+	jne debug_stack_correct
+	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+debug_stack_correct:
+	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	xorl %edx,%edx			# error code 0
+	movl %esp,%eax			# pt_regs pointer
+	call do_debug
+	jmp ret_from_exception
+	CFI_ENDPROC
+KPROBE_END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got  an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+KPROBE_ENTRY(nmi)
+	RING0_INT_FRAME
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	movl %ss, %eax
+	cmpw $__ESPFIX_SS, %ax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	je nmi_espfix_stack
+	cmpl $sysenter_entry,(%esp)
+	je nmi_stack_fixup
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	movl %esp,%eax
+	/* Do not access memory above the end of our stack page,
+	 * it might not exist.
+	 */
+	andl $(THREAD_SIZE-1),%eax
+	cmpl $(THREAD_SIZE-20),%eax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	jae nmi_stack_correct
+	cmpl $sysenter_entry,12(%esp)
+	je nmi_debug_stack_check
+nmi_stack_correct:
+	/* We have a RING0_INT_FRAME here */
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	xorl %edx,%edx		# zero error code
+	movl %esp,%eax		# pt_regs pointer
+	call do_nmi
+	jmp restore_nocheck_notrace
+	CFI_ENDPROC
+
+nmi_stack_fixup:
+	RING0_INT_FRAME
+	FIX_STACK(12,nmi_stack_correct, 1)
+	jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+	/* We have a RING0_INT_FRAME here */
+	cmpw $__KERNEL_CS,16(%esp)
+	jne nmi_stack_correct
+	cmpl $debug,(%esp)
+	jb nmi_stack_correct
+	cmpl $debug_esp_fix_insn,(%esp)
+	ja nmi_stack_correct
+	FIX_STACK(24,nmi_stack_correct, 1)
+	jmp nmi_stack_correct
+
+nmi_espfix_stack:
+	/* We have a RING0_INT_FRAME here.
+	 *
+	 * create the pointer to lss back
+	 */
+	pushl %ss
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl %esp
+	CFI_ADJUST_CFA_OFFSET 4
+	addw $4, (%esp)
+	/* copy the iret frame of 12 bytes */
+	.rept 3
+	pushl 16(%esp)
+	CFI_ADJUST_CFA_OFFSET 4
+	.endr
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	FIXUP_ESPFIX_STACK		# %eax == %esp
+	xorl %edx,%edx			# zero error code
+	call do_nmi
+	RESTORE_REGS
+	lss 12+4(%esp), %esp		# back to espfix stack
+	CFI_ADJUST_CFA_OFFSET -24
+1:	INTERRUPT_RETURN
+	CFI_ENDPROC
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+KPROBE_END(nmi)
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+1:	iret
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+END(native_iret)
+
+ENTRY(native_irq_enable_sysexit)
+	sti
+	sysexit
+END(native_irq_enable_sysexit)
+#endif
+
+KPROBE_ENTRY(int3)
+	RING0_INT_FRAME
+	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	xorl %edx,%edx		# zero error code
+	movl %esp,%eax		# pt_regs pointer
+	call do_int3
+	jmp ret_from_exception
+	CFI_ENDPROC
+KPROBE_END(int3)
+
+ENTRY(overflow)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_overflow
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(overflow)
+
+ENTRY(bounds)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_bounds
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(bounds)
+
+ENTRY(invalid_op)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_invalid_op
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(invalid_op)
+
+ENTRY(coprocessor_segment_overrun)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_coprocessor_segment_overrun
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(coprocessor_segment_overrun)
+
+ENTRY(invalid_TSS)
+	RING0_EC_FRAME
+	pushl $do_invalid_TSS
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(invalid_TSS)
+
+ENTRY(segment_not_present)
+	RING0_EC_FRAME
+	pushl $do_segment_not_present
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(segment_not_present)
+
+ENTRY(stack_segment)
+	RING0_EC_FRAME
+	pushl $do_stack_segment
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(stack_segment)
+
+KPROBE_ENTRY(general_protection)
+	RING0_EC_FRAME
+	pushl $do_general_protection
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+KPROBE_END(general_protection)
+
+ENTRY(alignment_check)
+	RING0_EC_FRAME
+	pushl $do_alignment_check
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(alignment_check)
+
+ENTRY(divide_error)
+	RING0_INT_FRAME
+	pushl $0			# no error code
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_divide_error
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(divide_error)
+
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl machine_check_vector
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(machine_check)
+#endif
+
+ENTRY(spurious_interrupt_bug)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_spurious_interrupt_bug
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(spurious_interrupt_bug)
+
+ENTRY(kernel_thread_helper)
+	pushl $0		# fake return address for unwinder
+	CFI_STARTPROC
+	movl %edx,%eax
+	push %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	call *%ebx
+	push %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	call do_exit
+	CFI_ENDPROC
+ENDPROC(kernel_thread_helper)
+
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+	CFI_STARTPROC
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	TRACE_IRQS_OFF
+
+	/* Check to see if we got the event in the critical
+	   region in xen_iret_direct, after we've reenabled
+	   events and checked for pending events.  This simulates
+	   iret instruction's behaviour where it delivers a
+	   pending interrupt when enabling interrupts. */
+	movl PT_EIP(%esp),%eax
+	cmpl $xen_iret_start_crit,%eax
+	jb   1f
+	cmpl $xen_iret_end_crit,%eax
+	jae  1f
+
+	call xen_iret_crit_fixup
+
+1:	mov %esp, %eax
+	call xen_evtchn_do_upcall
+	jmp  ret_from_intr
+	CFI_ENDPROC
+ENDPROC(xen_hypervisor_callback)
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(xen_failsafe_callback)
+	CFI_STARTPROC
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	movl $1,%eax
+1:	mov 4(%esp),%ds
+2:	mov 8(%esp),%es
+3:	mov 12(%esp),%fs
+4:	mov 16(%esp),%gs
+	testl %eax,%eax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	lea 16(%esp),%esp
+	CFI_ADJUST_CFA_OFFSET -16
+	jz 5f
+	addl $16,%esp
+	jmp iret_exc		# EAX != 0 => Category 2 (Bad IRET)
+5:	pushl $0		# EAX == 0 => Category 1 (Bad segment)
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	jmp ret_from_exception
+	CFI_ENDPROC
+
+.section .fixup,"ax"
+6:	xorl %eax,%eax
+	movl %eax,4(%esp)
+	jmp 1b
+7:	xorl %eax,%eax
+	movl %eax,8(%esp)
+	jmp 2b
+8:	xorl %eax,%eax
+	movl %eax,12(%esp)
+	jmp 3b
+9:	xorl %eax,%eax
+	movl %eax,16(%esp)
+	jmp 4b
+.previous
+.section __ex_table,"a"
+	.align 4
+	.long 1b,6b
+	.long 2b,7b
+	.long 3b,8b
+	.long 4b,9b
+.previous
+ENDPROC(xen_failsafe_callback)
+
+#endif	/* CONFIG_XEN */
+
+.section .rodata,"a"
+#include "syscall_table_32.S"
+
+syscall_table_size=(.-sys_call_table)
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
new file mode 100644
index 000000000000..41e8aec4c61d
--- /dev/null
+++ b/arch/x86/kernel/geode_32.c
@@ -0,0 +1,155 @@
+/*
+ * AMD Geode southbridge support code
+ * Copyright (C) 2006, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <asm/msr.h>
+#include <asm/geode.h>
+
+static struct {
+	char *name;
+	u32 msr;
+	int size;
+	u32 base;
+} lbars[] = {
+	{ "geode-pms",   MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
+	{ "geode-acpi",  MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
+	{ "geode-gpio",  MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
+	{ "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
+};
+
+static void __init init_lbars(void)
+{
+	u32 lo, hi;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(lbars); i++) {
+		rdmsr(lbars[i].msr, lo, hi);
+		if (hi & 0x01)
+			lbars[i].base = lo & 0x0000ffff;
+
+		if (lbars[i].base == 0)
+			printk(KERN_ERR "geode:  Couldn't initialize '%s'\n",
+					lbars[i].name);
+	}
+}
+
+int geode_get_dev_base(unsigned int dev)
+{
+	BUG_ON(dev >= ARRAY_SIZE(lbars));
+	return lbars[dev].base;
+}
+EXPORT_SYMBOL_GPL(geode_get_dev_base);
+
+/* === GPIO API === */
+
+void geode_gpio_set(unsigned int gpio, unsigned int reg)
+{
+	u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+
+	if (!base)
+		return;
+
+	if (gpio < 16)
+		outl(1 << gpio, base + reg);
+	else
+		outl(1 << (gpio - 16), base + 0x80 + reg);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_set);
+
+void geode_gpio_clear(unsigned int gpio, unsigned int reg)
+{
+	u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+
+	if (!base)
+		return;
+
+	if (gpio < 16)
+		outl(1 << (gpio + 16), base + reg);
+	else
+		outl(1 << gpio, base + 0x80 + reg);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_clear);
+
+int geode_gpio_isset(unsigned int gpio, unsigned int reg)
+{
+	u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+
+	if (!base)
+		return 0;
+
+	if (gpio < 16)
+		return (inl(base + reg) & (1 << gpio)) ? 1 : 0;
+	else
+		return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(geode_gpio_isset);
+
+void geode_gpio_set_irq(unsigned int group, unsigned int irq)
+{
+	u32 lo, hi;
+
+	if (group > 7 || irq > 15)
+		return;
+
+	rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
+
+	lo &= ~(0xF << (group * 4));
+	lo |= (irq & 0xF) << (group * 4);
+
+	wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
+
+void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
+{
+	u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+	u32 offset, shift, val;
+
+	if (gpio >= 24)
+		offset = GPIO_MAP_W;
+	else if (gpio >= 16)
+		offset = GPIO_MAP_Z;
+	else if (gpio >= 8)
+		offset = GPIO_MAP_Y;
+	else
+		offset = GPIO_MAP_X;
+
+	shift = (gpio % 8) * 4;
+
+	val = inl(base + offset);
+
+	/* Clear whatever was there before */
+	val &= ~(0xF << shift);
+
+	/* And set the new value */
+
+	val |= ((pair & 7) << shift);
+
+	/* Set the PME bit if this is a PME event */
+
+	if (pme)
+		val |= (1 << (shift + 3));
+
+	outl(val, base + offset);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
+
+static int __init geode_southbridge_init(void)
+{
+	if (!is_geode())
+		return -ENODEV;
+
+	init_lbars();
+	return 0;
+}
+
+postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
new file mode 100644
index 000000000000..9150ca9b5f80
--- /dev/null
+++ b/arch/x86/kernel/head_32.S
@@ -0,0 +1,578 @@
+/*
+ *  linux/arch/i386/kernel/head.S -- the 32-bit startup code.
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Enhanced CPU detection and feature setting code by Mike Jagdis
+ *  and Martin Mares, November 1997.
+ */
+
+.text
+#include <linux/threads.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+#include <asm/setup.h>
+
+/*
+ * References to members of the new_cpu_data structure.
+ */
+
+#define X86		new_cpu_data+CPUINFO_x86
+#define X86_VENDOR	new_cpu_data+CPUINFO_x86_vendor
+#define X86_MODEL	new_cpu_data+CPUINFO_x86_model
+#define X86_MASK	new_cpu_data+CPUINFO_x86_mask
+#define X86_HARD_MATH	new_cpu_data+CPUINFO_hard_math
+#define X86_CPUID	new_cpu_data+CPUINFO_cpuid_level
+#define X86_CAPABILITY	new_cpu_data+CPUINFO_x86_capability
+#define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
+
+/*
+ * This is how much memory *in addition to the memory covered up to
+ * and including _end* we need mapped initially.
+ * We need:
+ *  - one bit for each possible page, but only in low memory, which means
+ *     2^32/4096/8 = 128K worst case (4G/4G split.)
+ *  - enough space to map all low memory, which means
+ *     (2^32/4096) / 1024 pages (worst case, non PAE)
+ *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
+ *  - a few pages for allocator use before the kernel pagetable has
+ *     been set up
+ *
+ * Modulo rounding, each megabyte assigned here requires a kilobyte of
+ * memory, which is currently unreclaimed.
+ *
+ * This should be a multiple of a page.
+ */
+LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
+
+#if PTRS_PER_PMD > 1
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#else
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#endif
+BOOTBITMAP_SIZE = LOW_PAGES / 8
+ALLOCATOR_SLOP = 4
+
+INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+
+/*
+ * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
+ * %esi points to the real-mode code as a 32-bit pointer.
+ * CS and DS must be 4 GB flat segments, but we don't depend on
+ * any particular GDT layout, because we load our own as soon as we
+ * can.
+ */
+.section .text.head,"ax",@progbits
+ENTRY(startup_32)
+
+/*
+ * Set segments to known values.
+ */
+	cld
+	lgdt boot_gdt_descr - __PAGE_OFFSET
+	movl $(__BOOT_DS),%eax
+	movl %eax,%ds
+	movl %eax,%es
+	movl %eax,%fs
+	movl %eax,%gs
+
+/*
+ * Clear BSS first so that there are no surprises...
+ * No need to cld as DF is already clear from cld above...
+ */
+	xorl %eax,%eax
+	movl $__bss_start - __PAGE_OFFSET,%edi
+	movl $__bss_stop - __PAGE_OFFSET,%ecx
+	subl %edi,%ecx
+	shrl $2,%ecx
+	rep ; stosl
+/*
+ * Copy bootup parameters out of the way.
+ * Note: %esi still has the pointer to the real-mode data.
+ * With the kexec as boot loader, parameter segment might be loaded beyond
+ * kernel image and might not even be addressable by early boot page tables.
+ * (kexec on panic case). Hence copy out the parameters before initializing
+ * page tables.
+ */
+	movl $(boot_params - __PAGE_OFFSET),%edi
+	movl $(PARAM_SIZE/4),%ecx
+	cld
+	rep
+	movsl
+	movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi
+	andl %esi,%esi
+	jnz 2f			# New command line protocol
+	cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
+	jne 1f
+	movzwl OLD_CL_OFFSET,%esi
+	addl $(OLD_CL_BASE_ADDR),%esi
+2:
+	movl $(boot_command_line - __PAGE_OFFSET),%edi
+	movl $(COMMAND_LINE_SIZE/4),%ecx
+	rep
+	movsl
+1:
+
+/*
+ * Initialize page tables.  This creates a PDE and a set of page
+ * tables, which are located immediately beyond _end.  The variable
+ * init_pg_tables_end is set up to point to the first "safe" location.
+ * Mappings are created both at virtual address 0 (identity mapping)
+ * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ *
+ * Warning: don't use %esi or the stack in this code.  However, %esp
+ * can be used as a GPR if you really need it...
+ */
+page_pde_offset = (__PAGE_OFFSET >> 20);
+
+	movl $(pg0 - __PAGE_OFFSET), %edi
+	movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
+	movl $0x007, %eax			/* 0x007 = PRESENT+RW+USER */
+10:
+	leal 0x007(%edi),%ecx			/* Create PDE entry */
+	movl %ecx,(%edx)			/* Store identity PDE entry */
+	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
+	addl $4,%edx
+	movl $1024, %ecx
+11:
+	stosl
+	addl $0x1000,%eax
+	loop 11b
+	/* End condition: we must map up to and including INIT_MAP_BEYOND_END */
+	/* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
+	leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
+	cmpl %ebp,%eax
+	jb 10b
+	movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
+
+	xorl %ebx,%ebx				/* This is the boot CPU (BSP) */
+	jmp 3f
+/*
+ * Non-boot CPU entry point; entered from trampoline.S
+ * We can't lgdt here, because lgdt itself uses a data segment, but
+ * we know the trampoline has already loaded the boot_gdt for us.
+ *
+ * If cpu hotplug is not supported then this code can go in init section
+ * which will be freed later
+ */
+
+#ifndef CONFIG_HOTPLUG_CPU
+.section .init.text,"ax",@progbits
+#endif
+
+	/* Do an early initialization of the fixmap area */
+	movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
+	movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax
+	addl $0x007, %eax			/* 0x007 = PRESENT+RW+USER */
+	movl %eax, 4092(%edx)
+
+#ifdef CONFIG_SMP
+ENTRY(startup_32_smp)
+	cld
+	movl $(__BOOT_DS),%eax
+	movl %eax,%ds
+	movl %eax,%es
+	movl %eax,%fs
+	movl %eax,%gs
+
+/*
+ *	New page tables may be in 4Mbyte page mode and may
+ *	be using the global pages. 
+ *
+ *	NOTE! If we are on a 486 we may have no cr4 at all!
+ *	So we do not try to touch it unless we really have
+ *	some bits in it to set.  This won't work if the BSP
+ *	implements cr4 but this AP does not -- very unlikely
+ *	but be warned!  The same applies to the pse feature
+ *	if not equally supported. --macro
+ *
+ *	NOTE! We have to correct for the fact that we're
+ *	not yet offset PAGE_OFFSET..
+ */
+#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
+	movl cr4_bits,%edx
+	andl %edx,%edx
+	jz 6f
+	movl %cr4,%eax		# Turn on paging options (PSE,PAE,..)
+	orl %edx,%eax
+	movl %eax,%cr4
+
+	btl $5, %eax		# check if PAE is enabled
+	jnc 6f
+
+	/* Check if extended functions are implemented */
+	movl $0x80000000, %eax
+	cpuid
+	cmpl $0x80000000, %eax
+	jbe 6f
+	mov $0x80000001, %eax
+	cpuid
+	/* Execute Disable bit supported? */
+	btl $20, %edx
+	jnc 6f
+
+	/* Setup EFER (Extended Feature Enable Register) */
+	movl $0xc0000080, %ecx
+	rdmsr
+
+	btsl $11, %eax
+	/* Make changes effective */
+	wrmsr
+
+6:
+	/* This is a secondary processor (AP) */
+	xorl %ebx,%ebx
+	incl %ebx
+
+#endif /* CONFIG_SMP */
+3:
+
+/*
+ * Enable paging
+ */
+	movl $swapper_pg_dir-__PAGE_OFFSET,%eax
+	movl %eax,%cr3		/* set the page table pointer.. */
+	movl %cr0,%eax
+	orl $0x80000000,%eax
+	movl %eax,%cr0		/* ..and set paging (PG) bit */
+	ljmp $__BOOT_CS,$1f	/* Clear prefetch and normalize %eip */
+1:
+	/* Set up the stack pointer */
+	lss stack_start,%esp
+
+/*
+ * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
+ * confuse the debugger if this code is traced.
+ * XXX - best to initialize before switching to protected mode.
+ */
+	pushl $0
+	popfl
+
+#ifdef CONFIG_SMP
+	andl %ebx,%ebx
+	jz  1f				/* Initial CPU cleans BSS */
+	jmp checkCPUtype
+1:
+#endif /* CONFIG_SMP */
+
+/*
+ * start system 32-bit setup. We need to re-do some of the things done
+ * in 16-bit mode for the "real" operations.
+ */
+	call setup_idt
+
+checkCPUtype:
+
+	movl $-1,X86_CPUID		#  -1 for no CPUID initially
+
+/* check if it is 486 or 386. */
+/*
+ * XXX - this does a lot of unnecessary setup.  Alignment checks don't
+ * apply at our cpl of 0 and the stack ought to be aligned already, and
+ * we don't need to preserve eflags.
+ */
+
+	movb $3,X86		# at least 386
+	pushfl			# push EFLAGS
+	popl %eax		# get EFLAGS
+	movl %eax,%ecx		# save original EFLAGS
+	xorl $0x240000,%eax	# flip AC and ID bits in EFLAGS
+	pushl %eax		# copy to EFLAGS
+	popfl			# set EFLAGS
+	pushfl			# get new EFLAGS
+	popl %eax		# put it in eax
+	xorl %ecx,%eax		# change in flags
+	pushl %ecx		# restore original EFLAGS
+	popfl
+	testl $0x40000,%eax	# check if AC bit changed
+	je is386
+
+	movb $4,X86		# at least 486
+	testl $0x200000,%eax	# check if ID bit changed
+	je is486
+
+	/* get vendor info */
+	xorl %eax,%eax			# call CPUID with 0 -> return vendor ID
+	cpuid
+	movl %eax,X86_CPUID		# save CPUID level
+	movl %ebx,X86_VENDOR_ID		# lo 4 chars
+	movl %edx,X86_VENDOR_ID+4	# next 4 chars
+	movl %ecx,X86_VENDOR_ID+8	# last 4 chars
+
+	orl %eax,%eax			# do we have processor info as well?
+	je is486
+
+	movl $1,%eax		# Use the CPUID instruction to get CPU type
+	cpuid
+	movb %al,%cl		# save reg for future use
+	andb $0x0f,%ah		# mask processor family
+	movb %ah,X86
+	andb $0xf0,%al		# mask model
+	shrb $4,%al
+	movb %al,X86_MODEL
+	andb $0x0f,%cl		# mask mask revision
+	movb %cl,X86_MASK
+	movl %edx,X86_CAPABILITY
+
+is486:	movl $0x50022,%ecx	# set AM, WP, NE and MP
+	jmp 2f
+
+is386:	movl $2,%ecx		# set MP
+2:	movl %cr0,%eax
+	andl $0x80000011,%eax	# Save PG,PE,ET
+	orl %ecx,%eax
+	movl %eax,%cr0
+
+	call check_x87
+	lgdt early_gdt_descr
+	lidt idt_descr
+	ljmp $(__KERNEL_CS),$1f
+1:	movl $(__KERNEL_DS),%eax	# reload all the segment registers
+	movl %eax,%ss			# after changing gdt.
+	movl %eax,%fs			# gets reset once there's real percpu
+
+	movl $(__USER_DS),%eax		# DS/ES contains default USER segment
+	movl %eax,%ds
+	movl %eax,%es
+
+	xorl %eax,%eax			# Clear GS and LDT
+	movl %eax,%gs
+	lldt %ax
+
+	cld			# gcc2 wants the direction flag cleared at all times
+	pushl $0		# fake return address for unwinder
+#ifdef CONFIG_SMP
+	movb ready, %cl
+	movb $1, ready
+	cmpb $0,%cl		# the first CPU calls start_kernel
+	je   1f
+	movl $(__KERNEL_PERCPU), %eax
+	movl %eax,%fs		# set this cpu's percpu
+	jmp initialize_secondary # all other CPUs call initialize_secondary
+1:
+#endif /* CONFIG_SMP */
+	jmp start_kernel
+
+/*
+ * We depend on ET to be correct. This checks for 287/387.
+ */
+check_x87:
+	movb $0,X86_HARD_MATH
+	clts
+	fninit
+	fstsw %ax
+	cmpb $0,%al
+	je 1f
+	movl %cr0,%eax		/* no coprocessor: have to set bits */
+	xorl $4,%eax		/* set EM */
+	movl %eax,%cr0
+	ret
+	ALIGN
+1:	movb $1,X86_HARD_MATH
+	.byte 0xDB,0xE4		/* fsetpm for 287, ignored by 387 */
+	ret
+
+/*
+ *  setup_idt
+ *
+ *  sets up a idt with 256 entries pointing to
+ *  ignore_int, interrupt gates. It doesn't actually load
+ *  idt - that can be done only after paging has been enabled
+ *  and the kernel moved to PAGE_OFFSET. Interrupts
+ *  are enabled elsewhere, when we can be relatively
+ *  sure everything is ok.
+ *
+ *  Warning: %esi is live across this function.
+ */
+setup_idt:
+	lea ignore_int,%edx
+	movl $(__KERNEL_CS << 16),%eax
+	movw %dx,%ax		/* selector = 0x0010 = cs */
+	movw $0x8E00,%dx	/* interrupt gate - dpl=0, present */
+
+	lea idt_table,%edi
+	mov $256,%ecx
+rp_sidt:
+	movl %eax,(%edi)
+	movl %edx,4(%edi)
+	addl $8,%edi
+	dec %ecx
+	jne rp_sidt
+
+.macro	set_early_handler handler,trapno
+	lea \handler,%edx
+	movl $(__KERNEL_CS << 16),%eax
+	movw %dx,%ax
+	movw $0x8E00,%dx	/* interrupt gate - dpl=0, present */
+	lea idt_table,%edi
+	movl %eax,8*\trapno(%edi)
+	movl %edx,8*\trapno+4(%edi)
+.endm
+
+	set_early_handler handler=early_divide_err,trapno=0
+	set_early_handler handler=early_illegal_opcode,trapno=6
+	set_early_handler handler=early_protection_fault,trapno=13
+	set_early_handler handler=early_page_fault,trapno=14
+
+	ret
+
+early_divide_err:
+	xor %edx,%edx
+	pushl $0	/* fake errcode */
+	jmp early_fault
+
+early_illegal_opcode:
+	movl $6,%edx
+	pushl $0	/* fake errcode */
+	jmp early_fault
+
+early_protection_fault:
+	movl $13,%edx
+	jmp early_fault
+
+early_page_fault:
+	movl $14,%edx
+	jmp early_fault
+
+early_fault:
+	cld
+#ifdef CONFIG_PRINTK
+	movl $(__KERNEL_DS),%eax
+	movl %eax,%ds
+	movl %eax,%es
+	cmpl $2,early_recursion_flag
+	je hlt_loop
+	incl early_recursion_flag
+	movl %cr2,%eax
+	pushl %eax
+	pushl %edx		/* trapno */
+	pushl $fault_msg
+#ifdef CONFIG_EARLY_PRINTK
+	call early_printk
+#else
+	call printk
+#endif
+#endif
+hlt_loop:
+	hlt
+	jmp hlt_loop
+
+/* This is the default interrupt "handler" :-) */
+	ALIGN
+ignore_int:
+	cld
+#ifdef CONFIG_PRINTK
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	pushl %es
+	pushl %ds
+	movl $(__KERNEL_DS),%eax
+	movl %eax,%ds
+	movl %eax,%es
+	cmpl $2,early_recursion_flag
+	je hlt_loop
+	incl early_recursion_flag
+	pushl 16(%esp)
+	pushl 24(%esp)
+	pushl 32(%esp)
+	pushl 40(%esp)
+	pushl $int_msg
+#ifdef CONFIG_EARLY_PRINTK
+	call early_printk
+#else
+	call printk
+#endif
+	addl $(5*4),%esp
+	popl %ds
+	popl %es
+	popl %edx
+	popl %ecx
+	popl %eax
+#endif
+	iret
+
+.section .text
+/*
+ * Real beginning of normal "text" segment
+ */
+ENTRY(stext)
+ENTRY(_stext)
+
+/*
+ * BSS section
+ */
+.section ".bss.page_aligned","wa"
+	.align PAGE_SIZE_asm
+ENTRY(swapper_pg_dir)
+	.fill 1024,4,0
+ENTRY(swapper_pg_pmd)
+	.fill 1024,4,0
+ENTRY(empty_zero_page)
+	.fill 4096,1,0
+
+/*
+ * This starts the data section.
+ */
+.data
+ENTRY(stack_start)
+	.long init_thread_union+THREAD_SIZE
+	.long __BOOT_DS
+
+ready:	.byte 0
+
+early_recursion_flag:
+	.long 0
+
+int_msg:
+	.asciz "Unknown interrupt or fault at EIP %p %p %p\n"
+
+fault_msg:
+	.ascii "Int %d: CR2 %p  err %p  EIP %p  CS %p  flags %p\n"
+	.asciz "Stack: %p %p %p %p %p %p %p %p\n"
+
+#include "../../x86/xen/xen-head.S"
+
+/*
+ * The IDT and GDT 'descriptors' are a strange 48-bit object
+ * only used by the lidt and lgdt instructions. They are not
+ * like usual segment descriptors - they consist of a 16-bit
+ * segment size, and 32-bit linear address value:
+ */
+
+.globl boot_gdt_descr
+.globl idt_descr
+
+	ALIGN
+# early boot GDT descriptor (must use 1:1 address mapping)
+	.word 0				# 32 bit align gdt_desc.address
+boot_gdt_descr:
+	.word __BOOT_DS+7
+	.long boot_gdt - __PAGE_OFFSET
+
+	.word 0				# 32-bit align idt_desc.address
+idt_descr:
+	.word IDT_ENTRIES*8-1		# idt contains 256 entries
+	.long idt_table
+
+# boot GDT descriptor (later on used by CPU#0):
+	.word 0				# 32 bit align gdt_desc.address
+ENTRY(early_gdt_descr)
+	.word GDT_ENTRIES*8-1
+	.long per_cpu__gdt_page		/* Overwritten for secondary CPUs */
+
+/*
+ * The boot_gdt must mirror the equivalent in setup.S and is
+ * used only for booting.
+ */
+	.align L1_CACHE_BYTES
+ENTRY(boot_gdt)
+	.fill GDT_ENTRY_BOOT_CS,8,0
+	.quad 0x00cf9a000000ffff	/* kernel 4GB code at 0x00000000 */
+	.quad 0x00cf92000000ffff	/* kernel 4GB data at 0x00000000 */
diff --git a/arch/x86/kernel/hpet_32.c b/arch/x86/kernel/hpet_32.c
new file mode 100644
index 000000000000..533d4932bc79
--- /dev/null
+++ b/arch/x86/kernel/hpet_32.c
@@ -0,0 +1,553 @@
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/errno.h>
+#include <linux/hpet.h>
+#include <linux/init.h>
+#include <linux/sysdev.h>
+#include <linux/pm.h>
+#include <linux/delay.h>
+
+#include <asm/hpet.h>
+#include <asm/io.h>
+
+extern struct clock_event_device *global_clock_event;
+
+#define HPET_MASK	CLOCKSOURCE_MASK(32)
+#define HPET_SHIFT	22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC	1000000
+
+/*
+ * HPET address is set in acpi/boot.c, when an ACPI entry exists
+ */
+unsigned long hpet_address;
+static void __iomem * hpet_virt_address;
+
+static inline unsigned long hpet_readl(unsigned long a)
+{
+	return readl(hpet_virt_address + a);
+}
+
+static inline void hpet_writel(unsigned long d, unsigned long a)
+{
+	writel(d, hpet_virt_address + a);
+}
+
+/*
+ * HPET command line enable / disable
+ */
+static int boot_hpet_disable;
+
+static int __init hpet_setup(char* str)
+{
+	if (str) {
+		if (!strncmp("disable", str, 7))
+			boot_hpet_disable = 1;
+	}
+	return 1;
+}
+__setup("hpet=", hpet_setup);
+
+static inline int is_hpet_capable(void)
+{
+	return (!boot_hpet_disable && hpet_address);
+}
+
+/*
+ * HPET timer interrupt enable / disable
+ */
+static int hpet_legacy_int_enabled;
+
+/**
+ * is_hpet_enabled - check whether the hpet timer interrupt is enabled
+ */
+int is_hpet_enabled(void)
+{
+	return is_hpet_capable() && hpet_legacy_int_enabled;
+}
+
+/*
+ * When the hpet driver (/dev/hpet) is enabled, we need to reserve
+ * timer 0 and timer 1 in case of RTC emulation.
+ */
+#ifdef CONFIG_HPET
+static void hpet_reserve_platform_timers(unsigned long id)
+{
+	struct hpet __iomem *hpet = hpet_virt_address;
+	struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
+	unsigned int nrtimers, i;
+	struct hpet_data hd;
+
+	nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+
+	memset(&hd, 0, sizeof (hd));
+	hd.hd_phys_address = hpet_address;
+	hd.hd_address = hpet_virt_address;
+	hd.hd_nirqs = nrtimers;
+	hd.hd_flags = HPET_DATA_PLATFORM;
+	hpet_reserve_timer(&hd, 0);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+	hpet_reserve_timer(&hd, 1);
+#endif
+
+	hd.hd_irq[0] = HPET_LEGACY_8254;
+	hd.hd_irq[1] = HPET_LEGACY_RTC;
+
+	for (i = 2; i < nrtimers; timer++, i++)
+		hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
+			Tn_INT_ROUTE_CNF_SHIFT;
+
+	hpet_alloc(&hd);
+
+}
+#else
+static void hpet_reserve_platform_timers(unsigned long id) { }
+#endif
+
+/*
+ * Common hpet info
+ */
+static unsigned long hpet_period;
+
+static void hpet_set_mode(enum clock_event_mode mode,
+			  struct clock_event_device *evt);
+static int hpet_next_event(unsigned long delta,
+			   struct clock_event_device *evt);
+
+/*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+	.name		= "hpet",
+	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+	.set_mode	= hpet_set_mode,
+	.set_next_event = hpet_next_event,
+	.shift		= 32,
+	.irq		= 0,
+};
+
+static void hpet_start_counter(void)
+{
+	unsigned long cfg = hpet_readl(HPET_CFG);
+
+	cfg &= ~HPET_CFG_ENABLE;
+	hpet_writel(cfg, HPET_CFG);
+	hpet_writel(0, HPET_COUNTER);
+	hpet_writel(0, HPET_COUNTER + 4);
+	cfg |= HPET_CFG_ENABLE;
+	hpet_writel(cfg, HPET_CFG);
+}
+
+static void hpet_enable_int(void)
+{
+	unsigned long cfg = hpet_readl(HPET_CFG);
+
+	cfg |= HPET_CFG_LEGACY;
+	hpet_writel(cfg, HPET_CFG);
+	hpet_legacy_int_enabled = 1;
+}
+
+static void hpet_set_mode(enum clock_event_mode mode,
+			  struct clock_event_device *evt)
+{
+	unsigned long cfg, cmp, now;
+	uint64_t delta;
+
+	switch(mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
+		delta >>= hpet_clockevent.shift;
+		now = hpet_readl(HPET_COUNTER);
+		cmp = now + (unsigned long) delta;
+		cfg = hpet_readl(HPET_T0_CFG);
+		cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
+		       HPET_TN_SETVAL | HPET_TN_32BIT;
+		hpet_writel(cfg, HPET_T0_CFG);
+		/*
+		 * The first write after writing TN_SETVAL to the
+		 * config register sets the counter value, the second
+		 * write sets the period.
+		 */
+		hpet_writel(cmp, HPET_T0_CMP);
+		udelay(1);
+		hpet_writel((unsigned long) delta, HPET_T0_CMP);
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		cfg = hpet_readl(HPET_T0_CFG);
+		cfg &= ~HPET_TN_PERIODIC;
+		cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+		hpet_writel(cfg, HPET_T0_CFG);
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		cfg = hpet_readl(HPET_T0_CFG);
+		cfg &= ~HPET_TN_ENABLE;
+		hpet_writel(cfg, HPET_T0_CFG);
+		break;
+
+	case CLOCK_EVT_MODE_RESUME:
+		hpet_enable_int();
+		break;
+	}
+}
+
+static int hpet_next_event(unsigned long delta,
+			   struct clock_event_device *evt)
+{
+	unsigned long cnt;
+
+	cnt = hpet_readl(HPET_COUNTER);
+	cnt += delta;
+	hpet_writel(cnt, HPET_T0_CMP);
+
+	return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0;
+}
+
+/*
+ * Clock source related code
+ */
+static cycle_t read_hpet(void)
+{
+	return (cycle_t)hpet_readl(HPET_COUNTER);
+}
+
+static struct clocksource clocksource_hpet = {
+	.name		= "hpet",
+	.rating		= 250,
+	.read		= read_hpet,
+	.mask		= HPET_MASK,
+	.shift		= HPET_SHIFT,
+	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+	.resume		= hpet_start_counter,
+};
+
+/*
+ * Try to setup the HPET timer
+ */
+int __init hpet_enable(void)
+{
+	unsigned long id;
+	uint64_t hpet_freq;
+	u64 tmp, start, now;
+	cycle_t t1;
+
+	if (!is_hpet_capable())
+		return 0;
+
+	hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+
+	/*
+	 * Read the period and check for a sane value:
+	 */
+	hpet_period = hpet_readl(HPET_PERIOD);
+	if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
+		goto out_nohpet;
+
+	/*
+	 * The period is a femto seconds value. We need to calculate the
+	 * scaled math multiplication factor for nanosecond to hpet tick
+	 * conversion.
+	 */
+	hpet_freq = 1000000000000000ULL;
+	do_div(hpet_freq, hpet_period);
+	hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
+				      NSEC_PER_SEC, 32);
+	/* Calculate the min / max delta */
+	hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
+							   &hpet_clockevent);
+	hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30,
+							   &hpet_clockevent);
+
+	/*
+	 * Read the HPET ID register to retrieve the IRQ routing
+	 * information and the number of channels
+	 */
+	id = hpet_readl(HPET_ID);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+	/*
+	 * The legacy routing mode needs at least two channels, tick timer
+	 * and the rtc emulation channel.
+	 */
+	if (!(id & HPET_ID_NUMBER))
+		goto out_nohpet;
+#endif
+
+	/* Start the counter */
+	hpet_start_counter();
+
+	/* Verify whether hpet counter works */
+	t1 = read_hpet();
+	rdtscll(start);
+
+	/*
+	 * We don't know the TSC frequency yet, but waiting for
+	 * 200000 TSC cycles is safe:
+	 * 4 GHz == 50us
+	 * 1 GHz == 200us
+	 */
+	do {
+		rep_nop();
+		rdtscll(now);
+	} while ((now - start) < 200000UL);
+
+	if (t1 == read_hpet()) {
+		printk(KERN_WARNING
+		       "HPET counter not counting. HPET disabled\n");
+		goto out_nohpet;
+	}
+
+	/* Initialize and register HPET clocksource
+	 *
+	 * hpet period is in femto seconds per cycle
+	 * so we need to convert this to ns/cyc units
+	 * aproximated by mult/2^shift
+	 *
+	 *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+	 *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+	 *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+	 *  (fsec/cyc << shift)/1000000 = mult
+	 *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+	 */
+	tmp = (u64)hpet_period << HPET_SHIFT;
+	do_div(tmp, FSEC_PER_NSEC);
+	clocksource_hpet.mult = (u32)tmp;
+
+	clocksource_register(&clocksource_hpet);
+
+	if (id & HPET_ID_LEGSUP) {
+		hpet_enable_int();
+		hpet_reserve_platform_timers(id);
+		/*
+		 * Start hpet with the boot cpu mask and make it
+		 * global after the IO_APIC has been initialized.
+		 */
+		hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+		clockevents_register_device(&hpet_clockevent);
+		global_clock_event = &hpet_clockevent;
+		return 1;
+	}
+	return 0;
+
+out_nohpet:
+	iounmap(hpet_virt_address);
+	hpet_virt_address = NULL;
+	boot_hpet_disable = 1;
+	return 0;
+}
+
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ *    is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/mc146818rtc.h>
+#include <linux/rtc.h>
+
+#define DEFAULT_RTC_INT_FREQ	64
+#define DEFAULT_RTC_SHIFT	6
+#define RTC_NUM_INTS		1
+
+static unsigned long hpet_rtc_flags;
+static unsigned long hpet_prev_update_sec;
+static struct rtc_time hpet_alarm_time;
+static unsigned long hpet_pie_count;
+static unsigned long hpet_t1_cmp;
+static unsigned long hpet_default_delta;
+static unsigned long hpet_pie_delta;
+static unsigned long hpet_pie_limit;
+
+/*
+ * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
+ * is not supported by all HPET implementations for timer 1.
+ *
+ * hpet_rtc_timer_init() is called when the rtc is initialized.
+ */
+int hpet_rtc_timer_init(void)
+{
+	unsigned long cfg, cnt, delta, flags;
+
+	if (!is_hpet_enabled())
+		return 0;
+
+	if (!hpet_default_delta) {
+		uint64_t clc;
+
+		clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+		clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
+		hpet_default_delta = (unsigned long) clc;
+	}
+
+	if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+		delta = hpet_default_delta;
+	else
+		delta = hpet_pie_delta;
+
+	local_irq_save(flags);
+
+	cnt = delta + hpet_readl(HPET_COUNTER);
+	hpet_writel(cnt, HPET_T1_CMP);
+	hpet_t1_cmp = cnt;
+
+	cfg = hpet_readl(HPET_T1_CFG);
+	cfg &= ~HPET_TN_PERIODIC;
+	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_T1_CFG);
+
+	local_irq_restore(flags);
+
+	return 1;
+}
+
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	hpet_rtc_flags &= ~bit_mask;
+	return 1;
+}
+
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+	unsigned long oldbits = hpet_rtc_flags;
+
+	if (!is_hpet_enabled())
+		return 0;
+
+	hpet_rtc_flags |= bit_mask;
+
+	if (!oldbits)
+		hpet_rtc_timer_init();
+
+	return 1;
+}
+
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
+			unsigned char sec)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	hpet_alarm_time.tm_hour = hrs;
+	hpet_alarm_time.tm_min = min;
+	hpet_alarm_time.tm_sec = sec;
+
+	return 1;
+}
+
+int hpet_set_periodic_freq(unsigned long freq)
+{
+	uint64_t clc;
+
+	if (!is_hpet_enabled())
+		return 0;
+
+	if (freq <= DEFAULT_RTC_INT_FREQ)
+		hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
+	else {
+		clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+		do_div(clc, freq);
+		clc >>= hpet_clockevent.shift;
+		hpet_pie_delta = (unsigned long) clc;
+	}
+	return 1;
+}
+
+int hpet_rtc_dropped_irq(void)
+{
+	return is_hpet_enabled();
+}
+
+static void hpet_rtc_timer_reinit(void)
+{
+	unsigned long cfg, delta;
+	int lost_ints = -1;
+
+	if (unlikely(!hpet_rtc_flags)) {
+		cfg = hpet_readl(HPET_T1_CFG);
+		cfg &= ~HPET_TN_ENABLE;
+		hpet_writel(cfg, HPET_T1_CFG);
+		return;
+	}
+
+	if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+		delta = hpet_default_delta;
+	else
+		delta = hpet_pie_delta;
+
+	/*
+	 * Increment the comparator value until we are ahead of the
+	 * current count.
+	 */
+	do {
+		hpet_t1_cmp += delta;
+		hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+		lost_ints++;
+	} while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
+
+	if (lost_ints) {
+		if (hpet_rtc_flags & RTC_PIE)
+			hpet_pie_count += lost_ints;
+		if (printk_ratelimit())
+			printk(KERN_WARNING "rtc: lost %d interrupts\n",
+				lost_ints);
+	}
+}
+
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
+{
+	struct rtc_time curr_time;
+	unsigned long rtc_int_flag = 0;
+
+	hpet_rtc_timer_reinit();
+
+	if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
+		rtc_get_rtc_time(&curr_time);
+
+	if (hpet_rtc_flags & RTC_UIE &&
+	    curr_time.tm_sec != hpet_prev_update_sec) {
+		rtc_int_flag = RTC_UF;
+		hpet_prev_update_sec = curr_time.tm_sec;
+	}
+
+	if (hpet_rtc_flags & RTC_PIE &&
+	    ++hpet_pie_count >= hpet_pie_limit) {
+		rtc_int_flag |= RTC_PF;
+		hpet_pie_count = 0;
+	}
+
+	if (hpet_rtc_flags & RTC_PIE &&
+	    (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
+	    (curr_time.tm_min == hpet_alarm_time.tm_min) &&
+	    (curr_time.tm_hour == hpet_alarm_time.tm_hour))
+			rtc_int_flag |= RTC_AF;
+
+	if (rtc_int_flag) {
+		rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+		rtc_interrupt(rtc_int_flag, dev_id);
+	}
+	return IRQ_HANDLED;
+}
+#endif
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
new file mode 100644
index 000000000000..e3d4b73bfdb0
--- /dev/null
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -0,0 +1,30 @@
+#include <linux/module.h>
+#include <asm/checksum.h>
+#include <asm/desc.h>
+
+EXPORT_SYMBOL(__down_failed);
+EXPORT_SYMBOL(__down_failed_interruptible);
+EXPORT_SYMBOL(__down_failed_trylock);
+EXPORT_SYMBOL(__up_wakeup);
+/* Networking helper routines. */
+EXPORT_SYMBOL(csum_partial_copy_generic);
+
+EXPORT_SYMBOL(__get_user_1);
+EXPORT_SYMBOL(__get_user_2);
+EXPORT_SYMBOL(__get_user_4);
+
+EXPORT_SYMBOL(__put_user_1);
+EXPORT_SYMBOL(__put_user_2);
+EXPORT_SYMBOL(__put_user_4);
+EXPORT_SYMBOL(__put_user_8);
+
+EXPORT_SYMBOL(strstr);
+
+#ifdef CONFIG_SMP
+extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
+extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
+EXPORT_SYMBOL(__write_lock_failed);
+EXPORT_SYMBOL(__read_lock_failed);
+#endif
+
+EXPORT_SYMBOL(csum_partial);
diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c
new file mode 100644
index 000000000000..665847281ed2
--- /dev/null
+++ b/arch/x86/kernel/i387_32.c
@@ -0,0 +1,546 @@
+/*
+ *  linux/arch/i386/kernel/i387.c
+ *
+ *  Copyright (C) 1994 Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/math_emu.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_MATH_EMULATION
+#define HAVE_HWFP (boot_cpu_data.hard_math)
+#else
+#define HAVE_HWFP 1
+#endif
+
+static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
+
+void mxcsr_feature_mask_init(void)
+{
+	unsigned long mask = 0;
+	clts();
+	if (cpu_has_fxsr) {
+		memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+		asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); 
+		mask = current->thread.i387.fxsave.mxcsr_mask;
+		if (mask == 0) mask = 0x0000ffbf;
+	} 
+	mxcsr_feature_mask &= mask;
+	stts();
+}
+
+/*
+ * The _current_ task is using the FPU for the first time
+ * so initialize it and set the mxcsr to its default
+ * value at reset if we support XMM instructions and then
+ * remeber the current task has used the FPU.
+ */
+void init_fpu(struct task_struct *tsk)
+{
+	if (cpu_has_fxsr) {
+		memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+		tsk->thread.i387.fxsave.cwd = 0x37f;
+		if (cpu_has_xmm)
+			tsk->thread.i387.fxsave.mxcsr = 0x1f80;
+	} else {
+		memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
+		tsk->thread.i387.fsave.cwd = 0xffff037fu;
+		tsk->thread.i387.fsave.swd = 0xffff0000u;
+		tsk->thread.i387.fsave.twd = 0xffffffffu;
+		tsk->thread.i387.fsave.fos = 0xffff0000u;
+	}
+	/* only the device not available exception or ptrace can call init_fpu */
+	set_stopped_child_used_math(tsk);
+}
+
+/*
+ * FPU lazy state save handling.
+ */
+
+void kernel_fpu_begin(void)
+{
+	struct thread_info *thread = current_thread_info();
+
+	preempt_disable();
+	if (thread->status & TS_USEDFPU) {
+		__save_init_fpu(thread->task);
+		return;
+	}
+	clts();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
+{
+	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+ 
+	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
+        tmp = ~twd;
+        tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+        /* and move the valid bits to the lower byte. */
+        tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+        tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+        tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+        return tmp;
+}
+
+static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
+{
+	struct _fpxreg *st = NULL;
+	unsigned long tos = (fxsave->swd >> 11) & 7;
+	unsigned long twd = (unsigned long) fxsave->twd;
+	unsigned long tag;
+	unsigned long ret = 0xffff0000u;
+	int i;
+
+#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16);
+
+	for ( i = 0 ; i < 8 ; i++ ) {
+		if ( twd & 0x1 ) {
+			st = FPREG_ADDR( fxsave, (i - tos) & 7 );
+
+			switch ( st->exponent & 0x7fff ) {
+			case 0x7fff:
+				tag = 2;		/* Special */
+				break;
+			case 0x0000:
+				if ( !st->significand[0] &&
+				     !st->significand[1] &&
+				     !st->significand[2] &&
+				     !st->significand[3] ) {
+					tag = 1;	/* Zero */
+				} else {
+					tag = 2;	/* Special */
+				}
+				break;
+			default:
+				if ( st->significand[3] & 0x8000 ) {
+					tag = 0;	/* Valid */
+				} else {
+					tag = 2;	/* Special */
+				}
+				break;
+			}
+		} else {
+			tag = 3;			/* Empty */
+		}
+		ret |= (tag << (2 * i));
+		twd = twd >> 1;
+	}
+	return ret;
+}
+
+/*
+ * FPU state interaction.
+ */
+
+unsigned short get_fpu_cwd( struct task_struct *tsk )
+{
+	if ( cpu_has_fxsr ) {
+		return tsk->thread.i387.fxsave.cwd;
+	} else {
+		return (unsigned short)tsk->thread.i387.fsave.cwd;
+	}
+}
+
+unsigned short get_fpu_swd( struct task_struct *tsk )
+{
+	if ( cpu_has_fxsr ) {
+		return tsk->thread.i387.fxsave.swd;
+	} else {
+		return (unsigned short)tsk->thread.i387.fsave.swd;
+	}
+}
+
+#if 0
+unsigned short get_fpu_twd( struct task_struct *tsk )
+{
+	if ( cpu_has_fxsr ) {
+		return tsk->thread.i387.fxsave.twd;
+	} else {
+		return (unsigned short)tsk->thread.i387.fsave.twd;
+	}
+}
+#endif  /*  0  */
+
+unsigned short get_fpu_mxcsr( struct task_struct *tsk )
+{
+	if ( cpu_has_xmm ) {
+		return tsk->thread.i387.fxsave.mxcsr;
+	} else {
+		return 0x1f80;
+	}
+}
+
+#if 0
+
+void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
+{
+	if ( cpu_has_fxsr ) {
+		tsk->thread.i387.fxsave.cwd = cwd;
+	} else {
+		tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
+	}
+}
+
+void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
+{
+	if ( cpu_has_fxsr ) {
+		tsk->thread.i387.fxsave.swd = swd;
+	} else {
+		tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
+	}
+}
+
+void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
+{
+	if ( cpu_has_fxsr ) {
+		tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
+	} else {
+		tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
+	}
+}
+
+#endif  /*  0  */
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+static int convert_fxsr_to_user( struct _fpstate __user *buf,
+					struct i387_fxsave_struct *fxsave )
+{
+	unsigned long env[7];
+	struct _fpreg __user *to;
+	struct _fpxreg *from;
+	int i;
+
+	env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
+	env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
+	env[2] = twd_fxsr_to_i387(fxsave);
+	env[3] = fxsave->fip;
+	env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
+	env[5] = fxsave->foo;
+	env[6] = fxsave->fos;
+
+	if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
+		return 1;
+
+	to = &buf->_st[0];
+	from = (struct _fpxreg *) &fxsave->st_space[0];
+	for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+		unsigned long __user *t = (unsigned long __user *)to;
+		unsigned long *f = (unsigned long *)from;
+
+		if (__put_user(*f, t) ||
+				__put_user(*(f + 1), t + 1) ||
+				__put_user(from->exponent, &to->exponent))
+			return 1;
+	}
+	return 0;
+}
+
+static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
+					  struct _fpstate __user *buf )
+{
+	unsigned long env[7];
+	struct _fpxreg *to;
+	struct _fpreg __user *from;
+	int i;
+
+	if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
+		return 1;
+
+	fxsave->cwd = (unsigned short)(env[0] & 0xffff);
+	fxsave->swd = (unsigned short)(env[1] & 0xffff);
+	fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
+	fxsave->fip = env[3];
+	fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
+	fxsave->fcs = (env[4] & 0xffff);
+	fxsave->foo = env[5];
+	fxsave->fos = env[6];
+
+	to = (struct _fpxreg *) &fxsave->st_space[0];
+	from = &buf->_st[0];
+	for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+		unsigned long *t = (unsigned long *)to;
+		unsigned long __user *f = (unsigned long __user *)from;
+
+		if (__get_user(*t, f) ||
+				__get_user(*(t + 1), f + 1) ||
+				__get_user(to->exponent, &from->exponent))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Signal frame handlers.
+ */
+
+static inline int save_i387_fsave( struct _fpstate __user *buf )
+{
+	struct task_struct *tsk = current;
+
+	unlazy_fpu( tsk );
+	tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
+	if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
+			     sizeof(struct i387_fsave_struct) ) )
+		return -1;
+	return 1;
+}
+
+static int save_i387_fxsave( struct _fpstate __user *buf )
+{
+	struct task_struct *tsk = current;
+	int err = 0;
+
+	unlazy_fpu( tsk );
+
+	if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
+		return -1;
+
+	err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
+	err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
+	if ( err )
+		return -1;
+
+	if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
+			     sizeof(struct i387_fxsave_struct) ) )
+		return -1;
+	return 1;
+}
+
+int save_i387( struct _fpstate __user *buf )
+{
+	if ( !used_math() )
+		return 0;
+
+	/* This will cause a "finit" to be triggered by the next
+	 * attempted FPU operation by the 'current' process.
+	 */
+	clear_used_math();
+
+	if ( HAVE_HWFP ) {
+		if ( cpu_has_fxsr ) {
+			return save_i387_fxsave( buf );
+		} else {
+			return save_i387_fsave( buf );
+		}
+	} else {
+		return save_i387_soft( &current->thread.i387.soft, buf );
+	}
+}
+
+static inline int restore_i387_fsave( struct _fpstate __user *buf )
+{
+	struct task_struct *tsk = current;
+	clear_fpu( tsk );
+	return __copy_from_user( &tsk->thread.i387.fsave, buf,
+				 sizeof(struct i387_fsave_struct) );
+}
+
+static int restore_i387_fxsave( struct _fpstate __user *buf )
+{
+	int err;
+	struct task_struct *tsk = current;
+	clear_fpu( tsk );
+	err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
+				sizeof(struct i387_fxsave_struct) );
+	/* mxcsr reserved bits must be masked to zero for security reasons */
+	tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+	return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
+}
+
+int restore_i387( struct _fpstate __user *buf )
+{
+	int err;
+
+	if ( HAVE_HWFP ) {
+		if ( cpu_has_fxsr ) {
+			err = restore_i387_fxsave( buf );
+		} else {
+			err = restore_i387_fsave( buf );
+		}
+	} else {
+		err = restore_i387_soft( &current->thread.i387.soft, buf );
+	}
+	set_used_math();
+	return err;
+}
+
+/*
+ * ptrace request handlers.
+ */
+
+static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
+				    struct task_struct *tsk )
+{
+	return __copy_to_user( buf, &tsk->thread.i387.fsave,
+			       sizeof(struct user_i387_struct) );
+}
+
+static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
+				     struct task_struct *tsk )
+{
+	return convert_fxsr_to_user( (struct _fpstate __user *)buf,
+				     &tsk->thread.i387.fxsave );
+}
+
+int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
+{
+	if ( HAVE_HWFP ) {
+		if ( cpu_has_fxsr ) {
+			return get_fpregs_fxsave( buf, tsk );
+		} else {
+			return get_fpregs_fsave( buf, tsk );
+		}
+	} else {
+		return save_i387_soft( &tsk->thread.i387.soft,
+				       (struct _fpstate __user *)buf );
+	}
+}
+
+static inline int set_fpregs_fsave( struct task_struct *tsk,
+				    struct user_i387_struct __user *buf )
+{
+	return __copy_from_user( &tsk->thread.i387.fsave, buf,
+				 sizeof(struct user_i387_struct) );
+}
+
+static inline int set_fpregs_fxsave( struct task_struct *tsk,
+				     struct user_i387_struct __user *buf )
+{
+	return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
+				       (struct _fpstate __user *)buf );
+}
+
+int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
+{
+	if ( HAVE_HWFP ) {
+		if ( cpu_has_fxsr ) {
+			return set_fpregs_fxsave( tsk, buf );
+		} else {
+			return set_fpregs_fsave( tsk, buf );
+		}
+	} else {
+		return restore_i387_soft( &tsk->thread.i387.soft,
+					  (struct _fpstate __user *)buf );
+	}
+}
+
+int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
+{
+	if ( cpu_has_fxsr ) {
+		if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
+				    sizeof(struct user_fxsr_struct) ))
+			return -EFAULT;
+		return 0;
+	} else {
+		return -EIO;
+	}
+}
+
+int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
+{
+	int ret = 0;
+
+	if ( cpu_has_fxsr ) {
+		if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
+				  sizeof(struct user_fxsr_struct) ))
+			ret = -EFAULT;
+		/* mxcsr reserved bits must be masked to zero for security reasons */
+		tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+	} else {
+		ret = -EIO;
+	}
+	return ret;
+}
+
+/*
+ * FPU state for core dumps.
+ */
+
+static inline void copy_fpu_fsave( struct task_struct *tsk,
+				   struct user_i387_struct *fpu )
+{
+	memcpy( fpu, &tsk->thread.i387.fsave,
+		sizeof(struct user_i387_struct) );
+}
+
+static inline void copy_fpu_fxsave( struct task_struct *tsk,
+				   struct user_i387_struct *fpu )
+{
+	unsigned short *to;
+	unsigned short *from;
+	int i;
+
+	memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
+
+	to = (unsigned short *)&fpu->st_space[0];
+	from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
+	for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
+		memcpy( to, from, 5 * sizeof(unsigned short) );
+	}
+}
+
+int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
+{
+	int fpvalid;
+	struct task_struct *tsk = current;
+
+	fpvalid = !!used_math();
+	if ( fpvalid ) {
+		unlazy_fpu( tsk );
+		if ( cpu_has_fxsr ) {
+			copy_fpu_fxsave( tsk, fpu );
+		} else {
+			copy_fpu_fsave( tsk, fpu );
+		}
+	}
+
+	return fpvalid;
+}
+EXPORT_SYMBOL(dump_fpu);
+
+int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
+{
+	int fpvalid = !!tsk_used_math(tsk);
+
+	if (fpvalid) {
+		if (tsk == current)
+			unlazy_fpu(tsk);
+		if (cpu_has_fxsr)
+			copy_fpu_fxsave(tsk, fpu);
+		else
+			copy_fpu_fsave(tsk, fpu);
+	}
+	return fpvalid;
+}
+
+int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
+{
+	int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
+
+	if (fpvalid) {
+		if (tsk == current)
+		       unlazy_fpu(tsk);
+		memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
+	}
+	return fpvalid;
+}
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
new file mode 100644
index 000000000000..6f508e8d7c57
--- /dev/null
+++ b/arch/x86/kernel/i8237.c
@@ -0,0 +1,72 @@
+/*
+ * i8237.c: 8237A DMA controller suspend functions.
+ *
+ * Written by Pierre Ossman, 2005.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/sysdev.h>
+
+#include <asm/dma.h>
+
+/*
+ * This module just handles suspend/resume issues with the
+ * 8237A DMA controller (used for ISA and LPC).
+ * Allocation is handled in kernel/dma.c and normal usage is
+ * in asm/dma.h.
+ */
+
+static int i8237A_resume(struct sys_device *dev)
+{
+	unsigned long flags;
+	int i;
+
+	flags = claim_dma_lock();
+
+	dma_outb(DMA1_RESET_REG, 0);
+	dma_outb(DMA2_RESET_REG, 0);
+
+	for (i = 0;i < 8;i++) {
+		set_dma_addr(i, 0x000000);
+		/* DMA count is a bit weird so this is not 0 */
+		set_dma_count(i, 1);
+	}
+
+	/* Enable cascade DMA or channel 0-3 won't work */
+	enable_dma(4);
+
+	release_dma_lock(flags);
+
+	return 0;
+}
+
+static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
+{
+	return 0;
+}
+
+static struct sysdev_class i8237_sysdev_class = {
+	set_kset_name("i8237"),
+	.suspend = i8237A_suspend,
+	.resume = i8237A_resume,
+};
+
+static struct sys_device device_i8237A = {
+	.id	= 0,
+	.cls	= &i8237_sysdev_class,
+};
+
+static int __init i8237A_init_sysfs(void)
+{
+	int error = sysdev_class_register(&i8237_sysdev_class);
+	if (!error)
+		error = sysdev_register(&device_i8237A);
+	return error;
+}
+
+device_initcall(i8237A_init_sysfs);
diff --git a/arch/x86/kernel/i8253_32.c b/arch/x86/kernel/i8253_32.c
new file mode 100644
index 000000000000..6d839f2f1b1a
--- /dev/null
+++ b/arch/x86/kernel/i8253_32.c
@@ -0,0 +1,206 @@
+/*
+ * i8253.c  8253/PIT functions
+ *
+ */
+#include <linux/clockchips.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+
+#include <asm/smp.h>
+#include <asm/delay.h>
+#include <asm/i8253.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+
+DEFINE_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
+
+/*
+ * HPET replaces the PIT, when enabled. So we need to know, which of
+ * the two timers is used
+ */
+struct clock_event_device *global_clock_event;
+
+/*
+ * Initialize the PIT timer.
+ *
+ * This is also called after resume to bring the PIT into operation again.
+ */
+static void init_pit_timer(enum clock_event_mode mode,
+			   struct clock_event_device *evt)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+
+	switch(mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		/* binary, mode 2, LSB/MSB, ch 0 */
+		outb_p(0x34, PIT_MODE);
+		outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
+		outb(LATCH >> 8 , PIT_CH0);	/* MSB */
+		break;
+
+	case CLOCK_EVT_MODE_SHUTDOWN:
+	case CLOCK_EVT_MODE_UNUSED:
+		if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
+		    evt->mode == CLOCK_EVT_MODE_ONESHOT) {
+			outb_p(0x30, PIT_MODE);
+			outb_p(0, PIT_CH0);
+			outb_p(0, PIT_CH0);
+		}
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		/* One shot setup */
+		outb_p(0x38, PIT_MODE);
+		break;
+
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here */
+		break;
+	}
+	spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+/*
+ * Program the next event in oneshot mode
+ *
+ * Delta is given in PIT ticks
+ */
+static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+	outb_p(delta & 0xff , PIT_CH0);	/* LSB */
+	outb(delta >> 8 , PIT_CH0);	/* MSB */
+	spin_unlock_irqrestore(&i8253_lock, flags);
+
+	return 0;
+}
+
+/*
+ * On UP the PIT can serve all of the possible timer functions. On SMP systems
+ * it can be solely used for the global tick.
+ *
+ * The profiling and update capabilites are switched off once the local apic is
+ * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
+ * !using_apic_timer decisions in do_timer_interrupt_hook()
+ */
+struct clock_event_device pit_clockevent = {
+	.name		= "pit",
+	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+	.set_mode	= init_pit_timer,
+	.set_next_event = pit_next_event,
+	.shift		= 32,
+	.irq		= 0,
+};
+
+/*
+ * Initialize the conversion factor and the min/max deltas of the clock event
+ * structure and register the clock event source with the framework.
+ */
+void __init setup_pit_timer(void)
+{
+	/*
+	 * Start pit with the boot cpu mask and make it global after the
+	 * IO_APIC has been initialized.
+	 */
+	pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+	pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
+	pit_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFF, &pit_clockevent);
+	pit_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &pit_clockevent);
+	clockevents_register_device(&pit_clockevent);
+	global_clock_event = &pit_clockevent;
+}
+
+/*
+ * Since the PIT overflows every tick, its not very useful
+ * to just read by itself. So use jiffies to emulate a free
+ * running counter:
+ */
+static cycle_t pit_read(void)
+{
+	unsigned long flags;
+	int count;
+	u32 jifs;
+	static int old_count;
+	static u32 old_jifs;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+	/*
+	 * Although our caller may have the read side of xtime_lock,
+	 * this is now a seqlock, and we are cheating in this routine
+	 * by having side effects on state that we cannot undo if
+	 * there is a collision on the seqlock and our caller has to
+	 * retry.  (Namely, old_jifs and old_count.)  So we must treat
+	 * jiffies as volatile despite the lock.  We read jiffies
+	 * before latching the timer count to guarantee that although
+	 * the jiffies value might be older than the count (that is,
+	 * the counter may underflow between the last point where
+	 * jiffies was incremented and the point where we latch the
+	 * count), it cannot be newer.
+	 */
+	jifs = jiffies;
+	outb_p(0x00, PIT_MODE);	/* latch the count ASAP */
+	count = inb_p(PIT_CH0);	/* read the latched count */
+	count |= inb_p(PIT_CH0) << 8;
+
+	/* VIA686a test code... reset the latch if count > max + 1 */
+	if (count > LATCH) {
+		outb_p(0x34, PIT_MODE);
+		outb_p(LATCH & 0xff, PIT_CH0);
+		outb(LATCH >> 8, PIT_CH0);
+		count = LATCH - 1;
+	}
+
+	/*
+	 * It's possible for count to appear to go the wrong way for a
+	 * couple of reasons:
+	 *
+	 *  1. The timer counter underflows, but we haven't handled the
+	 *     resulting interrupt and incremented jiffies yet.
+	 *  2. Hardware problem with the timer, not giving us continuous time,
+	 *     the counter does small "jumps" upwards on some Pentium systems,
+	 *     (see c't 95/10 page 335 for Neptun bug.)
+	 *
+	 * Previous attempts to handle these cases intelligently were
+	 * buggy, so we just do the simple thing now.
+	 */
+	if (count > old_count && jifs == old_jifs) {
+		count = old_count;
+	}
+	old_count = count;
+	old_jifs = jifs;
+
+	spin_unlock_irqrestore(&i8253_lock, flags);
+
+	count = (LATCH - 1) - count;
+
+	return (cycle_t)(jifs * LATCH) + count;
+}
+
+static struct clocksource clocksource_pit = {
+	.name	= "pit",
+	.rating = 110,
+	.read	= pit_read,
+	.mask	= CLOCKSOURCE_MASK(32),
+	.mult	= 0,
+	.shift	= 20,
+};
+
+static int __init init_pit_clocksource(void)
+{
+	if (num_possible_cpus() > 1) /* PIT does not scale! */
+		return 0;
+
+	clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
+	return clocksource_register(&clocksource_pit);
+}
+arch_initcall(init_pit_clocksource);
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
new file mode 100644
index 000000000000..0499cbe9871a
--- /dev/null
+++ b/arch/x86/kernel/i8259_32.c
@@ -0,0 +1,420 @@
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/8253pit.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/arch_hooks.h>
+#include <asm/i8259.h>
+
+#include <io_ports.h>
+
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ * this file should become arch/i386/kernel/irq.c when the old irq.c
+ * moves to arch independent land
+ */
+
+static int i8259A_auto_eoi;
+DEFINE_SPINLOCK(i8259A_lock);
+static void mask_and_ack_8259A(unsigned int);
+
+static struct irq_chip i8259A_chip = {
+	.name		= "XT-PIC",
+	.mask		= disable_8259A_irq,
+	.disable	= disable_8259A_irq,
+	.unmask		= enable_8259A_irq,
+	.mask_ack	= mask_and_ack_8259A,
+};
+
+/*
+ * 8259A PIC functions to handle ISA devices:
+ */
+
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ */
+unsigned int cached_irq_mask = 0xffff;
+
+/*
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
+ *
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs nothing because it's only used
+ * at IRQ setup time.
+ */
+unsigned long io_apic_irqs;
+
+void disable_8259A_irq(unsigned int irq)
+{
+	unsigned int mask = 1 << irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask |= mask;
+	if (irq & 8)
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+	else
+		outb(cached_master_mask, PIC_MASTER_IMR);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+void enable_8259A_irq(unsigned int irq)
+{
+	unsigned int mask = ~(1 << irq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask &= mask;
+	if (irq & 8)
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+	else
+		outb(cached_master_mask, PIC_MASTER_IMR);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+int i8259A_irq_pending(unsigned int irq)
+{
+	unsigned int mask = 1<<irq;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	if (irq < 8)
+		ret = inb(PIC_MASTER_CMD) & mask;
+	else
+		ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+
+	return ret;
+}
+
+void make_8259A_irq(unsigned int irq)
+{
+	disable_irq_nosync(irq);
+	io_apic_irqs &= ~(1<<irq);
+	set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
+				      "XT");
+	enable_irq(irq);
+}
+
+/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ * This has to be protected by the irq controller spinlock
+ * before being called.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+	int value;
+	int irqmask = 1<<irq;
+
+	if (irq < 8) {
+		outb(0x0B,PIC_MASTER_CMD);	/* ISR register */
+		value = inb(PIC_MASTER_CMD) & irqmask;
+		outb(0x0A,PIC_MASTER_CMD);	/* back to the IRR register */
+		return value;
+	}
+	outb(0x0B,PIC_SLAVE_CMD);	/* ISR register */
+	value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
+	outb(0x0A,PIC_SLAVE_CMD);	/* back to the IRR register */
+	return value;
+}
+
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static void mask_and_ack_8259A(unsigned int irq)
+{
+	unsigned int irqmask = 1 << irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	/*
+	 * Lightweight spurious IRQ detection. We do not want
+	 * to overdo spurious IRQ handling - it's usually a sign
+	 * of hardware problems, so we only do the checks we can
+	 * do without slowing down good hardware unnecessarily.
+	 *
+	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
+	 * usually resulting from the 8259A-1|2 PICs) occur
+	 * even if the IRQ is masked in the 8259A. Thus we
+	 * can check spurious 8259A IRQs without doing the
+	 * quite slow i8259A_irq_real() call for every IRQ.
+	 * This does not cover 100% of spurious interrupts,
+	 * but should be enough to warn the user that there
+	 * is something bad going on ...
+	 */
+	if (cached_irq_mask & irqmask)
+		goto spurious_8259A_irq;
+	cached_irq_mask |= irqmask;
+
+handle_real_irq:
+	if (irq & 8) {
+		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+		outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
+		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
+	} else {
+		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
+		outb(cached_master_mask, PIC_MASTER_IMR);
+		outb(0x60+irq,PIC_MASTER_CMD);	/* 'Specific EOI to master */
+	}
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+	return;
+
+spurious_8259A_irq:
+	/*
+	 * this is the slow path - should happen rarely.
+	 */
+	if (i8259A_irq_real(irq))
+		/*
+		 * oops, the IRQ _is_ in service according to the
+		 * 8259A - not spurious, go handle it.
+		 */
+		goto handle_real_irq;
+
+	{
+		static int spurious_irq_mask;
+		/*
+		 * At this point we can be sure the IRQ is spurious,
+		 * lets ACK and report it. [once per IRQ]
+		 */
+		if (!(spurious_irq_mask & irqmask)) {
+			printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+			spurious_irq_mask |= irqmask;
+		}
+		atomic_inc(&irq_err_count);
+		/*
+		 * Theoretically we do not have to handle this IRQ,
+		 * but in Linux this does not cause problems and is
+		 * simpler for us.
+		 */
+		goto handle_real_irq;
+	}
+}
+
+static char irq_trigger[2];
+/**
+ * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
+ */
+static void restore_ELCR(char *trigger)
+{
+	outb(trigger[0], 0x4d0);
+	outb(trigger[1], 0x4d1);
+}
+
+static void save_ELCR(char *trigger)
+{
+	/* IRQ 0,1,2,8,13 are marked as reserved */
+	trigger[0] = inb(0x4d0) & 0xF8;
+	trigger[1] = inb(0x4d1) & 0xDE;
+}
+
+static int i8259A_resume(struct sys_device *dev)
+{
+	init_8259A(i8259A_auto_eoi);
+	restore_ELCR(irq_trigger);
+	return 0;
+}
+
+static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
+{
+	save_ELCR(irq_trigger);
+	return 0;
+}
+
+static int i8259A_shutdown(struct sys_device *dev)
+{
+	/* Put the i8259A into a quiescent state that
+	 * the kernel initialization code can get it
+	 * out of.
+	 */
+	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
+	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-1 */
+	return 0;
+}
+
+static struct sysdev_class i8259_sysdev_class = {
+	set_kset_name("i8259"),
+	.suspend = i8259A_suspend,
+	.resume = i8259A_resume,
+	.shutdown = i8259A_shutdown,
+};
+
+static struct sys_device device_i8259A = {
+	.id	= 0,
+	.cls	= &i8259_sysdev_class,
+};
+
+static int __init i8259A_init_sysfs(void)
+{
+	int error = sysdev_class_register(&i8259_sysdev_class);
+	if (!error)
+		error = sysdev_register(&device_i8259A);
+	return error;
+}
+
+device_initcall(i8259A_init_sysfs);
+
+void init_8259A(int auto_eoi)
+{
+	unsigned long flags;
+
+	i8259A_auto_eoi = auto_eoi;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
+	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
+
+	/*
+	 * outb_p - this has to work on a wide range of PC hardware.
+	 */
+	outb_p(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
+	outb_p(0x20 + 0, PIC_MASTER_IMR);	/* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
+	outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);	/* 8259A-1 (the master) has a slave on IR2 */
+	if (auto_eoi)	/* master does Auto EOI */
+		outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
+	else		/* master expects normal EOI */
+		outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+
+	outb_p(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
+	outb_p(0x20 + 8, PIC_SLAVE_IMR);	/* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
+	outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR);	/* 8259A-2 is a slave on master's IR2 */
+	outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
+	if (auto_eoi)
+		/*
+		 * In AEOI mode we just have to mask the interrupt
+		 * when acking.
+		 */
+		i8259A_chip.mask_ack = disable_8259A_irq;
+	else
+		i8259A_chip.mask_ack = mask_and_ack_8259A;
+
+	udelay(100);		/* wait for 8259A to initialize */
+
+	outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+	outb(cached_slave_mask, PIC_SLAVE_IMR);	  /* restore slave IRQ mask */
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+/*
+ * Note that on a 486, we don't want to do a SIGFPE on an irq13
+ * as the irq is unreliable, and exception 16 works correctly
+ * (ie as explained in the intel literature). On a 386, you
+ * can't use exception 16 due to bad IBM design, so we have to
+ * rely on the less exact irq13.
+ *
+ * Careful.. Not only is IRQ13 unreliable, but it is also
+ * leads to races. IBM designers who came up with it should
+ * be shot.
+ */
+ 
+
+static irqreturn_t math_error_irq(int cpl, void *dev_id)
+{
+	extern void math_error(void __user *);
+	outb(0,0xF0);
+	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
+		return IRQ_NONE;
+	math_error((void __user *)get_irq_regs()->eip);
+	return IRQ_HANDLED;
+}
+
+/*
+ * New motherboards sometimes make IRQ 13 be a PCI interrupt,
+ * so allow interrupt sharing.
+ */
+static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
+
+void __init init_ISA_irqs (void)
+{
+	int i;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	init_bsp_APIC();
+#endif
+	init_8259A(0);
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc[i].status = IRQ_DISABLED;
+		irq_desc[i].action = NULL;
+		irq_desc[i].depth = 1;
+
+		if (i < 16) {
+			/*
+			 * 16 old-style INTA-cycle interrupts:
+			 */
+			set_irq_chip_and_handler_name(i, &i8259A_chip,
+						      handle_level_irq, "XT");
+		} else {
+			/*
+			 * 'high' PCI IRQs filled in on demand
+			 */
+			irq_desc[i].chip = &no_irq_chip;
+		}
+	}
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	/* all the set up before the call gates are initialised */
+	pre_intr_init_hook();
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (i >= NR_IRQS)
+			break;
+		if (vector != SYSCALL_VECTOR) 
+			set_intr_gate(vector, interrupt[i]);
+	}
+
+	/* setup after call gates are initialised (usually add in
+	 * the architecture specific gates)
+	 */
+	intr_init_hook();
+
+	/*
+	 * External FPU? Set up irq13 if so, for
+	 * original braindamaged IBM FERR coupling.
+	 */
+	if (boot_cpu_data.hard_math && !cpu_has_fpu)
+		setup_irq(FPU_IRQ, &fpu_irq);
+
+	irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/kernel/init_task_32.c b/arch/x86/kernel/init_task_32.c
new file mode 100644
index 000000000000..d26fc063a760
--- /dev/null
+++ b/arch/x86/kernel/init_task_32.c
@@ -0,0 +1,46 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+#include <linux/mqueue.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+
+static struct fs_struct init_fs = INIT_FS;
+static struct files_struct init_files = INIT_FILES;
+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+struct mm_struct init_mm = INIT_MM(init_mm);
+
+EXPORT_SYMBOL(init_mm);
+
+/*
+ * Initial thread structure.
+ *
+ * We need to make sure that this is THREAD_SIZE aligned due to the
+ * way process stacks are handled. This is done by having a special
+ * "init_task" linker map entry..
+ */
+union thread_union init_thread_union 
+	__attribute__((__section__(".data.init_task"))) =
+		{ INIT_THREAD_INFO(init_task) };
+
+/*
+ * Initial task structure.
+ *
+ * All other task structs will be allocated on slabs in fork.c
+ */
+struct task_struct init_task = INIT_TASK(init_task);
+
+EXPORT_SYMBOL(init_task);
+
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's.
+ */ 
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
new file mode 100644
index 000000000000..e2f4a1c68547
--- /dev/null
+++ b/arch/x86/kernel/io_apic_32.c
@@ -0,0 +1,2847 @@
+/*
+ *	Intel IO-APIC support for multi-Pentium hosts.
+ *
+ *	Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ *
+ *	Many thanks to Stig Venaas for trying out countless experimental
+ *	patches and reporting/debugging problems patiently!
+ *
+ *	(c) 1999, Multiple IO-APIC support, developed by
+ *	Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ *	further tested and cleaned up by Zach Brown <zab@redhat.com>
+ *	and Ingo Molnar <mingo@redhat.com>
+ *
+ *	Fixes
+ *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs;
+ *					thanks to Eric Gilmore
+ *					and Rolf G. Tews
+ *					for testing these extensively
+ *	Paul Diefenbaugh	:	Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <linux/htirq.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/nmi.h>
+#include <asm/msidef.h>
+#include <asm/hypertransport.h>
+
+#include <mach_apic.h>
+#include <mach_apicdef.h>
+
+#include "io_ports.h"
+
+int (*ioapic_renumber_irq)(int ioapic, int irq);
+atomic_t irq_mis_count;
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+
+static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_SPINLOCK(vector_lock);
+
+int timer_over_8254 __initdata = 1;
+
+/*
+ *	Is the SiS APIC rmw bug present ?
+ *	-1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
+
+/*
+ * # of IRQ routing registers
+ */
+int nr_ioapic_registers[MAX_IO_APICS];
+
+static int disable_timer_pin_1 __initdata;
+
+/*
+ * Rough estimation of how many shared IRQs there are, can
+ * be changed anytime.
+ */
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+static struct irq_pin_list {
+	int apic, pin, next;
+} irq_2_pin[PIN_MAP_SIZE];
+
+struct io_apic {
+	unsigned int index;
+	unsigned int unused[3];
+	unsigned int data;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+}
+
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(reg, &io_apic->index);
+	return readl(&io_apic->data);
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(reg, &io_apic->index);
+	writel(value, &io_apic->data);
+}
+
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
+ */
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+{
+	volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
+	if (sis_apic_bug)
+		writel(reg, &io_apic->index);
+	writel(value, &io_apic->data);
+}
+
+union entry_union {
+	struct { u32 w1, w2; };
+	struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+	union entry_union eu;
+	unsigned long flags;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+	eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+	return eu.entry;
+}
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+	union entry_union eu;
+	eu.entry = e;
+	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__ioapic_write_entry(apic, pin, e);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+	unsigned long flags;
+	union entry_union eu = { .entry.mask = 1 };
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+	static int first_free_entry = NR_IRQS;
+	struct irq_pin_list *entry = irq_2_pin + irq;
+
+	while (entry->next)
+		entry = irq_2_pin + entry->next;
+
+	if (entry->pin != -1) {
+		entry->next = first_free_entry;
+		entry = irq_2_pin + entry->next;
+		if (++first_free_entry >= PIN_MAP_SIZE)
+			panic("io_apic.c: whoops");
+	}
+	entry->apic = apic;
+	entry->pin = pin;
+}
+
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq(unsigned int irq,
+				      int oldapic, int oldpin,
+				      int newapic, int newpin)
+{
+	struct irq_pin_list *entry = irq_2_pin + irq;
+
+	while (1) {
+		if (entry->apic == oldapic && entry->pin == oldpin) {
+			entry->apic = newapic;
+			entry->pin = newpin;
+		}
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+}
+
+static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
+{
+	struct irq_pin_list *entry = irq_2_pin + irq;
+	unsigned int pin, reg;
+
+	for (;;) {
+		pin = entry->pin;
+		if (pin == -1)
+			break;
+		reg = io_apic_read(entry->apic, 0x10 + pin*2);
+		reg &= ~disable;
+		reg |= enable;
+		io_apic_modify(entry->apic, 0x10 + pin*2, reg);
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+}
+
+/* mask = 1 */
+static void __mask_IO_APIC_irq (unsigned int irq)
+{
+	__modify_IO_APIC_irq(irq, 0x00010000, 0);
+}
+
+/* mask = 0 */
+static void __unmask_IO_APIC_irq (unsigned int irq)
+{
+	__modify_IO_APIC_irq(irq, 0, 0x00010000);
+}
+
+/* mask = 1, trigger = 0 */
+static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
+{
+	__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+}
+
+/* mask = 0, trigger = 1 */
+static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
+{
+	__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
+}
+
+static void mask_IO_APIC_irq (unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__mask_IO_APIC_irq(irq);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void unmask_IO_APIC_irq (unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__unmask_IO_APIC_irq(irq);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+	struct IO_APIC_route_entry entry;
+	
+	/* Check delivery_mode to be sure we're not clearing an SMI pin */
+	entry = ioapic_read_entry(apic, pin);
+	if (entry.delivery_mode == dest_SMI)
+		return;
+
+	/*
+	 * Disable it in the IO-APIC irq-routing table:
+	 */
+	ioapic_mask_entry(apic, pin);
+}
+
+static void clear_IO_APIC (void)
+{
+	int apic, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++)
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+			clear_IO_APIC_pin(apic, pin);
+}
+
+#ifdef CONFIG_SMP
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
+{
+	unsigned long flags;
+	int pin;
+	struct irq_pin_list *entry = irq_2_pin + irq;
+	unsigned int apicid_value;
+	cpumask_t tmp;
+	
+	cpus_and(tmp, cpumask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(cpumask, tmp, CPU_MASK_ALL);
+
+	apicid_value = cpu_mask_to_apicid(cpumask);
+	/* Prepare to do the io_apic_write */
+	apicid_value = apicid_value << 24;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	for (;;) {
+		pin = entry->pin;
+		if (pin == -1)
+			break;
+		io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+	irq_desc[irq].affinity = cpumask;
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#if defined(CONFIG_IRQBALANCE)
+# include <asm/processor.h>	/* kernel_thread() */
+# include <linux/kernel_stat.h>	/* kstat */
+# include <linux/slab.h>		/* kmalloc() */
+# include <linux/timer.h>	/* time_after() */
+ 
+#define IRQBALANCE_CHECK_ARCH -999
+#define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
+#define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
+#define BALANCED_IRQ_MORE_DELTA		(HZ/10)
+#define BALANCED_IRQ_LESS_DELTA		(HZ)
+
+static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
+static int physical_balance __read_mostly;
+static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
+
+static struct irq_cpu_info {
+	unsigned long * last_irq;
+	unsigned long * irq_delta;
+	unsigned long irq;
+} irq_cpu_data[NR_CPUS];
+
+#define CPU_IRQ(cpu)		(irq_cpu_data[cpu].irq)
+#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu,irq) 	(irq_cpu_data[cpu].irq_delta[irq])
+
+#define IDLE_ENOUGH(cpu,now) \
+	(idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
+
+#define IRQ_ALLOWED(cpu, allowed_mask)	cpu_isset(cpu, allowed_mask)
+
+#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
+
+static cpumask_t balance_irq_affinity[NR_IRQS] = {
+	[0 ... NR_IRQS-1] = CPU_MASK_ALL
+};
+
+void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	balance_irq_affinity[irq] = mask;
+}
+
+static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
+			unsigned long now, int direction)
+{
+	int search_idle = 1;
+	int cpu = curr_cpu;
+
+	goto inside;
+
+	do {
+		if (unlikely(cpu == curr_cpu))
+			search_idle = 0;
+inside:
+		if (direction == 1) {
+			cpu++;
+			if (cpu >= NR_CPUS)
+				cpu = 0;
+		} else {
+			cpu--;
+			if (cpu == -1)
+				cpu = NR_CPUS-1;
+		}
+	} while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
+			(search_idle && !IDLE_ENOUGH(cpu,now)));
+
+	return cpu;
+}
+
+static inline void balance_irq(int cpu, int irq)
+{
+	unsigned long now = jiffies;
+	cpumask_t allowed_mask;
+	unsigned int new_cpu;
+		
+	if (irqbalance_disabled)
+		return; 
+
+	cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
+	new_cpu = move(cpu, allowed_mask, now, 1);
+	if (cpu != new_cpu) {
+		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
+	}
+}
+
+static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
+{
+	int i, j;
+
+	for_each_online_cpu(i) {
+		for (j = 0; j < NR_IRQS; j++) {
+			if (!irq_desc[j].action)
+				continue;
+			/* Is it a significant load ?  */
+			if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
+						useful_load_threshold)
+				continue;
+			balance_irq(i, j);
+		}
+	}
+	balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+		balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);	
+	return;
+}
+
+static void do_irq_balance(void)
+{
+	int i, j;
+	unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
+	unsigned long move_this_load = 0;
+	int max_loaded = 0, min_loaded = 0;
+	int load;
+	unsigned long useful_load_threshold = balanced_irq_interval + 10;
+	int selected_irq;
+	int tmp_loaded, first_attempt = 1;
+	unsigned long tmp_cpu_irq;
+	unsigned long imbalance = 0;
+	cpumask_t allowed_mask, target_cpu_mask, tmp;
+
+	for_each_possible_cpu(i) {
+		int package_index;
+		CPU_IRQ(i) = 0;
+		if (!cpu_online(i))
+			continue;
+		package_index = CPU_TO_PACKAGEINDEX(i);
+		for (j = 0; j < NR_IRQS; j++) {
+			unsigned long value_now, delta;
+			/* Is this an active IRQ or balancing disabled ? */
+			if (!irq_desc[j].action || irq_balancing_disabled(j))
+				continue;
+			if ( package_index == i )
+				IRQ_DELTA(package_index,j) = 0;
+			/* Determine the total count per processor per IRQ */
+			value_now = (unsigned long) kstat_cpu(i).irqs[j];
+
+			/* Determine the activity per processor per IRQ */
+			delta = value_now - LAST_CPU_IRQ(i,j);
+
+			/* Update last_cpu_irq[][] for the next time */
+			LAST_CPU_IRQ(i,j) = value_now;
+
+			/* Ignore IRQs whose rate is less than the clock */
+			if (delta < useful_load_threshold)
+				continue;
+			/* update the load for the processor or package total */
+			IRQ_DELTA(package_index,j) += delta;
+
+			/* Keep track of the higher numbered sibling as well */
+			if (i != package_index)
+				CPU_IRQ(i) += delta;
+			/*
+			 * We have sibling A and sibling B in the package
+			 *
+			 * cpu_irq[A] = load for cpu A + load for cpu B
+			 * cpu_irq[B] = load for cpu B
+			 */
+			CPU_IRQ(package_index) += delta;
+		}
+	}
+	/* Find the least loaded processor package */
+	for_each_online_cpu(i) {
+		if (i != CPU_TO_PACKAGEINDEX(i))
+			continue;
+		if (min_cpu_irq > CPU_IRQ(i)) {
+			min_cpu_irq = CPU_IRQ(i);
+			min_loaded = i;
+		}
+	}
+	max_cpu_irq = ULONG_MAX;
+
+tryanothercpu:
+	/* Look for heaviest loaded processor.
+	 * We may come back to get the next heaviest loaded processor.
+	 * Skip processors with trivial loads.
+	 */
+	tmp_cpu_irq = 0;
+	tmp_loaded = -1;
+	for_each_online_cpu(i) {
+		if (i != CPU_TO_PACKAGEINDEX(i))
+			continue;
+		if (max_cpu_irq <= CPU_IRQ(i)) 
+			continue;
+		if (tmp_cpu_irq < CPU_IRQ(i)) {
+			tmp_cpu_irq = CPU_IRQ(i);
+			tmp_loaded = i;
+		}
+	}
+
+	if (tmp_loaded == -1) {
+ 	 /* In the case of small number of heavy interrupt sources, 
+	  * loading some of the cpus too much. We use Ingo's original 
+	  * approach to rotate them around.
+	  */
+		if (!first_attempt && imbalance >= useful_load_threshold) {
+			rotate_irqs_among_cpus(useful_load_threshold);
+			return;
+		}
+		goto not_worth_the_effort;
+	}
+	
+	first_attempt = 0;		/* heaviest search */
+	max_cpu_irq = tmp_cpu_irq;	/* load */
+	max_loaded = tmp_loaded;	/* processor */
+	imbalance = (max_cpu_irq - min_cpu_irq) / 2;
+	
+	/* if imbalance is less than approx 10% of max load, then
+	 * observe diminishing returns action. - quit
+	 */
+	if (imbalance < (max_cpu_irq >> 3))
+		goto not_worth_the_effort;
+
+tryanotherirq:
+	/* if we select an IRQ to move that can't go where we want, then
+	 * see if there is another one to try.
+	 */
+	move_this_load = 0;
+	selected_irq = -1;
+	for (j = 0; j < NR_IRQS; j++) {
+		/* Is this an active IRQ? */
+		if (!irq_desc[j].action)
+			continue;
+		if (imbalance <= IRQ_DELTA(max_loaded,j))
+			continue;
+		/* Try to find the IRQ that is closest to the imbalance
+		 * without going over.
+		 */
+		if (move_this_load < IRQ_DELTA(max_loaded,j)) {
+			move_this_load = IRQ_DELTA(max_loaded,j);
+			selected_irq = j;
+		}
+	}
+	if (selected_irq == -1) {
+		goto tryanothercpu;
+	}
+
+	imbalance = move_this_load;
+	
+	/* For physical_balance case, we accumlated both load
+	 * values in the one of the siblings cpu_irq[],
+	 * to use the same code for physical and logical processors
+	 * as much as possible. 
+	 *
+	 * NOTE: the cpu_irq[] array holds the sum of the load for
+	 * sibling A and sibling B in the slot for the lowest numbered
+	 * sibling (A), _AND_ the load for sibling B in the slot for
+	 * the higher numbered sibling.
+	 *
+	 * We seek the least loaded sibling by making the comparison
+	 * (A+B)/2 vs B
+	 */
+	load = CPU_IRQ(min_loaded) >> 1;
+	for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
+		if (load > CPU_IRQ(j)) {
+			/* This won't change cpu_sibling_map[min_loaded] */
+			load = CPU_IRQ(j);
+			min_loaded = j;
+		}
+	}
+
+	cpus_and(allowed_mask,
+		cpu_online_map,
+		balance_irq_affinity[selected_irq]);
+	target_cpu_mask = cpumask_of_cpu(min_loaded);
+	cpus_and(tmp, target_cpu_mask, allowed_mask);
+
+	if (!cpus_empty(tmp)) {
+		/* mark for change destination */
+		set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
+
+		/* Since we made a change, come back sooner to 
+		 * check for more variation.
+		 */
+		balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+			balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);	
+		return;
+	}
+	goto tryanotherirq;
+
+not_worth_the_effort:
+	/*
+	 * if we did not find an IRQ to move, then adjust the time interval
+	 * upward
+	 */
+	balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
+		balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);	
+	return;
+}
+
+static int balanced_irq(void *unused)
+{
+	int i;
+	unsigned long prev_balance_time = jiffies;
+	long time_remaining = balanced_irq_interval;
+
+	/* push everything to CPU 0 to give us a starting point.  */
+	for (i = 0 ; i < NR_IRQS ; i++) {
+		irq_desc[i].pending_mask = cpumask_of_cpu(0);
+		set_pending_irq(i, cpumask_of_cpu(0));
+	}
+
+	set_freezable();
+	for ( ; ; ) {
+		time_remaining = schedule_timeout_interruptible(time_remaining);
+		try_to_freeze();
+		if (time_after(jiffies,
+				prev_balance_time+balanced_irq_interval)) {
+			preempt_disable();
+			do_irq_balance();
+			prev_balance_time = jiffies;
+			time_remaining = balanced_irq_interval;
+			preempt_enable();
+		}
+	}
+	return 0;
+}
+
+static int __init balanced_irq_init(void)
+{
+	int i;
+	struct cpuinfo_x86 *c;
+	cpumask_t tmp;
+
+	cpus_shift_right(tmp, cpu_online_map, 2);
+        c = &boot_cpu_data;
+	/* When not overwritten by the command line ask subarchitecture. */
+	if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
+		irqbalance_disabled = NO_BALANCE_IRQ;
+	if (irqbalance_disabled)
+		return 0;
+	
+	 /* disable irqbalance completely if there is only one processor online */
+	if (num_online_cpus() < 2) {
+		irqbalance_disabled = 1;
+		return 0;
+	}
+	/*
+	 * Enable physical balance only if more than 1 physical processor
+	 * is present
+	 */
+	if (smp_num_siblings > 1 && !cpus_empty(tmp))
+		physical_balance = 1;
+
+	for_each_online_cpu(i) {
+		irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+		irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+		if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
+			printk(KERN_ERR "balanced_irq_init: out of memory");
+			goto failed;
+		}
+		memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
+		memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
+	}
+	
+	printk(KERN_INFO "Starting balanced_irq\n");
+	if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
+		return 0;
+	printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
+failed:
+	for_each_possible_cpu(i) {
+		kfree(irq_cpu_data[i].irq_delta);
+		irq_cpu_data[i].irq_delta = NULL;
+		kfree(irq_cpu_data[i].last_irq);
+		irq_cpu_data[i].last_irq = NULL;
+	}
+	return 0;
+}
+
+int __devinit irqbalance_disable(char *str)
+{
+	irqbalance_disabled = 1;
+	return 1;
+}
+
+__setup("noirqbalance", irqbalance_disable);
+
+late_initcall(balanced_irq_init);
+#endif /* CONFIG_IRQBALANCE */
+#endif /* CONFIG_SMP */
+
+#ifndef CONFIG_SMP
+void fastcall send_IPI_self(int vector)
+{
+	unsigned int cfg;
+
+	/*
+	 * Wait for idle.
+	 */
+	apic_wait_icr_idle();
+	cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
+	/*
+	 * Send the IPI. The write to APIC_ICR fires this off.
+	 */
+	apic_write_around(APIC_ICR, cfg);
+}
+#endif /* !CONFIG_SMP */
+
+
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries [MAX_PIRQS];
+static int pirqs_enabled;
+int skip_ioapic_setup;
+
+static int __init ioapic_pirq_setup(char *str)
+{
+	int i, max;
+	int ints[MAX_PIRQS+1];
+
+	get_options(str, ARRAY_SIZE(ints), ints);
+
+	for (i = 0; i < MAX_PIRQS; i++)
+		pirq_entries[i] = -1;
+
+	pirqs_enabled = 1;
+	apic_printk(APIC_VERBOSE, KERN_INFO
+			"PIRQ redirection, working around broken MP-BIOS.\n");
+	max = MAX_PIRQS;
+	if (ints[0] < MAX_PIRQS)
+		max = ints[0];
+
+	for (i = 0; i < max; i++) {
+		apic_printk(APIC_VERBOSE, KERN_DEBUG
+				"... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+		/*
+		 * PIRQs are mapped upside down, usually.
+		 */
+		pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+	}
+	return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int apic, int pin, int type)
+{
+	int i;
+
+	for (i = 0; i < mp_irq_entries; i++)
+		if (mp_irqs[i].mpc_irqtype == type &&
+		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+		    mp_irqs[i].mpc_dstirq == pin)
+			return i;
+
+	return -1;
+}
+
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+	int i;
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		int lbus = mp_irqs[i].mpc_srcbus;
+
+		if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
+		     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
+		     mp_bus_id_to_type[lbus] == MP_BUS_MCA
+		    ) &&
+		    (mp_irqs[i].mpc_irqtype == type) &&
+		    (mp_irqs[i].mpc_srcbusirq == irq))
+
+			return mp_irqs[i].mpc_dstirq;
+	}
+	return -1;
+}
+
+static int __init find_isa_irq_apic(int irq, int type)
+{
+	int i;
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		int lbus = mp_irqs[i].mpc_srcbus;
+
+		if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
+		     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
+		     mp_bus_id_to_type[lbus] == MP_BUS_MCA
+		    ) &&
+		    (mp_irqs[i].mpc_irqtype == type) &&
+		    (mp_irqs[i].mpc_srcbusirq == irq))
+			break;
+	}
+	if (i < mp_irq_entries) {
+		int apic;
+		for(apic = 0; apic < nr_ioapics; apic++) {
+			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+				return apic;
+		}
+	}
+
+	return -1;
+}
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+static int pin_2_irq(int idx, int apic, int pin);
+
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+	int apic, i, best_guess = -1;
+
+	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
+		"slot:%d, pin:%d.\n", bus, slot, pin);
+	if (mp_bus_id_to_pci_bus[bus] == -1) {
+		printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+		return -1;
+	}
+	for (i = 0; i < mp_irq_entries; i++) {
+		int lbus = mp_irqs[i].mpc_srcbus;
+
+		for (apic = 0; apic < nr_ioapics; apic++)
+			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+				break;
+
+		if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
+		    !mp_irqs[i].mpc_irqtype &&
+		    (bus == lbus) &&
+		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+
+			if (!(apic || IO_APIC_IRQ(irq)))
+				continue;
+
+			if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+				return irq;
+			/*
+			 * Use the first all-but-pin matching entry as a
+			 * best-guess fuzzy result for broken mptables.
+			 */
+			if (best_guess < 0)
+				best_guess = irq;
+		}
+	}
+	return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+/*
+ * This function currently is only a helper for the i386 smp boot process where 
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be TARGET_CPUS
+ */
+#ifdef CONFIG_SMP
+void __init setup_ioapic_dest(void)
+{
+	int pin, ioapic, irq, irq_entry;
+
+	if (skip_ioapic_setup == 1)
+		return;
+
+	for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
+		for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+			irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+			if (irq_entry == -1)
+				continue;
+			irq = pin_2_irq(irq_entry, ioapic, pin);
+			set_ioapic_affinity_irq(irq, TARGET_CPUS);
+		}
+
+	}
+}
+#endif
+
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+	if (irq < 16) {
+		unsigned int port = 0x4d0 + (irq >> 3);
+		return (inb(port) >> (irq & 7)) & 1;
+	}
+	apic_printk(APIC_VERBOSE, KERN_INFO
+			"Broken MPtable reports ISA irq %d\n", irq);
+	return 0;
+}
+
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value.  If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_polarity(idx)	(0)
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx)	(0)
+#define default_ISA_polarity(idx)	(0)
+
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx)	(1)
+#define default_PCI_polarity(idx)	(1)
+
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx)	(1)
+#define default_MCA_polarity(idx)	(0)
+
+static int __init MPBIOS_polarity(int idx)
+{
+	int bus = mp_irqs[idx].mpc_srcbus;
+	int polarity;
+
+	/*
+	 * Determine IRQ line polarity (high active or low active):
+	 */
+	switch (mp_irqs[idx].mpc_irqflag & 3)
+	{
+		case 0: /* conforms, ie. bus-type dependent polarity */
+		{
+			switch (mp_bus_id_to_type[bus])
+			{
+				case MP_BUS_ISA: /* ISA pin */
+				{
+					polarity = default_ISA_polarity(idx);
+					break;
+				}
+				case MP_BUS_EISA: /* EISA pin */
+				{
+					polarity = default_EISA_polarity(idx);
+					break;
+				}
+				case MP_BUS_PCI: /* PCI pin */
+				{
+					polarity = default_PCI_polarity(idx);
+					break;
+				}
+				case MP_BUS_MCA: /* MCA pin */
+				{
+					polarity = default_MCA_polarity(idx);
+					break;
+				}
+				default:
+				{
+					printk(KERN_WARNING "broken BIOS!!\n");
+					polarity = 1;
+					break;
+				}
+			}
+			break;
+		}
+		case 1: /* high active */
+		{
+			polarity = 0;
+			break;
+		}
+		case 2: /* reserved */
+		{
+			printk(KERN_WARNING "broken BIOS!!\n");
+			polarity = 1;
+			break;
+		}
+		case 3: /* low active */
+		{
+			polarity = 1;
+			break;
+		}
+		default: /* invalid */
+		{
+			printk(KERN_WARNING "broken BIOS!!\n");
+			polarity = 1;
+			break;
+		}
+	}
+	return polarity;
+}
+
+static int MPBIOS_trigger(int idx)
+{
+	int bus = mp_irqs[idx].mpc_srcbus;
+	int trigger;
+
+	/*
+	 * Determine IRQ trigger mode (edge or level sensitive):
+	 */
+	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+	{
+		case 0: /* conforms, ie. bus-type dependent */
+		{
+			switch (mp_bus_id_to_type[bus])
+			{
+				case MP_BUS_ISA: /* ISA pin */
+				{
+					trigger = default_ISA_trigger(idx);
+					break;
+				}
+				case MP_BUS_EISA: /* EISA pin */
+				{
+					trigger = default_EISA_trigger(idx);
+					break;
+				}
+				case MP_BUS_PCI: /* PCI pin */
+				{
+					trigger = default_PCI_trigger(idx);
+					break;
+				}
+				case MP_BUS_MCA: /* MCA pin */
+				{
+					trigger = default_MCA_trigger(idx);
+					break;
+				}
+				default:
+				{
+					printk(KERN_WARNING "broken BIOS!!\n");
+					trigger = 1;
+					break;
+				}
+			}
+			break;
+		}
+		case 1: /* edge */
+		{
+			trigger = 0;
+			break;
+		}
+		case 2: /* reserved */
+		{
+			printk(KERN_WARNING "broken BIOS!!\n");
+			trigger = 1;
+			break;
+		}
+		case 3: /* level */
+		{
+			trigger = 1;
+			break;
+		}
+		default: /* invalid */
+		{
+			printk(KERN_WARNING "broken BIOS!!\n");
+			trigger = 0;
+			break;
+		}
+	}
+	return trigger;
+}
+
+static inline int irq_polarity(int idx)
+{
+	return MPBIOS_polarity(idx);
+}
+
+static inline int irq_trigger(int idx)
+{
+	return MPBIOS_trigger(idx);
+}
+
+static int pin_2_irq(int idx, int apic, int pin)
+{
+	int irq, i;
+	int bus = mp_irqs[idx].mpc_srcbus;
+
+	/*
+	 * Debugging check, we are in big trouble if this message pops up!
+	 */
+	if (mp_irqs[idx].mpc_dstirq != pin)
+		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+
+	switch (mp_bus_id_to_type[bus])
+	{
+		case MP_BUS_ISA: /* ISA pin */
+		case MP_BUS_EISA:
+		case MP_BUS_MCA:
+		{
+			irq = mp_irqs[idx].mpc_srcbusirq;
+			break;
+		}
+		case MP_BUS_PCI: /* PCI pin */
+		{
+			/*
+			 * PCI IRQs are mapped in order
+			 */
+			i = irq = 0;
+			while (i < apic)
+				irq += nr_ioapic_registers[i++];
+			irq += pin;
+
+			/*
+			 * For MPS mode, so far only needed by ES7000 platform
+			 */
+			if (ioapic_renumber_irq)
+				irq = ioapic_renumber_irq(apic, irq);
+
+			break;
+		}
+		default:
+		{
+			printk(KERN_ERR "unknown bus type %d.\n",bus); 
+			irq = 0;
+			break;
+		}
+	}
+
+	/*
+	 * PCI IRQ command line redirection. Yes, limits are hardcoded.
+	 */
+	if ((pin >= 16) && (pin <= 23)) {
+		if (pirq_entries[pin-16] != -1) {
+			if (!pirq_entries[pin-16]) {
+				apic_printk(APIC_VERBOSE, KERN_DEBUG
+						"disabling PIRQ%d\n", pin-16);
+			} else {
+				irq = pirq_entries[pin-16];
+				apic_printk(APIC_VERBOSE, KERN_DEBUG
+						"using PIRQ%d -> IRQ %d\n",
+						pin-16, irq);
+			}
+		}
+	}
+	return irq;
+}
+
+static inline int IO_APIC_irq_trigger(int irq)
+{
+	int apic, idx, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+			idx = find_irq_entry(apic,pin,mp_INT);
+			if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+				return irq_trigger(idx);
+		}
+	}
+	/*
+	 * nonexistent IRQs are edge default
+	 */
+	return 0;
+}
+
+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
+static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
+
+static int __assign_irq_vector(int irq)
+{
+	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+	int vector, offset, i;
+
+	BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
+
+	if (irq_vector[irq] > 0)
+		return irq_vector[irq];
+
+	vector = current_vector;
+	offset = current_offset;
+next:
+	vector += 8;
+	if (vector >= FIRST_SYSTEM_VECTOR) {
+		offset = (offset + 1) % 8;
+		vector = FIRST_DEVICE_VECTOR + offset;
+	}
+	if (vector == current_vector)
+		return -ENOSPC;
+	if (vector == SYSCALL_VECTOR)
+		goto next;
+	for (i = 0; i < NR_IRQ_VECTORS; i++)
+		if (irq_vector[i] == vector)
+			goto next;
+
+	current_vector = vector;
+	current_offset = offset;
+	irq_vector[irq] = vector;
+
+	return vector;
+}
+
+static int assign_irq_vector(int irq)
+{
+	unsigned long flags;
+	int vector;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	vector = __assign_irq_vector(irq);
+	spin_unlock_irqrestore(&vector_lock, flags);
+
+	return vector;
+}
+static struct irq_chip ioapic_chip;
+
+#define IOAPIC_AUTO	-1
+#define IOAPIC_EDGE	0
+#define IOAPIC_LEVEL	1
+
+static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+{
+	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+	    trigger == IOAPIC_LEVEL) {
+		irq_desc[irq].status |= IRQ_LEVEL;
+		set_irq_chip_and_handler_name(irq, &ioapic_chip,
+					 handle_fasteoi_irq, "fasteoi");
+	} else {
+		irq_desc[irq].status &= ~IRQ_LEVEL;
+		set_irq_chip_and_handler_name(irq, &ioapic_chip,
+					 handle_edge_irq, "edge");
+	}
+	set_intr_gate(vector, interrupt[irq]);
+}
+
+static void __init setup_IO_APIC_irqs(void)
+{
+	struct IO_APIC_route_entry entry;
+	int apic, pin, idx, irq, first_notcon = 1, vector;
+	unsigned long flags;
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+	for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+
+		/*
+		 * add it to the IO-APIC irq-routing table:
+		 */
+		memset(&entry,0,sizeof(entry));
+
+		entry.delivery_mode = INT_DELIVERY_MODE;
+		entry.dest_mode = INT_DEST_MODE;
+		entry.mask = 0;				/* enable IRQ */
+		entry.dest.logical.logical_dest = 
+					cpu_mask_to_apicid(TARGET_CPUS);
+
+		idx = find_irq_entry(apic,pin,mp_INT);
+		if (idx == -1) {
+			if (first_notcon) {
+				apic_printk(APIC_VERBOSE, KERN_DEBUG
+						" IO-APIC (apicid-pin) %d-%d",
+						mp_ioapics[apic].mpc_apicid,
+						pin);
+				first_notcon = 0;
+			} else
+				apic_printk(APIC_VERBOSE, ", %d-%d",
+					mp_ioapics[apic].mpc_apicid, pin);
+			continue;
+		}
+
+		entry.trigger = irq_trigger(idx);
+		entry.polarity = irq_polarity(idx);
+
+		if (irq_trigger(idx)) {
+			entry.trigger = 1;
+			entry.mask = 1;
+		}
+
+		irq = pin_2_irq(idx, apic, pin);
+		/*
+		 * skip adding the timer int on secondary nodes, which causes
+		 * a small but painful rift in the time-space continuum
+		 */
+		if (multi_timer_check(apic, irq))
+			continue;
+		else
+			add_pin_to_irq(irq, apic, pin);
+
+		if (!apic && !IO_APIC_IRQ(irq))
+			continue;
+
+		if (IO_APIC_IRQ(irq)) {
+			vector = assign_irq_vector(irq);
+			entry.vector = vector;
+			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+		
+			if (!apic && (irq < 16))
+				disable_8259A_irq(irq);
+		}
+		spin_lock_irqsave(&ioapic_lock, flags);
+		__ioapic_write_entry(apic, pin, entry);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+	}
+	}
+
+	if (!first_notcon)
+		apic_printk(APIC_VERBOSE, " not connected.\n");
+}
+
+/*
+ * Set up the 8259A-master output pin:
+ */
+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+{
+	struct IO_APIC_route_entry entry;
+
+	memset(&entry,0,sizeof(entry));
+
+	disable_8259A_irq(0);
+
+	/* mask LVT0 */
+	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+
+	/*
+	 * We use logical delivery to get the timer IRQ
+	 * to the first CPU.
+	 */
+	entry.dest_mode = INT_DEST_MODE;
+	entry.mask = 0;					/* unmask IRQ now */
+	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+	entry.delivery_mode = INT_DELIVERY_MODE;
+	entry.polarity = 0;
+	entry.trigger = 0;
+	entry.vector = vector;
+
+	/*
+	 * The timer IRQ doesn't have to know that behind the
+	 * scene we have a 8259A-master in AEOI mode ...
+	 */
+	irq_desc[0].chip = &ioapic_chip;
+	set_irq_handler(0, handle_edge_irq);
+
+	/*
+	 * Add it to the IO-APIC irq-routing table:
+	 */
+	ioapic_write_entry(apic, pin, entry);
+
+	enable_8259A_irq(0);
+}
+
+void __init print_IO_APIC(void)
+{
+	int apic, i;
+	union IO_APIC_reg_00 reg_00;
+	union IO_APIC_reg_01 reg_01;
+	union IO_APIC_reg_02 reg_02;
+	union IO_APIC_reg_03 reg_03;
+	unsigned long flags;
+
+	if (apic_verbosity == APIC_QUIET)
+		return;
+
+ 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+	for (i = 0; i < nr_ioapics; i++)
+		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+		       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+
+	/*
+	 * We are a bit conservative about what we expect.  We have to
+	 * know about every hardware change ASAP.
+	 */
+	printk(KERN_INFO "testing the IO APIC.......................\n");
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	reg_00.raw = io_apic_read(apic, 0);
+	reg_01.raw = io_apic_read(apic, 1);
+	if (reg_01.bits.version >= 0x10)
+		reg_02.raw = io_apic_read(apic, 2);
+	if (reg_01.bits.version >= 0x20)
+		reg_03.raw = io_apic_read(apic, 3);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
+	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
+	printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
+
+	printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
+	printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
+
+	printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
+	printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
+
+	/*
+	 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+	 * but the value of reg_02 is read as the previous read register
+	 * value, so ignore it if reg_02 == reg_01.
+	 */
+	if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
+		printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+		printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
+	}
+
+	/*
+	 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+	 * or reg_03, but the value of reg_0[23] is read as the previous read
+	 * register value, so ignore it if reg_03 == reg_0[12].
+	 */
+	if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+	    reg_03.raw != reg_01.raw) {
+		printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+		printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
+	}
+
+	printk(KERN_DEBUG ".... IRQ redirection table:\n");
+
+	printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
+			  " Stat Dest Deli Vect:   \n");
+
+	for (i = 0; i <= reg_01.bits.entries; i++) {
+		struct IO_APIC_route_entry entry;
+
+		entry = ioapic_read_entry(apic, i);
+
+		printk(KERN_DEBUG " %02x %03X %02X  ",
+			i,
+			entry.dest.logical.logical_dest,
+			entry.dest.physical.physical_dest
+		);
+
+		printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
+			entry.mask,
+			entry.trigger,
+			entry.irr,
+			entry.polarity,
+			entry.delivery_status,
+			entry.dest_mode,
+			entry.delivery_mode,
+			entry.vector
+		);
+	}
+	}
+	printk(KERN_DEBUG "IRQ to pin mappings:\n");
+	for (i = 0; i < NR_IRQS; i++) {
+		struct irq_pin_list *entry = irq_2_pin + i;
+		if (entry->pin < 0)
+			continue;
+		printk(KERN_DEBUG "IRQ%d ", i);
+		for (;;) {
+			printk("-> %d:%d", entry->apic, entry->pin);
+			if (!entry->next)
+				break;
+			entry = irq_2_pin + entry->next;
+		}
+		printk("\n");
+	}
+
+	printk(KERN_INFO ".................................... done.\n");
+
+	return;
+}
+
+#if 0
+
+static void print_APIC_bitfield (int base)
+{
+	unsigned int v;
+	int i, j;
+
+	if (apic_verbosity == APIC_QUIET)
+		return;
+
+	printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
+	for (i = 0; i < 8; i++) {
+		v = apic_read(base + i*0x10);
+		for (j = 0; j < 32; j++) {
+			if (v & (1<<j))
+				printk("1");
+			else
+				printk("0");
+		}
+		printk("\n");
+	}
+}
+
+void /*__init*/ print_local_APIC(void * dummy)
+{
+	unsigned int v, ver, maxlvt;
+
+	if (apic_verbosity == APIC_QUIET)
+		return;
+
+	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+		smp_processor_id(), hard_smp_processor_id());
+	v = apic_read(APIC_ID);
+	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
+	v = apic_read(APIC_LVR);
+	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+	ver = GET_APIC_VERSION(v);
+	maxlvt = lapic_get_maxlvt();
+
+	v = apic_read(APIC_TASKPRI);
+	printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+	if (APIC_INTEGRATED(ver)) {			/* !82489DX */
+		v = apic_read(APIC_ARBPRI);
+		printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+			v & APIC_ARBPRI_MASK);
+		v = apic_read(APIC_PROCPRI);
+		printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+	}
+
+	v = apic_read(APIC_EOI);
+	printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+	v = apic_read(APIC_RRR);
+	printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+	v = apic_read(APIC_LDR);
+	printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+	v = apic_read(APIC_DFR);
+	printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+	v = apic_read(APIC_SPIV);
+	printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+
+	printk(KERN_DEBUG "... APIC ISR field:\n");
+	print_APIC_bitfield(APIC_ISR);
+	printk(KERN_DEBUG "... APIC TMR field:\n");
+	print_APIC_bitfield(APIC_TMR);
+	printk(KERN_DEBUG "... APIC IRR field:\n");
+	print_APIC_bitfield(APIC_IRR);
+
+	if (APIC_INTEGRATED(ver)) {		/* !82489DX */
+		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP. */
+			apic_write(APIC_ESR, 0);
+		v = apic_read(APIC_ESR);
+		printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+	}
+
+	v = apic_read(APIC_ICR);
+	printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
+	v = apic_read(APIC_ICR2);
+	printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+
+	v = apic_read(APIC_LVTT);
+	printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+
+	if (maxlvt > 3) {                       /* PC is LVT#4. */
+		v = apic_read(APIC_LVTPC);
+		printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+	}
+	v = apic_read(APIC_LVT0);
+	printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+	v = apic_read(APIC_LVT1);
+	printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+
+	if (maxlvt > 2) {			/* ERR is LVT#3. */
+		v = apic_read(APIC_LVTERR);
+		printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+	}
+
+	v = apic_read(APIC_TMICT);
+	printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+	v = apic_read(APIC_TMCCT);
+	printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+	v = apic_read(APIC_TDCR);
+	printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+	printk("\n");
+}
+
+void print_all_local_APICs (void)
+{
+	on_each_cpu(print_local_APIC, NULL, 1, 1);
+}
+
+void /*__init*/ print_PIC(void)
+{
+	unsigned int v;
+	unsigned long flags;
+
+	if (apic_verbosity == APIC_QUIET)
+		return;
+
+	printk(KERN_DEBUG "\nprinting PIC contents\n");
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	v = inb(0xa1) << 8 | inb(0x21);
+	printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
+
+	v = inb(0xa0) << 8 | inb(0x20);
+	printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
+
+	outb(0x0b,0xa0);
+	outb(0x0b,0x20);
+	v = inb(0xa0) << 8 | inb(0x20);
+	outb(0x0a,0xa0);
+	outb(0x0a,0x20);
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+
+	printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
+
+	v = inb(0x4d1) << 8 | inb(0x4d0);
+	printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+
+#endif  /*  0  */
+
+static void __init enable_IO_APIC(void)
+{
+	union IO_APIC_reg_01 reg_01;
+	int i8259_apic, i8259_pin;
+	int i, apic;
+	unsigned long flags;
+
+	for (i = 0; i < PIN_MAP_SIZE; i++) {
+		irq_2_pin[i].pin = -1;
+		irq_2_pin[i].next = 0;
+	}
+	if (!pirqs_enabled)
+		for (i = 0; i < MAX_PIRQS; i++)
+			pirq_entries[i] = -1;
+
+	/*
+	 * The number of IO-APIC IRQ registers (== #pins):
+	 */
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		spin_lock_irqsave(&ioapic_lock, flags);
+		reg_01.raw = io_apic_read(apic, 1);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
+	}
+	for(apic = 0; apic < nr_ioapics; apic++) {
+		int pin;
+		/* See if any of the pins is in ExtINT mode */
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+			struct IO_APIC_route_entry entry;
+			entry = ioapic_read_entry(apic, pin);
+
+
+			/* If the interrupt line is enabled and in ExtInt mode
+			 * I have found the pin where the i8259 is connected.
+			 */
+			if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+				ioapic_i8259.apic = apic;
+				ioapic_i8259.pin  = pin;
+				goto found_i8259;
+			}
+		}
+	}
+ found_i8259:
+	/* Look to see what if the MP table has reported the ExtINT */
+	/* If we could not find the appropriate pin by looking at the ioapic
+	 * the i8259 probably is not connected the ioapic but give the
+	 * mptable a chance anyway.
+	 */
+	i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
+	i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+	/* Trust the MP table if nothing is setup in the hardware */
+	if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+		printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+		ioapic_i8259.pin  = i8259_pin;
+		ioapic_i8259.apic = i8259_apic;
+	}
+	/* Complain if the MP table and the hardware disagree */
+	if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+		(i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+	{
+		printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+	}
+
+	/*
+	 * Do not trust the IO-APIC being empty at bootup
+	 */
+	clear_IO_APIC();
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+	/*
+	 * Clear the IO-APIC before rebooting:
+	 */
+	clear_IO_APIC();
+
+	/*
+	 * If the i8259 is routed through an IOAPIC
+	 * Put that IOAPIC in virtual wire mode
+	 * so legacy interrupts can be delivered.
+	 */
+	if (ioapic_i8259.pin != -1) {
+		struct IO_APIC_route_entry entry;
+
+		memset(&entry, 0, sizeof(entry));
+		entry.mask            = 0; /* Enabled */
+		entry.trigger         = 0; /* Edge */
+		entry.irr             = 0;
+		entry.polarity        = 0; /* High */
+		entry.delivery_status = 0;
+		entry.dest_mode       = 0; /* Physical */
+		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
+		entry.vector          = 0;
+		entry.dest.physical.physical_dest =
+					GET_APIC_ID(apic_read(APIC_ID));
+
+		/*
+		 * Add it to the IO-APIC irq-routing table:
+		 */
+		ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+	}
+	disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+}
+
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
+ */
+
+#ifndef CONFIG_X86_NUMAQ
+static void __init setup_ioapic_ids_from_mpc(void)
+{
+	union IO_APIC_reg_00 reg_00;
+	physid_mask_t phys_id_present_map;
+	int apic;
+	int i;
+	unsigned char old_id;
+	unsigned long flags;
+
+	/*
+	 * Don't check I/O APIC IDs for xAPIC systems.  They have
+	 * no meaning without the serial APIC bus.
+	 */
+	if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		|| APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+		return;
+	/*
+	 * This is broken; anything with a real cpu count has to
+	 * circumvent this idiocy regardless.
+	 */
+	phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+	/*
+	 * Set the IOAPIC ID to the value stored in the MPC table.
+	 */
+	for (apic = 0; apic < nr_ioapics; apic++) {
+
+		/* Read the register 0 value */
+		spin_lock_irqsave(&ioapic_lock, flags);
+		reg_00.raw = io_apic_read(apic, 0);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+		
+		old_id = mp_ioapics[apic].mpc_apicid;
+
+		if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
+			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+				apic, mp_ioapics[apic].mpc_apicid);
+			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+				reg_00.bits.ID);
+			mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
+		}
+
+		/*
+		 * Sanity check, is the ID really free? Every APIC in a
+		 * system must have a unique ID or we get lots of nice
+		 * 'stuck on smp_invalidate_needed IPI wait' messages.
+		 */
+		if (check_apicid_used(phys_id_present_map,
+					mp_ioapics[apic].mpc_apicid)) {
+			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+				apic, mp_ioapics[apic].mpc_apicid);
+			for (i = 0; i < get_physical_broadcast(); i++)
+				if (!physid_isset(i, phys_id_present_map))
+					break;
+			if (i >= get_physical_broadcast())
+				panic("Max APIC ID exceeded!\n");
+			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+				i);
+			physid_set(i, phys_id_present_map);
+			mp_ioapics[apic].mpc_apicid = i;
+		} else {
+			physid_mask_t tmp;
+			tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
+			apic_printk(APIC_VERBOSE, "Setting %d in the "
+					"phys_id_present_map\n",
+					mp_ioapics[apic].mpc_apicid);
+			physids_or(phys_id_present_map, phys_id_present_map, tmp);
+		}
+
+
+		/*
+		 * We need to adjust the IRQ routing table
+		 * if the ID changed.
+		 */
+		if (old_id != mp_ioapics[apic].mpc_apicid)
+			for (i = 0; i < mp_irq_entries; i++)
+				if (mp_irqs[i].mpc_dstapic == old_id)
+					mp_irqs[i].mpc_dstapic
+						= mp_ioapics[apic].mpc_apicid;
+
+		/*
+		 * Read the right value from the MPC table and
+		 * write it into the ID register.
+	 	 */
+		apic_printk(APIC_VERBOSE, KERN_INFO
+			"...changing IO-APIC physical APIC ID to %d ...",
+			mp_ioapics[apic].mpc_apicid);
+
+		reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+		spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(apic, 0, reg_00.raw);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+
+		/*
+		 * Sanity check
+		 */
+		spin_lock_irqsave(&ioapic_lock, flags);
+		reg_00.raw = io_apic_read(apic, 0);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+		if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+			printk("could not set ID!\n");
+		else
+			apic_printk(APIC_VERBOSE, " ok.\n");
+	}
+}
+#else
+static void __init setup_ioapic_ids_from_mpc(void) { }
+#endif
+
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+	no_timer_check = 1;
+	return 1;
+}
+__setup("no_timer_check", notimercheck);
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ *	- timer IRQ defaults to IO-APIC IRQ
+ *	- if this function detects that timer IRQs are defunct, then we fall
+ *	  back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+	unsigned long t1 = jiffies;
+
+	if (no_timer_check)
+		return 1;
+
+	local_irq_enable();
+	/* Let ten ticks pass... */
+	mdelay((10 * 1000) / HZ);
+
+	/*
+	 * Expect a few ticks at least, to be sure some possible
+	 * glue logic does not lock up after one or two first
+	 * ticks in a non-ExtINT mode.  Also the local APIC
+	 * might have cached one ExtINT interrupt.  Finally, at
+	 * least one tick may be lost due to delays.
+	 */
+	if (jiffies - t1 > 4)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Startup quirk:
+ *
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ *
+ * (We do this for level-triggered IRQs too - it cannot hurt.)
+ */
+static unsigned int startup_ioapic_irq(unsigned int irq)
+{
+	int was_pending = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	if (irq < 16) {
+		disable_8259A_irq(irq);
+		if (i8259A_irq_pending(irq))
+			was_pending = 1;
+	}
+	__unmask_IO_APIC_irq(irq);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return was_pending;
+}
+
+static void ack_ioapic_irq(unsigned int irq)
+{
+	move_native_irq(irq);
+	ack_APIC_irq();
+}
+
+static void ack_ioapic_quirk_irq(unsigned int irq)
+{
+	unsigned long v;
+	int i;
+
+	move_native_irq(irq);
+/*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets).  Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless.  As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source.  The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually.  We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt.  We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul.  --macro
+ */
+	i = irq_vector[irq];
+
+	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+
+	ack_APIC_irq();
+
+	if (!(v & (1 << (i & 0x1f)))) {
+		atomic_inc(&irq_mis_count);
+		spin_lock(&ioapic_lock);
+		__mask_and_edge_IO_APIC_irq(irq);
+		__unmask_and_level_IO_APIC_irq(irq);
+		spin_unlock(&ioapic_lock);
+	}
+}
+
+static int ioapic_retrigger_irq(unsigned int irq)
+{
+	send_IPI_self(irq_vector[irq]);
+
+	return 1;
+}
+
+static struct irq_chip ioapic_chip __read_mostly = {
+	.name 		= "IO-APIC",
+	.startup 	= startup_ioapic_irq,
+	.mask	 	= mask_IO_APIC_irq,
+	.unmask	 	= unmask_IO_APIC_irq,
+	.ack 		= ack_ioapic_irq,
+	.eoi 		= ack_ioapic_quirk_irq,
+#ifdef CONFIG_SMP
+	.set_affinity 	= set_ioapic_affinity_irq,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
+
+
+static inline void init_IO_APIC_traps(void)
+{
+	int irq;
+
+	/*
+	 * NOTE! The local APIC isn't very good at handling
+	 * multiple interrupts at the same interrupt level.
+	 * As the interrupt level is determined by taking the
+	 * vector number and shifting that right by 4, we
+	 * want to spread these out a bit so that they don't
+	 * all fall in the same interrupt level.
+	 *
+	 * Also, we've got to be careful not to trash gate
+	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
+	 */
+	for (irq = 0; irq < NR_IRQS ; irq++) {
+		int tmp = irq;
+		if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
+			/*
+			 * Hmm.. We don't have an entry for this,
+			 * so default to an old-fashioned 8259
+			 * interrupt if we can..
+			 */
+			if (irq < 16)
+				make_8259A_irq(irq);
+			else
+				/* Strange. Oh, well.. */
+				irq_desc[irq].chip = &no_irq_chip;
+		}
+	}
+}
+
+/*
+ * The local APIC irq-chip implementation:
+ */
+
+static void ack_apic(unsigned int irq)
+{
+	ack_APIC_irq();
+}
+
+static void mask_lapic_irq (unsigned int irq)
+{
+	unsigned long v;
+
+	v = apic_read(APIC_LVT0);
+	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void unmask_lapic_irq (unsigned int irq)
+{
+	unsigned long v;
+
+	v = apic_read(APIC_LVT0);
+	apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static struct irq_chip lapic_chip __read_mostly = {
+	.name		= "local-APIC-edge",
+	.mask		= mask_lapic_irq,
+	.unmask		= unmask_lapic_irq,
+	.eoi		= ack_apic,
+};
+
+static void setup_nmi (void)
+{
+	/*
+ 	 * Dirty trick to enable the NMI watchdog ...
+	 * We put the 8259A master into AEOI mode and
+	 * unmask on all local APICs LVT0 as NMI.
+	 *
+	 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
+	 * is from Maciej W. Rozycki - so we do not have to EOI from
+	 * the NMI handler or the timer interrupt.
+	 */ 
+	apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
+
+	on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
+
+	apic_printk(APIC_VERBOSE, " done.\n");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic.  ICR does
+ * not support the ExtINT mode, unfortunately.  We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA.  --macro
+ */
+static inline void unlock_ExtINT_logic(void)
+{
+	int apic, pin, i;
+	struct IO_APIC_route_entry entry0, entry1;
+	unsigned char save_control, save_freq_select;
+
+	pin  = find_isa_irq_pin(8, mp_INT);
+	if (pin == -1) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+	apic = find_isa_irq_apic(8, mp_INT);
+	if (apic == -1) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	entry0 = ioapic_read_entry(apic, pin);
+	clear_IO_APIC_pin(apic, pin);
+
+	memset(&entry1, 0, sizeof(entry1));
+
+	entry1.dest_mode = 0;			/* physical delivery */
+	entry1.mask = 0;			/* unmask IRQ now */
+	entry1.dest.physical.physical_dest = hard_smp_processor_id();
+	entry1.delivery_mode = dest_ExtINT;
+	entry1.polarity = entry0.polarity;
+	entry1.trigger = 0;
+	entry1.vector = 0;
+
+	ioapic_write_entry(apic, pin, entry1);
+
+	save_control = CMOS_READ(RTC_CONTROL);
+	save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+	CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+		   RTC_FREQ_SELECT);
+	CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+	i = 100;
+	while (i-- > 0) {
+		mdelay(10);
+		if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+			i -= 10;
+	}
+
+	CMOS_WRITE(save_control, RTC_CONTROL);
+	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+	clear_IO_APIC_pin(apic, pin);
+
+	ioapic_write_entry(apic, pin, entry0);
+}
+
+int timer_uses_ioapic_pin_0;
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
+ * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ */
+static inline void __init check_timer(void)
+{
+	int apic1, pin1, apic2, pin2;
+	int vector;
+
+	/*
+	 * get/set the timer IRQ vector:
+	 */
+	disable_8259A_irq(0);
+	vector = assign_irq_vector(0);
+	set_intr_gate(vector, interrupt[0]);
+
+	/*
+	 * Subtle, code in do_timer_interrupt() expects an AEOI
+	 * mode for the 8259A whenever interrupts are routed
+	 * through I/O APICs.  Also IRQ0 has to be enabled in
+	 * the 8259A which implies the virtual wire has to be
+	 * disabled in the local APIC.
+	 */
+	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+	init_8259A(1);
+	timer_ack = 1;
+	if (timer_over_8254 > 0)
+		enable_8259A_irq(0);
+
+	pin1  = find_isa_irq_pin(0, mp_INT);
+	apic1 = find_isa_irq_apic(0, mp_INT);
+	pin2  = ioapic_i8259.pin;
+	apic2 = ioapic_i8259.apic;
+
+	if (pin1 == 0)
+		timer_uses_ioapic_pin_0 = 1;
+
+	printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+		vector, apic1, pin1, apic2, pin2);
+
+	if (pin1 != -1) {
+		/*
+		 * Ok, does IRQ0 through the IOAPIC work?
+		 */
+		unmask_IO_APIC_irq(0);
+		if (timer_irq_works()) {
+			if (nmi_watchdog == NMI_IO_APIC) {
+				disable_8259A_irq(0);
+				setup_nmi();
+				enable_8259A_irq(0);
+			}
+			if (disable_timer_pin_1 > 0)
+				clear_IO_APIC_pin(0, pin1);
+			return;
+		}
+		clear_IO_APIC_pin(apic1, pin1);
+		printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
+				"IO-APIC\n");
+	}
+
+	printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
+	if (pin2 != -1) {
+		printk("\n..... (found pin %d) ...", pin2);
+		/*
+		 * legacy devices should be connected to IO APIC #0
+		 */
+		setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
+		if (timer_irq_works()) {
+			printk("works.\n");
+			if (pin1 != -1)
+				replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+			else
+				add_pin_to_irq(0, apic2, pin2);
+			if (nmi_watchdog == NMI_IO_APIC) {
+				setup_nmi();
+			}
+			return;
+		}
+		/*
+		 * Cleanup, just in case ...
+		 */
+		clear_IO_APIC_pin(apic2, pin2);
+	}
+	printk(" failed.\n");
+
+	if (nmi_watchdog == NMI_IO_APIC) {
+		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+		nmi_watchdog = 0;
+	}
+
+	printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+
+	disable_8259A_irq(0);
+	set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
+				      "fasteoi");
+	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
+	enable_8259A_irq(0);
+
+	if (timer_irq_works()) {
+		printk(" works.\n");
+		return;
+	}
+	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+	printk(" failed.\n");
+
+	printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+
+	timer_ack = 0;
+	init_8259A(0);
+	make_8259A_irq(0);
+	apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+
+	unlock_ExtINT_logic();
+
+	if (timer_irq_works()) {
+		printk(" works.\n");
+		return;
+	}
+	printk(" failed :(.\n");
+	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
+		"report.  Then try booting with the 'noapic' option");
+}
+
+/*
+ *
+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
+ *   Linux doesn't really care, as it's not actually used
+ *   for any interrupt handling anyway.
+ */
+#define PIC_IRQS	(1 << PIC_CASCADE_IR)
+
+void __init setup_IO_APIC(void)
+{
+	enable_IO_APIC();
+
+	if (acpi_ioapic)
+		io_apic_irqs = ~0;	/* all IRQs go through IOAPIC */
+	else
+		io_apic_irqs = ~PIC_IRQS;
+
+	printk("ENABLING IO-APIC IRQs\n");
+
+	/*
+	 * Set up IO-APIC IRQ routing.
+	 */
+	if (!acpi_ioapic)
+		setup_ioapic_ids_from_mpc();
+	sync_Arb_IDs();
+	setup_IO_APIC_irqs();
+	init_IO_APIC_traps();
+	check_timer();
+	if (!acpi_ioapic)
+		print_IO_APIC();
+}
+
+static int __init setup_disable_8254_timer(char *s)
+{
+	timer_over_8254 = -1;
+	return 1;
+}
+static int __init setup_enable_8254_timer(char *s)
+{
+	timer_over_8254 = 2;
+	return 1;
+}
+
+__setup("disable_8254_timer", setup_disable_8254_timer);
+__setup("enable_8254_timer", setup_enable_8254_timer);
+
+/*
+ *	Called after all the initialization is done. If we didnt find any
+ *	APIC bugs then we can allow the modify fast path
+ */
+ 
+static int __init io_apic_bug_finalize(void)
+{
+	if(sis_apic_bug == -1)
+		sis_apic_bug = 0;
+	return 0;
+}
+
+late_initcall(io_apic_bug_finalize);
+
+struct sysfs_ioapic_data {
+	struct sys_device dev;
+	struct IO_APIC_route_entry entry[0];
+};
+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+
+static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+	struct IO_APIC_route_entry *entry;
+	struct sysfs_ioapic_data *data;
+	int i;
+	
+	data = container_of(dev, struct sysfs_ioapic_data, dev);
+	entry = data->entry;
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+		entry[i] = ioapic_read_entry(dev->id, i);
+
+	return 0;
+}
+
+static int ioapic_resume(struct sys_device *dev)
+{
+	struct IO_APIC_route_entry *entry;
+	struct sysfs_ioapic_data *data;
+	unsigned long flags;
+	union IO_APIC_reg_00 reg_00;
+	int i;
+	
+	data = container_of(dev, struct sysfs_ioapic_data, dev);
+	entry = data->entry;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	reg_00.raw = io_apic_read(dev->id, 0);
+	if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
+		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+		io_apic_write(dev->id, 0, reg_00.raw);
+	}
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+		ioapic_write_entry(dev->id, i, entry[i]);
+
+	return 0;
+}
+
+static struct sysdev_class ioapic_sysdev_class = {
+	set_kset_name("ioapic"),
+	.suspend = ioapic_suspend,
+	.resume = ioapic_resume,
+};
+
+static int __init ioapic_init_sysfs(void)
+{
+	struct sys_device * dev;
+	int i, size, error = 0;
+
+	error = sysdev_class_register(&ioapic_sysdev_class);
+	if (error)
+		return error;
+
+	for (i = 0; i < nr_ioapics; i++ ) {
+		size = sizeof(struct sys_device) + nr_ioapic_registers[i] 
+			* sizeof(struct IO_APIC_route_entry);
+		mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+		if (!mp_ioapic_data[i]) {
+			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+			continue;
+		}
+		memset(mp_ioapic_data[i], 0, size);
+		dev = &mp_ioapic_data[i]->dev;
+		dev->id = i; 
+		dev->cls = &ioapic_sysdev_class;
+		error = sysdev_register(dev);
+		if (error) {
+			kfree(mp_ioapic_data[i]);
+			mp_ioapic_data[i] = NULL;
+			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+			continue;
+		}
+	}
+
+	return 0;
+}
+
+device_initcall(ioapic_init_sysfs);
+
+/*
+ * Dynamic irq allocate and deallocation
+ */
+int create_irq(void)
+{
+	/* Allocate an unused irq */
+	int irq, new, vector = 0;
+	unsigned long flags;
+
+	irq = -ENOSPC;
+	spin_lock_irqsave(&vector_lock, flags);
+	for (new = (NR_IRQS - 1); new >= 0; new--) {
+		if (platform_legacy_irq(new))
+			continue;
+		if (irq_vector[new] != 0)
+			continue;
+		vector = __assign_irq_vector(new);
+		if (likely(vector > 0))
+			irq = new;
+		break;
+	}
+	spin_unlock_irqrestore(&vector_lock, flags);
+
+	if (irq >= 0) {
+		set_intr_gate(vector, interrupt[irq]);
+		dynamic_irq_init(irq);
+	}
+	return irq;
+}
+
+void destroy_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	dynamic_irq_cleanup(irq);
+
+	spin_lock_irqsave(&vector_lock, flags);
+	irq_vector[irq] = 0;
+	spin_unlock_irqrestore(&vector_lock, flags);
+}
+
+/*
+ * MSI mesage composition
+ */
+#ifdef CONFIG_PCI_MSI
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+	int vector;
+	unsigned dest;
+
+	vector = assign_irq_vector(irq);
+	if (vector >= 0) {
+		dest = cpu_mask_to_apicid(TARGET_CPUS);
+
+		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->address_lo =
+			MSI_ADDR_BASE_LO |
+			((INT_DEST_MODE == 0) ?
+				MSI_ADDR_DEST_MODE_PHYSICAL:
+				MSI_ADDR_DEST_MODE_LOGICAL) |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				MSI_ADDR_REDIRECTION_CPU:
+				MSI_ADDR_REDIRECTION_LOWPRI) |
+			MSI_ADDR_DEST_ID(dest);
+
+		msg->data =
+			MSI_DATA_TRIGGER_EDGE |
+			MSI_DATA_LEVEL_ASSERT |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				MSI_DATA_DELIVERY_FIXED:
+				MSI_DATA_DELIVERY_LOWPRI) |
+			MSI_DATA_VECTOR(vector);
+	}
+	return vector;
+}
+
+#ifdef CONFIG_SMP
+static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct msi_msg msg;
+	unsigned int dest;
+	cpumask_t tmp;
+	int vector;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	vector = assign_irq_vector(irq);
+	if (vector < 0)
+		return;
+
+	dest = cpu_mask_to_apicid(mask);
+
+	read_msi_msg(irq, &msg);
+
+	msg.data &= ~MSI_DATA_VECTOR_MASK;
+	msg.data |= MSI_DATA_VECTOR(vector);
+	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+	write_msi_msg(irq, &msg);
+	irq_desc[irq].affinity = mask;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+	.name		= "PCI-MSI",
+	.unmask		= unmask_msi_irq,
+	.mask		= mask_msi_irq,
+	.ack		= ack_ioapic_irq,
+#ifdef CONFIG_SMP
+	.set_affinity	= set_msi_irq_affinity,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
+
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+	struct msi_msg msg;
+	int irq, ret;
+	irq = create_irq();
+	if (irq < 0)
+		return irq;
+
+	ret = msi_compose_msg(dev, irq, &msg);
+	if (ret < 0) {
+		destroy_irq(irq);
+		return ret;
+	}
+
+	set_irq_msi(irq, desc);
+	write_msi_msg(irq, &msg);
+
+	set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
+				      "edge");
+
+	return 0;
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+	destroy_irq(irq);
+}
+
+#endif /* CONFIG_PCI_MSI */
+
+/*
+ * Hypertransport interrupt support
+ */
+#ifdef CONFIG_HT_IRQ
+
+#ifdef CONFIG_SMP
+
+static void target_ht_irq(unsigned int irq, unsigned int dest)
+{
+	struct ht_irq_msg msg;
+	fetch_ht_irq_msg(irq, &msg);
+
+	msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
+	msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+
+	msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
+	msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+
+	write_ht_irq_msg(irq, &msg);
+}
+
+static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	unsigned int dest;
+	cpumask_t tmp;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(mask, tmp, CPU_MASK_ALL);
+
+	dest = cpu_mask_to_apicid(mask);
+
+	target_ht_irq(irq, dest);
+	irq_desc[irq].affinity = mask;
+}
+#endif
+
+static struct irq_chip ht_irq_chip = {
+	.name		= "PCI-HT",
+	.mask		= mask_ht_irq,
+	.unmask		= unmask_ht_irq,
+	.ack		= ack_ioapic_irq,
+#ifdef CONFIG_SMP
+	.set_affinity	= set_ht_irq_affinity,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
+
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+	int vector;
+
+	vector = assign_irq_vector(irq);
+	if (vector >= 0) {
+		struct ht_irq_msg msg;
+		unsigned dest;
+		cpumask_t tmp;
+
+		cpus_clear(tmp);
+		cpu_set(vector >> 8, tmp);
+		dest = cpu_mask_to_apicid(tmp);
+
+		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+		msg.address_lo =
+			HT_IRQ_LOW_BASE |
+			HT_IRQ_LOW_DEST_ID(dest) |
+			HT_IRQ_LOW_VECTOR(vector) |
+			((INT_DEST_MODE == 0) ?
+				HT_IRQ_LOW_DM_PHYSICAL :
+				HT_IRQ_LOW_DM_LOGICAL) |
+			HT_IRQ_LOW_RQEOI_EDGE |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				HT_IRQ_LOW_MT_FIXED :
+				HT_IRQ_LOW_MT_ARBITRATED) |
+			HT_IRQ_LOW_IRQ_MASKED;
+
+		write_ht_irq_msg(irq, &msg);
+
+		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
+					      handle_edge_irq, "edge");
+	}
+	return vector;
+}
+#endif /* CONFIG_HT_IRQ */
+
+/* --------------------------------------------------------------------------
+                          ACPI-based IOAPIC Configuration
+   -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI
+
+int __init io_apic_get_unique_id (int ioapic, int apic_id)
+{
+	union IO_APIC_reg_00 reg_00;
+	static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+	physid_mask_t tmp;
+	unsigned long flags;
+	int i = 0;
+
+	/*
+	 * The P4 platform supports up to 256 APIC IDs on two separate APIC 
+	 * buses (one for LAPICs, one for IOAPICs), where predecessors only 
+	 * supports up to 16 on one shared APIC bus.
+	 * 
+	 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+	 *      advantage of new APIC bus architecture.
+	 */
+
+	if (physids_empty(apic_id_map))
+		apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	reg_00.raw = io_apic_read(ioapic, 0);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	if (apic_id >= get_physical_broadcast()) {
+		printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+			"%d\n", ioapic, apic_id, reg_00.bits.ID);
+		apic_id = reg_00.bits.ID;
+	}
+
+	/*
+	 * Every APIC in a system must have a unique ID or we get lots of nice 
+	 * 'stuck on smp_invalidate_needed IPI wait' messages.
+	 */
+	if (check_apicid_used(apic_id_map, apic_id)) {
+
+		for (i = 0; i < get_physical_broadcast(); i++) {
+			if (!check_apicid_used(apic_id_map, i))
+				break;
+		}
+
+		if (i == get_physical_broadcast())
+			panic("Max apic_id exceeded!\n");
+
+		printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+			"trying %d\n", ioapic, apic_id, i);
+
+		apic_id = i;
+	} 
+
+	tmp = apicid_to_cpu_present(apic_id);
+	physids_or(apic_id_map, apic_id_map, tmp);
+
+	if (reg_00.bits.ID != apic_id) {
+		reg_00.bits.ID = apic_id;
+
+		spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(ioapic, 0, reg_00.raw);
+		reg_00.raw = io_apic_read(ioapic, 0);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+
+		/* Sanity check */
+		if (reg_00.bits.ID != apic_id) {
+			printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+			return -1;
+		}
+	}
+
+	apic_printk(APIC_VERBOSE, KERN_INFO
+			"IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+	return apic_id;
+}
+
+
+int __init io_apic_get_version (int ioapic)
+{
+	union IO_APIC_reg_01	reg_01;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	reg_01.raw = io_apic_read(ioapic, 1);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return reg_01.bits.version;
+}
+
+
+int __init io_apic_get_redir_entries (int ioapic)
+{
+	union IO_APIC_reg_01	reg_01;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	reg_01.raw = io_apic_read(ioapic, 1);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return reg_01.bits.entries;
+}
+
+
+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+{
+	struct IO_APIC_route_entry entry;
+	unsigned long flags;
+
+	if (!IO_APIC_IRQ(irq)) {
+		printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+			ioapic);
+		return -EINVAL;
+	}
+
+	/*
+	 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
+	 * Note that we mask (disable) IRQs now -- these get enabled when the
+	 * corresponding device driver registers for this IRQ.
+	 */
+
+	memset(&entry,0,sizeof(entry));
+
+	entry.delivery_mode = INT_DELIVERY_MODE;
+	entry.dest_mode = INT_DEST_MODE;
+	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+	entry.trigger = edge_level;
+	entry.polarity = active_high_low;
+	entry.mask  = 1;
+
+	/*
+	 * IRQs < 16 are already in the irq_2_pin[] map
+	 */
+	if (irq >= 16)
+		add_pin_to_irq(irq, ioapic, pin);
+
+	entry.vector = assign_irq_vector(irq);
+
+	apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
+		"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
+		mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+		edge_level, active_high_low);
+
+	ioapic_register_intr(irq, entry.vector, edge_level);
+
+	if (!ioapic && (irq < 16))
+		disable_8259A_irq(irq);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__ioapic_write_entry(ioapic, pin, entry);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return 0;
+}
+
+#endif /* CONFIG_ACPI */
+
+static int __init parse_disable_timer_pin_1(char *arg)
+{
+	disable_timer_pin_1 = 1;
+	return 0;
+}
+early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
+
+static int __init parse_enable_timer_pin_1(char *arg)
+{
+	disable_timer_pin_1 = -1;
+	return 0;
+}
+early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
+
+static int __init parse_noapic(char *arg)
+{
+	/* disable IO-APIC */
+	disable_ioapic_setup();
+	return 0;
+}
+early_param("noapic", parse_noapic);
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport_32.c
new file mode 100644
index 000000000000..3d310a946d76
--- /dev/null
+++ b/arch/x86/kernel/ioport_32.c
@@ -0,0 +1,153 @@
+/*
+ *	linux/arch/i386/kernel/ioport.c
+ *
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/syscalls.h>
+
+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
+{
+	unsigned long mask;
+	unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
+	unsigned int low_index = base & (BITS_PER_LONG-1);
+	int length = low_index + extent;
+
+	if (low_index != 0) {
+		mask = (~0UL << low_index);
+		if (length < BITS_PER_LONG)
+			mask &= ~(~0UL << length);
+		if (new_value)
+			*bitmap_base++ |= mask;
+		else
+			*bitmap_base++ &= ~mask;
+		length -= BITS_PER_LONG;
+	}
+
+	mask = (new_value ? ~0UL : 0UL);
+	while (length >= BITS_PER_LONG) {
+		*bitmap_base++ = mask;
+		length -= BITS_PER_LONG;
+	}
+
+	if (length > 0) {
+		mask = ~(~0UL << length);
+		if (new_value)
+			*bitmap_base++ |= mask;
+		else
+			*bitmap_base++ &= ~mask;
+	}
+}
+
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+	unsigned long i, max_long, bytes, bytes_updated;
+	struct thread_struct * t = &current->thread;
+	struct tss_struct * tss;
+	unsigned long *bitmap;
+
+	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+		return -EINVAL;
+	if (turn_on && !capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	/*
+	 * If it's the first ioperm() call in this thread's lifetime, set the
+	 * IO bitmap up. ioperm() is much less timing critical than clone(),
+	 * this is why we delay this operation until now:
+	 */
+	if (!t->io_bitmap_ptr) {
+		bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+		if (!bitmap)
+			return -ENOMEM;
+
+		memset(bitmap, 0xff, IO_BITMAP_BYTES);
+		t->io_bitmap_ptr = bitmap;
+		set_thread_flag(TIF_IO_BITMAP);
+	}
+
+	/*
+	 * do it in the per-thread copy and in the TSS ...
+	 *
+	 * Disable preemption via get_cpu() - we must not switch away
+	 * because the ->io_bitmap_max value must match the bitmap
+	 * contents:
+	 */
+	tss = &per_cpu(init_tss, get_cpu());
+
+	set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+
+	/*
+	 * Search for a (possibly new) maximum. This is simple and stupid,
+	 * to keep it obviously correct:
+	 */
+	max_long = 0;
+	for (i = 0; i < IO_BITMAP_LONGS; i++)
+		if (t->io_bitmap_ptr[i] != ~0UL)
+			max_long = i;
+
+	bytes = (max_long + 1) * sizeof(long);
+	bytes_updated = max(bytes, t->io_bitmap_max);
+
+	t->io_bitmap_max = bytes;
+
+	/*
+	 * Sets the lazy trigger so that the next I/O operation will
+	 * reload the correct bitmap.
+	 * Reset the owner so that a process switch will not set
+	 * tss->io_bitmap_base to IO_BITMAP_OFFSET.
+	 */
+	tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
+	tss->io_bitmap_owner = NULL;
+
+	put_cpu();
+
+	return 0;
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ *
+ * Here we just change the eflags value on the stack: we allow
+ * only the super-user to do it. This depends on the stack-layout
+ * on system-call entry - see also fork() and the signal handling
+ * code.
+ */
+
+asmlinkage long sys_iopl(unsigned long unused)
+{
+	volatile struct pt_regs * regs = (struct pt_regs *) &unused;
+	unsigned int level = regs->ebx;
+	unsigned int old = (regs->eflags >> 12) & 3;
+	struct thread_struct *t = &current->thread;
+
+	if (level > 3)
+		return -EINVAL;
+	/* Trying to gain more privileges? */
+	if (level > old) {
+		if (!capable(CAP_SYS_RAWIO))
+			return -EPERM;
+	}
+	t->iopl = level << 12;
+	regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
+	set_iopl_mask(t->iopl);
+	return 0;
+}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
new file mode 100644
index 000000000000..dd2b97fc00b2
--- /dev/null
+++ b/arch/x86/kernel/irq_32.c
@@ -0,0 +1,343 @@
+/*
+ *	linux/arch/i386/kernel/irq.c
+ *
+ *	Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86-specific interrupt
+ * entry, irq-stacks and irq statistics code. All the remaining
+ * irq logic is done by the generic kernel/irq/ code and
+ * by the x86-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+
+#include <asm/apic.h>
+#include <asm/uaccess.h>
+
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+	printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	/*
+	 * Currently unexpected vectors happen only on SMP and APIC.
+	 * We _must_ ack these because every local APIC has only N
+	 * irq slots per priority level, and a 'hanging, unacked' IRQ
+	 * holds up an irq slot - in excessive cases (when multiple
+	 * unexpected vectors occur) that might lock up the APIC
+	 * completely.
+	 * But only ack when the APIC is enabled -AK
+	 */
+	if (cpu_has_apic)
+		ack_APIC_irq();
+#endif
+}
+
+#ifdef CONFIG_4KSTACKS
+/*
+ * per-CPU IRQ handling contexts (thread information and stack)
+ */
+union irq_ctx {
+	struct thread_info      tinfo;
+	u32                     stack[THREAD_SIZE/sizeof(u32)];
+};
+
+static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
+static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+#endif
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+fastcall unsigned int do_IRQ(struct pt_regs *regs)
+{	
+	struct pt_regs *old_regs;
+	/* high bit used in ret_from_ code */
+	int irq = ~regs->orig_eax;
+	struct irq_desc *desc = irq_desc + irq;
+#ifdef CONFIG_4KSTACKS
+	union irq_ctx *curctx, *irqctx;
+	u32 *isp;
+#endif
+
+	if (unlikely((unsigned)irq >= NR_IRQS)) {
+		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
+					__FUNCTION__, irq);
+		BUG();
+	}
+
+	old_regs = set_irq_regs(regs);
+	irq_enter();
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	/* Debugging check for stack overflow: is there less than 1KB free? */
+	{
+		long esp;
+
+		__asm__ __volatile__("andl %%esp,%0" :
+					"=r" (esp) : "0" (THREAD_SIZE - 1));
+		if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
+			printk("do_IRQ: stack overflow: %ld\n",
+				esp - sizeof(struct thread_info));
+			dump_stack();
+		}
+	}
+#endif
+
+#ifdef CONFIG_4KSTACKS
+
+	curctx = (union irq_ctx *) current_thread_info();
+	irqctx = hardirq_ctx[smp_processor_id()];
+
+	/*
+	 * this is where we switch to the IRQ stack. However, if we are
+	 * already using the IRQ stack (because we interrupted a hardirq
+	 * handler) we can't do that and just have to keep using the
+	 * current stack (which is the irq stack already after all)
+	 */
+	if (curctx != irqctx) {
+		int arg1, arg2, ebx;
+
+		/* build the stack frame on the IRQ stack */
+		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+		irqctx->tinfo.task = curctx->tinfo.task;
+		irqctx->tinfo.previous_esp = current_stack_pointer;
+
+		/*
+		 * Copy the softirq bits in preempt_count so that the
+		 * softirq checks work in the hardirq context.
+		 */
+		irqctx->tinfo.preempt_count =
+			(irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
+			(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+
+		asm volatile(
+			"       xchgl  %%ebx,%%esp      \n"
+			"       call   *%%edi           \n"
+			"       movl   %%ebx,%%esp      \n"
+			: "=a" (arg1), "=d" (arg2), "=b" (ebx)
+			:  "0" (irq),   "1" (desc),  "2" (isp),
+			   "D" (desc->handle_irq)
+			: "memory", "cc"
+		);
+	} else
+#endif
+		desc->handle_irq(irq, desc);
+
+	irq_exit();
+	set_irq_regs(old_regs);
+	return 1;
+}
+
+#ifdef CONFIG_4KSTACKS
+
+static char softirq_stack[NR_CPUS * THREAD_SIZE]
+		__attribute__((__section__(".bss.page_aligned")));
+
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]
+		__attribute__((__section__(".bss.page_aligned")));
+
+/*
+ * allocate per-cpu stacks for hardirq and for softirq processing
+ */
+void irq_ctx_init(int cpu)
+{
+	union irq_ctx *irqctx;
+
+	if (hardirq_ctx[cpu])
+		return;
+
+	irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+	irqctx->tinfo.task              = NULL;
+	irqctx->tinfo.exec_domain       = NULL;
+	irqctx->tinfo.cpu               = cpu;
+	irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
+	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+	hardirq_ctx[cpu] = irqctx;
+
+	irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
+	irqctx->tinfo.task              = NULL;
+	irqctx->tinfo.exec_domain       = NULL;
+	irqctx->tinfo.cpu               = cpu;
+	irqctx->tinfo.preempt_count     = 0;
+	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+	softirq_ctx[cpu] = irqctx;
+
+	printk("CPU %u irqstacks, hard=%p soft=%p\n",
+		cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+}
+
+void irq_ctx_exit(int cpu)
+{
+	hardirq_ctx[cpu] = NULL;
+}
+
+extern asmlinkage void __do_softirq(void);
+
+asmlinkage void do_softirq(void)
+{
+	unsigned long flags;
+	struct thread_info *curctx;
+	union irq_ctx *irqctx;
+	u32 *isp;
+
+	if (in_interrupt())
+		return;
+
+	local_irq_save(flags);
+
+	if (local_softirq_pending()) {
+		curctx = current_thread_info();
+		irqctx = softirq_ctx[smp_processor_id()];
+		irqctx->tinfo.task = curctx->task;
+		irqctx->tinfo.previous_esp = current_stack_pointer;
+
+		/* build the stack frame on the softirq stack */
+		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+
+		asm volatile(
+			"       xchgl   %%ebx,%%esp     \n"
+			"       call    __do_softirq    \n"
+			"       movl    %%ebx,%%esp     \n"
+			: "=b"(isp)
+			: "0"(isp)
+			: "memory", "cc", "edx", "ecx", "eax"
+		);
+		/*
+		 * Shouldnt happen, we returned above if in_interrupt():
+	 	 */
+		WARN_ON_ONCE(softirq_count());
+	}
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(do_softirq);
+#endif
+
+/*
+ * Interrupt statistics:
+ */
+
+atomic_t irq_err_count;
+
+/*
+ * /proc/interrupts printing:
+ */
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+	int i = *(loff_t *) v, j;
+	struct irqaction * action;
+	unsigned long flags;
+
+	if (i == 0) {
+		seq_printf(p, "           ");
+		for_each_online_cpu(j)
+			seq_printf(p, "CPU%-8d",j);
+		seq_putc(p, '\n');
+	}
+
+	if (i < NR_IRQS) {
+		spin_lock_irqsave(&irq_desc[i].lock, flags);
+		action = irq_desc[i].action;
+		if (!action)
+			goto skip;
+		seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+		seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+#endif
+		seq_printf(p, " %8s", irq_desc[i].chip->name);
+		seq_printf(p, "-%-8s", irq_desc[i].name);
+		seq_printf(p, "  %s", action->name);
+
+		for (action=action->next; action; action = action->next)
+			seq_printf(p, ", %s", action->name);
+
+		seq_putc(p, '\n');
+skip:
+		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+	} else if (i == NR_IRQS) {
+		seq_printf(p, "NMI: ");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", nmi_count(j));
+		seq_putc(p, '\n');
+#ifdef CONFIG_X86_LOCAL_APIC
+		seq_printf(p, "LOC: ");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ",
+				per_cpu(irq_stat,j).apic_timer_irqs);
+		seq_putc(p, '\n');
+#endif
+		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+		seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+	}
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+#include <mach_apic.h>
+
+void fixup_irqs(cpumask_t map)
+{
+	unsigned int irq;
+	static int warned;
+
+	for (irq = 0; irq < NR_IRQS; irq++) {
+		cpumask_t mask;
+		if (irq == 2)
+			continue;
+
+		cpus_and(mask, irq_desc[irq].affinity, map);
+		if (any_online_cpu(mask) == NR_CPUS) {
+			printk("Breaking affinity for irq %i\n", irq);
+			mask = map;
+		}
+		if (irq_desc[irq].chip->set_affinity)
+			irq_desc[irq].chip->set_affinity(irq, mask);
+		else if (irq_desc[irq].action && !(warned++))
+			printk("Cannot set affinity for irq %i\n", irq);
+	}
+
+#if 0
+	barrier();
+	/* Ingo Molnar says: "after the IO-APIC masks have been redirected
+	   [note the nop - the interrupt-enable boundary on x86 is two
+	   instructions from sti] - to flush out pending hardirqs and
+	   IPIs. After this point nothing is supposed to reach this CPU." */
+	__asm__ __volatile__("sti; nop; cli");
+	barrier();
+#else
+	/* That doesn't seem sufficient.  Give it 1ms. */
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+#endif
+}
+#endif
+
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
new file mode 100644
index 000000000000..448a50b1324c
--- /dev/null
+++ b/arch/x86/kernel/kprobes_32.c
@@ -0,0 +1,751 @@
+/*
+ *  Kernel Probes (KProbes)
+ *  arch/i386/kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *		Probes initial implementation ( includes contributions from
+ *		Rusty Russell).
+ * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *		interface to access function arguments.
+ * 2005-May	Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> added function-return probes.
+ */
+
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/kdebug.h>
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/uaccess.h>
+#include <asm/alternative.h>
+
+void jprobe_return_end(void);
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+/* insert a jmp code */
+static __always_inline void set_jmp_op(void *from, void *to)
+{
+	struct __arch_jmp_op {
+		char op;
+		long raddr;
+	} __attribute__((packed)) *jop;
+	jop = (struct __arch_jmp_op *)from;
+	jop->raddr = (long)(to) - ((long)(from) + 5);
+	jop->op = RELATIVEJUMP_INSTRUCTION;
+}
+
+/*
+ * returns non-zero if opcodes can be boosted.
+ */
+static __always_inline int can_boost(kprobe_opcode_t *opcodes)
+{
+#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)		      \
+	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+	 << (row % 32))
+	/*
+	 * Undefined/reserved opcodes, conditional jump, Opcode Extension
+	 * Groups, and some special opcodes can not be boost.
+	 */
+	static const unsigned long twobyte_is_boostable[256 / 32] = {
+		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+		/*      -------------------------------         */
+		W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
+		W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
+		W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
+		W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
+		W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
+		W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
+		W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
+		W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
+		W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
+		W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
+		W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
+		W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
+		W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
+		W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
+		W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
+		W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0)  /* f0 */
+		/*      -------------------------------         */
+		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+	};
+#undef W
+	kprobe_opcode_t opcode;
+	kprobe_opcode_t *orig_opcodes = opcodes;
+retry:
+	if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+		return 0;
+	opcode = *(opcodes++);
+
+	/* 2nd-byte opcode */
+	if (opcode == 0x0f) {
+		if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+			return 0;
+		return test_bit(*opcodes, twobyte_is_boostable);
+	}
+
+	switch (opcode & 0xf0) {
+	case 0x60:
+		if (0x63 < opcode && opcode < 0x67)
+			goto retry; /* prefixes */
+		/* can't boost Address-size override and bound */
+		return (opcode != 0x62 && opcode != 0x67);
+	case 0x70:
+		return 0; /* can't boost conditional jump */
+	case 0xc0:
+		/* can't boost software-interruptions */
+		return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
+	case 0xd0:
+		/* can boost AA* and XLAT */
+		return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
+	case 0xe0:
+		/* can boost in/out and absolute jmps */
+		return ((opcode & 0x04) || opcode == 0xea);
+	case 0xf0:
+		if ((opcode & 0x0c) == 0 && opcode != 0xf1)
+			goto retry; /* lock/rep(ne) prefix */
+		/* clear and set flags can be boost */
+		return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
+	default:
+		if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
+			goto retry; /* prefixes */
+		/* can't boost CS override and call */
+		return (opcode != 0x2e && opcode != 0x9a);
+	}
+}
+
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
+{
+	switch (opcode) {
+	case 0xfa:		/* cli */
+	case 0xfb:		/* sti */
+	case 0xcf:		/* iret/iretd */
+	case 0x9d:		/* popf/popfd */
+		return 1;
+	}
+	return 0;
+}
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+	/* insn: must be on special executable page on i386. */
+	p->ainsn.insn = get_insn_slot();
+	if (!p->ainsn.insn)
+		return -ENOMEM;
+
+	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	p->opcode = *p->addr;
+	if (can_boost(p->addr)) {
+		p->ainsn.boostable = 0;
+	} else {
+		p->ainsn.boostable = -1;
+	}
+	return 0;
+}
+
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+	text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+}
+
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
+{
+	text_poke(p->addr, &p->opcode, 1);
+}
+
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+	mutex_lock(&kprobe_mutex);
+	free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
+	mutex_unlock(&kprobe_mutex);
+}
+
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	kcb->prev_kprobe.kp = kprobe_running();
+	kcb->prev_kprobe.status = kcb->kprobe_status;
+	kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags;
+	kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags;
+}
+
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+	kcb->kprobe_status = kcb->prev_kprobe.status;
+	kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags;
+	kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags;
+}
+
+static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+				struct kprobe_ctlblk *kcb)
+{
+	__get_cpu_var(current_kprobe) = p;
+	kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
+		= (regs->eflags & (TF_MASK | IF_MASK));
+	if (is_IF_modifier(p->opcode))
+		kcb->kprobe_saved_eflags &= ~IF_MASK;
+}
+
+static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+	regs->eflags |= TF_MASK;
+	regs->eflags &= ~IF_MASK;
+	/*single step inline if the instruction is an int3*/
+	if (p->opcode == BREAKPOINT_INSTRUCTION)
+		regs->eip = (unsigned long)p->addr;
+	else
+		regs->eip = (unsigned long)p->ainsn.insn;
+}
+
+/* Called with kretprobe_lock held */
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+				      struct pt_regs *regs)
+{
+	unsigned long *sara = (unsigned long *)&regs->esp;
+
+	ri->ret_addr = (kprobe_opcode_t *) *sara;
+
+	/* Replace the return addr with trampoline addr */
+	*sara = (unsigned long) &kretprobe_trampoline;
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+static int __kprobes kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *p;
+	int ret = 0;
+	kprobe_opcode_t *addr;
+	struct kprobe_ctlblk *kcb;
+
+	addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
+
+	/* Check we're not actually recursing */
+	if (kprobe_running()) {
+		p = get_kprobe(addr);
+		if (p) {
+			if (kcb->kprobe_status == KPROBE_HIT_SS &&
+				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
+				regs->eflags &= ~TF_MASK;
+				regs->eflags |= kcb->kprobe_saved_eflags;
+				goto no_kprobe;
+			}
+			/* We have reentered the kprobe_handler(), since
+			 * another probe was hit while within the handler.
+			 * We here save the original kprobes variables and
+			 * just single step on the instruction of the new probe
+			 * without calling any user handlers.
+			 */
+			save_previous_kprobe(kcb);
+			set_current_kprobe(p, regs, kcb);
+			kprobes_inc_nmissed_count(p);
+			prepare_singlestep(p, regs);
+			kcb->kprobe_status = KPROBE_REENTER;
+			return 1;
+		} else {
+			if (*addr != BREAKPOINT_INSTRUCTION) {
+			/* The breakpoint instruction was removed by
+			 * another cpu right after we hit, no further
+			 * handling of this interrupt is appropriate
+			 */
+				regs->eip -= sizeof(kprobe_opcode_t);
+				ret = 1;
+				goto no_kprobe;
+			}
+			p = __get_cpu_var(current_kprobe);
+			if (p->break_handler && p->break_handler(p, regs)) {
+				goto ss_probe;
+			}
+		}
+		goto no_kprobe;
+	}
+
+	p = get_kprobe(addr);
+	if (!p) {
+		if (*addr != BREAKPOINT_INSTRUCTION) {
+			/*
+			 * The breakpoint instruction was removed right
+			 * after we hit it.  Another cpu has removed
+			 * either a probepoint or a debugger breakpoint
+			 * at this address.  In either case, no further
+			 * handling of this interrupt is appropriate.
+			 * Back up over the (now missing) int3 and run
+			 * the original instruction.
+			 */
+			regs->eip -= sizeof(kprobe_opcode_t);
+			ret = 1;
+		}
+		/* Not one of ours: let kernel handle it */
+		goto no_kprobe;
+	}
+
+	set_current_kprobe(p, regs, kcb);
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+	if (p->pre_handler && p->pre_handler(p, regs))
+		/* handler has already set things up, so skip ss setup */
+		return 1;
+
+ss_probe:
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
+	if (p->ainsn.boostable == 1 && !p->post_handler){
+		/* Boost up -- we can execute copied instructions directly */
+		reset_current_kprobe();
+		regs->eip = (unsigned long)p->ainsn.insn;
+		preempt_enable_no_resched();
+		return 1;
+	}
+#endif
+	prepare_singlestep(p, regs);
+	kcb->kprobe_status = KPROBE_HIT_SS;
+	return 1;
+
+no_kprobe:
+	preempt_enable_no_resched();
+	return ret;
+}
+
+/*
+ * For function-return probes, init_kprobes() establishes a probepoint
+ * here. When a retprobed function returns, this probe is hit and
+ * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ */
+ void __kprobes kretprobe_trampoline_holder(void)
+ {
+	asm volatile ( ".global kretprobe_trampoline\n"
+			"kretprobe_trampoline: \n"
+			"	pushf\n"
+			/* skip cs, eip, orig_eax */
+			"	subl $12, %esp\n"
+			"	pushl %fs\n"
+			"	pushl %ds\n"
+			"	pushl %es\n"
+			"	pushl %eax\n"
+			"	pushl %ebp\n"
+			"	pushl %edi\n"
+			"	pushl %esi\n"
+			"	pushl %edx\n"
+			"	pushl %ecx\n"
+			"	pushl %ebx\n"
+			"	movl %esp, %eax\n"
+			"	call trampoline_handler\n"
+			/* move eflags to cs */
+			"	movl 52(%esp), %edx\n"
+			"	movl %edx, 48(%esp)\n"
+			/* save true return address on eflags */
+			"	movl %eax, 52(%esp)\n"
+			"	popl %ebx\n"
+			"	popl %ecx\n"
+			"	popl %edx\n"
+			"	popl %esi\n"
+			"	popl %edi\n"
+			"	popl %ebp\n"
+			"	popl %eax\n"
+			/* skip eip, orig_eax, es, ds, fs */
+			"	addl $20, %esp\n"
+			"	popf\n"
+			"	ret\n");
+}
+
+/*
+ * Called from kretprobe_trampoline
+ */
+fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
+{
+	struct kretprobe_instance *ri = NULL;
+	struct hlist_head *head, empty_rp;
+	struct hlist_node *node, *tmp;
+	unsigned long flags, orig_ret_address = 0;
+	unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
+
+	INIT_HLIST_HEAD(&empty_rp);
+	spin_lock_irqsave(&kretprobe_lock, flags);
+	head = kretprobe_inst_table_head(current);
+	/* fixup registers */
+	regs->xcs = __KERNEL_CS | get_kernel_rpl();
+	regs->eip = trampoline_address;
+	regs->orig_eax = 0xffffffff;
+
+	/*
+	 * It is possible to have multiple instances associated with a given
+	 * task either because an multiple functions in the call path
+	 * have a return probe installed on them, and/or more then one return
+	 * return probe was registered for a target function.
+	 *
+	 * We can handle this because:
+	 *     - instances are always inserted at the head of the list
+	 *     - when multiple return probes are registered for the same
+	 *       function, the first instance's ret_addr will point to the
+	 *       real return address, and all the rest will point to
+	 *       kretprobe_trampoline
+	 */
+	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+		if (ri->task != current)
+			/* another task is sharing our hash bucket */
+			continue;
+
+		if (ri->rp && ri->rp->handler){
+			__get_cpu_var(current_kprobe) = &ri->rp->kp;
+			get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+			ri->rp->handler(ri, regs);
+			__get_cpu_var(current_kprobe) = NULL;
+		}
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
+		recycle_rp_inst(ri, &empty_rp);
+
+		if (orig_ret_address != trampoline_address)
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
+	}
+
+	kretprobe_assert(ri, orig_ret_address, trampoline_address);
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
+
+	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+		hlist_del(&ri->hlist);
+		kfree(ri);
+	}
+	return (void*)orig_ret_address;
+}
+
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int 3"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt.  We have to fix up the stack as follows:
+ *
+ * 0) Except in the case of absolute or indirect jump or call instructions,
+ * the new eip is relative to the copied instruction.  We need to make
+ * it relative to the original instruction.
+ *
+ * 1) If the single-stepped instruction was pushfl, then the TF and IF
+ * flags are set in the just-pushed eflags, and may need to be cleared.
+ *
+ * 2) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ *
+ * This function also checks instruction size for preparing direct execution.
+ */
+static void __kprobes resume_execution(struct kprobe *p,
+		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+{
+	unsigned long *tos = (unsigned long *)&regs->esp;
+	unsigned long copy_eip = (unsigned long)p->ainsn.insn;
+	unsigned long orig_eip = (unsigned long)p->addr;
+
+	regs->eflags &= ~TF_MASK;
+	switch (p->ainsn.insn[0]) {
+	case 0x9c:		/* pushfl */
+		*tos &= ~(TF_MASK | IF_MASK);
+		*tos |= kcb->kprobe_old_eflags;
+		break;
+	case 0xc2:		/* iret/ret/lret */
+	case 0xc3:
+	case 0xca:
+	case 0xcb:
+	case 0xcf:
+	case 0xea:		/* jmp absolute -- eip is correct */
+		/* eip is already adjusted, no more changes required */
+		p->ainsn.boostable = 1;
+		goto no_change;
+	case 0xe8:		/* call relative - Fix return addr */
+		*tos = orig_eip + (*tos - copy_eip);
+		break;
+	case 0x9a:		/* call absolute -- same as call absolute, indirect */
+		*tos = orig_eip + (*tos - copy_eip);
+		goto no_change;
+	case 0xff:
+		if ((p->ainsn.insn[1] & 0x30) == 0x10) {
+			/*
+			 * call absolute, indirect
+			 * Fix return addr; eip is correct.
+			 * But this is not boostable
+			 */
+			*tos = orig_eip + (*tos - copy_eip);
+			goto no_change;
+		} else if (((p->ainsn.insn[1] & 0x31) == 0x20) ||	/* jmp near, absolute indirect */
+			   ((p->ainsn.insn[1] & 0x31) == 0x21)) {	/* jmp far, absolute indirect */
+			/* eip is correct. And this is boostable */
+			p->ainsn.boostable = 1;
+			goto no_change;
+		}
+	default:
+		break;
+	}
+
+	if (p->ainsn.boostable == 0) {
+		if ((regs->eip > copy_eip) &&
+		    (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
+			/*
+			 * These instructions can be executed directly if it
+			 * jumps back to correct address.
+			 */
+			set_jmp_op((void *)regs->eip,
+				   (void *)orig_eip + (regs->eip - copy_eip));
+			p->ainsn.boostable = 1;
+		} else {
+			p->ainsn.boostable = -1;
+		}
+	}
+
+	regs->eip = orig_eip + (regs->eip - copy_eip);
+
+no_change:
+	return;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thoroughout this function.
+ */
+static int __kprobes post_kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (!cur)
+		return 0;
+
+	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		cur->post_handler(cur, regs, 0);
+	}
+
+	resume_execution(cur, regs, kcb);
+	regs->eflags |= kcb->kprobe_saved_eflags;
+
+	/*Restore back the original saved kprobes variables and continue. */
+	if (kcb->kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe(kcb);
+		goto out;
+	}
+	reset_current_kprobe();
+out:
+	preempt_enable_no_resched();
+
+	/*
+	 * if somebody else is singlestepping across a probe point, eflags
+	 * will have TF set, in which case, continue the remaining processing
+	 * of do_debug, as if this is not a probe hit.
+	 */
+	if (regs->eflags & TF_MASK)
+		return 0;
+
+	return 1;
+}
+
+static int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	switch(kcb->kprobe_status) {
+	case KPROBE_HIT_SS:
+	case KPROBE_REENTER:
+		/*
+		 * We are here because the instruction being single
+		 * stepped caused a page fault. We reset the current
+		 * kprobe and the eip points back to the probe address
+		 * and allow the page fault handler to continue as a
+		 * normal page fault.
+		 */
+		regs->eip = (unsigned long)cur->addr;
+		regs->eflags |= kcb->kprobe_old_eflags;
+		if (kcb->kprobe_status == KPROBE_REENTER)
+			restore_previous_kprobe(kcb);
+		else
+			reset_current_kprobe();
+		preempt_enable_no_resched();
+		break;
+	case KPROBE_HIT_ACTIVE:
+	case KPROBE_HIT_SSDONE:
+		/*
+		 * We increment the nmissed count for accounting,
+		 * we can also use npre/npostfault count for accouting
+		 * these specific fault cases.
+		 */
+		kprobes_inc_nmissed_count(cur);
+
+		/*
+		 * We come here because instructions in the pre/post
+		 * handler caused the page_fault, this could happen
+		 * if handler tries to access user space by
+		 * copy_from_user(), get_user() etc. Let the
+		 * user-specified handler try to fix it first.
+		 */
+		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+			return 1;
+
+		/*
+		 * In case the user-specified fault handler returned
+		 * zero, try to fix up.
+		 */
+		if (fixup_exception(regs))
+			return 1;
+
+		/*
+		 * fixup_exception() could not handle it,
+		 * Let do_page_fault() fix it.
+		 */
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Wrapper routine to for handling exceptions.
+ */
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+				       unsigned long val, void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+
+	if (args->regs && user_mode_vm(args->regs))
+		return ret;
+
+	switch (val) {
+	case DIE_INT3:
+		if (kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_DEBUG:
+		if (post_kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_GPF:
+	case DIE_PAGE_FAULT:
+		/* kprobe_running() needs smp_processor_id() */
+		preempt_disable();
+		if (kprobe_running() &&
+		    kprobe_fault_handler(args->regs, args->trapnr))
+			ret = NOTIFY_STOP;
+		preempt_enable();
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+	unsigned long addr;
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	kcb->jprobe_saved_regs = *regs;
+	kcb->jprobe_saved_esp = &regs->esp;
+	addr = (unsigned long)(kcb->jprobe_saved_esp);
+
+	/*
+	 * TBD: As Linus pointed out, gcc assumes that the callee
+	 * owns the argument space and could overwrite it, e.g.
+	 * tailcall optimization. So, to be absolutely safe
+	 * we also save and restore enough stack bytes to cover
+	 * the argument area.
+	 */
+	memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
+			MIN_STACK_SIZE(addr));
+	regs->eflags &= ~IF_MASK;
+	regs->eip = (unsigned long)(jp->entry);
+	return 1;
+}
+
+void __kprobes jprobe_return(void)
+{
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	asm volatile ("       xchgl   %%ebx,%%esp     \n"
+		      "       int3			\n"
+		      "       .globl jprobe_return_end	\n"
+		      "       jprobe_return_end:	\n"
+		      "       nop			\n"::"b"
+		      (kcb->jprobe_saved_esp):"memory");
+}
+
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	u8 *addr = (u8 *) (regs->eip - 1);
+	unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+
+	if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
+		if (&regs->esp != kcb->jprobe_saved_esp) {
+			struct pt_regs *saved_regs =
+			    container_of(kcb->jprobe_saved_esp,
+					    struct pt_regs, esp);
+			printk("current esp %p does not match saved esp %p\n",
+			       &regs->esp, kcb->jprobe_saved_esp);
+			printk("Saved registers for jprobe %p\n", jp);
+			show_registers(saved_regs);
+			printk("Current registers\n");
+			show_registers(regs);
+			BUG();
+		}
+		*regs = kcb->jprobe_saved_regs;
+		memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
+		       MIN_STACK_SIZE(stack_addr));
+		preempt_enable_no_resched();
+		return 1;
+	}
+	return 0;
+}
+
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+	return 0;
+}
+
+int __init arch_init_kprobes(void)
+{
+	return 0;
+}
diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt_32.c
new file mode 100644
index 000000000000..e0b2d17f4f10
--- /dev/null
+++ b/arch/x86/kernel/ldt_32.c
@@ -0,0 +1,250 @@
+/*
+ * linux/arch/i386/kernel/ldt.c
+ *
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+
+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+static void flush_ldt(void *null)
+{
+	if (current->active_mm)
+		load_LDT(&current->active_mm->context);
+}
+#endif
+
+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+{
+	void *oldldt;
+	void *newldt;
+	int oldsize;
+
+	if (mincount <= pc->size)
+		return 0;
+	oldsize = pc->size;
+	mincount = (mincount+511)&(~511);
+	if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+		newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+	else
+		newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+
+	if (!newldt)
+		return -ENOMEM;
+
+	if (oldsize)
+		memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+	oldldt = pc->ldt;
+	memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+	pc->ldt = newldt;
+	wmb();
+	pc->size = mincount;
+	wmb();
+
+	if (reload) {
+#ifdef CONFIG_SMP
+		cpumask_t mask;
+		preempt_disable();
+		load_LDT(pc);
+		mask = cpumask_of_cpu(smp_processor_id());
+		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+			smp_call_function(flush_ldt, NULL, 1, 1);
+		preempt_enable();
+#else
+		load_LDT(pc);
+#endif
+	}
+	if (oldsize) {
+		if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+			vfree(oldldt);
+		else
+			kfree(oldldt);
+	}
+	return 0;
+}
+
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+	int err = alloc_ldt(new, old->size, 0);
+	if (err < 0)
+		return err;
+	memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+	return 0;
+}
+
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+	struct mm_struct * old_mm;
+	int retval = 0;
+
+	init_MUTEX(&mm->context.sem);
+	mm->context.size = 0;
+	old_mm = current->mm;
+	if (old_mm && old_mm->context.size > 0) {
+		down(&old_mm->context.sem);
+		retval = copy_ldt(&mm->context, &old_mm->context);
+		up(&old_mm->context.sem);
+	}
+	return retval;
+}
+
+/*
+ * No need to lock the MM as we are the last user
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	if (mm->context.size) {
+		if (mm == current->active_mm)
+			clear_LDT();
+		if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+			vfree(mm->context.ldt);
+		else
+			kfree(mm->context.ldt);
+		mm->context.size = 0;
+	}
+}
+
+static int read_ldt(void __user * ptr, unsigned long bytecount)
+{
+	int err;
+	unsigned long size;
+	struct mm_struct * mm = current->mm;
+
+	if (!mm->context.size)
+		return 0;
+	if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
+		bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+
+	down(&mm->context.sem);
+	size = mm->context.size*LDT_ENTRY_SIZE;
+	if (size > bytecount)
+		size = bytecount;
+
+	err = 0;
+	if (copy_to_user(ptr, mm->context.ldt, size))
+		err = -EFAULT;
+	up(&mm->context.sem);
+	if (err < 0)
+		goto error_return;
+	if (size != bytecount) {
+		/* zero-fill the rest */
+		if (clear_user(ptr+size, bytecount-size) != 0) {
+			err = -EFAULT;
+			goto error_return;
+		}
+	}
+	return bytecount;
+error_return:
+	return err;
+}
+
+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+{
+	int err;
+	unsigned long size;
+
+	err = 0;
+	size = 5*sizeof(struct desc_struct);
+	if (size > bytecount)
+		size = bytecount;
+
+	err = size;
+	if (clear_user(ptr, size))
+		err = -EFAULT;
+
+	return err;
+}
+
+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+{
+	struct mm_struct * mm = current->mm;
+	__u32 entry_1, entry_2;
+	int error;
+	struct user_desc ldt_info;
+
+	error = -EINVAL;
+	if (bytecount != sizeof(ldt_info))
+		goto out;
+	error = -EFAULT; 	
+	if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+		goto out;
+
+	error = -EINVAL;
+	if (ldt_info.entry_number >= LDT_ENTRIES)
+		goto out;
+	if (ldt_info.contents == 3) {
+		if (oldmode)
+			goto out;
+		if (ldt_info.seg_not_present == 0)
+			goto out;
+	}
+
+	down(&mm->context.sem);
+	if (ldt_info.entry_number >= mm->context.size) {
+		error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
+		if (error < 0)
+			goto out_unlock;
+	}
+
+   	/* Allow LDTs to be cleared by the user. */
+   	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+		if (oldmode || LDT_empty(&ldt_info)) {
+			entry_1 = 0;
+			entry_2 = 0;
+			goto install;
+		}
+	}
+
+	entry_1 = LDT_entry_a(&ldt_info);
+	entry_2 = LDT_entry_b(&ldt_info);
+	if (oldmode)
+		entry_2 &= ~(1 << 20);
+
+	/* Install the new entry ...  */
+install:
+	write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2);
+	error = 0;
+
+out_unlock:
+	up(&mm->context.sem);
+out:
+	return error;
+}
+
+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+{
+	int ret = -ENOSYS;
+
+	switch (func) {
+	case 0:
+		ret = read_ldt(ptr, bytecount);
+		break;
+	case 1:
+		ret = write_ldt(ptr, bytecount, 1);
+		break;
+	case 2:
+		ret = read_default_ldt(ptr, bytecount);
+		break;
+	case 0x11:
+		ret = write_ldt(ptr, bytecount, 0);
+		break;
+	}
+	return ret;
+}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
new file mode 100644
index 000000000000..91966bafb3dc
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -0,0 +1,171 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/desc.h>
+#include <asm/system.h>
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+static u32 kexec_pgd[1024] PAGE_ALIGNED;
+#ifdef CONFIG_X86_PAE
+static u32 kexec_pmd0[1024] PAGE_ALIGNED;
+static u32 kexec_pmd1[1024] PAGE_ALIGNED;
+#endif
+static u32 kexec_pte0[1024] PAGE_ALIGNED;
+static u32 kexec_pte1[1024] PAGE_ALIGNED;
+
+static void set_idt(void *newidt, __u16 limit)
+{
+	struct Xgt_desc_struct curidt;
+
+	/* ia32 supports unaliged loads & stores */
+	curidt.size    = limit;
+	curidt.address = (unsigned long)newidt;
+
+	load_idt(&curidt);
+};
+
+
+static void set_gdt(void *newgdt, __u16 limit)
+{
+	struct Xgt_desc_struct curgdt;
+
+	/* ia32 supports unaligned loads & stores */
+	curgdt.size    = limit;
+	curgdt.address = (unsigned long)newgdt;
+
+	load_gdt(&curgdt);
+};
+
+static void load_segments(void)
+{
+#define __STR(X) #X
+#define STR(X) __STR(X)
+
+	__asm__ __volatile__ (
+		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
+		"\t1:\n"
+		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+		"\tmovl %%eax,%%ds\n"
+		"\tmovl %%eax,%%es\n"
+		"\tmovl %%eax,%%fs\n"
+		"\tmovl %%eax,%%gs\n"
+		"\tmovl %%eax,%%ss\n"
+		::: "eax", "memory");
+#undef STR
+#undef __STR
+}
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+	return 0;
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+	unsigned long page_list[PAGES_NR];
+	void *control_page;
+
+	/* Interrupts aren't acceptable while we reboot */
+	local_irq_disable();
+
+	control_page = page_address(image->control_code_page);
+	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+	page_list[PA_CONTROL_PAGE] = __pa(control_page);
+	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+	page_list[PA_PGD] = __pa(kexec_pgd);
+	page_list[VA_PGD] = (unsigned long)kexec_pgd;
+#ifdef CONFIG_X86_PAE
+	page_list[PA_PMD_0] = __pa(kexec_pmd0);
+	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
+	page_list[PA_PMD_1] = __pa(kexec_pmd1);
+	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+#endif
+	page_list[PA_PTE_0] = __pa(kexec_pte0);
+	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
+	page_list[PA_PTE_1] = __pa(kexec_pte1);
+	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+
+	/* The segment registers are funny things, they have both a
+	 * visible and an invisible part.  Whenever the visible part is
+	 * set to a specific selector, the invisible part is loaded
+	 * with from a table in memory.  At no other time is the
+	 * descriptor table in memory accessed.
+	 *
+	 * I take advantage of this here by force loading the
+	 * segments, before I zap the gdt with an invalid value.
+	 */
+	load_segments();
+	/* The gdt & idt are now invalid.
+	 * If you want to load them you must set up your own idt & gdt.
+	 */
+	set_gdt(phys_to_virt(0),0);
+	set_idt(phys_to_virt(0),0);
+
+	/* now call it */
+	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+			image->start, cpu_has_pae);
+}
+
+/* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel.  By reserving this memory we guarantee
+ * that linux never sets it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+static int __init parse_crashkernel(char *arg)
+{
+	unsigned long size, base;
+	size = memparse(arg, &arg);
+	if (*arg == '@') {
+		base = memparse(arg+1, &arg);
+		/* FIXME: Do I want a sanity check
+		 * to validate the memory range?
+		 */
+		crashk_res.start = base;
+		crashk_res.end   = base + size - 1;
+	}
+	return 0;
+}
+early_param("crashkernel", parse_crashkernel);
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
new file mode 100644
index 000000000000..b83672b89527
--- /dev/null
+++ b/arch/x86/kernel/mca_32.c
@@ -0,0 +1,470 @@
+/*
+ *  linux/arch/i386/kernel/mca.c
+ *  Written by Martin Kolinek, February 1996
+ *
+ * Changes:
+ *
+ *	Chris Beauregard July 28th, 1996
+ *	- Fixed up integrated SCSI detection
+ *
+ *	Chris Beauregard August 3rd, 1996
+ *	- Made mca_info local
+ *	- Made integrated registers accessible through standard function calls
+ *	- Added name field
+ *	- More sanity checking
+ *
+ *	Chris Beauregard August 9th, 1996
+ *	- Rewrote /proc/mca
+ *
+ *	Chris Beauregard January 7th, 1997
+ *	- Added basic NMI-processing
+ *	- Added more information to mca_info structure
+ *
+ *	David Weinehall October 12th, 1998
+ *	- Made a lot of cleaning up in the source
+ *	- Added use of save_flags / restore_flags
+ *	- Added the 'driver_loaded' flag in MCA_adapter
+ *	- Added an alternative implemention of ZP Gu's mca_find_unused_adapter
+ *
+ *	David Weinehall March 24th, 1999
+ *	- Fixed the output of 'Driver Installed' in /proc/mca/pos
+ *	- Made the Integrated Video & SCSI show up even if they have id 0000
+ *
+ *	Alexander Viro November 9th, 1999
+ *	- Switched to regular procfs methods
+ *
+ *	Alfred Arnold & David Weinehall August 23rd, 2000
+ *	- Added support for Planar POS-registers
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mca.h>
+#include <linux/kprobes.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <linux/proc_fs.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/ioport.h>
+#include <asm/uaccess.h>
+#include <linux/init.h>
+#include <asm/arch_hooks.h>
+
+static unsigned char which_scsi = 0;
+
+int MCA_bus = 0;
+EXPORT_SYMBOL(MCA_bus);
+
+/*
+ * Motherboard register spinlock. Untested on SMP at the moment, but
+ * are there any MCA SMP boxes?
+ *
+ * Yes - Alan
+ */
+static DEFINE_SPINLOCK(mca_lock);
+
+/* Build the status info for the adapter */
+
+static void mca_configure_adapter_status(struct mca_device *mca_dev) {
+	mca_dev->status = MCA_ADAPTER_NONE;
+
+	mca_dev->pos_id = mca_dev->pos[0]
+		+ (mca_dev->pos[1] << 8);
+
+	if(!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
+
+		/* id = 0x0000 usually indicates hardware failure,
+		 * however, ZP Gu (zpg@castle.net> reports that his 9556
+		 * has 0x0000 as id and everything still works. There
+		 * also seem to be an adapter with id = 0x0000; the
+		 * NCR Parallel Bus Memory Card. Until this is confirmed,
+		 * however, this code will stay.
+		 */
+
+		mca_dev->status = MCA_ADAPTER_ERROR;
+
+		return;
+	} else if(mca_dev->pos_id != 0xffff) {
+
+		/* 0xffff usually indicates that there's no adapter,
+		 * however, some integrated adapters may have 0xffff as
+		 * their id and still be valid. Examples are on-board
+		 * VGA of the 55sx, the integrated SCSI of the 56 & 57,
+		 * and possibly also the 95 ULTIMEDIA.
+		 */
+
+		mca_dev->status = MCA_ADAPTER_NORMAL;
+	}
+
+	if((mca_dev->pos_id == 0xffff ||
+	    mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
+		int j;
+
+		for(j = 2; j < 8; j++) {
+			if(mca_dev->pos[j] != 0xff) {
+				mca_dev->status = MCA_ADAPTER_NORMAL;
+				break;
+			}
+		}
+	}
+
+	if(!(mca_dev->pos[2] & MCA_ENABLED)) {
+
+		/* enabled bit is in POS 2 */
+
+		mca_dev->status = MCA_ADAPTER_DISABLED;
+	}
+} /* mca_configure_adapter_status */
+
+/*--------------------------------------------------------------------*/
+
+static struct resource mca_standard_resources[] = {
+	{ .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
+	{ .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
+	{ .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
+	{ .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
+	{ .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
+	{ .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
+	{ .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
+};
+
+#define MCA_STANDARD_RESOURCES	ARRAY_SIZE(mca_standard_resources)
+
+/**
+ *	mca_read_and_store_pos - read the POS registers into a memory buffer
+ *      @pos: a char pointer to 8 bytes, contains the POS register value on
+ *            successful return
+ *
+ *	Returns 1 if a card actually exists (i.e. the pos isn't
+ *	all 0xff) or 0 otherwise
+ */
+static int mca_read_and_store_pos(unsigned char *pos) {
+	int j;
+	int found = 0;
+
+	for(j=0; j<8; j++) {
+		if((pos[j] = inb_p(MCA_POS_REG(j))) != 0xff) {
+			/* 0xff all across means no device. 0x00 means
+			 * something's broken, but a device is
+			 * probably there.  However, if you get 0x00
+			 * from a motherboard register it won't matter
+			 * what we find.  For the record, on the
+			 * 57SLC, the integrated SCSI adapter has
+			 * 0xffff for the adapter ID, but nonzero for
+			 * other registers.  */
+
+			found = 1;
+		}
+	}
+	return found;
+}
+
+static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
+{
+	unsigned char byte;
+	unsigned long flags;
+
+	if(reg < 0 || reg >= 8)
+		return 0;
+
+	spin_lock_irqsave(&mca_lock, flags);
+	if(mca_dev->pos_register) {
+		/* Disable adapter setup, enable motherboard setup */
+
+		outb_p(0, MCA_ADAPTER_SETUP_REG);
+		outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
+
+		byte = inb_p(MCA_POS_REG(reg));
+		outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
+	} else {
+
+		/* Make sure motherboard setup is off */
+
+		outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
+
+		/* Read the appropriate register */
+
+		outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
+		byte = inb_p(MCA_POS_REG(reg));
+		outb_p(0, MCA_ADAPTER_SETUP_REG);
+	}
+	spin_unlock_irqrestore(&mca_lock, flags);
+
+	mca_dev->pos[reg] = byte;
+
+	return byte;
+}
+
+static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
+			     unsigned char byte)
+{
+	unsigned long flags;
+
+	if(reg < 0 || reg >= 8)
+		return;
+
+	spin_lock_irqsave(&mca_lock, flags);
+
+	/* Make sure motherboard setup is off */
+
+	outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
+
+	/* Read in the appropriate register */
+
+	outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
+	outb_p(byte, MCA_POS_REG(reg));
+	outb_p(0, MCA_ADAPTER_SETUP_REG);
+
+	spin_unlock_irqrestore(&mca_lock, flags);
+
+	/* Update the global register list, while we have the byte */
+
+	mca_dev->pos[reg] = byte;
+
+}
+
+/* for the primary MCA bus, we have identity transforms */
+static int mca_dummy_transform_irq(struct mca_device * mca_dev, int irq)
+{
+	return irq;
+}
+
+static int mca_dummy_transform_ioport(struct mca_device * mca_dev, int port)
+{
+	return port;
+}
+
+static void *mca_dummy_transform_memory(struct mca_device * mca_dev, void *mem)
+{
+	return mem;
+}
+
+
+static int __init mca_init(void)
+{
+	unsigned int i, j;
+	struct mca_device *mca_dev;
+	unsigned char pos[8];
+	short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
+	struct mca_bus *bus;
+
+	/* WARNING: Be careful when making changes here. Putting an adapter
+	 * and the motherboard simultaneously into setup mode may result in
+	 * damage to chips (according to The Indispensible PC Hardware Book
+	 * by Hans-Peter Messmer). Also, we disable system interrupts (so
+	 * that we are not disturbed in the middle of this).
+	 */
+
+	/* Make sure the MCA bus is present */
+
+	if (mca_system_init()) {
+		printk(KERN_ERR "MCA bus system initialisation failed\n");
+		return -ENODEV;
+	}
+
+	if (!MCA_bus)
+		return -ENODEV;
+
+	printk(KERN_INFO "Micro Channel bus detected.\n");
+
+	/* All MCA systems have at least a primary bus */
+	bus = mca_attach_bus(MCA_PRIMARY_BUS);
+	if (!bus)
+		goto out_nomem;
+	bus->default_dma_mask = 0xffffffffLL;
+	bus->f.mca_write_pos = mca_pc_write_pos;
+	bus->f.mca_read_pos = mca_pc_read_pos;
+	bus->f.mca_transform_irq = mca_dummy_transform_irq;
+	bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
+	bus->f.mca_transform_memory = mca_dummy_transform_memory;
+
+	/* get the motherboard device */
+	mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
+	if(unlikely(!mca_dev))
+		goto out_nomem;
+
+	/*
+	 * We do not expect many MCA interrupts during initialization,
+	 * but let us be safe:
+	 */
+	spin_lock_irq(&mca_lock);
+
+	/* Make sure adapter setup is off */
+
+	outb_p(0, MCA_ADAPTER_SETUP_REG);
+
+	/* Read motherboard POS registers */
+
+	mca_dev->pos_register = 0x7f;
+	outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
+	mca_dev->name[0] = 0;
+	mca_read_and_store_pos(mca_dev->pos);
+	mca_configure_adapter_status(mca_dev);
+	/* fake POS and slot for a motherboard */
+	mca_dev->pos_id = MCA_MOTHERBOARD_POS;
+	mca_dev->slot = MCA_MOTHERBOARD;
+	mca_register_device(MCA_PRIMARY_BUS, mca_dev);
+
+	mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
+	if(unlikely(!mca_dev))
+		goto out_unlock_nomem;
+
+	/* Put motherboard into video setup mode, read integrated video
+	 * POS registers, and turn motherboard setup off.
+	 */
+
+	mca_dev->pos_register = 0xdf;
+	outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
+	mca_dev->name[0] = 0;
+	mca_read_and_store_pos(mca_dev->pos);
+	mca_configure_adapter_status(mca_dev);
+	/* fake POS and slot for the integrated video */
+	mca_dev->pos_id = MCA_INTEGVIDEO_POS;
+	mca_dev->slot = MCA_INTEGVIDEO;
+	mca_register_device(MCA_PRIMARY_BUS, mca_dev);
+
+	/* Put motherboard into scsi setup mode, read integrated scsi
+	 * POS registers, and turn motherboard setup off.
+	 *
+	 * It seems there are two possible SCSI registers. Martin says that
+	 * for the 56,57, 0xf7 is the one, but fails on the 76.
+	 * Alfredo (apena@vnet.ibm.com) says
+	 * 0xfd works on his machine. We'll try both of them. I figure it's
+	 * a good bet that only one could be valid at a time. This could
+	 * screw up though if one is used for something else on the other
+	 * machine.
+	 */
+
+	for(i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
+		outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
+		if(mca_read_and_store_pos(pos))
+			break;
+	}
+	if(which_scsi) {
+		/* found a scsi card */
+		mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
+		if(unlikely(!mca_dev))
+			goto out_unlock_nomem;
+
+		for(j = 0; j < 8; j++)
+			mca_dev->pos[j] = pos[j];
+
+		mca_configure_adapter_status(mca_dev);
+		/* fake POS and slot for integrated SCSI controller */
+		mca_dev->pos_id = MCA_INTEGSCSI_POS;
+		mca_dev->slot = MCA_INTEGSCSI;
+		mca_dev->pos_register = which_scsi;
+		mca_register_device(MCA_PRIMARY_BUS, mca_dev);
+	}
+
+	/* Turn off motherboard setup */
+
+	outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
+
+	/* Now loop over MCA slots: put each adapter into setup mode, and
+	 * read its POS registers. Then put adapter setup off.
+	 */
+
+	for(i=0; i<MCA_MAX_SLOT_NR; i++) {
+		outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
+		if(!mca_read_and_store_pos(pos))
+			continue;
+
+		mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
+		if(unlikely(!mca_dev))
+			goto out_unlock_nomem;
+
+		for(j=0; j<8; j++)
+			mca_dev->pos[j]=pos[j];
+
+		mca_dev->driver_loaded = 0;
+		mca_dev->slot = i;
+		mca_dev->pos_register = 0;
+		mca_configure_adapter_status(mca_dev);
+		mca_register_device(MCA_PRIMARY_BUS, mca_dev);
+	}
+	outb_p(0, MCA_ADAPTER_SETUP_REG);
+
+	/* Enable interrupts and return memory start */
+	spin_unlock_irq(&mca_lock);
+
+	for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
+		request_resource(&ioport_resource, mca_standard_resources + i);
+
+	mca_do_proc_init();
+
+	return 0;
+
+ out_unlock_nomem:
+	spin_unlock_irq(&mca_lock);
+ out_nomem:
+	printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
+	return -ENOMEM;
+}
+
+subsys_initcall(mca_init);
+
+/*--------------------------------------------------------------------*/
+
+static __kprobes void
+mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
+{
+	int slot = mca_dev->slot;
+
+	if(slot == MCA_INTEGSCSI) {
+		printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
+			mca_dev->name);
+	} else if(slot == MCA_INTEGVIDEO) {
+		printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
+			mca_dev->name);
+	} else if(slot == MCA_MOTHERBOARD) {
+		printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
+			mca_dev->name);
+	}
+
+	/* More info available in POS 6 and 7? */
+
+	if(check_flag) {
+		unsigned char pos6, pos7;
+
+		pos6 = mca_device_read_pos(mca_dev, 6);
+		pos7 = mca_device_read_pos(mca_dev, 7);
+
+		printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
+	}
+
+} /* mca_handle_nmi_slot */
+
+/*--------------------------------------------------------------------*/
+
+static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
+{
+	struct mca_device *mca_dev = to_mca_device(dev);
+	unsigned char pos5;
+
+	pos5 = mca_device_read_pos(mca_dev, 5);
+
+	if(!(pos5 & 0x80)) {
+		/* Bit 7 of POS 5 is reset when this adapter has a hardware
+		 * error. Bit 7 it reset if there's error information
+		 * available in POS 6 and 7.
+		 */
+		mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
+		return 1;
+	}
+	return 0;
+}
+
+void __kprobes mca_handle_nmi(void)
+{
+	/* First try - scan the various adapters and see if a specific
+	 * adapter was responsible for the error.
+	 */
+	bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
+
+	mca_nmi_hook();
+} /* mca_handle_nmi */
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
new file mode 100644
index 000000000000..09cf78110358
--- /dev/null
+++ b/arch/x86/kernel/microcode.c
@@ -0,0 +1,850 @@
+/*
+ *	Intel CPU Microcode Update Driver for Linux
+ *
+ *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *		      2006	Shaohua Li <shaohua.li@intel.com>
+ *
+ *	This driver allows to upgrade microcode on Intel processors
+ *	belonging to IA-32 family - PentiumPro, Pentium II, 
+ *	Pentium III, Xeon, Pentium 4, etc.
+ *
+ *	Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, 
+ *	Order Number 245472 or free download from:
+ *		
+ *	http://developer.intel.com/design/pentium4/manuals/245472.htm
+ *
+ *	For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Initial release.
+ *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Added read() support + cleanups.
+ *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Added 'device trimming' support. open(O_WRONLY) zeroes
+ *		and frees the saved copy of applied microcode.
+ *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Made to use devfs (/dev/cpu/microcode) + cleanups.
+ *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *		Added misc device support (now uses both devfs and misc).
+ *		Added MICROCODE_IOCFREE ioctl to clear memory.
+ *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *		Messages for error cases (non Intel & no suitable microcode).
+ *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
+ *		Removed ->release(). Removed exclusive open and status bitmap.
+ *		Added microcode_rwsem to serialize read()/write()/ioctl().
+ *		Removed global kernel lock usage.
+ *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
+ *		Write 0 to 0x8B msr and then cpuid before reading revision,
+ *		so that it works even if there were no update done by the
+ *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
+ *		to be 0 on my machine which is why it worked even when I
+ *		disabled update by the BIOS)
+ *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
+ *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
+ *			     Tigran Aivazian <tigran@veritas.com>
+ *		Intel Pentium 4 processor support and bugfixes.
+ *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
+ *		Bugfix for HT (Hyper-Threading) enabled processors
+ *		whereby processor resources are shared by all logical processors
+ *		in a single CPU package.
+ *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
+ *		Tigran Aivazian <tigran@veritas.com>,
+ *		Serialize updates as required on HT processors due to speculative
+ *		nature of implementation.
+ *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
+ *		Fix the panic when writing zero-length microcode chunk.
+ *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, 
+ *		Jun Nakajima <jun.nakajima@intel.com>
+ *		Support for the microcode updates in the new format.
+ *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
+ *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
+ *		because we no longer hold a copy of applied microcode 
+ *		in kernel memory.
+ *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
+ *		Fix sigmatch() macro to handle old CPUs with pf == 0.
+ *		Thanks to Stuart Swales for pointing out this bug.
+ */
+
+//#define DEBUG /* pr_debug */
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+
+MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+#define MICROCODE_VERSION 	"1.14a"
+
+#define DEFAULT_UCODE_DATASIZE 	(2000) 	  /* 2000 bytes */
+#define MC_HEADER_SIZE		(sizeof (microcode_header_t))  	  /* 48 bytes */
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
+#define EXT_HEADER_SIZE		(sizeof (struct extended_sigtable)) /* 20 bytes */
+#define EXT_SIGNATURE_SIZE	(sizeof (struct extended_signature)) /* 12 bytes */
+#define DWSIZE			(sizeof (u32))
+#define get_totalsize(mc) \
+	(((microcode_t *)mc)->hdr.totalsize ? \
+	 ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
+#define get_datasize(mc) \
+	(((microcode_t *)mc)->hdr.datasize ? \
+	 ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
+
+#define sigmatch(s1, s2, p1, p2) \
+	(((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
+
+#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
+
+/* serialize access to the physical write to MSR 0x79 */
+static DEFINE_SPINLOCK(microcode_update_lock);
+
+/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+static DEFINE_MUTEX(microcode_mutex);
+
+static struct ucode_cpu_info {
+	int valid;
+	unsigned int sig;
+	unsigned int pf;
+	unsigned int rev;
+	microcode_t *mc;
+} ucode_cpu_info[NR_CPUS];
+
+static void collect_cpu_info(int cpu_num)
+{
+	struct cpuinfo_x86 *c = cpu_data + cpu_num;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+	unsigned int val[2];
+
+	/* We should bind the task to the CPU */
+	BUG_ON(raw_smp_processor_id() != cpu_num);
+	uci->pf = uci->rev = 0;
+	uci->mc = NULL;
+	uci->valid = 1;
+
+	if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+	    	cpu_has(c, X86_FEATURE_IA64)) {
+		printk(KERN_ERR "microcode: CPU%d not a capable Intel "
+			"processor\n", cpu_num);
+		uci->valid = 0;
+		return;
+	}
+
+	uci->sig = cpuid_eax(0x00000001);
+
+	if ((c->x86_model >= 5) || (c->x86 > 6)) {
+		/* get processor flags from MSR 0x17 */
+		rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+		uci->pf = 1 << ((val[1] >> 18) & 7);
+	}
+
+	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+	/* see notes above for revision 1.07.  Apparent chip bug */
+	sync_core();
+	/* get the current revision from MSR 0x8B */
+	rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
+	pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
+			uci->sig, uci->pf, uci->rev);
+}
+
+static inline int microcode_update_match(int cpu_num,
+	microcode_header_t *mc_header, int sig, int pf)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+
+	if (!sigmatch(sig, uci->sig, pf, uci->pf)
+		|| mc_header->rev <= uci->rev)
+		return 0;
+	return 1;
+}
+
+static int microcode_sanity_check(void *mc)
+{
+	microcode_header_t *mc_header = mc;
+	struct extended_sigtable *ext_header = NULL;
+	struct extended_signature *ext_sig;
+	unsigned long total_size, data_size, ext_table_size;
+	int sum, orig_sum, ext_sigcount = 0, i;
+
+	total_size = get_totalsize(mc_header);
+	data_size = get_datasize(mc_header);
+	if (data_size + MC_HEADER_SIZE > total_size) {
+		printk(KERN_ERR "microcode: error! "
+			"Bad data size in microcode data file\n");
+		return -EINVAL;
+	}
+
+	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+		printk(KERN_ERR "microcode: error! "
+			"Unknown microcode update format\n");
+		return -EINVAL;
+	}
+	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+	if (ext_table_size) {
+		if ((ext_table_size < EXT_HEADER_SIZE)
+		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+			printk(KERN_ERR "microcode: error! "
+				"Small exttable size in microcode data file\n");
+			return -EINVAL;
+		}
+		ext_header = mc + MC_HEADER_SIZE + data_size;
+		if (ext_table_size != exttable_size(ext_header)) {
+			printk(KERN_ERR "microcode: error! "
+				"Bad exttable size in microcode data file\n");
+			return -EFAULT;
+		}
+		ext_sigcount = ext_header->count;
+	}
+
+	/* check extended table checksum */
+	if (ext_table_size) {
+		int ext_table_sum = 0;
+		int *ext_tablep = (int *)ext_header;
+
+		i = ext_table_size / DWSIZE;
+		while (i--)
+			ext_table_sum += ext_tablep[i];
+		if (ext_table_sum) {
+			printk(KERN_WARNING "microcode: aborting, "
+				"bad extended signature table checksum\n");
+			return -EINVAL;
+		}
+	}
+
+	/* calculate the checksum */
+	orig_sum = 0;
+	i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+	while (i--)
+		orig_sum += ((int *)mc)[i];
+	if (orig_sum) {
+		printk(KERN_ERR "microcode: aborting, bad checksum\n");
+		return -EINVAL;
+	}
+	if (!ext_table_size)
+		return 0;
+	/* check extended signature checksum */
+	for (i = 0; i < ext_sigcount; i++) {
+		ext_sig = (struct extended_signature *)((void *)ext_header
+			+ EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i);
+		sum = orig_sum
+			- (mc_header->sig + mc_header->pf + mc_header->cksum)
+			+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+		if (sum) {
+			printk(KERN_ERR "microcode: aborting, bad checksum\n");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ * return < 0 - error
+ */
+static int get_maching_microcode(void *mc, int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	microcode_header_t *mc_header = mc;
+	struct extended_sigtable *ext_header;
+	unsigned long total_size = get_totalsize(mc_header);
+	int ext_sigcount, i;
+	struct extended_signature *ext_sig;
+	void *new_mc;
+
+	if (microcode_update_match(cpu, mc_header,
+			mc_header->sig, mc_header->pf))
+		goto find;
+
+	if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+		return 0;
+
+	ext_header = (struct extended_sigtable *)(mc +
+			get_datasize(mc_header) + MC_HEADER_SIZE);
+	ext_sigcount = ext_header->count;
+	ext_sig = (struct extended_signature *)((void *)ext_header
+			+ EXT_HEADER_SIZE);
+	for (i = 0; i < ext_sigcount; i++) {
+		if (microcode_update_match(cpu, mc_header,
+				ext_sig->sig, ext_sig->pf))
+			goto find;
+		ext_sig++;
+	}
+	return 0;
+find:
+	pr_debug("microcode: CPU %d found a matching microcode update with"
+		" version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
+	new_mc = vmalloc(total_size);
+	if (!new_mc) {
+		printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+		return -ENOMEM;
+	}
+
+	/* free previous update file */
+	vfree(uci->mc);
+
+	memcpy(new_mc, mc, total_size);
+	uci->mc = new_mc;
+	return 1;
+}
+
+static void apply_microcode(int cpu)
+{
+	unsigned long flags;
+	unsigned int val[2];
+	int cpu_num = raw_smp_processor_id();
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+
+	/* We should bind the task to the CPU */
+	BUG_ON(cpu_num != cpu);
+
+	if (uci->mc == NULL)
+		return;
+
+	/* serialize access to the physical write to MSR 0x79 */
+	spin_lock_irqsave(&microcode_update_lock, flags);          
+
+	/* write microcode via MSR 0x79 */
+	wrmsr(MSR_IA32_UCODE_WRITE,
+		(unsigned long) uci->mc->bits, 
+		(unsigned long) uci->mc->bits >> 16 >> 16);
+	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+	/* see notes above for revision 1.07.  Apparent chip bug */
+	sync_core();
+
+	/* get the current revision from MSR 0x8B */
+	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+	spin_unlock_irqrestore(&microcode_update_lock, flags);
+	if (val[1] != uci->mc->hdr.rev) {
+		printk(KERN_ERR "microcode: CPU%d updated from revision "
+			"0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
+		return;
+	}
+	pr_debug("microcode: CPU%d updated from revision "
+	       "0x%x to 0x%x, date = %08x \n", 
+	       cpu_num, uci->rev, val[1], uci->mc->hdr.date);
+	uci->rev = val[1];
+}
+
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static void __user *user_buffer;	/* user area microcode data buffer */
+static unsigned int user_buffer_size;	/* it's size */
+
+static long get_next_ucode(void **mc, long offset)
+{
+	microcode_header_t mc_header;
+	unsigned long total_size;
+
+	/* No more data */
+	if (offset >= user_buffer_size)
+		return 0;
+	if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
+		printk(KERN_ERR "microcode: error! Can not read user data\n");
+		return -EFAULT;
+	}
+	total_size = get_totalsize(&mc_header);
+	if (offset + total_size > user_buffer_size) {
+		printk(KERN_ERR "microcode: error! Bad total size in microcode "
+				"data file\n");
+		return -EINVAL;
+	}
+	*mc = vmalloc(total_size);
+	if (!*mc)
+		return -ENOMEM;
+	if (copy_from_user(*mc, user_buffer + offset, total_size)) {
+		printk(KERN_ERR "microcode: error! Can not read user data\n");
+		vfree(*mc);
+		return -EFAULT;
+	}
+	return offset + total_size;
+}
+
+static int do_microcode_update (void)
+{
+	long cursor = 0;
+	int error = 0;
+	void *new_mc = NULL;
+	int cpu;
+	cpumask_t old;
+
+	old = current->cpus_allowed;
+
+	while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
+		error = microcode_sanity_check(new_mc);
+		if (error)
+			goto out;
+		/*
+		 * It's possible the data file has multiple matching ucode,
+		 * lets keep searching till the latest version
+		 */
+		for_each_online_cpu(cpu) {
+			struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+			if (!uci->valid)
+				continue;
+			set_cpus_allowed(current, cpumask_of_cpu(cpu));
+			error = get_maching_microcode(new_mc, cpu);
+			if (error < 0)
+				goto out;
+			if (error == 1)
+				apply_microcode(cpu);
+		}
+		vfree(new_mc);
+	}
+out:
+	if (cursor > 0)
+		vfree(new_mc);
+	if (cursor < 0)
+		error = cursor;
+	set_cpus_allowed(current, old);
+	return error;
+}
+
+static int microcode_open (struct inode *unused1, struct file *unused2)
+{
+	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
+static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
+{
+	ssize_t ret;
+
+	if ((len >> PAGE_SHIFT) > num_physpages) {
+		printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
+		return -EINVAL;
+	}
+
+	lock_cpu_hotplug();
+	mutex_lock(&microcode_mutex);
+
+	user_buffer = (void __user *) buf;
+	user_buffer_size = (int) len;
+
+	ret = do_microcode_update();
+	if (!ret)
+		ret = (ssize_t)len;
+
+	mutex_unlock(&microcode_mutex);
+	unlock_cpu_hotplug();
+
+	return ret;
+}
+
+static const struct file_operations microcode_fops = {
+	.owner		= THIS_MODULE,
+	.write		= microcode_write,
+	.open		= microcode_open,
+};
+
+static struct miscdevice microcode_dev = {
+	.minor		= MICROCODE_MINOR,
+	.name		= "microcode",
+	.fops		= &microcode_fops,
+};
+
+static int __init microcode_dev_init (void)
+{
+	int error;
+
+	error = misc_register(&microcode_dev);
+	if (error) {
+		printk(KERN_ERR
+			"microcode: can't misc_register on minor=%d\n",
+			MICROCODE_MINOR);
+		return error;
+	}
+
+	return 0;
+}
+
+static void microcode_dev_exit (void)
+{
+	misc_deregister(&microcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while(0)
+#endif
+
+static long get_next_ucode_from_buffer(void **mc, void *buf,
+	unsigned long size, long offset)
+{
+	microcode_header_t *mc_header;
+	unsigned long total_size;
+
+	/* No more data */
+	if (offset >= size)
+		return 0;
+	mc_header = (microcode_header_t *)(buf + offset);
+	total_size = get_totalsize(mc_header);
+
+	if (offset + total_size > size) {
+		printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+		return -EINVAL;
+	}
+
+	*mc = vmalloc(total_size);
+	if (!*mc) {
+		printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+		return -ENOMEM;
+	}
+	memcpy(*mc, buf + offset, total_size);
+	return offset + total_size;
+}
+
+/* fake device for request_firmware */
+static struct platform_device *microcode_pdev;
+
+static int cpu_request_microcode(int cpu)
+{
+	char name[30];
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	const struct firmware *firmware;
+	void *buf;
+	unsigned long size;
+	long offset = 0;
+	int error;
+	void *mc;
+
+	/* We should bind the task to the CPU */
+	BUG_ON(cpu != raw_smp_processor_id());
+	sprintf(name,"intel-ucode/%02x-%02x-%02x",
+		c->x86, c->x86_model, c->x86_mask);
+	error = request_firmware(&firmware, name, &microcode_pdev->dev);
+	if (error) {
+		pr_debug("ucode data file %s load failed\n", name);
+		return error;
+	}
+	buf = (void *)firmware->data;
+	size = firmware->size;
+	while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
+			> 0) {
+		error = microcode_sanity_check(mc);
+		if (error)
+			break;
+		error = get_maching_microcode(mc, cpu);
+		if (error < 0)
+			break;
+		/*
+		 * It's possible the data file has multiple matching ucode,
+		 * lets keep searching till the latest version
+		 */
+		if (error == 1) {
+			apply_microcode(cpu);
+			error = 0;
+		}
+		vfree(mc);
+	}
+	if (offset > 0)
+		vfree(mc);
+	if (offset < 0)
+		error = offset;
+	release_firmware(firmware);
+
+	return error;
+}
+
+static int apply_microcode_check_cpu(int cpu)
+{
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	cpumask_t old;
+	unsigned int val[2];
+	int err = 0;
+
+	/* Check if the microcode is available */
+	if (!uci->mc)
+		return 0;
+
+	old = current->cpus_allowed;
+	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+
+	/* Check if the microcode we have in memory matches the CPU */
+	if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+	    cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
+		err = -EINVAL;
+
+	if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
+		/* get processor flags from MSR 0x17 */
+		rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+		if (uci->pf != (1 << ((val[1] >> 18) & 7)))
+			err = -EINVAL;
+	}
+
+	if (!err) {
+		wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+		/* see notes above for revision 1.07.  Apparent chip bug */
+		sync_core();
+		/* get the current revision from MSR 0x8B */
+		rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+		if (uci->rev != val[1])
+			err = -EINVAL;
+	}
+
+	if (!err)
+		apply_microcode(cpu);
+	else
+		printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
+			" sig=0x%x, pf=0x%x, rev=0x%x\n",
+			cpu, uci->sig, uci->pf, uci->rev);
+
+	set_cpus_allowed(current, old);
+	return err;
+}
+
+static void microcode_init_cpu(int cpu, int resume)
+{
+	cpumask_t old;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	old = current->cpus_allowed;
+
+	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+	mutex_lock(&microcode_mutex);
+	collect_cpu_info(cpu);
+	if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
+		cpu_request_microcode(cpu);
+	mutex_unlock(&microcode_mutex);
+	set_cpus_allowed(current, old);
+}
+
+static void microcode_fini_cpu(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	mutex_lock(&microcode_mutex);
+	uci->valid = 0;
+	vfree(uci->mc);
+	uci->mc = NULL;
+	mutex_unlock(&microcode_mutex);
+}
+
+static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+	char *end;
+	unsigned long val = simple_strtoul(buf, &end, 0);
+	int err = 0;
+	int cpu = dev->id;
+
+	if (end == buf)
+		return -EINVAL;
+	if (val == 1) {
+		cpumask_t old;
+
+		old = current->cpus_allowed;
+
+		lock_cpu_hotplug();
+		set_cpus_allowed(current, cpumask_of_cpu(cpu));
+
+		mutex_lock(&microcode_mutex);
+		if (uci->valid)
+			err = cpu_request_microcode(cpu);
+		mutex_unlock(&microcode_mutex);
+		unlock_cpu_hotplug();
+		set_cpus_allowed(current, old);
+	}
+	if (err)
+		return err;
+	return sz;
+}
+
+static ssize_t version_show(struct sys_device *dev, char *buf)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+	return sprintf(buf, "0x%x\n", uci->rev);
+}
+
+static ssize_t pf_show(struct sys_device *dev, char *buf)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+	return sprintf(buf, "0x%x\n", uci->pf);
+}
+
+static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
+static SYSDEV_ATTR(version, 0400, version_show, NULL);
+static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+
+static struct attribute *mc_default_attrs[] = {
+	&attr_reload.attr,
+	&attr_version.attr,
+	&attr_processor_flags.attr,
+	NULL
+};
+
+static struct attribute_group mc_attr_group = {
+	.attrs = mc_default_attrs,
+	.name = "microcode",
+};
+
+static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
+{
+	int err, cpu = sys_dev->id;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	pr_debug("Microcode:CPU %d added\n", cpu);
+	memset(uci, 0, sizeof(*uci));
+
+	err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+	if (err)
+		return err;
+
+	microcode_init_cpu(cpu, resume);
+
+	return 0;
+}
+
+static int mc_sysdev_add(struct sys_device *sys_dev)
+{
+	return __mc_sysdev_add(sys_dev, 0);
+}
+
+static int mc_sysdev_remove(struct sys_device *sys_dev)
+{
+	int cpu = sys_dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	pr_debug("Microcode:CPU %d removed\n", cpu);
+	microcode_fini_cpu(cpu);
+	sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+	return 0;
+}
+
+static int mc_sysdev_resume(struct sys_device *dev)
+{
+	int cpu = dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+	pr_debug("Microcode:CPU %d resumed\n", cpu);
+	/* only CPU 0 will apply ucode here */
+	apply_microcode(0);
+	return 0;
+}
+
+static struct sysdev_driver mc_sysdev_driver = {
+	.add = mc_sysdev_add,
+	.remove = mc_sysdev_remove,
+	.resume = mc_sysdev_resume,
+};
+
+static __cpuinit int
+mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct sys_device *sys_dev;
+
+	sys_dev = get_cpu_sysdev(cpu);
+	switch (action) {
+	case CPU_UP_CANCELED_FROZEN:
+		/* The CPU refused to come up during a system resume */
+		microcode_fini_cpu(cpu);
+		break;
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		mc_sysdev_add(sys_dev);
+		break;
+	case CPU_ONLINE_FROZEN:
+		/* System-wide resume is in progress, try to apply microcode */
+		if (apply_microcode_check_cpu(cpu)) {
+			/* The application of microcode failed */
+			microcode_fini_cpu(cpu);
+			__mc_sysdev_add(sys_dev, 1);
+			break;
+		}
+	case CPU_DOWN_FAILED_FROZEN:
+		if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+			printk(KERN_ERR "Microcode: Failed to create the sysfs "
+				"group for CPU%d\n", cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		mc_sysdev_remove(sys_dev);
+		break;
+	case CPU_DOWN_PREPARE_FROZEN:
+		/* Suspend is in progress, only remove the interface */
+		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata mc_cpu_notifier = {
+	.notifier_call = mc_cpu_callback,
+};
+
+static int __init microcode_init (void)
+{
+	int error;
+
+	error = microcode_dev_init();
+	if (error)
+		return error;
+	microcode_pdev = platform_device_register_simple("microcode", -1,
+							 NULL, 0);
+	if (IS_ERR(microcode_pdev)) {
+		microcode_dev_exit();
+		return PTR_ERR(microcode_pdev);
+	}
+
+	lock_cpu_hotplug();
+	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+	unlock_cpu_hotplug();
+	if (error) {
+		microcode_dev_exit();
+		platform_device_unregister(microcode_pdev);
+		return error;
+	}
+
+	register_hotcpu_notifier(&mc_cpu_notifier);
+
+	printk(KERN_INFO 
+		"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
+	return 0;
+}
+
+static void __exit microcode_exit (void)
+{
+	microcode_dev_exit();
+
+	unregister_hotcpu_notifier(&mc_cpu_notifier);
+
+	lock_cpu_hotplug();
+	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+	unlock_cpu_hotplug();
+
+	platform_device_unregister(microcode_pdev);
+}
+
+module_init(microcode_init)
+module_exit(microcode_exit)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
new file mode 100644
index 000000000000..3db0a5442eb1
--- /dev/null
+++ b/arch/x86/kernel/module_32.c
@@ -0,0 +1,152 @@
+/*  Kernel module help for i386.
+    Copyright (C) 2001 Rusty Russell.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/bug.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt...)
+#endif
+
+void *module_alloc(unsigned long size)
+{
+	if (size == 0)
+		return NULL;
+	return vmalloc_exec(size);
+}
+
+
+/* Free memory returned from module_alloc */
+void module_free(struct module *mod, void *module_region)
+{
+	vfree(module_region);
+	/* FIXME: If module_region == mod->init_region, trim exception
+           table entries. */
+}
+
+/* We don't need anything special. */
+int module_frob_arch_sections(Elf_Ehdr *hdr,
+			      Elf_Shdr *sechdrs,
+			      char *secstrings,
+			      struct module *mod)
+{
+	return 0;
+}
+
+int apply_relocate(Elf32_Shdr *sechdrs,
+		   const char *strtab,
+		   unsigned int symindex,
+		   unsigned int relsec,
+		   struct module *me)
+{
+	unsigned int i;
+	Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
+	Elf32_Sym *sym;
+	uint32_t *location;
+
+	DEBUGP("Applying relocate section %u to %u\n", relsec,
+	       sechdrs[relsec].sh_info);
+	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+		/* This is where to make the change */
+		location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+			+ rel[i].r_offset;
+		/* This is the symbol it is referring to.  Note that all
+		   undefined symbols have been resolved.  */
+		sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
+			+ ELF32_R_SYM(rel[i].r_info);
+
+		switch (ELF32_R_TYPE(rel[i].r_info)) {
+		case R_386_32:
+			/* We add the value into the location given */
+			*location += sym->st_value;
+			break;
+		case R_386_PC32:
+			/* Add the value, subtract its postition */
+			*location += sym->st_value - (uint32_t)location;
+			break;
+		default:
+			printk(KERN_ERR "module %s: Unknown relocation: %u\n",
+			       me->name, ELF32_R_TYPE(rel[i].r_info));
+			return -ENOEXEC;
+		}
+	}
+	return 0;
+}
+
+int apply_relocate_add(Elf32_Shdr *sechdrs,
+		       const char *strtab,
+		       unsigned int symindex,
+		       unsigned int relsec,
+		       struct module *me)
+{
+	printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
+	       me->name);
+	return -ENOEXEC;
+}
+
+int module_finalize(const Elf_Ehdr *hdr,
+		    const Elf_Shdr *sechdrs,
+		    struct module *me)
+{
+	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+		*para = NULL;
+	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 
+		if (!strcmp(".text", secstrings + s->sh_name))
+			text = s;
+		if (!strcmp(".altinstructions", secstrings + s->sh_name))
+			alt = s;
+		if (!strcmp(".smp_locks", secstrings + s->sh_name))
+			locks= s;
+		if (!strcmp(".parainstructions", secstrings + s->sh_name))
+			para = s;
+	}
+
+	if (alt) {
+		/* patch .altinstructions */
+		void *aseg = (void *)alt->sh_addr;
+		apply_alternatives(aseg, aseg + alt->sh_size);
+	}
+	if (locks && text) {
+		void *lseg = (void *)locks->sh_addr;
+		void *tseg = (void *)text->sh_addr;
+		alternatives_smp_module_add(me, me->name,
+					    lseg, lseg + locks->sh_size,
+					    tseg, tseg + text->sh_size);
+	}
+
+	if (para) {
+		void *pseg = (void *)para->sh_addr;
+		apply_paravirt(pseg, pseg + para->sh_size);
+	}
+
+	return module_bug_finalize(hdr, sechdrs, me);
+}
+
+void module_arch_cleanup(struct module *mod)
+{
+	alternatives_smp_module_del(mod);
+	module_bug_cleanup(mod);
+}
diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
new file mode 100644
index 000000000000..13abb4ebfb79
--- /dev/null
+++ b/arch/x86/kernel/mpparse_32.c
@@ -0,0 +1,1132 @@
+/*
+ *	Intel Multiprocessor Specification 1.1 and 1.4
+ *	compliant MP-table parsing routines.
+ *
+ *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *	Fixes
+ *		Erich Boleyn	:	MP v1.4 and additional changes.
+ *		Alan Cox	:	Added EBDA scanning
+ *		Ingo Molnar	:	various cleanups and rewrites
+ *		Maciej W. Rozycki:	Bits for default MP configurations
+ *		Paul Diefenbaugh:	Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/bitops.h>
+
+#include <asm/smp.h>
+#include <asm/acpi.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
+
+#include <mach_apic.h>
+#include <mach_apicdef.h>
+#include <mach_mpparse.h>
+#include <bios_ebda.h>
+
+/* Have we found an MP table */
+int smp_found_config;
+unsigned int __cpuinitdata maxcpus = NR_CPUS;
+
+/*
+ * Various Linux-internal data structures created from the
+ * MP-table.
+ */
+int apic_version [MAX_APICS];
+int mp_bus_id_to_type [MAX_MP_BUSSES];
+int mp_bus_id_to_node [MAX_MP_BUSSES];
+int mp_bus_id_to_local [MAX_MP_BUSSES];
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+static int mp_current_pci_id;
+
+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+
+/* # of MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* MP IRQ source entries */
+int mp_irq_entries;
+
+int nr_ioapics;
+
+int pic_mode;
+unsigned long mp_lapic_addr;
+
+unsigned int def_to_bigsmp = 0;
+
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+/* Internal processor count */
+unsigned int __cpuinitdata num_processors;
+
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map;
+
+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+
+/*
+ * Intel MP BIOS table parsing routines:
+ */
+
+
+/*
+ * Checksum an MP configuration block.
+ */
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+	int sum = 0;
+
+	while (len--)
+		sum += *mp++;
+
+	return sum & 0xFF;
+}
+
+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+
+static int mpc_record; 
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
+
+static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
+{
+ 	int ver, apicid;
+	physid_mask_t phys_cpu;
+ 	
+	if (!(m->mpc_cpuflag & CPU_ENABLED))
+		return;
+
+	apicid = mpc_apic_id(m, translation_table[mpc_record]);
+
+	if (m->mpc_featureflag&(1<<0))
+		Dprintk("    Floating point unit present.\n");
+	if (m->mpc_featureflag&(1<<7))
+		Dprintk("    Machine Exception supported.\n");
+	if (m->mpc_featureflag&(1<<8))
+		Dprintk("    64 bit compare & exchange supported.\n");
+	if (m->mpc_featureflag&(1<<9))
+		Dprintk("    Internal APIC present.\n");
+	if (m->mpc_featureflag&(1<<11))
+		Dprintk("    SEP present.\n");
+	if (m->mpc_featureflag&(1<<12))
+		Dprintk("    MTRR  present.\n");
+	if (m->mpc_featureflag&(1<<13))
+		Dprintk("    PGE  present.\n");
+	if (m->mpc_featureflag&(1<<14))
+		Dprintk("    MCA  present.\n");
+	if (m->mpc_featureflag&(1<<15))
+		Dprintk("    CMOV  present.\n");
+	if (m->mpc_featureflag&(1<<16))
+		Dprintk("    PAT  present.\n");
+	if (m->mpc_featureflag&(1<<17))
+		Dprintk("    PSE  present.\n");
+	if (m->mpc_featureflag&(1<<18))
+		Dprintk("    PSN  present.\n");
+	if (m->mpc_featureflag&(1<<19))
+		Dprintk("    Cache Line Flush Instruction present.\n");
+	/* 20 Reserved */
+	if (m->mpc_featureflag&(1<<21))
+		Dprintk("    Debug Trace and EMON Store present.\n");
+	if (m->mpc_featureflag&(1<<22))
+		Dprintk("    ACPI Thermal Throttle Registers  present.\n");
+	if (m->mpc_featureflag&(1<<23))
+		Dprintk("    MMX  present.\n");
+	if (m->mpc_featureflag&(1<<24))
+		Dprintk("    FXSR  present.\n");
+	if (m->mpc_featureflag&(1<<25))
+		Dprintk("    XMM  present.\n");
+	if (m->mpc_featureflag&(1<<26))
+		Dprintk("    Willamette New Instructions  present.\n");
+	if (m->mpc_featureflag&(1<<27))
+		Dprintk("    Self Snoop  present.\n");
+	if (m->mpc_featureflag&(1<<28))
+		Dprintk("    HT  present.\n");
+	if (m->mpc_featureflag&(1<<29))
+		Dprintk("    Thermal Monitor present.\n");
+	/* 30, 31 Reserved */
+
+
+	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+		Dprintk("    Bootup CPU\n");
+		boot_cpu_physical_apicid = m->mpc_apicid;
+	}
+
+	ver = m->mpc_apicver;
+
+	/*
+	 * Validate version
+	 */
+	if (ver == 0x0) {
+		printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
+				"fixing up to 0x10. (tell your hw vendor)\n",
+				m->mpc_apicid);
+		ver = 0x10;
+	}
+	apic_version[m->mpc_apicid] = ver;
+
+	phys_cpu = apicid_to_cpu_present(apicid);
+	physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
+
+	if (num_processors >= NR_CPUS) {
+		printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+			"  Processor ignored.\n", NR_CPUS);
+		return;
+	}
+
+	if (num_processors >= maxcpus) {
+		printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
+			" Processor ignored.\n", maxcpus);
+		return;
+	}
+
+	cpu_set(num_processors, cpu_possible_map);
+	num_processors++;
+
+	/*
+	 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
+	 * but we need to work other dependencies like SMP_SUSPEND etc
+	 * before this can be done without some confusion.
+	 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
+	 *       - Ashok Raj <ashok.raj@intel.com>
+	 */
+	if (num_processors > 8) {
+		switch (boot_cpu_data.x86_vendor) {
+		case X86_VENDOR_INTEL:
+			if (!APIC_XAPIC(ver)) {
+				def_to_bigsmp = 0;
+				break;
+			}
+			/* If P4 and above fall through */
+		case X86_VENDOR_AMD:
+			def_to_bigsmp = 1;
+		}
+	}
+	bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
+}
+
+static void __init MP_bus_info (struct mpc_config_bus *m)
+{
+	char str[7];
+
+	memcpy(str, m->mpc_bustype, 6);
+	str[6] = 0;
+
+	mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+
+#if MAX_MP_BUSSES < 256
+	if (m->mpc_busid >= MAX_MP_BUSSES) {
+		printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
+			" is too large, max. supported is %d\n",
+			m->mpc_busid, str, MAX_MP_BUSSES - 1);
+		return;
+	}
+#endif
+
+	if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
+		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+	} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
+		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
+		mpc_oem_pci_bus(m, translation_table[mpc_record]);
+		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+		mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+		mp_current_pci_id++;
+	} else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
+		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
+	} else {
+		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+	}
+}
+
+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
+{
+	if (!(m->mpc_flags & MPC_APIC_USABLE))
+		return;
+
+	printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
+		m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+	if (nr_ioapics >= MAX_IO_APICS) {
+		printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
+			MAX_IO_APICS, nr_ioapics);
+		panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
+	}
+	if (!m->mpc_apicaddr) {
+		printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
+			" found in MP table, skipping!\n");
+		return;
+	}
+	mp_ioapics[nr_ioapics] = *m;
+	nr_ioapics++;
+}
+
+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
+{
+	mp_irqs [mp_irq_entries] = *m;
+	Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
+			m->mpc_irqtype, m->mpc_irqflag & 3,
+			(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+			m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+		panic("Max # of irq sources exceeded!!\n");
+}
+
+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
+{
+	Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+			m->mpc_irqtype, m->mpc_irqflag & 3,
+			(m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
+			m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+}
+
+#ifdef CONFIG_X86_NUMAQ
+static void __init MP_translation_info (struct mpc_config_translation *m)
+{
+	printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
+
+	if (mpc_record >= MAX_MPC_ENTRY) 
+		printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+	else
+		translation_table[mpc_record] = m; /* stash this for later */
+	if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+		node_set_online(m->trans_quad);
+}
+
+/*
+ * Read/parse the MPC oem tables
+ */
+
+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
+	unsigned short oemsize)
+{
+	int count = sizeof (*oemtable); /* the header size */
+	unsigned char *oemptr = ((unsigned char *)oemtable)+count;
+	
+	mpc_record = 0;
+	printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
+	if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
+	{
+		printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+			oemtable->oem_signature[0],
+			oemtable->oem_signature[1],
+			oemtable->oem_signature[2],
+			oemtable->oem_signature[3]);
+		return;
+	}
+	if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
+	{
+		printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+		return;
+	}
+	while (count < oemtable->oem_length) {
+		switch (*oemptr) {
+			case MP_TRANSLATION:
+			{
+				struct mpc_config_translation *m=
+					(struct mpc_config_translation *)oemptr;
+				MP_translation_info(m);
+				oemptr += sizeof(*m);
+				count += sizeof(*m);
+				++mpc_record;
+				break;
+			}
+			default:
+			{
+				printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
+				return;
+			}
+		}
+       }
+}
+
+static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
+		char *productid)
+{
+	if (strncmp(oem, "IBM NUMA", 8))
+		printk("Warning!  May not be a NUMA-Q system!\n");
+	if (mpc->mpc_oemptr)
+		smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
+				mpc->mpc_oemsize);
+}
+#endif	/* CONFIG_X86_NUMAQ */
+
+/*
+ * Read/parse the MPC
+ */
+
+static int __init smp_read_mpc(struct mp_config_table *mpc)
+{
+	char str[16];
+	char oem[10];
+	int count=sizeof(*mpc);
+	unsigned char *mpt=((unsigned char *)mpc)+count;
+
+	if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
+		printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
+			*(u32 *)mpc->mpc_signature);
+		return 0;
+	}
+	if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
+		printk(KERN_ERR "SMP mptable: checksum error!\n");
+		return 0;
+	}
+	if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
+		printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
+			mpc->mpc_spec);
+		return 0;
+	}
+	if (!mpc->mpc_lapic) {
+		printk(KERN_ERR "SMP mptable: null local APIC address!\n");
+		return 0;
+	}
+	memcpy(oem,mpc->mpc_oem,8);
+	oem[8]=0;
+	printk(KERN_INFO "OEM ID: %s ",oem);
+
+	memcpy(str,mpc->mpc_productid,12);
+	str[12]=0;
+	printk("Product ID: %s ",str);
+
+	mps_oem_check(mpc, oem, str);
+
+	printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
+
+	/* 
+	 * Save the local APIC address (it might be non-default) -- but only
+	 * if we're not using ACPI.
+	 */
+	if (!acpi_lapic)
+		mp_lapic_addr = mpc->mpc_lapic;
+
+	/*
+	 *	Now process the configuration blocks.
+	 */
+	mpc_record = 0;
+	while (count < mpc->mpc_length) {
+		switch(*mpt) {
+			case MP_PROCESSOR:
+			{
+				struct mpc_config_processor *m=
+					(struct mpc_config_processor *)mpt;
+				/* ACPI may have already provided this data */
+				if (!acpi_lapic)
+					MP_processor_info(m);
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+			case MP_BUS:
+			{
+				struct mpc_config_bus *m=
+					(struct mpc_config_bus *)mpt;
+				MP_bus_info(m);
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+			case MP_IOAPIC:
+			{
+				struct mpc_config_ioapic *m=
+					(struct mpc_config_ioapic *)mpt;
+				MP_ioapic_info(m);
+				mpt+=sizeof(*m);
+				count+=sizeof(*m);
+				break;
+			}
+			case MP_INTSRC:
+			{
+				struct mpc_config_intsrc *m=
+					(struct mpc_config_intsrc *)mpt;
+
+				MP_intsrc_info(m);
+				mpt+=sizeof(*m);
+				count+=sizeof(*m);
+				break;
+			}
+			case MP_LINTSRC:
+			{
+				struct mpc_config_lintsrc *m=
+					(struct mpc_config_lintsrc *)mpt;
+				MP_lintsrc_info(m);
+				mpt+=sizeof(*m);
+				count+=sizeof(*m);
+				break;
+			}
+			default:
+			{
+				count = mpc->mpc_length;
+				break;
+			}
+		}
+		++mpc_record;
+	}
+	setup_apic_routing();
+	if (!num_processors)
+		printk(KERN_ERR "SMP mptable: no processors registered!\n");
+	return num_processors;
+}
+
+static int __init ELCR_trigger(unsigned int irq)
+{
+	unsigned int port;
+
+	port = 0x4d0 + (irq >> 3);
+	return (inb(port) >> (irq & 7)) & 1;
+}
+
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+	struct mpc_config_intsrc intsrc;
+	int i;
+	int ELCR_fallback = 0;
+
+	intsrc.mpc_type = MP_INTSRC;
+	intsrc.mpc_irqflag = 0;			/* conforming */
+	intsrc.mpc_srcbus = 0;
+	intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+
+	intsrc.mpc_irqtype = mp_INT;
+
+	/*
+	 *  If true, we have an ISA/PCI system with no IRQ entries
+	 *  in the MP table. To prevent the PCI interrupts from being set up
+	 *  incorrectly, we try to use the ELCR. The sanity check to see if
+	 *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+	 *  never be level sensitive, so we simply see if the ELCR agrees.
+	 *  If it does, we assume it's valid.
+	 */
+	if (mpc_default_type == 5) {
+		printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
+
+		if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
+			printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
+		else {
+			printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
+			ELCR_fallback = 1;
+		}
+	}
+
+	for (i = 0; i < 16; i++) {
+		switch (mpc_default_type) {
+		case 2:
+			if (i == 0 || i == 13)
+				continue;	/* IRQ0 & IRQ13 not connected */
+			/* fall through */
+		default:
+			if (i == 2)
+				continue;	/* IRQ2 is never connected */
+		}
+
+		if (ELCR_fallback) {
+			/*
+			 *  If the ELCR indicates a level-sensitive interrupt, we
+			 *  copy that information over to the MP table in the
+			 *  irqflag field (level sensitive, active high polarity).
+			 */
+			if (ELCR_trigger(i))
+				intsrc.mpc_irqflag = 13;
+			else
+				intsrc.mpc_irqflag = 0;
+		}
+
+		intsrc.mpc_srcbusirq = i;
+		intsrc.mpc_dstirq = i ? i : 2;		/* IRQ0 to INTIN2 */
+		MP_intsrc_info(&intsrc);
+	}
+
+	intsrc.mpc_irqtype = mp_ExtINT;
+	intsrc.mpc_srcbusirq = 0;
+	intsrc.mpc_dstirq = 0;				/* 8259A to INTIN0 */
+	MP_intsrc_info(&intsrc);
+}
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+	struct mpc_config_processor processor;
+	struct mpc_config_bus bus;
+	struct mpc_config_ioapic ioapic;
+	struct mpc_config_lintsrc lintsrc;
+	int linttypes[2] = { mp_ExtINT, mp_NMI };
+	int i;
+
+	/*
+	 * local APIC has default address
+	 */
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+	/*
+	 * 2 CPUs, numbered 0 & 1.
+	 */
+	processor.mpc_type = MP_PROCESSOR;
+	/* Either an integrated APIC or a discrete 82489DX. */
+	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	processor.mpc_cpuflag = CPU_ENABLED;
+	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+				   (boot_cpu_data.x86_model << 4) |
+				   boot_cpu_data.x86_mask;
+	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+	processor.mpc_reserved[0] = 0;
+	processor.mpc_reserved[1] = 0;
+	for (i = 0; i < 2; i++) {
+		processor.mpc_apicid = i;
+		MP_processor_info(&processor);
+	}
+
+	bus.mpc_type = MP_BUS;
+	bus.mpc_busid = 0;
+	switch (mpc_default_type) {
+		default:
+			printk("???\n");
+			printk(KERN_ERR "Unknown standard configuration %d\n",
+				mpc_default_type);
+			/* fall through */
+		case 1:
+		case 5:
+			memcpy(bus.mpc_bustype, "ISA   ", 6);
+			break;
+		case 2:
+		case 6:
+		case 3:
+			memcpy(bus.mpc_bustype, "EISA  ", 6);
+			break;
+		case 4:
+		case 7:
+			memcpy(bus.mpc_bustype, "MCA   ", 6);
+	}
+	MP_bus_info(&bus);
+	if (mpc_default_type > 4) {
+		bus.mpc_busid = 1;
+		memcpy(bus.mpc_bustype, "PCI   ", 6);
+		MP_bus_info(&bus);
+	}
+
+	ioapic.mpc_type = MP_IOAPIC;
+	ioapic.mpc_apicid = 2;
+	ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	ioapic.mpc_flags = MPC_APIC_USABLE;
+	ioapic.mpc_apicaddr = 0xFEC00000;
+	MP_ioapic_info(&ioapic);
+
+	/*
+	 * We set up most of the low 16 IO-APIC pins according to MPS rules.
+	 */
+	construct_default_ioirq_mptable(mpc_default_type);
+
+	lintsrc.mpc_type = MP_LINTSRC;
+	lintsrc.mpc_irqflag = 0;		/* conforming */
+	lintsrc.mpc_srcbusid = 0;
+	lintsrc.mpc_srcbusirq = 0;
+	lintsrc.mpc_destapic = MP_APIC_ALL;
+	for (i = 0; i < 2; i++) {
+		lintsrc.mpc_irqtype = linttypes[i];
+		lintsrc.mpc_destapiclint = i;
+		MP_lintsrc_info(&lintsrc);
+	}
+}
+
+static struct intel_mp_floating *mpf_found;
+
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+void __init get_smp_config (void)
+{
+	struct intel_mp_floating *mpf = mpf_found;
+
+	/*
+	 * ACPI supports both logical (e.g. Hyper-Threading) and physical 
+	 * processors, where MPS only supports physical.
+	 */
+	if (acpi_lapic && acpi_ioapic) {
+		printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
+		return;
+	}
+	else if (acpi_lapic)
+		printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
+
+	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
+	if (mpf->mpf_feature2 & (1<<7)) {
+		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
+		pic_mode = 1;
+	} else {
+		printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
+		pic_mode = 0;
+	}
+
+	/*
+	 * Now see if we need to read further.
+	 */
+	if (mpf->mpf_feature1 != 0) {
+
+		printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
+		construct_default_ISA_mptable(mpf->mpf_feature1);
+
+	} else if (mpf->mpf_physptr) {
+
+		/*
+		 * Read the physical hardware table.  Anything here will
+		 * override the defaults.
+		 */
+		if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
+			smp_found_config = 0;
+			printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
+			printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
+			return;
+		}
+		/*
+		 * If there are no explicit MP IRQ entries, then we are
+		 * broken.  We set up most of the low 16 IO-APIC pins to
+		 * ISA defaults and hope it will work.
+		 */
+		if (!mp_irq_entries) {
+			struct mpc_config_bus bus;
+
+			printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
+
+			bus.mpc_type = MP_BUS;
+			bus.mpc_busid = 0;
+			memcpy(bus.mpc_bustype, "ISA   ", 6);
+			MP_bus_info(&bus);
+
+			construct_default_ioirq_mptable(0);
+		}
+
+	} else
+		BUG();
+
+	printk(KERN_INFO "Processors: %d\n", num_processors);
+	/*
+	 * Only use the first configuration found.
+	 */
+}
+
+static int __init smp_scan_config (unsigned long base, unsigned long length)
+{
+	unsigned long *bp = phys_to_virt(base);
+	struct intel_mp_floating *mpf;
+
+	Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+	if (sizeof(*mpf) != 16)
+		printk("Error: MPF size\n");
+
+	while (length > 0) {
+		mpf = (struct intel_mp_floating *)bp;
+		if ((*bp == SMP_MAGIC_IDENT) &&
+			(mpf->mpf_length == 1) &&
+			!mpf_checksum((unsigned char *)bp, 16) &&
+			((mpf->mpf_specification == 1)
+				|| (mpf->mpf_specification == 4)) ) {
+
+			smp_found_config = 1;
+			printk(KERN_INFO "found SMP MP-table at %08lx\n",
+						virt_to_phys(mpf));
+			reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
+			if (mpf->mpf_physptr) {
+				/*
+				 * We cannot access to MPC table to compute
+				 * table size yet, as only few megabytes from
+				 * the bottom is mapped now.
+				 * PC-9800's MPC table places on the very last
+				 * of physical memory; so that simply reserving
+				 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
+				 * in reserve_bootmem.
+				 */
+				unsigned long size = PAGE_SIZE;
+				unsigned long end = max_low_pfn * PAGE_SIZE;
+				if (mpf->mpf_physptr + size > end)
+					size = end - mpf->mpf_physptr;
+				reserve_bootmem(mpf->mpf_physptr, size);
+			}
+
+			mpf_found = mpf;
+			return 1;
+		}
+		bp += 4;
+		length -= 16;
+	}
+	return 0;
+}
+
+void __init find_smp_config (void)
+{
+	unsigned int address;
+
+	/*
+	 * FIXME: Linux assumes you have 640K of base ram..
+	 * this continues the error...
+	 *
+	 * 1) Scan the bottom 1K for a signature
+	 * 2) Scan the top 1K of base RAM
+	 * 3) Scan the 64K of bios
+	 */
+	if (smp_scan_config(0x0,0x400) ||
+		smp_scan_config(639*0x400,0x400) ||
+			smp_scan_config(0xF0000,0x10000))
+		return;
+	/*
+	 * If it is an SMP machine we should know now, unless the
+	 * configuration is in an EISA/MCA bus machine with an
+	 * extended bios data area.
+	 *
+	 * there is a real-mode segmented pointer pointing to the
+	 * 4K EBDA area at 0x40E, calculate and scan it here.
+	 *
+	 * NOTE! There are Linux loaders that will corrupt the EBDA
+	 * area, and as such this kind of SMP config may be less
+	 * trustworthy, simply because the SMP table may have been
+	 * stomped on during early boot. These loaders are buggy and
+	 * should be fixed.
+	 *
+	 * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
+	 */
+
+	address = get_bios_ebda();
+	if (address)
+		smp_scan_config(address, 0x400);
+}
+
+int es7000_plat;
+
+/* --------------------------------------------------------------------------
+                            ACPI-based MP Configuration
+   -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI
+
+void __init mp_register_lapic_address(u64 address)
+{
+	mp_lapic_addr = (unsigned long) address;
+
+	set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+
+	if (boot_cpu_physical_apicid == -1U)
+		boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+
+	Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
+}
+
+void __cpuinit mp_register_lapic (u8 id, u8 enabled)
+{
+	struct mpc_config_processor processor;
+	int boot_cpu = 0;
+	
+	if (MAX_APICS - id <= 0) {
+		printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
+			id, MAX_APICS);
+		return;
+	}
+
+	if (id == boot_cpu_physical_apicid)
+		boot_cpu = 1;
+
+	processor.mpc_type = MP_PROCESSOR;
+	processor.mpc_apicid = id;
+	processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
+	processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
+	processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
+	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
+		(boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+	processor.mpc_reserved[0] = 0;
+	processor.mpc_reserved[1] = 0;
+
+	MP_processor_info(&processor);
+}
+
+#ifdef	CONFIG_X86_IO_APIC
+
+#define MP_ISA_BUS		0
+#define MP_MAX_IOAPIC_PIN	127
+
+static struct mp_ioapic_routing {
+	int			apic_id;
+	int			gsi_base;
+	int			gsi_end;
+	u32			pin_programmed[4];
+} mp_ioapic_routing[MAX_IO_APICS];
+
+static int mp_find_ioapic (int gsi)
+{
+	int i = 0;
+
+	/* Find the IOAPIC that manages this GSI. */
+	for (i = 0; i < nr_ioapics; i++) {
+		if ((gsi >= mp_ioapic_routing[i].gsi_base)
+			&& (gsi <= mp_ioapic_routing[i].gsi_end))
+			return i;
+	}
+
+	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+
+	return -1;
+}
+
+void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
+{
+	int idx = 0;
+	int tmpid;
+
+	if (nr_ioapics >= MAX_IO_APICS) {
+		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+			"(found %d)\n", MAX_IO_APICS, nr_ioapics);
+		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+	}
+	if (!address) {
+		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+			" found in MADT table, skipping!\n");
+		return;
+	}
+
+	idx = nr_ioapics++;
+
+	mp_ioapics[idx].mpc_type = MP_IOAPIC;
+	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+	mp_ioapics[idx].mpc_apicaddr = address;
+
+	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		&& !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+		tmpid = io_apic_get_unique_id(idx, id);
+	else
+		tmpid = id;
+	if (tmpid == -1) {
+		nr_ioapics--;
+		return;
+	}
+	mp_ioapics[idx].mpc_apicid = tmpid;
+	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+	
+	/* 
+	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+	 */
+	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+	mp_ioapic_routing[idx].gsi_base = gsi_base;
+	mp_ioapic_routing[idx].gsi_end = gsi_base + 
+		io_apic_get_redir_entries(idx);
+
+	printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
+		"GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
+		mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+		mp_ioapic_routing[idx].gsi_base,
+		mp_ioapic_routing[idx].gsi_end);
+}
+
+void __init
+mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
+	struct mpc_config_intsrc intsrc;
+	int			ioapic = -1;
+	int			pin = -1;
+
+	/* 
+	 * Convert 'gsi' to 'ioapic.pin'.
+	 */
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return;
+	pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+	/*
+	 * TBD: This check is for faulty timer entries, where the override
+	 *      erroneously sets the trigger to level, resulting in a HUGE 
+	 *      increase of timer interrupts!
+	 */
+	if ((bus_irq == 0) && (trigger == 3))
+		trigger = 1;
+
+	intsrc.mpc_type = MP_INTSRC;
+	intsrc.mpc_irqtype = mp_INT;
+	intsrc.mpc_irqflag = (trigger << 2) | polarity;
+	intsrc.mpc_srcbus = MP_ISA_BUS;
+	intsrc.mpc_srcbusirq = bus_irq;				       /* IRQ */
+	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;	   /* APIC ID */
+	intsrc.mpc_dstirq = pin;				    /* INTIN# */
+
+	Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
+		intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+		(intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+		intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
+
+	mp_irqs[mp_irq_entries] = intsrc;
+	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+		panic("Max # of irq sources exceeded!\n");
+}
+
+void __init mp_config_acpi_legacy_irqs (void)
+{
+	struct mpc_config_intsrc intsrc;
+	int i = 0;
+	int ioapic = -1;
+
+	/* 
+	 * Fabricate the legacy ISA bus (bus #31).
+	 */
+	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+
+	/*
+	 * Older generations of ES7000 have no legacy identity mappings
+	 */
+	if (es7000_plat == 1)
+		return;
+
+	/* 
+	 * Locate the IOAPIC that manages the ISA IRQs (0-15). 
+	 */
+	ioapic = mp_find_ioapic(0);
+	if (ioapic < 0)
+		return;
+
+	intsrc.mpc_type = MP_INTSRC;
+	intsrc.mpc_irqflag = 0;					/* Conforming */
+	intsrc.mpc_srcbus = MP_ISA_BUS;
+	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+
+	/* 
+	 * Use the default configuration for the IRQs 0-15.  Unless
+	 * overriden by (MADT) interrupt source override entries.
+	 */
+	for (i = 0; i < 16; i++) {
+		int idx;
+
+		for (idx = 0; idx < mp_irq_entries; idx++) {
+			struct mpc_config_intsrc *irq = mp_irqs + idx;
+
+			/* Do we already have a mapping for this ISA IRQ? */
+			if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
+				break;
+
+			/* Do we already have a mapping for this IOAPIC pin */
+			if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
+				(irq->mpc_dstirq == i))
+				break;
+		}
+
+		if (idx != mp_irq_entries) {
+			printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+			continue;			/* IRQ already used */
+		}
+
+		intsrc.mpc_irqtype = mp_INT;
+		intsrc.mpc_srcbusirq = i;		   /* Identity mapped */
+		intsrc.mpc_dstirq = i;
+
+		Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
+			"%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+			(intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+			intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
+			intsrc.mpc_dstirq);
+
+		mp_irqs[mp_irq_entries] = intsrc;
+		if (++mp_irq_entries == MAX_IRQ_SOURCES)
+			panic("Max # of irq sources exceeded!\n");
+	}
+}
+
+#define MAX_GSI_NUM	4096
+
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+	int ioapic = -1;
+	int ioapic_pin = 0;
+	int idx, bit = 0;
+	static int pci_irq = 16;
+	/*
+	 * Mapping between Global System Interrups, which
+	 * represent all possible interrupts, and IRQs
+	 * assigned to actual devices.
+	 */
+	static int		gsi_to_irq[MAX_GSI_NUM];
+
+	/* Don't set up the ACPI SCI because it's already set up */
+	if (acpi_gbl_FADT.sci_interrupt == gsi)
+		return gsi;
+
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0) {
+		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+		return gsi;
+	}
+
+	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+	if (ioapic_renumber_irq)
+		gsi = ioapic_renumber_irq(ioapic, gsi);
+
+	/* 
+	 * Avoid pin reprogramming.  PRTs typically include entries  
+	 * with redundant pin->gsi mappings (but unique PCI devices);
+	 * we only program the IOAPIC on the first.
+	 */
+	bit = ioapic_pin % 32;
+	idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
+	if (idx > 3) {
+		printk(KERN_ERR "Invalid reference to IOAPIC pin "
+			"%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
+			ioapic_pin);
+		return gsi;
+	}
+	if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+		return gsi_to_irq[gsi];
+	}
+
+	mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+
+	if (triggering == ACPI_LEVEL_SENSITIVE) {
+		/*
+		 * For PCI devices assign IRQs in order, avoiding gaps
+		 * due to unused I/O APIC pins.
+		 */
+		int irq = gsi;
+		if (gsi < MAX_GSI_NUM) {
+			/*
+			 * Retain the VIA chipset work-around (gsi > 15), but
+			 * avoid a problem where the 8254 timer (IRQ0) is setup
+			 * via an override (so it's not on pin 0 of the ioapic),
+			 * and at the same time, the pin 0 interrupt is a PCI
+			 * type.  The gsi > 15 test could cause these two pins
+			 * to be shared as IRQ0, and they are not shareable.
+			 * So test for this condition, and if necessary, avoid
+			 * the pin collision.
+			 */
+			if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
+				gsi = pci_irq++;
+			/*
+			 * Don't assign IRQ used by ACPI SCI
+			 */
+			if (gsi == acpi_gbl_FADT.sci_interrupt)
+				gsi = pci_irq++;
+			gsi_to_irq[irq] = gsi;
+		} else {
+			printk(KERN_ERR "GSI %u is too high\n", gsi);
+			return gsi;
+		}
+	}
+
+	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+		    triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+		    polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+	return gsi;
+}
+
+#endif /* CONFIG_X86_IO_APIC */
+#endif /* CONFIG_ACPI */
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
new file mode 100644
index 000000000000..0c1069b8d638
--- /dev/null
+++ b/arch/x86/kernel/msr.c
@@ -0,0 +1,224 @@
+/* ----------------------------------------------------------------------- *
+ *   
+ *   Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * msr.c
+ *
+ * x86 MSR access device
+ *
+ * This device is accessed by lseek() to the appropriate register number
+ * and then read/write in chunks of 8 bytes.  A larger size means multiple
+ * reads or writes of the same register.
+ *
+ * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
+ * an SMP box will direct the access to CPU %d.
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+static struct class *msr_class;
+
+static loff_t msr_seek(struct file *file, loff_t offset, int orig)
+{
+	loff_t ret = -EINVAL;
+
+	lock_kernel();
+	switch (orig) {
+	case 0:
+		file->f_pos = offset;
+		ret = file->f_pos;
+		break;
+	case 1:
+		file->f_pos += offset;
+		ret = file->f_pos;
+	}
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t msr_read(struct file *file, char __user * buf,
+			size_t count, loff_t * ppos)
+{
+	u32 __user *tmp = (u32 __user *) buf;
+	u32 data[2];
+	u32 reg = *ppos;
+	int cpu = iminor(file->f_path.dentry->d_inode);
+	int err;
+
+	if (count % 8)
+		return -EINVAL;	/* Invalid chunk size */
+
+	for (; count; count -= 8) {
+		err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
+		if (err)
+			return -EIO;
+		if (copy_to_user(tmp, &data, 8))
+			return -EFAULT;
+		tmp += 2;
+	}
+
+	return ((char __user *)tmp) - buf;
+}
+
+static ssize_t msr_write(struct file *file, const char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	const u32 __user *tmp = (const u32 __user *)buf;
+	u32 data[2];
+	u32 reg = *ppos;
+	int cpu = iminor(file->f_path.dentry->d_inode);
+	int err;
+
+	if (count % 8)
+		return -EINVAL;	/* Invalid chunk size */
+
+	for (; count; count -= 8) {
+		if (copy_from_user(&data, tmp, 8))
+			return -EFAULT;
+		err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
+		if (err)
+			return -EIO;
+		tmp += 2;
+	}
+
+	return ((char __user *)tmp) - buf;
+}
+
+static int msr_open(struct inode *inode, struct file *file)
+{
+	unsigned int cpu = iminor(file->f_path.dentry->d_inode);
+	struct cpuinfo_x86 *c = &(cpu_data)[cpu];
+
+	if (cpu >= NR_CPUS || !cpu_online(cpu))
+		return -ENXIO;	/* No such CPU */
+	if (!cpu_has(c, X86_FEATURE_MSR))
+		return -EIO;	/* MSR not supported */
+
+	return 0;
+}
+
+/*
+ * File operations we support
+ */
+static const struct file_operations msr_fops = {
+	.owner = THIS_MODULE,
+	.llseek = msr_seek,
+	.read = msr_read,
+	.write = msr_write,
+	.open = msr_open,
+};
+
+static int msr_device_create(int i)
+{
+	int err = 0;
+	struct device *dev;
+
+	dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i);
+	if (IS_ERR(dev))
+		err = PTR_ERR(dev);
+	return err;
+}
+
+static int msr_class_cpu_callback(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		msr_device_create(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
+{
+	.notifier_call = msr_class_cpu_callback,
+};
+
+static int __init msr_init(void)
+{
+	int i, err = 0;
+	i = 0;
+
+	if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
+		printk(KERN_ERR "msr: unable to get major %d for msr\n",
+		       MSR_MAJOR);
+		err = -EBUSY;
+		goto out;
+	}
+	msr_class = class_create(THIS_MODULE, "msr");
+	if (IS_ERR(msr_class)) {
+		err = PTR_ERR(msr_class);
+		goto out_chrdev;
+	}
+	for_each_online_cpu(i) {
+		err = msr_device_create(i);
+		if (err != 0)
+			goto out_class;
+	}
+	register_hotcpu_notifier(&msr_class_cpu_notifier);
+
+	err = 0;
+	goto out;
+
+out_class:
+	i = 0;
+	for_each_online_cpu(i)
+		device_destroy(msr_class, MKDEV(MSR_MAJOR, i));
+	class_destroy(msr_class);
+out_chrdev:
+	unregister_chrdev(MSR_MAJOR, "cpu/msr");
+out:
+	return err;
+}
+
+static void __exit msr_exit(void)
+{
+	int cpu = 0;
+	for_each_online_cpu(cpu)
+		device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+	class_destroy(msr_class);
+	unregister_chrdev(MSR_MAJOR, "cpu/msr");
+	unregister_hotcpu_notifier(&msr_class_cpu_notifier);
+}
+
+module_init(msr_init);
+module_exit(msr_exit)
+
+MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
+MODULE_DESCRIPTION("x86 generic MSR driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
new file mode 100644
index 000000000000..c7227e2180f8
--- /dev/null
+++ b/arch/x86/kernel/nmi_32.c
@@ -0,0 +1,468 @@
+/*
+ *  linux/arch/i386/nmi.c
+ *
+ *  NMI watchdog support on APIC systems
+ *
+ *  Started by Ingo Molnar <mingo@redhat.com>
+ *
+ *  Fixes:
+ *  Mikael Pettersson	: AMD K7 support for local APIC NMI watchdog.
+ *  Mikael Pettersson	: Power Management for local APIC NMI watchdog.
+ *  Mikael Pettersson	: Pentium 4 support for local APIC NMI watchdog.
+ *  Pavel Machek and
+ *  Mikael Pettersson	: PM converted to driver model. Disable/enable API.
+ */
+
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/sysdev.h>
+#include <linux/sysctl.h>
+#include <linux/percpu.h>
+#include <linux/kprobes.h>
+#include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
+#include <linux/kdebug.h>
+
+#include <asm/smp.h>
+#include <asm/nmi.h>
+
+#include "mach_traps.h"
+
+int unknown_nmi_panic;
+int nmi_watchdog_enabled;
+
+static cpumask_t backtrace_mask = CPU_MASK_NONE;
+
+/* nmi_active:
+ * >0: the lapic NMI watchdog is active, but can be disabled
+ * <0: the lapic NMI watchdog has not been set up, and cannot
+ *     be enabled
+ *  0: the lapic NMI watchdog is disabled, but can be enabled
+ */
+atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
+
+unsigned int nmi_watchdog = NMI_DEFAULT;
+static unsigned int nmi_hz = HZ;
+
+static DEFINE_PER_CPU(short, wd_enabled);
+
+/* local prototypes */
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
+
+static int endflag __initdata = 0;
+
+#ifdef CONFIG_SMP
+/* The performance counters used by NMI_LOCAL_APIC don't trigger when
+ * the CPU is idle. To make sure the NMI watchdog really ticks on all
+ * CPUs during the test make them busy.
+ */
+static __init void nmi_cpu_busy(void *data)
+{
+	local_irq_enable_in_hardirq();
+	/* Intentionally don't use cpu_relax here. This is
+	   to make sure that the performance counter really ticks,
+	   even if there is a simulator or similar that catches the
+	   pause instruction. On a real HT machine this is fine because
+	   all other CPUs are busy with "useless" delay loops and don't
+	   care if they get somewhat less cycles. */
+	while (endflag == 0)
+		mb();
+}
+#endif
+
+static int __init check_nmi_watchdog(void)
+{
+	unsigned int *prev_nmi_count;
+	int cpu;
+
+	if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
+		return 0;
+
+	if (!atomic_read(&nmi_active))
+		return 0;
+
+	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+	if (!prev_nmi_count)
+		return -1;
+
+	printk(KERN_INFO "Testing NMI watchdog ... ");
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+
+	for_each_possible_cpu(cpu)
+		prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
+	local_irq_enable();
+	mdelay((20*1000)/nmi_hz); // wait 20 ticks
+
+	for_each_possible_cpu(cpu) {
+#ifdef CONFIG_SMP
+		/* Check cpu_callin_map here because that is set
+		   after the timer is started. */
+		if (!cpu_isset(cpu, cpu_callin_map))
+			continue;
+#endif
+		if (!per_cpu(wd_enabled, cpu))
+			continue;
+		if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
+			printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
+				cpu,
+				prev_nmi_count[cpu],
+				nmi_count(cpu));
+			per_cpu(wd_enabled, cpu) = 0;
+			atomic_dec(&nmi_active);
+		}
+	}
+	endflag = 1;
+	if (!atomic_read(&nmi_active)) {
+		kfree(prev_nmi_count);
+		atomic_set(&nmi_active, -1);
+		return -1;
+	}
+	printk("OK.\n");
+
+	/* now that we know it works we can reduce NMI frequency to
+	   something more reasonable; makes a difference in some configs */
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		nmi_hz = lapic_adjust_nmi_hz(1);
+
+	kfree(prev_nmi_count);
+	return 0;
+}
+/* This needs to happen later in boot so counters are working */
+late_initcall(check_nmi_watchdog);
+
+static int __init setup_nmi_watchdog(char *str)
+{
+	int nmi;
+
+	get_option(&str, &nmi);
+
+	if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
+		return 0;
+
+	nmi_watchdog = nmi;
+	return 1;
+}
+
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
+
+/* Suspend/resume support */
+
+#ifdef CONFIG_PM
+
+static int nmi_pm_active; /* nmi_active before suspend */
+
+static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
+{
+	/* only CPU0 goes here, other CPUs should be offline */
+	nmi_pm_active = atomic_read(&nmi_active);
+	stop_apic_nmi_watchdog(NULL);
+	BUG_ON(atomic_read(&nmi_active) != 0);
+	return 0;
+}
+
+static int lapic_nmi_resume(struct sys_device *dev)
+{
+	/* only CPU0 goes here, other CPUs should be offline */
+	if (nmi_pm_active > 0) {
+		setup_apic_nmi_watchdog(NULL);
+		touch_nmi_watchdog();
+	}
+	return 0;
+}
+
+
+static struct sysdev_class nmi_sysclass = {
+	set_kset_name("lapic_nmi"),
+	.resume		= lapic_nmi_resume,
+	.suspend	= lapic_nmi_suspend,
+};
+
+static struct sys_device device_lapic_nmi = {
+	.id	= 0,
+	.cls	= &nmi_sysclass,
+};
+
+static int __init init_lapic_nmi_sysfs(void)
+{
+	int error;
+
+	/* should really be a BUG_ON but b/c this is an
+	 * init call, it just doesn't work.  -dcz
+	 */
+	if (nmi_watchdog != NMI_LOCAL_APIC)
+		return 0;
+
+	if (atomic_read(&nmi_active) < 0)
+		return 0;
+
+	error = sysdev_class_register(&nmi_sysclass);
+	if (!error)
+		error = sysdev_register(&device_lapic_nmi);
+	return error;
+}
+/* must come after the local APIC's device_initcall() */
+late_initcall(init_lapic_nmi_sysfs);
+
+#endif	/* CONFIG_PM */
+
+static void __acpi_nmi_enable(void *__unused)
+{
+	apic_write_around(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_disable(void *__unused)
+{
+	apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
+void setup_apic_nmi_watchdog (void *unused)
+{
+	if (__get_cpu_var(wd_enabled))
+ 		return;
+
+	/* cheap hack to support suspend/resume */
+	/* if cpu0 is not active neither should the other cpus */
+	if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
+		return;
+
+	switch (nmi_watchdog) {
+	case NMI_LOCAL_APIC:
+		__get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
+		if (lapic_watchdog_init(nmi_hz) < 0) {
+			__get_cpu_var(wd_enabled) = 0;
+			return;
+		}
+		/* FALL THROUGH */
+	case NMI_IO_APIC:
+		__get_cpu_var(wd_enabled) = 1;
+		atomic_inc(&nmi_active);
+	}
+}
+
+void stop_apic_nmi_watchdog(void *unused)
+{
+	/* only support LOCAL and IO APICs for now */
+	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+	    (nmi_watchdog != NMI_IO_APIC))
+	    	return;
+	if (__get_cpu_var(wd_enabled) == 0)
+		return;
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		lapic_watchdog_stop();
+	__get_cpu_var(wd_enabled) = 0;
+	atomic_dec(&nmi_active);
+}
+
+/*
+ * the best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ *
+ * since NMIs don't listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk might lock
+ * up though, so we have to break up any console locks first ...
+ * [when there will be more tty-related locks, break them up
+ *  here too!]
+ */
+
+static unsigned int
+	last_irq_sums [NR_CPUS],
+	alert_counter [NR_CPUS];
+
+void touch_nmi_watchdog(void)
+{
+	if (nmi_watchdog > 0) {
+		unsigned cpu;
+
+		/*
+		 * Just reset the alert counters, (other CPUs might be
+		 * spinning on locks we hold):
+		 */
+		for_each_present_cpu(cpu) {
+			if (alert_counter[cpu])
+				alert_counter[cpu] = 0;
+		}
+	}
+
+	/*
+	 * Tickle the softlockup detector too:
+	 */
+	touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+extern void die_nmi(struct pt_regs *, const char *msg);
+
+__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+{
+
+	/*
+	 * Since current_thread_info()-> is always on the stack, and we
+	 * always switch the stack NMI-atomically, it's safe to use
+	 * smp_processor_id().
+	 */
+	unsigned int sum;
+	int touched = 0;
+	int cpu = smp_processor_id();
+	int rc=0;
+
+	/* check for other users first */
+	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+			== NOTIFY_STOP) {
+		rc = 1;
+		touched = 1;
+	}
+
+	if (cpu_isset(cpu, backtrace_mask)) {
+		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
+
+		spin_lock(&lock);
+		printk("NMI backtrace for cpu %d\n", cpu);
+		dump_stack();
+		spin_unlock(&lock);
+		cpu_clear(cpu, backtrace_mask);
+	}
+
+	/*
+	 * Take the local apic timer and PIT/HPET into account. We don't
+	 * know which one is active, when we have highres/dyntick on
+	 */
+	sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0];
+
+	/* if the none of the timers isn't firing, this cpu isn't doing much */
+	if (!touched && last_irq_sums[cpu] == sum) {
+		/*
+		 * Ayiee, looks like this CPU is stuck ...
+		 * wait a few IRQs (5 seconds) before doing the oops ...
+		 */
+		alert_counter[cpu]++;
+		if (alert_counter[cpu] == 5*nmi_hz)
+			/*
+			 * die_nmi will return ONLY if NOTIFY_STOP happens..
+			 */
+			die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
+	} else {
+		last_irq_sums[cpu] = sum;
+		alert_counter[cpu] = 0;
+	}
+	/* see if the nmi watchdog went off */
+	if (!__get_cpu_var(wd_enabled))
+		return rc;
+	switch (nmi_watchdog) {
+	case NMI_LOCAL_APIC:
+		rc |= lapic_wd_event(nmi_hz);
+		break;
+	case NMI_IO_APIC:
+		/* don't know how to accurately check for this.
+		 * just assume it was a watchdog timer interrupt
+		 * This matches the old behaviour.
+		 */
+		rc = 1;
+		break;
+	}
+	return rc;
+}
+
+int do_nmi_callback(struct pt_regs * regs, int cpu)
+{
+#ifdef CONFIG_SYSCTL
+	if (unknown_nmi_panic)
+		return unknown_nmi_panic_callback(regs, cpu);
+#endif
+	return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
+{
+	unsigned char reason = get_nmi_reason();
+	char buf[64];
+
+	sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+	die_nmi(regs, buf);
+	return 0;
+}
+
+/*
+ * proc handler for /proc/sys/kernel/nmi
+ */
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+			void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int old_state;
+
+	nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+	old_state = nmi_watchdog_enabled;
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	if (!!old_state == !!nmi_watchdog_enabled)
+		return 0;
+
+	if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
+		printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
+		return -EIO;
+	}
+
+	if (nmi_watchdog == NMI_DEFAULT) {
+		if (lapic_watchdog_ok())
+			nmi_watchdog = NMI_LOCAL_APIC;
+		else
+			nmi_watchdog = NMI_IO_APIC;
+	}
+
+	if (nmi_watchdog == NMI_LOCAL_APIC) {
+		if (nmi_watchdog_enabled)
+			enable_lapic_nmi_watchdog();
+		else
+			disable_lapic_nmi_watchdog();
+	} else {
+		printk( KERN_WARNING
+			"NMI watchdog doesn't know what hardware to touch\n");
+		return -EIO;
+	}
+	return 0;
+}
+
+#endif
+
+void __trigger_all_cpu_backtrace(void)
+{
+	int i;
+
+	backtrace_mask = cpu_online_map;
+	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
+	for (i = 0; i < 10 * 1000; i++) {
+		if (cpus_empty(backtrace_mask))
+			break;
+		mdelay(1);
+	}
+}
+
+EXPORT_SYMBOL(nmi_active);
+EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
new file mode 100644
index 000000000000..9000d82c6dc0
--- /dev/null
+++ b/arch/x86/kernel/numaq_32.c
@@ -0,0 +1,89 @@
+/*
+ * Written by: Patricia Gaughen, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <gone@us.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <asm/numaq.h>
+#include <asm/topology.h>
+#include <asm/processor.h>
+
+#define	MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
+
+/*
+ * Function: smp_dump_qct()
+ *
+ * Description: gets memory layout from the quad config table.  This
+ * function also updates node_online_map with the nodes (quads) present.
+ */
+static void __init smp_dump_qct(void)
+{
+	int node;
+	struct eachquadmem *eq;
+	struct sys_cfg_data *scd =
+		(struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);
+
+	nodes_clear(node_online_map);
+	for_each_node(node) {
+		if (scd->quads_present31_0 & (1 << node)) {
+			node_set_online(node);
+			eq = &scd->eq[node];
+			/* Convert to pages */
+			node_start_pfn[node] = MB_TO_PAGES(
+				eq->hi_shrd_mem_start - eq->priv_mem_size);
+			node_end_pfn[node] = MB_TO_PAGES(
+				eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
+
+			memory_present(node,
+				node_start_pfn[node], node_end_pfn[node]);
+			node_remap_size[node] = node_memmap_size_bytes(node,
+							node_start_pfn[node],
+							node_end_pfn[node]);
+		}
+	}
+}
+
+/*
+ * Unlike Summit, we don't really care to let the NUMA-Q
+ * fall back to flat mode.  Don't compile for NUMA-Q
+ * unless you really need it!
+ */
+int __init get_memcfg_numaq(void)
+{
+	smp_dump_qct();
+	return 1;
+}
+
+static int __init numaq_tsc_disable(void)
+{
+	if (num_online_nodes() > 1) {
+		printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
+		tsc_disable = 1;
+	}
+	return 0;
+}
+arch_initcall(numaq_tsc_disable);
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
new file mode 100644
index 000000000000..739cfb207dd7
--- /dev/null
+++ b/arch/x86/kernel/paravirt_32.c
@@ -0,0 +1,392 @@
+/*  Paravirtualization interfaces
+    Copyright (C) 2006 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/bcd.h>
+#include <linux/highmem.h>
+
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+
+/* nop stub */
+void _paravirt_nop(void)
+{
+}
+
+static void __init default_banner(void)
+{
+	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+	       paravirt_ops.name);
+}
+
+char *memory_setup(void)
+{
+	return paravirt_ops.memory_setup();
+}
+
+/* Simple instruction patching code. */
+#define DEF_NATIVE(name, code)					\
+	extern const char start_##name[], end_##name[];		\
+	asm("start_" #name ": " code "; end_" #name ":")
+
+DEF_NATIVE(irq_disable, "cli");
+DEF_NATIVE(irq_enable, "sti");
+DEF_NATIVE(restore_fl, "push %eax; popf");
+DEF_NATIVE(save_fl, "pushf; pop %eax");
+DEF_NATIVE(iret, "iret");
+DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
+DEF_NATIVE(read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(read_cr3, "mov %cr3, %eax");
+DEF_NATIVE(clts, "clts");
+DEF_NATIVE(read_tsc, "rdtsc");
+
+DEF_NATIVE(ud2a, "ud2a");
+
+static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+			     unsigned long addr, unsigned len)
+{
+	const unsigned char *start, *end;
+	unsigned ret;
+
+	switch(type) {
+#define SITE(x)	case PARAVIRT_PATCH(x):	start = start_##x; end = end_##x; goto patch_site
+		SITE(irq_disable);
+		SITE(irq_enable);
+		SITE(restore_fl);
+		SITE(save_fl);
+		SITE(iret);
+		SITE(irq_enable_sysexit);
+		SITE(read_cr2);
+		SITE(read_cr3);
+		SITE(write_cr3);
+		SITE(clts);
+		SITE(read_tsc);
+#undef SITE
+
+	patch_site:
+		ret = paravirt_patch_insns(ibuf, len, start, end);
+		break;
+
+	case PARAVIRT_PATCH(make_pgd):
+	case PARAVIRT_PATCH(make_pte):
+	case PARAVIRT_PATCH(pgd_val):
+	case PARAVIRT_PATCH(pte_val):
+#ifdef CONFIG_X86_PAE
+	case PARAVIRT_PATCH(make_pmd):
+	case PARAVIRT_PATCH(pmd_val):
+#endif
+		/* These functions end up returning exactly what
+		   they're passed, in the same registers. */
+		ret = paravirt_patch_nop();
+		break;
+
+	default:
+		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+		break;
+	}
+
+	return ret;
+}
+
+unsigned paravirt_patch_nop(void)
+{
+	return 0;
+}
+
+unsigned paravirt_patch_ignore(unsigned len)
+{
+	return len;
+}
+
+struct branch {
+	unsigned char opcode;
+	u32 delta;
+} __attribute__((packed));
+
+unsigned paravirt_patch_call(void *insnbuf,
+			     const void *target, u16 tgt_clobbers,
+			     unsigned long addr, u16 site_clobbers,
+			     unsigned len)
+{
+	struct branch *b = insnbuf;
+	unsigned long delta = (unsigned long)target - (addr+5);
+
+	if (tgt_clobbers & ~site_clobbers)
+		return len;	/* target would clobber too much for this site */
+	if (len < 5)
+		return len;	/* call too long for patch site */
+
+	b->opcode = 0xe8; /* call */
+	b->delta = delta;
+	BUILD_BUG_ON(sizeof(*b) != 5);
+
+	return 5;
+}
+
+unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
+			    unsigned long addr, unsigned len)
+{
+	struct branch *b = insnbuf;
+	unsigned long delta = (unsigned long)target - (addr+5);
+
+	if (len < 5)
+		return len;	/* call too long for patch site */
+
+	b->opcode = 0xe9;	/* jmp */
+	b->delta = delta;
+
+	return 5;
+}
+
+unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
+				unsigned long addr, unsigned len)
+{
+	void *opfunc = *((void **)&paravirt_ops + type);
+	unsigned ret;
+
+	if (opfunc == NULL)
+		/* If there's no function, patch it with a ud2a (BUG) */
+		ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a);
+	else if (opfunc == paravirt_nop)
+		/* If the operation is a nop, then nop the callsite */
+		ret = paravirt_patch_nop();
+	else if (type == PARAVIRT_PATCH(iret) ||
+		 type == PARAVIRT_PATCH(irq_enable_sysexit))
+		/* If operation requires a jmp, then jmp */
+		ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len);
+	else
+		/* Otherwise call the function; assume target could
+		   clobber any caller-save reg */
+		ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
+					  addr, clobbers, len);
+
+	return ret;
+}
+
+unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
+			      const char *start, const char *end)
+{
+	unsigned insn_len = end - start;
+
+	if (insn_len > len || start == NULL)
+		insn_len = len;
+	else
+		memcpy(insnbuf, start, insn_len);
+
+	return insn_len;
+}
+
+void init_IRQ(void)
+{
+	paravirt_ops.init_IRQ();
+}
+
+static void native_flush_tlb(void)
+{
+	__native_flush_tlb();
+}
+
+/*
+ * Global pages have to be flushed a bit differently. Not a real
+ * performance problem because this does not happen often.
+ */
+static void native_flush_tlb_global(void)
+{
+	__native_flush_tlb_global();
+}
+
+static void native_flush_tlb_single(unsigned long addr)
+{
+	__native_flush_tlb_single(addr);
+}
+
+/* These are in entry.S */
+extern void native_iret(void);
+extern void native_irq_enable_sysexit(void);
+
+static int __init print_banner(void)
+{
+	paravirt_ops.banner();
+	return 0;
+}
+core_initcall(print_banner);
+
+static struct resource reserve_ioports = {
+	.start = 0,
+	.end = IO_SPACE_LIMIT,
+	.name = "paravirt-ioport",
+	.flags = IORESOURCE_IO | IORESOURCE_BUSY,
+};
+
+static struct resource reserve_iomem = {
+	.start = 0,
+	.end = -1,
+	.name = "paravirt-iomem",
+	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
+/*
+ * Reserve the whole legacy IO space to prevent any legacy drivers
+ * from wasting time probing for their hardware.  This is a fairly
+ * brute-force approach to disabling all non-virtual drivers.
+ *
+ * Note that this must be called very early to have any effect.
+ */
+int paravirt_disable_iospace(void)
+{
+	int ret;
+
+	ret = request_resource(&ioport_resource, &reserve_ioports);
+	if (ret == 0) {
+		ret = request_resource(&iomem_resource, &reserve_iomem);
+		if (ret)
+			release_resource(&reserve_ioports);
+	}
+
+	return ret;
+}
+
+struct paravirt_ops paravirt_ops = {
+	.name = "bare hardware",
+	.paravirt_enabled = 0,
+	.kernel_rpl = 0,
+	.shared_kernel_pmd = 1,	/* Only used when CONFIG_X86_PAE is set */
+
+ 	.patch = native_patch,
+	.banner = default_banner,
+	.arch_setup = paravirt_nop,
+	.memory_setup = machine_specific_memory_setup,
+	.get_wallclock = native_get_wallclock,
+	.set_wallclock = native_set_wallclock,
+	.time_init = hpet_time_init,
+	.init_IRQ = native_init_IRQ,
+
+	.cpuid = native_cpuid,
+	.get_debugreg = native_get_debugreg,
+	.set_debugreg = native_set_debugreg,
+	.clts = native_clts,
+	.read_cr0 = native_read_cr0,
+	.write_cr0 = native_write_cr0,
+	.read_cr2 = native_read_cr2,
+	.write_cr2 = native_write_cr2,
+	.read_cr3 = native_read_cr3,
+	.write_cr3 = native_write_cr3,
+	.read_cr4 = native_read_cr4,
+	.read_cr4_safe = native_read_cr4_safe,
+	.write_cr4 = native_write_cr4,
+	.save_fl = native_save_fl,
+	.restore_fl = native_restore_fl,
+	.irq_disable = native_irq_disable,
+	.irq_enable = native_irq_enable,
+	.safe_halt = native_safe_halt,
+	.halt = native_halt,
+	.wbinvd = native_wbinvd,
+	.read_msr = native_read_msr_safe,
+	.write_msr = native_write_msr_safe,
+	.read_tsc = native_read_tsc,
+	.read_pmc = native_read_pmc,
+	.sched_clock = native_sched_clock,
+	.get_cpu_khz = native_calculate_cpu_khz,
+	.load_tr_desc = native_load_tr_desc,
+	.set_ldt = native_set_ldt,
+	.load_gdt = native_load_gdt,
+	.load_idt = native_load_idt,
+	.store_gdt = native_store_gdt,
+	.store_idt = native_store_idt,
+	.store_tr = native_store_tr,
+	.load_tls = native_load_tls,
+	.write_ldt_entry = write_dt_entry,
+	.write_gdt_entry = write_dt_entry,
+	.write_idt_entry = write_dt_entry,
+	.load_esp0 = native_load_esp0,
+
+	.set_iopl_mask = native_set_iopl_mask,
+	.io_delay = native_io_delay,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	.apic_write = native_apic_write,
+	.apic_write_atomic = native_apic_write_atomic,
+	.apic_read = native_apic_read,
+	.setup_boot_clock = setup_boot_APIC_clock,
+	.setup_secondary_clock = setup_secondary_APIC_clock,
+	.startup_ipi_hook = paravirt_nop,
+#endif
+	.set_lazy_mode = paravirt_nop,
+
+	.pagetable_setup_start = native_pagetable_setup_start,
+	.pagetable_setup_done = native_pagetable_setup_done,
+
+	.flush_tlb_user = native_flush_tlb,
+	.flush_tlb_kernel = native_flush_tlb_global,
+	.flush_tlb_single = native_flush_tlb_single,
+	.flush_tlb_others = native_flush_tlb_others,
+
+	.alloc_pt = paravirt_nop,
+	.alloc_pd = paravirt_nop,
+	.alloc_pd_clone = paravirt_nop,
+	.release_pt = paravirt_nop,
+	.release_pd = paravirt_nop,
+
+	.set_pte = native_set_pte,
+	.set_pte_at = native_set_pte_at,
+	.set_pmd = native_set_pmd,
+	.pte_update = paravirt_nop,
+	.pte_update_defer = paravirt_nop,
+
+#ifdef CONFIG_HIGHPTE
+	.kmap_atomic_pte = kmap_atomic,
+#endif
+
+#ifdef CONFIG_X86_PAE
+	.set_pte_atomic = native_set_pte_atomic,
+	.set_pte_present = native_set_pte_present,
+	.set_pud = native_set_pud,
+	.pte_clear = native_pte_clear,
+	.pmd_clear = native_pmd_clear,
+
+	.pmd_val = native_pmd_val,
+	.make_pmd = native_make_pmd,
+#endif
+
+	.pte_val = native_pte_val,
+	.pgd_val = native_pgd_val,
+
+	.make_pte = native_make_pte,
+	.make_pgd = native_make_pgd,
+
+	.irq_enable_sysexit = native_irq_enable_sysexit,
+	.iret = native_iret,
+
+	.dup_mmap = paravirt_nop,
+	.exit_mmap = paravirt_nop,
+	.activate_mm = paravirt_nop,
+};
+
+EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/x86/kernel/pci-dma_32.c b/arch/x86/kernel/pci-dma_32.c
new file mode 100644
index 000000000000..048f09b62553
--- /dev/null
+++ b/arch/x86/kernel/pci-dma_32.c
@@ -0,0 +1,177 @@
+/*
+ * Dynamic DMA mapping support.
+ *
+ * On i386 there is no hardware dynamic DMA address translation,
+ * so consistent alloc/free are merely page allocation/freeing.
+ * The rest of the dynamic DMA mapping interface is implemented
+ * in asm/pci.h.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <asm/io.h>
+
+struct dma_coherent_mem {
+	void		*virt_base;
+	u32		device_base;
+	int		size;
+	int		flags;
+	unsigned long	*bitmap;
+};
+
+void *dma_alloc_coherent(struct device *dev, size_t size,
+			   dma_addr_t *dma_handle, gfp_t gfp)
+{
+	void *ret;
+	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+	int order = get_order(size);
+	/* ignore region specifiers */
+	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
+
+	if (mem) {
+		int page = bitmap_find_free_region(mem->bitmap, mem->size,
+						     order);
+		if (page >= 0) {
+			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
+			ret = mem->virt_base + (page << PAGE_SHIFT);
+			memset(ret, 0, size);
+			return ret;
+		}
+		if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+			return NULL;
+	}
+
+	if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
+		gfp |= GFP_DMA;
+
+	ret = (void *)__get_free_pages(gfp, order);
+
+	if (ret != NULL) {
+		memset(ret, 0, size);
+		*dma_handle = virt_to_phys(ret);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+void dma_free_coherent(struct device *dev, size_t size,
+			 void *vaddr, dma_addr_t dma_handle)
+{
+	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+	int order = get_order(size);
+	
+	if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+		int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+
+		bitmap_release_region(mem->bitmap, page, order);
+	} else
+		free_pages((unsigned long)vaddr, order);
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+				dma_addr_t device_addr, size_t size, int flags)
+{
+	void __iomem *mem_base = NULL;
+	int pages = size >> PAGE_SHIFT;
+	int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+
+	if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+		goto out;
+	if (!size)
+		goto out;
+	if (dev->dma_mem)
+		goto out;
+
+	/* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+
+	mem_base = ioremap(bus_addr, size);
+	if (!mem_base)
+		goto out;
+
+	dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+	if (!dev->dma_mem)
+		goto out;
+	dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+	if (!dev->dma_mem->bitmap)
+		goto free1_out;
+
+	dev->dma_mem->virt_base = mem_base;
+	dev->dma_mem->device_base = device_addr;
+	dev->dma_mem->size = pages;
+	dev->dma_mem->flags = flags;
+
+	if (flags & DMA_MEMORY_MAP)
+		return DMA_MEMORY_MAP;
+
+	return DMA_MEMORY_IO;
+
+ free1_out:
+	kfree(dev->dma_mem);
+ out:
+	if (mem_base)
+		iounmap(mem_base);
+	return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+
+void dma_release_declared_memory(struct device *dev)
+{
+	struct dma_coherent_mem *mem = dev->dma_mem;
+	
+	if(!mem)
+		return;
+	dev->dma_mem = NULL;
+	iounmap(mem->virt_base);
+	kfree(mem->bitmap);
+	kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+
+void *dma_mark_declared_memory_occupied(struct device *dev,
+					dma_addr_t device_addr, size_t size)
+{
+	struct dma_coherent_mem *mem = dev->dma_mem;
+	int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	int pos, err;
+
+	if (!mem)
+		return ERR_PTR(-EINVAL);
+
+	pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+	err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
+	if (err != 0)
+		return ERR_PTR(err);
+	return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+
+#ifdef CONFIG_PCI
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+
+int forbid_dac;
+EXPORT_SYMBOL(forbid_dac);
+
+static __devinit void via_no_dac(struct pci_dev *dev)
+{
+	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+		printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
+		forbid_dac = 1;
+	}
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+
+static int check_iommu(char *s)
+{
+	if (!strcmp(s, "usedac")) {
+		forbid_dac = -1;
+		return 1;
+	}
+	return 0;
+}
+__setup("iommu=", check_iommu);
+#endif
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
new file mode 100644
index 000000000000..bc1f2d3ea277
--- /dev/null
+++ b/arch/x86/kernel/pcspeaker.c
@@ -0,0 +1,20 @@
+#include <linux/platform_device.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+static __init int add_pcspkr(void)
+{
+	struct platform_device *pd;
+	int ret;
+
+	pd = platform_device_alloc("pcspkr", -1);
+	if (!pd)
+		return -ENOMEM;
+
+	ret = platform_device_add(pd);
+	if (ret)
+		platform_device_put(pd);
+
+	return ret;
+}
+device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
new file mode 100644
index 000000000000..84664710b784
--- /dev/null
+++ b/arch/x86/kernel/process_32.c
@@ -0,0 +1,951 @@
+/*
+ *  linux/arch/i386/kernel/process.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <stdarg.h>
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/interrupt.h>
+#include <linux/utsname.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/random.h>
+#include <linux/personality.h>
+#include <linux/tick.h>
+#include <linux/percpu.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/desc.h>
+#include <asm/vm86.h>
+#ifdef CONFIG_MATH_EMULATION
+#include <asm/math_emu.h>
+#endif
+
+#include <linux/err.h>
+
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+
+asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+
+static int hlt_counter;
+
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
+/*
+ * Return saved PC of a blocked thread.
+ */
+unsigned long thread_saved_pc(struct task_struct *tsk)
+{
+	return ((unsigned long *)tsk->thread.esp)[3];
+}
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
+
+void disable_hlt(void)
+{
+	hlt_counter++;
+}
+
+EXPORT_SYMBOL(disable_hlt);
+
+void enable_hlt(void)
+{
+	hlt_counter--;
+}
+
+EXPORT_SYMBOL(enable_hlt);
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void default_idle(void)
+{
+	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
+		current_thread_info()->status &= ~TS_POLLING;
+		/*
+		 * TS_POLLING-cleared state must be visible before we
+		 * test NEED_RESCHED:
+		 */
+		smp_mb();
+
+		local_irq_disable();
+		if (!need_resched())
+			safe_halt();	/* enables interrupts racelessly */
+		else
+			local_irq_enable();
+		current_thread_info()->status |= TS_POLLING;
+	} else {
+		/* loop is done by the caller */
+		cpu_relax();
+	}
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->work.need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle (void)
+{
+	cpu_relax();
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+	/* This must be done before dead CPU ack */
+	cpu_exit_clear();
+	wbinvd();
+	mb();
+	/* Ack it */
+	__get_cpu_var(cpu_state) = CPU_DEAD;
+
+	/*
+	 * With physical CPU hotplug, we should halt the cpu
+	 */
+	local_irq_disable();
+	while (1)
+		halt();
+}
+#else
+static inline void play_dead(void)
+{
+	BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle(void)
+{
+	int cpu = smp_processor_id();
+
+	current_thread_info()->status |= TS_POLLING;
+
+	/* endless idle loop with no priority at all */
+	while (1) {
+		tick_nohz_stop_sched_tick();
+		while (!need_resched()) {
+			void (*idle)(void);
+
+			if (__get_cpu_var(cpu_idle_state))
+				__get_cpu_var(cpu_idle_state) = 0;
+
+			check_pgt_cache();
+			rmb();
+			idle = pm_idle;
+
+			if (!idle)
+				idle = default_idle;
+
+			if (cpu_is_offline(cpu))
+				play_dead();
+
+			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
+			idle();
+		}
+		tick_nohz_restart_sched_tick();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
+	}
+}
+
+void cpu_idle_wait(void)
+{
+	unsigned int cpu, this_cpu = get_cpu();
+	cpumask_t map, tmp = current->cpus_allowed;
+
+	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+	put_cpu();
+
+	cpus_clear(map);
+	for_each_online_cpu(cpu) {
+		per_cpu(cpu_idle_state, cpu) = 1;
+		cpu_set(cpu, map);
+	}
+
+	__get_cpu_var(cpu_idle_state) = 0;
+
+	wmb();
+	do {
+		ssleep(1);
+		for_each_online_cpu(cpu) {
+			if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+				cpu_clear(cpu, map);
+		}
+		cpus_and(map, map, cpu_online_map);
+	} while (!cpus_empty(map));
+
+	set_cpus_allowed(current, tmp);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+	if (!need_resched()) {
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		smp_mb();
+		if (!need_resched())
+			__mwait(eax, ecx);
+	}
+}
+
+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
+static void mwait_idle(void)
+{
+	local_irq_enable();
+	mwait_idle_with_hints(0, 0);
+}
+
+void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
+	if (cpu_has(c, X86_FEATURE_MWAIT)) {
+		printk("monitor/mwait feature present.\n");
+		/*
+		 * Skip, if setup has overridden idle.
+		 * One CPU supports mwait => All CPUs supports mwait
+		 */
+		if (!pm_idle) {
+			printk("using mwait in idle threads.\n");
+			pm_idle = mwait_idle;
+		}
+	}
+}
+
+static int __init idle_setup(char *str)
+{
+	if (!strcmp(str, "poll")) {
+		printk("using polling idle threads.\n");
+		pm_idle = poll_idle;
+#ifdef CONFIG_X86_SMP
+		if (smp_num_siblings > 1)
+			printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
+#endif
+	} else if (!strcmp(str, "mwait"))
+		force_mwait = 1;
+	else
+		return -1;
+
+	boot_option_idle_override = 1;
+	return 0;
+}
+early_param("idle", idle_setup);
+
+void show_regs(struct pt_regs * regs)
+{
+	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+	unsigned long d0, d1, d2, d3, d6, d7;
+
+	printk("\n");
+	printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
+	printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
+	print_symbol("EIP is at %s\n", regs->eip);
+
+	if (user_mode_vm(regs))
+		printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
+	printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
+	       regs->eflags, print_tainted(), init_utsname()->release,
+	       (int)strcspn(init_utsname()->version, " "),
+	       init_utsname()->version);
+	printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+		regs->eax,regs->ebx,regs->ecx,regs->edx);
+	printk("ESI: %08lx EDI: %08lx EBP: %08lx",
+		regs->esi, regs->edi, regs->ebp);
+	printk(" DS: %04x ES: %04x FS: %04x\n",
+	       0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
+
+	cr0 = read_cr0();
+	cr2 = read_cr2();
+	cr3 = read_cr3();
+	cr4 = read_cr4_safe();
+	printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
+
+	get_debugreg(d0, 0);
+	get_debugreg(d1, 1);
+	get_debugreg(d2, 2);
+	get_debugreg(d3, 3);
+	printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+			d0, d1, d2, d3);
+	get_debugreg(d6, 6);
+	get_debugreg(d7, 7);
+	printk("DR6: %08lx DR7: %08lx\n", d6, d7);
+
+	show_trace(NULL, regs, &regs->esp);
+}
+
+/*
+ * This gets run with %ebx containing the
+ * function to call, and %edx containing
+ * the "args".
+ */
+extern void kernel_thread_helper(void);
+
+/*
+ * Create a kernel thread
+ */
+int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+{
+	struct pt_regs regs;
+
+	memset(&regs, 0, sizeof(regs));
+
+	regs.ebx = (unsigned long) fn;
+	regs.edx = (unsigned long) arg;
+
+	regs.xds = __USER_DS;
+	regs.xes = __USER_DS;
+	regs.xfs = __KERNEL_PERCPU;
+	regs.orig_eax = -1;
+	regs.eip = (unsigned long) kernel_thread_helper;
+	regs.xcs = __KERNEL_CS | get_kernel_rpl();
+	regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+
+	/* Ok, create the new process.. */
+	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(kernel_thread);
+
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+	/* The process may have allocated an io port bitmap... nuke it. */
+	if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
+		struct task_struct *tsk = current;
+		struct thread_struct *t = &tsk->thread;
+		int cpu = get_cpu();
+		struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+		kfree(t->io_bitmap_ptr);
+		t->io_bitmap_ptr = NULL;
+		clear_thread_flag(TIF_IO_BITMAP);
+		/*
+		 * Careful, clear this in the TSS too:
+		 */
+		memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
+		t->io_bitmap_max = 0;
+		tss->io_bitmap_owner = NULL;
+		tss->io_bitmap_max = 0;
+		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
+		put_cpu();
+	}
+}
+
+void flush_thread(void)
+{
+	struct task_struct *tsk = current;
+
+	memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
+	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));	
+	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	/*
+	 * Forget coprocessor state..
+	 */
+	clear_fpu(tsk);
+	clear_used_math();
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+	BUG_ON(dead_task->mm);
+	release_vm86_irqs(dead_task);
+}
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+	unlazy_fpu(tsk);
+}
+
+int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
+	unsigned long unused,
+	struct task_struct * p, struct pt_regs * regs)
+{
+	struct pt_regs * childregs;
+	struct task_struct *tsk;
+	int err;
+
+	childregs = task_pt_regs(p);
+	*childregs = *regs;
+	childregs->eax = 0;
+	childregs->esp = esp;
+
+	p->thread.esp = (unsigned long) childregs;
+	p->thread.esp0 = (unsigned long) (childregs+1);
+
+	p->thread.eip = (unsigned long) ret_from_fork;
+
+	savesegment(gs,p->thread.gs);
+
+	tsk = current;
+	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
+		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
+						IO_BITMAP_BYTES, GFP_KERNEL);
+		if (!p->thread.io_bitmap_ptr) {
+			p->thread.io_bitmap_max = 0;
+			return -ENOMEM;
+		}
+		set_tsk_thread_flag(p, TIF_IO_BITMAP);
+	}
+
+	/*
+	 * Set a new TLS for the child thread?
+	 */
+	if (clone_flags & CLONE_SETTLS) {
+		struct desc_struct *desc;
+		struct user_desc info;
+		int idx;
+
+		err = -EFAULT;
+		if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
+			goto out;
+		err = -EINVAL;
+		if (LDT_empty(&info))
+			goto out;
+
+		idx = info.entry_number;
+		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+			goto out;
+
+		desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+		desc->a = LDT_entry_a(&info);
+		desc->b = LDT_entry_b(&info);
+	}
+
+	err = 0;
+ out:
+	if (err && p->thread.io_bitmap_ptr) {
+		kfree(p->thread.io_bitmap_ptr);
+		p->thread.io_bitmap_max = 0;
+	}
+	return err;
+}
+
+/*
+ * fill in the user structure for a core dump..
+ */
+void dump_thread(struct pt_regs * regs, struct user * dump)
+{
+	int i;
+
+/* changed the size calculations - should hopefully work better. lbt */
+	dump->magic = CMAGIC;
+	dump->start_code = 0;
+	dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
+	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
+	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
+	dump->u_dsize -= dump->u_tsize;
+	dump->u_ssize = 0;
+	for (i = 0; i < 8; i++)
+		dump->u_debugreg[i] = current->thread.debugreg[i];  
+
+	if (dump->start_stack < TASK_SIZE)
+		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
+
+	dump->regs.ebx = regs->ebx;
+	dump->regs.ecx = regs->ecx;
+	dump->regs.edx = regs->edx;
+	dump->regs.esi = regs->esi;
+	dump->regs.edi = regs->edi;
+	dump->regs.ebp = regs->ebp;
+	dump->regs.eax = regs->eax;
+	dump->regs.ds = regs->xds;
+	dump->regs.es = regs->xes;
+	dump->regs.fs = regs->xfs;
+	savesegment(gs,dump->regs.gs);
+	dump->regs.orig_eax = regs->orig_eax;
+	dump->regs.eip = regs->eip;
+	dump->regs.cs = regs->xcs;
+	dump->regs.eflags = regs->eflags;
+	dump->regs.esp = regs->esp;
+	dump->regs.ss = regs->xss;
+
+	dump->u_fpvalid = dump_fpu (regs, &dump->i387);
+}
+EXPORT_SYMBOL(dump_thread);
+
+/* 
+ * Capture the user space registers if the task is not running (in user space)
+ */
+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
+{
+	struct pt_regs ptregs = *task_pt_regs(tsk);
+	ptregs.xcs &= 0xffff;
+	ptregs.xds &= 0xffff;
+	ptregs.xes &= 0xffff;
+	ptregs.xss &= 0xffff;
+
+	elf_core_copy_regs(regs, &ptregs);
+
+	return 1;
+}
+
+#ifdef CONFIG_SECCOMP
+void hard_disable_TSC(void)
+{
+	write_cr4(read_cr4() | X86_CR4_TSD);
+}
+void disable_TSC(void)
+{
+	preempt_disable();
+	if (!test_and_set_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_disable_TSC();
+	preempt_enable();
+}
+void hard_enable_TSC(void)
+{
+	write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+#endif /* CONFIG_SECCOMP */
+
+static noinline void
+__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+		 struct tss_struct *tss)
+{
+	struct thread_struct *next;
+
+	next = &next_p->thread;
+
+	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+		set_debugreg(next->debugreg[0], 0);
+		set_debugreg(next->debugreg[1], 1);
+		set_debugreg(next->debugreg[2], 2);
+		set_debugreg(next->debugreg[3], 3);
+		/* no 4 and 5 */
+		set_debugreg(next->debugreg[6], 6);
+		set_debugreg(next->debugreg[7], 7);
+	}
+
+#ifdef CONFIG_SECCOMP
+	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+		/* prev and next are different */
+		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+			hard_disable_TSC();
+		else
+			hard_enable_TSC();
+	}
+#endif
+
+	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+		/*
+		 * Disable the bitmap via an invalid offset. We still cache
+		 * the previous bitmap owner and the IO bitmap contents:
+		 */
+		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
+		return;
+	}
+
+	if (likely(next == tss->io_bitmap_owner)) {
+		/*
+		 * Previous owner of the bitmap (hence the bitmap content)
+		 * matches the next task, we dont have to do anything but
+		 * to set a valid offset in the TSS:
+		 */
+		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+		return;
+	}
+	/*
+	 * Lazy TSS's I/O bitmap copy. We set an invalid offset here
+	 * and we let the task to get a GPF in case an I/O instruction
+	 * is performed.  The handler of the GPF will verify that the
+	 * faulting task has a valid I/O bitmap and, it true, does the
+	 * real copy and restart the instruction.  This will save us
+	 * redundant copies when the currently switched task does not
+	 * perform any I/O during its timeslice.
+	 */
+	tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
+}
+
+/*
+ *	switch_to(x,yn) should switch tasks from x to y.
+ *
+ * We fsave/fwait so that an exception goes off at the right time
+ * (as a call from the fsave or fwait in effect) rather than to
+ * the wrong process. Lazy FP saving no longer makes any sense
+ * with modern CPU's, and this simplifies a lot of things (SMP
+ * and UP become the same).
+ *
+ * NOTE! We used to use the x86 hardware context switching. The
+ * reason for not using it any more becomes apparent when you
+ * try to recover gracefully from saved state that is no longer
+ * valid (stale segment register values in particular). With the
+ * hardware task-switch, there is no way to fix up bad state in
+ * a reasonable manner.
+ *
+ * The fact that Intel documents the hardware task-switching to
+ * be slow is a fairly red herring - this code is not noticeably
+ * faster. However, there _is_ some room for improvement here,
+ * so the performance issues may eventually be a valid point.
+ * More important, however, is the fact that this allows us much
+ * more flexibility.
+ *
+ * The return value (in %eax) will be the "prev" task after
+ * the task-switch, and shows up in ret_from_fork in entry.S,
+ * for example.
+ */
+struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	struct thread_struct *prev = &prev_p->thread,
+				 *next = &next_p->thread;
+	int cpu = smp_processor_id();
+	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
+
+	__unlazy_fpu(prev_p);
+
+
+	/* we're going to use this soon, after a few expensive things */
+	if (next_p->fpu_counter > 5)
+		prefetch(&next->i387.fxsave);
+
+	/*
+	 * Reload esp0.
+	 */
+	load_esp0(tss, next);
+
+	/*
+	 * Save away %gs. No need to save %fs, as it was saved on the
+	 * stack on entry.  No need to save %es and %ds, as those are
+	 * always kernel segments while inside the kernel.  Doing this
+	 * before setting the new TLS descriptors avoids the situation
+	 * where we temporarily have non-reloadable segments in %fs
+	 * and %gs.  This could be an issue if the NMI handler ever
+	 * used %fs or %gs (it does not today), or if the kernel is
+	 * running inside of a hypervisor layer.
+	 */
+	savesegment(gs, prev->gs);
+
+	/*
+	 * Load the per-thread Thread-Local Storage descriptor.
+	 */
+	load_TLS(next, cpu);
+
+	/*
+	 * Restore IOPL if needed.  In normal use, the flags restore
+	 * in the switch assembly will handle this.  But if the kernel
+	 * is running virtualized at a non-zero CPL, the popf will
+	 * not restore flags, so it must be done in a separate step.
+	 */
+	if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
+		set_iopl_mask(next->iopl);
+
+	/*
+	 * Now maybe handle debug registers and/or IO bitmaps
+	 */
+	if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
+		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
+		__switch_to_xtra(prev_p, next_p, tss);
+
+	/*
+	 * Leave lazy mode, flushing any hypercalls made here.
+	 * This must be done before restoring TLS segments so
+	 * the GDT and LDT are properly updated, and must be
+	 * done before math_state_restore, so the TS bit is up
+	 * to date.
+	 */
+	arch_leave_lazy_cpu_mode();
+
+	/* If the task has used fpu the last 5 timeslices, just do a full
+	 * restore of the math state immediately to avoid the trap; the
+	 * chances of needing FPU soon are obviously high now
+	 */
+	if (next_p->fpu_counter > 5)
+		math_state_restore();
+
+	/*
+	 * Restore %gs if needed (which is common)
+	 */
+	if (prev->gs | next->gs)
+		loadsegment(gs, next->gs);
+
+	x86_write_percpu(current_task, next_p);
+
+	return prev_p;
+}
+
+asmlinkage int sys_fork(struct pt_regs regs)
+{
+	return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+}
+
+asmlinkage int sys_clone(struct pt_regs regs)
+{
+	unsigned long clone_flags;
+	unsigned long newsp;
+	int __user *parent_tidptr, *child_tidptr;
+
+	clone_flags = regs.ebx;
+	newsp = regs.ecx;
+	parent_tidptr = (int __user *)regs.edx;
+	child_tidptr = (int __user *)regs.edi;
+	if (!newsp)
+		newsp = regs.esp;
+	return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+asmlinkage int sys_vfork(struct pt_regs regs)
+{
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+}
+
+/*
+ * sys_execve() executes a new program.
+ */
+asmlinkage int sys_execve(struct pt_regs regs)
+{
+	int error;
+	char * filename;
+
+	filename = getname((char __user *) regs.ebx);
+	error = PTR_ERR(filename);
+	if (IS_ERR(filename))
+		goto out;
+	error = do_execve(filename,
+			(char __user * __user *) regs.ecx,
+			(char __user * __user *) regs.edx,
+			&regs);
+	if (error == 0) {
+		task_lock(current);
+		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+		/* Make sure we don't return using sysenter.. */
+		set_thread_flag(TIF_IRET);
+	}
+	putname(filename);
+out:
+	return error;
+}
+
+#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
+#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
+
+unsigned long get_wchan(struct task_struct *p)
+{
+	unsigned long ebp, esp, eip;
+	unsigned long stack_page;
+	int count = 0;
+	if (!p || p == current || p->state == TASK_RUNNING)
+		return 0;
+	stack_page = (unsigned long)task_stack_page(p);
+	esp = p->thread.esp;
+	if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
+		return 0;
+	/* include/asm-i386/system.h:switch_to() pushes ebp last. */
+	ebp = *(unsigned long *) esp;
+	do {
+		if (ebp < stack_page || ebp > top_ebp+stack_page)
+			return 0;
+		eip = *(unsigned long *) (ebp+4);
+		if (!in_sched_functions(eip))
+			return eip;
+		ebp = *(unsigned long *) ebp;
+	} while (count++ < 16);
+	return 0;
+}
+
+/*
+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
+ */
+static int get_free_idx(void)
+{
+	struct thread_struct *t = &current->thread;
+	int idx;
+
+	for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+		if (desc_empty(t->tls_array + idx))
+			return idx + GDT_ENTRY_TLS_MIN;
+	return -ESRCH;
+}
+
+/*
+ * Set a given TLS descriptor:
+ */
+asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
+{
+	struct thread_struct *t = &current->thread;
+	struct user_desc info;
+	struct desc_struct *desc;
+	int cpu, idx;
+
+	if (copy_from_user(&info, u_info, sizeof(info)))
+		return -EFAULT;
+	idx = info.entry_number;
+
+	/*
+	 * index -1 means the kernel should try to find and
+	 * allocate an empty descriptor:
+	 */
+	if (idx == -1) {
+		idx = get_free_idx();
+		if (idx < 0)
+			return idx;
+		if (put_user(idx, &u_info->entry_number))
+			return -EFAULT;
+	}
+
+	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+		return -EINVAL;
+
+	desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
+
+	/*
+	 * We must not get preempted while modifying the TLS.
+	 */
+	cpu = get_cpu();
+
+	if (LDT_empty(&info)) {
+		desc->a = 0;
+		desc->b = 0;
+	} else {
+		desc->a = LDT_entry_a(&info);
+		desc->b = LDT_entry_b(&info);
+	}
+	load_TLS(t, cpu);
+
+	put_cpu();
+
+	return 0;
+}
+
+/*
+ * Get the current Thread-Local Storage area:
+ */
+
+#define GET_BASE(desc) ( \
+	(((desc)->a >> 16) & 0x0000ffff) | \
+	(((desc)->b << 16) & 0x00ff0000) | \
+	( (desc)->b        & 0xff000000)   )
+
+#define GET_LIMIT(desc) ( \
+	((desc)->a & 0x0ffff) | \
+	 ((desc)->b & 0xf0000) )
+	
+#define GET_32BIT(desc)		(((desc)->b >> 22) & 1)
+#define GET_CONTENTS(desc)	(((desc)->b >> 10) & 3)
+#define GET_WRITABLE(desc)	(((desc)->b >>  9) & 1)
+#define GET_LIMIT_PAGES(desc)	(((desc)->b >> 23) & 1)
+#define GET_PRESENT(desc)	(((desc)->b >> 15) & 1)
+#define GET_USEABLE(desc)	(((desc)->b >> 20) & 1)
+
+asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
+{
+	struct user_desc info;
+	struct desc_struct *desc;
+	int idx;
+
+	if (get_user(idx, &u_info->entry_number))
+		return -EFAULT;
+	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+		return -EINVAL;
+
+	memset(&info, 0, sizeof(info));
+
+	desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+
+	info.entry_number = idx;
+	info.base_addr = GET_BASE(desc);
+	info.limit = GET_LIMIT(desc);
+	info.seg_32bit = GET_32BIT(desc);
+	info.contents = GET_CONTENTS(desc);
+	info.read_exec_only = !GET_WRITABLE(desc);
+	info.limit_in_pages = GET_LIMIT_PAGES(desc);
+	info.seg_not_present = !GET_PRESENT(desc);
+	info.useable = GET_USEABLE(desc);
+
+	if (copy_to_user(u_info, &info, sizeof(info)))
+		return -EFAULT;
+	return 0;
+}
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+		sp -= get_random_int() % 8192;
+	return sp & ~0xf;
+}
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
new file mode 100644
index 000000000000..7c1b92522e95
--- /dev/null
+++ b/arch/x86/kernel/ptrace_32.c
@@ -0,0 +1,723 @@
+/* ptrace.c */
+/* By Ross Biro 1/23/92 */
+/*
+ * Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+
+/*
+ * Determines which flags the user has access to [1 = access, 0 = no access].
+ * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
+ * Also masks reserved bits (31-22, 15, 5, 3, 1).
+ */
+#define FLAG_MASK 0x00050dd5
+
+/* set's the trap flag. */
+#define TRAP_FLAG 0x100
+
+/*
+ * Offset of eflags on child stack..
+ */
+#define EFL_OFFSET offsetof(struct pt_regs, eflags)
+
+static inline struct pt_regs *get_child_regs(struct task_struct *task)
+{
+	void *stack_top = (void *)task->thread.esp0;
+	return stack_top - sizeof(struct pt_regs);
+}
+
+/*
+ * This routine will get a word off of the processes privileged stack.
+ * the offset is bytes into the pt_regs structure on the stack.
+ * This routine assumes that all the privileged stacks are in our
+ * data space.
+ */   
+static inline int get_stack_long(struct task_struct *task, int offset)
+{
+	unsigned char *stack;
+
+	stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
+	stack += offset;
+	return (*((int *)stack));
+}
+
+/*
+ * This routine will put a word on the processes privileged stack.
+ * the offset is bytes into the pt_regs structure on the stack.
+ * This routine assumes that all the privileged stacks are in our
+ * data space.
+ */
+static inline int put_stack_long(struct task_struct *task, int offset,
+	unsigned long data)
+{
+	unsigned char * stack;
+
+	stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
+	stack += offset;
+	*(unsigned long *) stack = data;
+	return 0;
+}
+
+static int putreg(struct task_struct *child,
+	unsigned long regno, unsigned long value)
+{
+	switch (regno >> 2) {
+		case GS:
+			if (value && (value & 3) != 3)
+				return -EIO;
+			child->thread.gs = value;
+			return 0;
+		case DS:
+		case ES:
+		case FS:
+			if (value && (value & 3) != 3)
+				return -EIO;
+			value &= 0xffff;
+			break;
+		case SS:
+		case CS:
+			if ((value & 3) != 3)
+				return -EIO;
+			value &= 0xffff;
+			break;
+		case EFL:
+			value &= FLAG_MASK;
+			value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
+			break;
+	}
+	if (regno > FS*4)
+		regno -= 1*4;
+	put_stack_long(child, regno, value);
+	return 0;
+}
+
+static unsigned long getreg(struct task_struct *child,
+	unsigned long regno)
+{
+	unsigned long retval = ~0UL;
+
+	switch (regno >> 2) {
+		case GS:
+			retval = child->thread.gs;
+			break;
+		case DS:
+		case ES:
+		case FS:
+		case SS:
+		case CS:
+			retval = 0xffff;
+			/* fall through */
+		default:
+			if (regno > FS*4)
+				regno -= 1*4;
+			retval &= get_stack_long(child, regno);
+	}
+	return retval;
+}
+
+#define LDT_SEGMENT 4
+
+static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
+{
+	unsigned long addr, seg;
+
+	addr = regs->eip;
+	seg = regs->xcs & 0xffff;
+	if (regs->eflags & VM_MASK) {
+		addr = (addr & 0xffff) + (seg << 4);
+		return addr;
+	}
+
+	/*
+	 * We'll assume that the code segments in the GDT
+	 * are all zero-based. That is largely true: the
+	 * TLS segments are used for data, and the PNPBIOS
+	 * and APM bios ones we just ignore here.
+	 */
+	if (seg & LDT_SEGMENT) {
+		u32 *desc;
+		unsigned long base;
+
+		seg &= ~7UL;
+
+		down(&child->mm->context.sem);
+		if (unlikely((seg >> 3) >= child->mm->context.size))
+			addr = -1L; /* bogus selector, access would fault */
+		else {
+			desc = child->mm->context.ldt + seg;
+			base = ((desc[0] >> 16) |
+				((desc[1] & 0xff) << 16) |
+				(desc[1] & 0xff000000));
+
+			/* 16-bit code segment? */
+			if (!((desc[1] >> 22) & 1))
+				addr &= 0xffff;
+			addr += base;
+		}
+		up(&child->mm->context.sem);
+	}
+	return addr;
+}
+
+static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
+{
+	int i, copied;
+	unsigned char opcode[15];
+	unsigned long addr = convert_eip_to_linear(child, regs);
+
+	copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
+	for (i = 0; i < copied; i++) {
+		switch (opcode[i]) {
+		/* popf and iret */
+		case 0x9d: case 0xcf:
+			return 1;
+		/* opcode and address size prefixes */
+		case 0x66: case 0x67:
+			continue;
+		/* irrelevant prefixes (segment overrides and repeats) */
+		case 0x26: case 0x2e:
+		case 0x36: case 0x3e:
+		case 0x64: case 0x65:
+		case 0xf0: case 0xf2: case 0xf3:
+			continue;
+
+		/*
+		 * pushf: NOTE! We should probably not let
+		 * the user see the TF bit being set. But
+		 * it's more pain than it's worth to avoid
+		 * it, and a debugger could emulate this
+		 * all in user space if it _really_ cares.
+		 */
+		case 0x9c:
+		default:
+			return 0;
+		}
+	}
+	return 0;
+}
+
+static void set_singlestep(struct task_struct *child)
+{
+	struct pt_regs *regs = get_child_regs(child);
+
+	/*
+	 * Always set TIF_SINGLESTEP - this guarantees that 
+	 * we single-step system calls etc..  This will also
+	 * cause us to set TF when returning to user mode.
+	 */
+	set_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+	/*
+	 * If TF was already set, don't do anything else
+	 */
+	if (regs->eflags & TRAP_FLAG)
+		return;
+
+	/* Set TF on the kernel stack.. */
+	regs->eflags |= TRAP_FLAG;
+
+	/*
+	 * ..but if TF is changed by the instruction we will trace,
+	 * don't mark it as being "us" that set it, so that we
+	 * won't clear it by hand later.
+	 */
+	if (is_setting_trap_flag(child, regs))
+		return;
+	
+	child->ptrace |= PT_DTRACE;
+}
+
+static void clear_singlestep(struct task_struct *child)
+{
+	/* Always clear TIF_SINGLESTEP... */
+	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+	/* But touch TF only if it was set by us.. */
+	if (child->ptrace & PT_DTRACE) {
+		struct pt_regs *regs = get_child_regs(child);
+		regs->eflags &= ~TRAP_FLAG;
+		child->ptrace &= ~PT_DTRACE;
+	}
+}
+
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure the single step bit is not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{ 
+	clear_singlestep(child);
+	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+}
+
+/*
+ * Perform get_thread_area on behalf of the traced child.
+ */
+static int
+ptrace_get_thread_area(struct task_struct *child,
+		       int idx, struct user_desc __user *user_desc)
+{
+	struct user_desc info;
+	struct desc_struct *desc;
+
+/*
+ * Get the current Thread-Local Storage area:
+ */
+
+#define GET_BASE(desc) ( \
+	(((desc)->a >> 16) & 0x0000ffff) | \
+	(((desc)->b << 16) & 0x00ff0000) | \
+	( (desc)->b        & 0xff000000)   )
+
+#define GET_LIMIT(desc) ( \
+	((desc)->a & 0x0ffff) | \
+	 ((desc)->b & 0xf0000) )
+
+#define GET_32BIT(desc)		(((desc)->b >> 22) & 1)
+#define GET_CONTENTS(desc)	(((desc)->b >> 10) & 3)
+#define GET_WRITABLE(desc)	(((desc)->b >>  9) & 1)
+#define GET_LIMIT_PAGES(desc)	(((desc)->b >> 23) & 1)
+#define GET_PRESENT(desc)	(((desc)->b >> 15) & 1)
+#define GET_USEABLE(desc)	(((desc)->b >> 20) & 1)
+
+	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+		return -EINVAL;
+
+	desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+
+	info.entry_number = idx;
+	info.base_addr = GET_BASE(desc);
+	info.limit = GET_LIMIT(desc);
+	info.seg_32bit = GET_32BIT(desc);
+	info.contents = GET_CONTENTS(desc);
+	info.read_exec_only = !GET_WRITABLE(desc);
+	info.limit_in_pages = GET_LIMIT_PAGES(desc);
+	info.seg_not_present = !GET_PRESENT(desc);
+	info.useable = GET_USEABLE(desc);
+
+	if (copy_to_user(user_desc, &info, sizeof(info)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * Perform set_thread_area on behalf of the traced child.
+ */
+static int
+ptrace_set_thread_area(struct task_struct *child,
+		       int idx, struct user_desc __user *user_desc)
+{
+	struct user_desc info;
+	struct desc_struct *desc;
+
+	if (copy_from_user(&info, user_desc, sizeof(info)))
+		return -EFAULT;
+
+	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+		return -EINVAL;
+
+	desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+	if (LDT_empty(&info)) {
+		desc->a = 0;
+		desc->b = 0;
+	} else {
+		desc->a = LDT_entry_a(&info);
+		desc->b = LDT_entry_b(&info);
+	}
+
+	return 0;
+}
+
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+{
+	struct user * dummy = NULL;
+	int i, ret;
+	unsigned long __user *datap = (unsigned long __user *)data;
+
+	switch (request) {
+	/* when I and D space are separate, these will need to be fixed. */
+	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
+	case PTRACE_PEEKDATA:
+		ret = generic_ptrace_peekdata(child, addr, data);
+		break;
+
+	/* read the word at location addr in the USER area. */
+	case PTRACE_PEEKUSR: {
+		unsigned long tmp;
+
+		ret = -EIO;
+		if ((addr & 3) || addr < 0 || 
+		    addr > sizeof(struct user) - 3)
+			break;
+
+		tmp = 0;  /* Default return condition */
+		if(addr < FRAME_SIZE*sizeof(long))
+			tmp = getreg(child, addr);
+		if(addr >= (long) &dummy->u_debugreg[0] &&
+		   addr <= (long) &dummy->u_debugreg[7]){
+			addr -= (long) &dummy->u_debugreg[0];
+			addr = addr >> 2;
+			tmp = child->thread.debugreg[addr];
+		}
+		ret = put_user(tmp, datap);
+		break;
+	}
+
+	/* when I and D space are separate, this will have to be fixed. */
+	case PTRACE_POKETEXT: /* write the word at location addr. */
+	case PTRACE_POKEDATA:
+		ret = generic_ptrace_pokedata(child, addr, data);
+		break;
+
+	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+		ret = -EIO;
+		if ((addr & 3) || addr < 0 || 
+		    addr > sizeof(struct user) - 3)
+			break;
+
+		if (addr < FRAME_SIZE*sizeof(long)) {
+			ret = putreg(child, addr, data);
+			break;
+		}
+		/* We need to be very careful here.  We implicitly
+		   want to modify a portion of the task_struct, and we
+		   have to be selective about what portions we allow someone
+		   to modify. */
+
+		  ret = -EIO;
+		  if(addr >= (long) &dummy->u_debugreg[0] &&
+		     addr <= (long) &dummy->u_debugreg[7]){
+
+			  if(addr == (long) &dummy->u_debugreg[4]) break;
+			  if(addr == (long) &dummy->u_debugreg[5]) break;
+			  if(addr < (long) &dummy->u_debugreg[4] &&
+			     ((unsigned long) data) >= TASK_SIZE-3) break;
+			  
+			  /* Sanity-check data. Take one half-byte at once with
+			   * check = (val >> (16 + 4*i)) & 0xf. It contains the
+			   * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
+			   * 2 and 3 are LENi. Given a list of invalid values,
+			   * we do mask |= 1 << invalid_value, so that
+			   * (mask >> check) & 1 is a correct test for invalid
+			   * values.
+			   *
+			   * R/Wi contains the type of the breakpoint /
+			   * watchpoint, LENi contains the length of the watched
+			   * data in the watchpoint case.
+			   *
+			   * The invalid values are:
+			   * - LENi == 0x10 (undefined), so mask |= 0x0f00.
+			   * - R/Wi == 0x10 (break on I/O reads or writes), so
+			   *   mask |= 0x4444.
+			   * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
+			   *   0x1110.
+			   *
+			   * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
+			   *
+			   * See the Intel Manual "System Programming Guide",
+			   * 15.2.4
+			   *
+			   * Note that LENi == 0x10 is defined on x86_64 in long
+			   * mode (i.e. even for 32-bit userspace software, but
+			   * 64-bit kernel), so the x86_64 mask value is 0x5454.
+			   * See the AMD manual no. 24593 (AMD64 System
+			   * Programming)*/
+
+			  if(addr == (long) &dummy->u_debugreg[7]) {
+				  data &= ~DR_CONTROL_RESERVED;
+				  for(i=0; i<4; i++)
+					  if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
+						  goto out_tsk;
+				  if (data)
+					  set_tsk_thread_flag(child, TIF_DEBUG);
+				  else
+					  clear_tsk_thread_flag(child, TIF_DEBUG);
+			  }
+			  addr -= (long) &dummy->u_debugreg;
+			  addr = addr >> 2;
+			  child->thread.debugreg[addr] = data;
+			  ret = 0;
+		  }
+		  break;
+
+	case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
+	case PTRACE_SYSCALL:	/* continue and stop at next (return from) syscall */
+	case PTRACE_CONT:	/* restart after signal. */
+		ret = -EIO;
+		if (!valid_signal(data))
+			break;
+		if (request == PTRACE_SYSEMU) {
+			set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		} else if (request == PTRACE_SYSCALL) {
+			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+		} else {
+			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		}
+		child->exit_code = data;
+		/* make sure the single step bit is not set. */
+		clear_singlestep(child);
+		wake_up_process(child);
+		ret = 0;
+		break;
+
+/*
+ * make the child exit.  Best I can do is send it a sigkill. 
+ * perhaps it should be put in the status that it wants to 
+ * exit.
+ */
+	case PTRACE_KILL:
+		ret = 0;
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
+			break;
+		child->exit_code = SIGKILL;
+		/* make sure the single step bit is not set. */
+		clear_singlestep(child);
+		wake_up_process(child);
+		break;
+
+	case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
+	case PTRACE_SINGLESTEP:	/* set the trap flag. */
+		ret = -EIO;
+		if (!valid_signal(data))
+			break;
+
+		if (request == PTRACE_SYSEMU_SINGLESTEP)
+			set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+		else
+			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+
+		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		set_singlestep(child);
+		child->exit_code = data;
+		/* give it a chance to run. */
+		wake_up_process(child);
+		ret = 0;
+		break;
+
+	case PTRACE_DETACH:
+		/* detach a process that was attached. */
+		ret = ptrace_detach(child, data);
+		break;
+
+	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
+	  	if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
+			ret = -EIO;
+			break;
+		}
+		for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
+			__put_user(getreg(child, i), datap);
+			datap++;
+		}
+		ret = 0;
+		break;
+	}
+
+	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
+		unsigned long tmp;
+	  	if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
+			ret = -EIO;
+			break;
+		}
+		for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
+			__get_user(tmp, datap);
+			putreg(child, i, tmp);
+			datap++;
+		}
+		ret = 0;
+		break;
+	}
+
+	case PTRACE_GETFPREGS: { /* Get the child FPU state. */
+		if (!access_ok(VERIFY_WRITE, datap,
+			       sizeof(struct user_i387_struct))) {
+			ret = -EIO;
+			break;
+		}
+		ret = 0;
+		if (!tsk_used_math(child))
+			init_fpu(child);
+		get_fpregs((struct user_i387_struct __user *)data, child);
+		break;
+	}
+
+	case PTRACE_SETFPREGS: { /* Set the child FPU state. */
+		if (!access_ok(VERIFY_READ, datap,
+			       sizeof(struct user_i387_struct))) {
+			ret = -EIO;
+			break;
+		}
+		set_stopped_child_used_math(child);
+		set_fpregs(child, (struct user_i387_struct __user *)data);
+		ret = 0;
+		break;
+	}
+
+	case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
+		if (!access_ok(VERIFY_WRITE, datap,
+			       sizeof(struct user_fxsr_struct))) {
+			ret = -EIO;
+			break;
+		}
+		if (!tsk_used_math(child))
+			init_fpu(child);
+		ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
+		break;
+	}
+
+	case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
+		if (!access_ok(VERIFY_READ, datap,
+			       sizeof(struct user_fxsr_struct))) {
+			ret = -EIO;
+			break;
+		}
+		set_stopped_child_used_math(child);
+		ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
+		break;
+	}
+
+	case PTRACE_GET_THREAD_AREA:
+		ret = ptrace_get_thread_area(child, addr,
+					(struct user_desc __user *) data);
+		break;
+
+	case PTRACE_SET_THREAD_AREA:
+		ret = ptrace_set_thread_area(child, addr,
+					(struct user_desc __user *) data);
+		break;
+
+	default:
+		ret = ptrace_request(child, request, addr, data);
+		break;
+	}
+ out_tsk:
+	return ret;
+}
+
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
+{
+	struct siginfo info;
+
+	tsk->thread.trap_no = 1;
+	tsk->thread.error_code = error_code;
+
+	memset(&info, 0, sizeof(info));
+	info.si_signo = SIGTRAP;
+	info.si_code = TRAP_BRKPT;
+
+	/* User-mode eip? */
+	info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
+
+	/* Send us the fakey SIGTRAP */
+	force_sig_info(SIGTRAP, &info, tsk);
+}
+
+/* notification of system call entry/exit
+ * - triggered by current->work.syscall_trace
+ */
+__attribute__((regparm(3)))
+int do_syscall_trace(struct pt_regs *regs, int entryexit)
+{
+	int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
+	/*
+	 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
+	 * interception
+	 */
+	int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
+	int ret = 0;
+
+	/* do the secure computing check first */
+	if (!entryexit)
+		secure_computing(regs->orig_eax);
+
+	if (unlikely(current->audit_context)) {
+		if (entryexit)
+			audit_syscall_exit(AUDITSC_RESULT(regs->eax),
+						regs->eax);
+		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
+		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
+		 * not used, entry.S will call us only on syscall exit, not
+		 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
+		 * calling send_sigtrap() on syscall entry.
+		 *
+		 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
+		 * is_singlestep is false, despite his name, so we will still do
+		 * the correct thing.
+		 */
+		else if (is_singlestep)
+			goto out;
+	}
+
+	if (!(current->ptrace & PT_PTRACED))
+		goto out;
+
+	/* If a process stops on the 1st tracepoint with SYSCALL_TRACE
+	 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
+	 * here. We have to check this and return */
+	if (is_sysemu && entryexit)
+		return 0;
+
+	/* Fake a debug trap */
+	if (is_singlestep)
+		send_sigtrap(current, regs, 0);
+
+ 	if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
+		goto out;
+
+	/* the 0x80 provides a way for the tracing parent to distinguish
+	   between a syscall stop and SIGTRAP delivery */
+	/* Note that the debugger could change the result of test_thread_flag!*/
+	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
+
+	/*
+	 * this isn't the same as continuing with a signal, but it will do
+	 * for normal use.  strace only continues with a signal if the
+	 * stopping signal is not SIGTRAP.  -brl
+	 */
+	if (current->exit_code) {
+		send_sig(current->exit_code, current, 1);
+		current->exit_code = 0;
+	}
+	ret = is_sysemu;
+out:
+	if (unlikely(current->audit_context) && !entryexit)
+		audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
+				    regs->ebx, regs->ecx, regs->edx, regs->esi);
+	if (ret == 0)
+		return 0;
+
+	regs->orig_eax = -1; /* force skip of syscall restarting */
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
+	return 1;
+}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
new file mode 100644
index 000000000000..6722469c2633
--- /dev/null
+++ b/arch/x86/kernel/quirks.c
@@ -0,0 +1,49 @@
+/*
+ * This file contains work-arounds for x86 and x86_64 platform bugs.
+ */
+#include <linux/pci.h>
+#include <linux/irq.h>
+
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+
+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+{
+	u8 config, rev;
+	u32 word;
+
+	/* BIOS may enable hardware IRQ balancing for
+	 * E7520/E7320/E7525(revision ID 0x9 and below)
+	 * based platforms.
+	 * Disable SW irqbalance/affinity on those platforms.
+	 */
+	pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+	if (rev > 0x9)
+		return;
+
+	/* enable access to config space*/
+	pci_read_config_byte(dev, 0xf4, &config);
+	pci_write_config_byte(dev, 0xf4, config|0x2);
+
+	/* read xTPR register */
+	raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
+
+	if (!(word & (1 << 13))) {
+		printk(KERN_INFO "Intel E7520/7320/7525 detected. "
+			"Disabling irq balancing and affinity\n");
+#ifdef CONFIG_IRQBALANCE
+		irqbalance_disable("");
+#endif
+		noirqdebug_setup("");
+#ifdef CONFIG_PROC_FS
+		no_irq_affinity = 1;
+#endif
+	}
+
+	/* put back the original value for config space*/
+	if (!(config & 0x2))
+		pci_write_config_byte(dev, 0xf4, config);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7320_MCH,	quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7525_MCH,	quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7520_MCH,	quirk_intel_irqbalance);
+#endif
diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot_32.c
new file mode 100644
index 000000000000..0d796248866c
--- /dev/null
+++ b/arch/x86/kernel/reboot_32.c
@@ -0,0 +1,413 @@
+/*
+ *  linux/arch/i386/kernel/reboot.c
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/efi.h>
+#include <linux/dmi.h>
+#include <linux/ctype.h>
+#include <linux/pm.h>
+#include <linux/reboot.h>
+#include <asm/uaccess.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include "mach_reboot.h"
+#include <asm/reboot_fixups.h>
+#include <asm/reboot.h>
+
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+
+static int reboot_mode;
+static int reboot_thru_bios;
+
+#ifdef CONFIG_SMP
+static int reboot_cpu = -1;
+#endif
+static int __init reboot_setup(char *str)
+{
+	while(1) {
+		switch (*str) {
+		case 'w': /* "warm" reboot (no memory testing etc) */
+			reboot_mode = 0x1234;
+			break;
+		case 'c': /* "cold" reboot (with memory testing etc) */
+			reboot_mode = 0x0;
+			break;
+		case 'b': /* "bios" reboot by jumping through the BIOS */
+			reboot_thru_bios = 1;
+			break;
+		case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
+			reboot_thru_bios = 0;
+			break;
+#ifdef CONFIG_SMP
+		case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
+			if (isdigit(*(str+1))) {
+				reboot_cpu = (int) (*(str+1) - '0');
+				if (isdigit(*(str+2)))
+					reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
+			}
+				/* we will leave sorting out the final value 
+				when we are ready to reboot, since we might not
+ 				have set up boot_cpu_id or smp_num_cpu */
+			break;
+#endif
+		}
+		if((str = strchr(str,',')) != NULL)
+			str++;
+		else
+			break;
+	}
+	return 1;
+}
+
+__setup("reboot=", reboot_setup);
+
+/*
+ * Reboot options and system auto-detection code provided by
+ * Dell Inc. so their systems "just work". :-)
+ */
+
+/*
+ * Some machines require the "reboot=b"  commandline option, this quirk makes that automatic.
+ */
+static int __init set_bios_reboot(struct dmi_system_id *d)
+{
+	if (!reboot_thru_bios) {
+		reboot_thru_bios = 1;
+		printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
+	}
+	return 0;
+}
+
+static struct dmi_system_id __initdata reboot_dmi_table[] = {
+	{	/* Handle problems with rebooting on Dell E520's */
+		.callback = set_bios_reboot,
+		.ident = "Dell E520",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
+		},
+	},
+	{	/* Handle problems with rebooting on Dell 1300's */
+		.callback = set_bios_reboot,
+		.ident = "Dell PowerEdge 1300",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
+		},
+	},
+	{	/* Handle problems with rebooting on Dell 300's */
+		.callback = set_bios_reboot,
+		.ident = "Dell PowerEdge 300",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+		},
+	},
+	{       /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
+		.callback = set_bios_reboot,
+		.ident = "Dell OptiPlex 745",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+			DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
+		},
+	},
+	{	/* Handle problems with rebooting on Dell 2400's */
+		.callback = set_bios_reboot,
+		.ident = "Dell PowerEdge 2400",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
+		},
+	},
+	{	/* Handle problems with rebooting on HP laptops */
+		.callback = set_bios_reboot,
+		.ident = "HP Compaq Laptop",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
+		},
+	},
+	{ }
+};
+
+static int __init reboot_init(void)
+{
+	dmi_check_system(reboot_dmi_table);
+	return 0;
+}
+
+core_initcall(reboot_init);
+
+/* The following code and data reboots the machine by switching to real
+   mode and jumping to the BIOS reset entry point, as if the CPU has
+   really been reset.  The previous version asked the keyboard
+   controller to pulse the CPU reset line, which is more thorough, but
+   doesn't work with at least one type of 486 motherboard.  It is easy
+   to stop this code working; hence the copious comments. */
+
+static unsigned long long
+real_mode_gdt_entries [3] =
+{
+	0x0000000000000000ULL,	/* Null descriptor */
+	0x00009a000000ffffULL,	/* 16-bit real-mode 64k code at 0x00000000 */
+	0x000092000100ffffULL	/* 16-bit real-mode 64k data at 0x00000100 */
+};
+
+static struct Xgt_desc_struct
+real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
+real_mode_idt = { 0x3ff, 0 },
+no_idt = { 0, 0 };
+
+
+/* This is 16-bit protected mode code to disable paging and the cache,
+   switch to real mode and jump to the BIOS reset code.
+
+   The instruction that switches to real mode by writing to CR0 must be
+   followed immediately by a far jump instruction, which set CS to a
+   valid value for real mode, and flushes the prefetch queue to avoid
+   running instructions that have already been decoded in protected
+   mode.
+
+   Clears all the flags except ET, especially PG (paging), PE
+   (protected-mode enable) and TS (task switch for coprocessor state
+   save).  Flushes the TLB after paging has been disabled.  Sets CD and
+   NW, to disable the cache on a 486, and invalidates the cache.  This
+   is more like the state of a 486 after reset.  I don't know if
+   something else should be done for other chips.
+
+   More could be done here to set up the registers as if a CPU reset had
+   occurred; hopefully real BIOSs don't assume much. */
+
+static unsigned char real_mode_switch [] =
+{
+	0x66, 0x0f, 0x20, 0xc0,			/*    movl  %cr0,%eax        */
+	0x66, 0x83, 0xe0, 0x11,			/*    andl  $0x00000011,%eax */
+	0x66, 0x0d, 0x00, 0x00, 0x00, 0x60,	/*    orl   $0x60000000,%eax */
+	0x66, 0x0f, 0x22, 0xc0,			/*    movl  %eax,%cr0        */
+	0x66, 0x0f, 0x22, 0xd8,			/*    movl  %eax,%cr3        */
+	0x66, 0x0f, 0x20, 0xc3,			/*    movl  %cr0,%ebx        */
+	0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60,	/*    andl  $0x60000000,%ebx */
+	0x74, 0x02,				/*    jz    f                */
+	0x0f, 0x09,				/*    wbinvd                 */
+	0x24, 0x10,				/* f: andb  $0x10,al         */
+	0x66, 0x0f, 0x22, 0xc0			/*    movl  %eax,%cr0        */
+};
+static unsigned char jump_to_bios [] =
+{
+	0xea, 0x00, 0x00, 0xff, 0xff		/*    ljmp  $0xffff,$0x0000  */
+};
+
+/*
+ * Switch to real mode and then execute the code
+ * specified by the code and length parameters.
+ * We assume that length will aways be less that 100!
+ */
+void machine_real_restart(unsigned char *code, int length)
+{
+	local_irq_disable();
+
+	/* Write zero to CMOS register number 0x0f, which the BIOS POST
+	   routine will recognize as telling it to do a proper reboot.  (Well
+	   that's what this book in front of me says -- it may only apply to
+	   the Phoenix BIOS though, it's not clear).  At the same time,
+	   disable NMIs by setting the top bit in the CMOS address register,
+	   as we're about to do peculiar things to the CPU.  I'm not sure if
+	   `outb_p' is needed instead of just `outb'.  Use it to be on the
+	   safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
+	 */
+
+	spin_lock(&rtc_lock);
+	CMOS_WRITE(0x00, 0x8f);
+	spin_unlock(&rtc_lock);
+
+	/* Remap the kernel at virtual address zero, as well as offset zero
+	   from the kernel segment.  This assumes the kernel segment starts at
+	   virtual address PAGE_OFFSET. */
+
+	memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+		sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
+
+	/*
+	 * Use `swapper_pg_dir' as our page directory.
+	 */
+	load_cr3(swapper_pg_dir);
+
+	/* Write 0x1234 to absolute memory location 0x472.  The BIOS reads
+	   this on booting to tell it to "Bypass memory test (also warm
+	   boot)".  This seems like a fairly standard thing that gets set by
+	   REBOOT.COM programs, and the previous reset routine did this
+	   too. */
+
+	*((unsigned short *)0x472) = reboot_mode;
+
+	/* For the switch to real mode, copy some code to low memory.  It has
+	   to be in the first 64k because it is running in 16-bit mode, and it
+	   has to have the same physical and virtual address, because it turns
+	   off paging.  Copy it near the end of the first page, out of the way
+	   of BIOS variables. */
+
+	memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
+		real_mode_switch, sizeof (real_mode_switch));
+	memcpy ((void *) (0x1000 - 100), code, length);
+
+	/* Set up the IDT for real mode. */
+
+	load_idt(&real_mode_idt);
+
+	/* Set up a GDT from which we can load segment descriptors for real
+	   mode.  The GDT is not used in real mode; it is just needed here to
+	   prepare the descriptors. */
+
+	load_gdt(&real_mode_gdt);
+
+	/* Load the data segment registers, and thus the descriptors ready for
+	   real mode.  The base address of each segment is 0x100, 16 times the
+	   selector value being loaded here.  This is so that the segment
+	   registers don't have to be reloaded after switching to real mode:
+	   the values are consistent for real mode operation already. */
+
+	__asm__ __volatile__ ("movl $0x0010,%%eax\n"
+				"\tmovl %%eax,%%ds\n"
+				"\tmovl %%eax,%%es\n"
+				"\tmovl %%eax,%%fs\n"
+				"\tmovl %%eax,%%gs\n"
+				"\tmovl %%eax,%%ss" : : : "eax");
+
+	/* Jump to the 16-bit code that we copied earlier.  It disables paging
+	   and the cache, switches to real mode, and jumps to the BIOS reset
+	   entry point. */
+
+	__asm__ __volatile__ ("ljmp $0x0008,%0"
+				:
+				: "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(machine_real_restart);
+#endif
+
+static void native_machine_shutdown(void)
+{
+#ifdef CONFIG_SMP
+	int reboot_cpu_id;
+
+	/* The boot cpu is always logical cpu 0 */
+	reboot_cpu_id = 0;
+
+	/* See if there has been given a command line override */
+	if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
+		cpu_isset(reboot_cpu, cpu_online_map)) {
+		reboot_cpu_id = reboot_cpu;
+	}
+
+	/* Make certain the cpu I'm rebooting on is online */
+	if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+		reboot_cpu_id = smp_processor_id();
+	}
+
+	/* Make certain I only run on the appropriate processor */
+	set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+	/* O.K. Now that I'm on the appropriate processor, stop
+	 * all of the others, and disable their local APICs.
+	 */
+
+	smp_send_stop();
+#endif /* CONFIG_SMP */
+
+	lapic_shutdown();
+
+#ifdef CONFIG_X86_IO_APIC
+	disable_IO_APIC();
+#endif
+}
+
+void __attribute__((weak)) mach_reboot_fixups(void)
+{
+}
+
+static void native_machine_emergency_restart(void)
+{
+	if (!reboot_thru_bios) {
+		if (efi_enabled) {
+			efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
+			load_idt(&no_idt);
+			__asm__ __volatile__("int3");
+		}
+		/* rebooting needs to touch the page at absolute addr 0 */
+		*((unsigned short *)__va(0x472)) = reboot_mode;
+		for (;;) {
+			mach_reboot_fixups(); /* for board specific fixups */
+			mach_reboot();
+			/* That didn't work - force a triple fault.. */
+			load_idt(&no_idt);
+			__asm__ __volatile__("int3");
+		}
+	}
+	if (efi_enabled)
+		efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
+
+	machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
+}
+
+static void native_machine_restart(char * __unused)
+{
+	machine_shutdown();
+	machine_emergency_restart();
+}
+
+static void native_machine_halt(void)
+{
+}
+
+static void native_machine_power_off(void)
+{
+	if (pm_power_off) {
+		machine_shutdown();
+		pm_power_off();
+	}
+}
+
+
+struct machine_ops machine_ops = {
+	.power_off = native_machine_power_off,
+	.shutdown = native_machine_shutdown,
+	.emergency_restart = native_machine_emergency_restart,
+	.restart = native_machine_restart,
+	.halt = native_machine_halt,
+};
+
+void machine_power_off(void)
+{
+	machine_ops.power_off();
+}
+
+void machine_shutdown(void)
+{
+	machine_ops.shutdown();
+}
+
+void machine_emergency_restart(void)
+{
+	machine_ops.emergency_restart();
+}
+
+void machine_restart(char *cmd)
+{
+	machine_ops.restart(cmd);
+}
+
+void machine_halt(void)
+{
+	machine_ops.halt();
+}
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
new file mode 100644
index 000000000000..03e1cce58f49
--- /dev/null
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -0,0 +1,68 @@
+/*
+ * linux/arch/i386/kernel/reboot_fixups.c
+ *
+ * This is a good place to put board specific reboot fixups.
+ *
+ * List of supported fixups:
+ * geode-gx1/cs5530a - Jaya Kumar <jayalk@intworks.biz>
+ * geode-gx/lx/cs5536 - Andres Salomon <dilinger@debian.org>
+ *
+ */
+
+#include <asm/delay.h>
+#include <linux/pci.h>
+#include <asm/reboot_fixups.h>
+#include <asm/msr.h>
+
+static void cs5530a_warm_reset(struct pci_dev *dev)
+{
+	/* writing 1 to the reset control register, 0x44 causes the
+	cs5530a to perform a system warm reset */
+	pci_write_config_byte(dev, 0x44, 0x1);
+	udelay(50); /* shouldn't get here but be safe and spin-a-while */
+	return;
+}
+
+static void cs5536_warm_reset(struct pci_dev *dev)
+{
+	/*
+	 * 6.6.2.12 Soft Reset (DIVIL_SOFT_RESET)
+	 * writing 1 to the LSB of this MSR causes a hard reset.
+	 */
+	wrmsrl(0x51400017, 1ULL);
+	udelay(50); /* shouldn't get here but be safe and spin a while */
+}
+
+struct device_fixup {
+	unsigned int vendor;
+	unsigned int device;
+	void (*reboot_fixup)(struct pci_dev *);
+};
+
+static struct device_fixup fixups_table[] = {
+{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
+{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
+};
+
+/*
+ * we see if any fixup is available for our current hardware. if there
+ * is a fixup, we call it and we expect to never return from it. if we
+ * do return, we keep looking and then eventually fall back to the
+ * standard mach_reboot on return.
+ */
+void mach_reboot_fixups(void)
+{
+	struct device_fixup *cur;
+	struct pci_dev *dev;
+	int i;
+
+	for (i=0; i < ARRAY_SIZE(fixups_table); i++) {
+		cur = &(fixups_table[i]);
+		dev = pci_get_device(cur->vendor, cur->device, NULL);
+		if (!dev)
+			continue;
+
+		cur->reboot_fixup(dev);
+	}
+}
+
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
new file mode 100644
index 000000000000..f151d6fae462
--- /dev/null
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -0,0 +1,252 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/kexec.h>
+
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 2)
+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
+
+	.text
+	.align PAGE_ALIGNED
+	.globl relocate_kernel
+relocate_kernel:
+	movl	8(%esp), %ebp /* list of pages */
+
+#ifdef CONFIG_X86_PAE
+	/* map the control page at its virtual address */
+
+	movl	PTR(VA_PGD)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0xc0000000, %eax
+	shrl	$27, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PMD_0)(%ebp), %edx
+	orl	$PAE_PGD_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PMD_0)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x3fe00000, %eax
+	shrl	$18, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PTE_0)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PTE_0)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x001ff000, %eax
+	shrl	$9, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	/* identity map the control page at its physical address */
+
+	movl	PTR(VA_PGD)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0xc0000000, %eax
+	shrl	$27, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PMD_1)(%ebp), %edx
+	orl	$PAE_PGD_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PMD_1)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x3fe00000, %eax
+	shrl	$18, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PTE_1)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PTE_1)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x001ff000, %eax
+	shrl	$9, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+#else
+	/* map the control page at its virtual address */
+
+	movl	PTR(VA_PGD)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0xffc00000, %eax
+	shrl	$20, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PTE_0)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PTE_0)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x003ff000, %eax
+	shrl	$10, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	/* identity map the control page at its physical address */
+
+	movl	PTR(VA_PGD)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0xffc00000, %eax
+	shrl	$20, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PTE_1)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PTE_1)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x003ff000, %eax
+	shrl	$10, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+#endif
+
+relocate_new_kernel:
+	/* read the arguments and say goodbye to the stack */
+	movl  4(%esp), %ebx /* page_list */
+	movl  8(%esp), %ebp /* list of pages */
+	movl  12(%esp), %edx /* start address */
+	movl  16(%esp), %ecx /* cpu_has_pae */
+
+	/* zero out flags, and disable interrupts */
+	pushl $0
+	popfl
+
+	/* get physical address of control page now */
+	/* this is impossible after page table switch */
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edi
+
+	/* switch to new set of page tables */
+	movl	PTR(PA_PGD)(%ebp), %eax
+	movl	%eax, %cr3
+
+	/* setup a new stack at the end of the physical control page */
+	lea	4096(%edi), %esp
+
+	/* jump to identity mapped page */
+	movl    %edi, %eax
+	addl    $(identity_mapped - relocate_kernel), %eax
+	pushl   %eax
+	ret
+
+identity_mapped:
+	/* store the start address on the stack */
+	pushl   %edx
+
+	/* Set cr0 to a known state:
+	 * 31 0 == Paging disabled
+	 * 18 0 == Alignment check disabled
+	 * 16 0 == Write protect disabled
+	 * 3  0 == No task switch
+	 * 2  0 == Don't do FP software emulation.
+	 * 0  1 == Proctected mode enabled
+	 */
+	movl	%cr0, %eax
+	andl	$~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
+	orl	$(1<<0), %eax
+	movl	%eax, %cr0
+
+	/* clear cr4 if applicable */
+	testl	%ecx, %ecx
+	jz	1f
+	/* Set cr4 to a known state:
+	 * Setting everything to zero seems safe.
+	 */
+	movl	%cr4, %eax
+	andl	$0, %eax
+	movl	%eax, %cr4
+
+	jmp 1f
+1:
+
+	/* Flush the TLB (needed?) */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+
+	/* Do the copies */
+	movl	%ebx, %ecx
+	jmp	1f
+
+0:	/* top, read another word from the indirection page */
+	movl	(%ebx), %ecx
+	addl	$4, %ebx
+1:
+	testl	$0x1,   %ecx  /* is it a destination page */
+	jz	2f
+	movl	%ecx,	%edi
+	andl	$0xfffff000, %edi
+	jmp     0b
+2:
+	testl	$0x2,	%ecx  /* is it an indirection page */
+	jz	2f
+	movl	%ecx,	%ebx
+	andl	$0xfffff000, %ebx
+	jmp     0b
+2:
+	testl   $0x4,   %ecx /* is it the done indicator */
+	jz      2f
+	jmp     3f
+2:
+	testl   $0x8,   %ecx /* is it the source indicator */
+	jz      0b	     /* Ignore it otherwise */
+	movl    %ecx,   %esi /* For every source page do a copy */
+	andl    $0xfffff000, %esi
+
+	movl    $1024, %ecx
+	rep ; movsl
+	jmp     0b
+
+3:
+
+	/* To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB, it's handy, and not processor dependent.
+	 */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+
+	/* set all of the registers to known values */
+	/* leave %esp alone */
+
+	xorl	%eax, %eax
+	xorl	%ebx, %ebx
+	xorl    %ecx, %ecx
+	xorl    %edx, %edx
+	xorl    %esi, %esi
+	xorl    %edi, %edi
+	xorl    %ebp, %ebp
+	ret
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c
new file mode 100644
index 000000000000..c7d3df23f589
--- /dev/null
+++ b/arch/x86/kernel/scx200_32.c
@@ -0,0 +1,131 @@
+/* linux/arch/i386/kernel/scx200.c 
+
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+
+   National Semiconductor SCx200 support. */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+
+#include <linux/scx200.h>
+#include <linux/scx200_gpio.h>
+
+/* Verify that the configuration block really is there */
+#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
+
+#define NAME "scx200"
+
+MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
+MODULE_DESCRIPTION("NatSemi SCx200 Driver");
+MODULE_LICENSE("GPL");
+
+unsigned scx200_gpio_base = 0;
+long scx200_gpio_shadow[2];
+
+unsigned scx200_cb_base = 0;
+
+static struct pci_device_id scx200_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS)   },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS)   },
+	{ },
+};
+MODULE_DEVICE_TABLE(pci,scx200_tbl);
+
+static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *);
+
+static struct pci_driver scx200_pci_driver = {
+	.name = "scx200",
+	.id_table = scx200_tbl,
+	.probe = scx200_probe,
+};
+
+static DEFINE_MUTEX(scx200_gpio_config_lock);
+
+static void __devinit scx200_init_shadow(void)
+{
+	int bank;
+
+	/* read the current values driven on the GPIO signals */
+	for (bank = 0; bank < 2; ++bank)
+		scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+}
+
+static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	unsigned base;
+
+	if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
+	    pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
+		base = pci_resource_start(pdev, 0);
+		printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
+
+		if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) {
+			printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
+			return -EBUSY;
+		}
+
+		scx200_gpio_base = base;
+		scx200_init_shadow();
+
+	} else {
+		/* find the base of the Configuration Block */
+		if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
+			scx200_cb_base = SCx200_CB_BASE_FIXED;
+		} else {
+			pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
+			if (scx200_cb_probe(base)) {
+				scx200_cb_base = base;
+			} else {
+				printk(KERN_WARNING NAME ": Configuration Block not found\n");
+				return -ENODEV;
+			}
+		}
+		printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
+	}
+
+	return 0;
+}
+
+u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
+{
+	u32 config, new_config;
+
+	mutex_lock(&scx200_gpio_config_lock);
+
+	outl(index, scx200_gpio_base + 0x20);
+	config = inl(scx200_gpio_base + 0x24);
+
+	new_config = (config & mask) | bits;
+	outl(new_config, scx200_gpio_base + 0x24);
+
+	mutex_unlock(&scx200_gpio_config_lock);
+
+	return config;
+}
+
+static int __init scx200_init(void)
+{
+	printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
+
+	return pci_register_driver(&scx200_pci_driver);
+}
+
+static void __exit scx200_cleanup(void)
+{
+	pci_unregister_driver(&scx200_pci_driver);
+	release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
+}
+
+module_init(scx200_init);
+module_exit(scx200_cleanup);
+
+EXPORT_SYMBOL(scx200_gpio_base);
+EXPORT_SYMBOL(scx200_gpio_shadow);
+EXPORT_SYMBOL(scx200_gpio_configure);
+EXPORT_SYMBOL(scx200_cb_base);
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
new file mode 100644
index 000000000000..d474cd639bcb
--- /dev/null
+++ b/arch/x86/kernel/setup_32.c
@@ -0,0 +1,653 @@
+/*
+ *  linux/arch/i386/kernel/setup.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ *  Memory region support
+ *	David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ *  Added E820 sanitization routine (removes overlapping memory regions);
+ *  Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ *    Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ *  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *  Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/mca.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/nodemask.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+
+#include <video/edid.h>
+
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/mpspec.h>
+#include <asm/mmzone.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/sections.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/io.h>
+#include <asm/vmi.h>
+#include <setup_arch.h>
+#include <bios_ebda.h>
+
+/* This value is set up by the early boot code to point to the value
+   immediately after the boot time page tables.  It contains a *physical*
+   address, and must not be in the .bss segment! */
+unsigned long init_pg_tables_end __initdata = ~0UL;
+
+int disable_pse __devinitdata = 0;
+
+/*
+ * Machine setup..
+ */
+extern struct resource code_resource;
+extern struct resource data_resource;
+
+/* cpu data as detected by the assembly code in head.S */
+struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+EXPORT_SYMBOL(boot_cpu_data);
+
+unsigned long mmu_cr4_features;
+
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+#ifdef CONFIG_MCA
+EXPORT_SYMBOL(machine_id);
+#endif
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+unsigned int mca_pentium_flag;
+
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+struct ist_info ist_info;
+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
+	defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+EXPORT_SYMBOL(ist_info);
+#endif
+
+extern void early_cpu_init(void);
+extern int root_mountflags;
+
+unsigned long saved_videomode;
+
+#define RAMDISK_IMAGE_START_MASK  	0x07FF
+#define RAMDISK_PROMPT_FLAG		0x8000
+#define RAMDISK_LOAD_FLAG		0x4000	
+
+static char __initdata command_line[COMMAND_LINE_SIZE];
+
+struct boot_params __initdata boot_params;
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ *              from boot_params into a safe place.
+ *
+ */
+static inline void copy_edd(void)
+{
+     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
+     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
+     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
+     edd.edd_info_nr = EDD_NR;
+}
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+
+int __initdata user_defined_memmap = 0;
+
+/*
+ * "mem=nopentium" disables the 4MB page tables.
+ * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
+ * to <mem>, overriding the bios size.
+ * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
+ * <start> to <start>+<mem>, overriding the bios size.
+ *
+ * HPA tells me bootloaders need to parse mem=, so no new
+ * option should be mem=  [also see Documentation/i386/boot.txt]
+ */
+static int __init parse_mem(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (strcmp(arg, "nopentium") == 0) {
+		clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+		disable_pse = 1;
+	} else {
+		/* If the user specifies memory size, we
+		 * limit the BIOS-provided memory map to
+		 * that size. exactmap can be used to specify
+		 * the exact map. mem=number can be used to
+		 * trim the existing memory map.
+		 */
+		unsigned long long mem_size;
+ 
+		mem_size = memparse(arg, &arg);
+		limit_regions(mem_size);
+		user_defined_memmap = 1;
+	}
+	return 0;
+}
+early_param("mem", parse_mem);
+
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel.
+ */
+static int __init parse_elfcorehdr(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	elfcorehdr_addr = memparse(arg, &arg);
+	return 0;
+}
+early_param("elfcorehdr", parse_elfcorehdr);
+#endif /* CONFIG_PROC_VMCORE */
+
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+	return 0;
+}
+early_param("highmem", parse_highmem);
+
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	__VMALLOC_RESERVE = memparse(arg, &arg);
+	return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+	unsigned long address;
+
+	if (!arg)
+		return -EINVAL;
+
+	address = memparse(arg, &arg);
+	reserve_top_address(address);
+	return 0;
+}
+early_param("reservetop", parse_reservetop);
+
+/*
+ * Determine low and high memory ranges:
+ */
+unsigned long __init find_max_low_pfn(void)
+{
+	unsigned long max_low_pfn;
+
+	max_low_pfn = max_pfn;
+	if (max_low_pfn > MAXMEM_PFN) {
+		if (highmem_pages == -1)
+			highmem_pages = max_pfn - MAXMEM_PFN;
+		if (highmem_pages + MAXMEM_PFN < max_pfn)
+			max_pfn = MAXMEM_PFN + highmem_pages;
+		if (highmem_pages + MAXMEM_PFN > max_pfn) {
+			printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
+			highmem_pages = 0;
+		}
+		max_low_pfn = MAXMEM_PFN;
+#ifndef CONFIG_HIGHMEM
+		/* Maximum memory usable is what is directly addressable */
+		printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+					MAXMEM>>20);
+		if (max_pfn > MAX_NONPAE_PFN)
+			printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+		else
+			printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+		max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+		if (max_pfn > MAX_NONPAE_PFN) {
+			max_pfn = MAX_NONPAE_PFN;
+			printk(KERN_WARNING "Warning only 4GB will be used.\n");
+			printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+		}
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+	} else {
+		if (highmem_pages == -1)
+			highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+		if (highmem_pages >= max_pfn) {
+			printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+			highmem_pages = 0;
+		}
+		if (highmem_pages) {
+			if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
+				printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
+				highmem_pages = 0;
+			}
+			max_low_pfn -= highmem_pages;
+		}
+#else
+		if (highmem_pages)
+			printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+	}
+	return max_low_pfn;
+}
+
+/*
+ * workaround for Dell systems that neglect to reserve EBDA
+ */
+static void __init reserve_ebda_region(void)
+{
+	unsigned int addr;
+	addr = get_bios_ebda();
+	if (addr)
+		reserve_bootmem(addr, PAGE_SIZE);	
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init setup_bootmem_allocator(void);
+static unsigned long __init setup_memory(void)
+{
+	/*
+	 * partially used pages are not usable - thus
+	 * we are rounding upwards:
+	 */
+	min_low_pfn = PFN_UP(init_pg_tables_end);
+
+	find_max_pfn();
+
+	max_low_pfn = find_max_low_pfn();
+
+#ifdef CONFIG_HIGHMEM
+	highstart_pfn = highend_pfn = max_pfn;
+	if (max_pfn > max_low_pfn) {
+		highstart_pfn = max_low_pfn;
+	}
+	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+		pages_to_mb(highend_pfn - highstart_pfn));
+	num_physpages = highend_pfn;
+	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+	num_physpages = max_low_pfn;
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+	max_mapnr = num_physpages;
+#endif
+	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+			pages_to_mb(max_low_pfn));
+
+	setup_bootmem_allocator();
+
+	return max_low_pfn;
+}
+
+void __init zone_sizes_init(void)
+{
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+	max_zone_pfns[ZONE_DMA] =
+		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+	add_active_range(0, 0, highend_pfn);
+#else
+	add_active_range(0, 0, max_low_pfn);
+#endif
+
+	free_area_init_nodes(max_zone_pfns);
+}
+#else
+extern unsigned long __init setup_memory(void);
+extern void zone_sizes_init(void);
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+void __init setup_bootmem_allocator(void)
+{
+	unsigned long bootmap_size;
+	/*
+	 * Initialize the boot-time allocator (with low memory only):
+	 */
+	bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
+
+	register_bootmem_low_pages(max_low_pfn);
+
+	/*
+	 * Reserve the bootmem bitmap itself as well. We do this in two
+	 * steps (first step was init_bootmem()) because this catches
+	 * the (very unlikely) case of us accidentally initializing the
+	 * bootmem allocator with an invalid RAM area.
+	 */
+	reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
+			 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
+
+	/*
+	 * reserve physical page 0 - it's a special BIOS page on many boxes,
+	 * enabling clean reboots, SMP operation, laptop functions.
+	 */
+	reserve_bootmem(0, PAGE_SIZE);
+
+	/* reserve EBDA region, it's a 4K region */
+	reserve_ebda_region();
+
+    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
+       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
+       unless you have no PS/2 mouse plugged in. */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+	    boot_cpu_data.x86 == 6)
+	     reserve_bootmem(0xa0000 - 4096, 4096);
+
+#ifdef CONFIG_SMP
+	/*
+	 * But first pinch a few for the stack/trampoline stuff
+	 * FIXME: Don't need the extra page at 4K, but need to fix
+	 * trampoline before removing it. (see the GDT stuff)
+	 */
+	reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+	/*
+	 * Reserve low memory region for sleep support.
+	 */
+	acpi_reserve_bootmem();
+#endif
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+	/*
+	 * Find and reserve possible boot-time SMP configuration:
+	 */
+	find_smp_config();
+#endif
+	numa_kva_reserve();
+#ifdef CONFIG_BLK_DEV_INITRD
+	if (LOADER_TYPE && INITRD_START) {
+		if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
+			reserve_bootmem(INITRD_START, INITRD_SIZE);
+			initrd_start = INITRD_START + PAGE_OFFSET;
+			initrd_end = initrd_start+INITRD_SIZE;
+		}
+		else {
+			printk(KERN_ERR "initrd extends beyond end of memory "
+			    "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+			    INITRD_START + INITRD_SIZE,
+			    max_low_pfn << PAGE_SHIFT);
+			initrd_start = 0;
+		}
+	}
+#endif
+#ifdef CONFIG_KEXEC
+	if (crashk_res.start != crashk_res.end)
+		reserve_bootmem(crashk_res.start,
+			crashk_res.end - crashk_res.start + 1);
+#endif
+}
+
+/*
+ * The node 0 pgdat is initialized before all of these because
+ * it's needed for bootmem.  node>0 pgdats have their virtual
+ * space allocated before the pagetables are in place to access
+ * them, so they can't be cleared then.
+ *
+ * This should all compile down to nothing when NUMA is off.
+ */
+static void __init remapped_pgdat_init(void)
+{
+	int nid;
+
+	for_each_online_node(nid) {
+		if (nid != 0)
+			memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+	}
+}
+
+#ifdef CONFIG_MCA
+static void set_mca_bus(int x)
+{
+	MCA_bus = x;
+}
+#else
+static void set_mca_bus(int x) { }
+#endif
+
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+char * __init __attribute__((weak)) memory_setup(void)
+{
+	return machine_specific_memory_setup();
+}
+
+/*
+ * Determine if we were loaded by an EFI loader.  If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization.  Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
+ */
+void __init setup_arch(char **cmdline_p)
+{
+	unsigned long max_low_pfn;
+
+	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+	pre_setup_arch_hook();
+	early_cpu_init();
+
+	/*
+	 * FIXME: This isn't an official loader_type right
+	 * now but does currently work with elilo.
+	 * If we were configured as an EFI kernel, check to make
+	 * sure that we were loaded correctly from elilo and that
+	 * the system table is valid.  If not, then initialize normally.
+	 */
+#ifdef CONFIG_EFI
+	if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
+		efi_enabled = 1;
+#endif
+
+ 	ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
+ 	screen_info = SCREEN_INFO;
+	edid_info = EDID_INFO;
+	apm_info.bios = APM_BIOS_INFO;
+	ist_info = IST_INFO;
+	saved_videomode = VIDEO_MODE;
+	if( SYS_DESC_TABLE.length != 0 ) {
+		set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
+		machine_id = SYS_DESC_TABLE.table[0];
+		machine_submodel_id = SYS_DESC_TABLE.table[1];
+		BIOS_revision = SYS_DESC_TABLE.table[2];
+	}
+	bootloader_type = LOADER_TYPE;
+
+#ifdef CONFIG_BLK_DEV_RAM
+	rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
+	rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
+	rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
+#endif
+	ARCH_SETUP
+	if (efi_enabled)
+		efi_init();
+	else {
+		printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+		print_memory_map(memory_setup());
+	}
+
+	copy_edd();
+
+	if (!MOUNT_ROOT_RDONLY)
+		root_mountflags &= ~MS_RDONLY;
+	init_mm.start_code = (unsigned long) _text;
+	init_mm.end_code = (unsigned long) _etext;
+	init_mm.end_data = (unsigned long) _edata;
+	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
+
+	code_resource.start = virt_to_phys(_text);
+	code_resource.end = virt_to_phys(_etext)-1;
+	data_resource.start = virt_to_phys(_etext);
+	data_resource.end = virt_to_phys(_edata)-1;
+
+	parse_early_param();
+
+	if (user_defined_memmap) {
+		printk(KERN_INFO "user-defined physical RAM map:\n");
+		print_memory_map("user");
+	}
+
+	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+	*cmdline_p = command_line;
+
+	max_low_pfn = setup_memory();
+
+#ifdef CONFIG_VMI
+	/*
+	 * Must be after max_low_pfn is determined, and before kernel
+	 * pagetables are setup.
+	 */
+	vmi_init();
+#endif
+
+	/*
+	 * NOTE: before this point _nobody_ is allowed to allocate
+	 * any memory using the bootmem allocator.  Although the
+	 * alloctor is now initialised only the first 8Mb of the kernel
+	 * virtual address space has been mapped.  All allocations before
+	 * paging_init() has completed must use the alloc_bootmem_low_pages()
+	 * variant (which allocates DMA'able memory) and care must be taken
+	 * not to exceed the 8Mb limit.
+	 */
+
+#ifdef CONFIG_SMP
+	smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
+#endif
+	paging_init();
+	remapped_pgdat_init();
+	sparse_init();
+	zone_sizes_init();
+
+	/*
+	 * NOTE: at this point the bootmem allocator is fully available.
+	 */
+
+	paravirt_post_allocator_init();
+
+	dmi_scan_machine();
+
+#ifdef CONFIG_X86_GENERICARCH
+	generic_apic_probe();
+#endif	
+	if (efi_enabled)
+		efi_map_memmap();
+
+#ifdef CONFIG_ACPI
+	/*
+	 * Parse the ACPI tables for possible boot-time SMP configuration.
+	 */
+	acpi_boot_table_init();
+#endif
+
+#ifdef CONFIG_PCI
+#ifdef CONFIG_X86_IO_APIC
+	check_acpi_pci();	/* Checks more than just ACPI actually */
+#endif
+#endif
+
+#ifdef CONFIG_ACPI
+	acpi_boot_init();
+
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
+	if (def_to_bigsmp)
+		printk(KERN_WARNING "More than 8 CPUs detected and "
+			"CONFIG_X86_PC cannot handle it.\nUse "
+			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
+#endif
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (smp_found_config)
+		get_smp_config();
+#endif
+
+	e820_register_memory();
+	e820_mark_nosave_regions();
+
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+	if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+		conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+	conswitchp = &dummy_con;
+#endif
+#endif
+}
diff --git a/arch/x86/kernel/sigframe_32.h b/arch/x86/kernel/sigframe_32.h
new file mode 100644
index 000000000000..0b2221711dad
--- /dev/null
+++ b/arch/x86/kernel/sigframe_32.h
@@ -0,0 +1,21 @@
+struct sigframe
+{
+	char __user *pretcode;
+	int sig;
+	struct sigcontext sc;
+	struct _fpstate fpstate;
+	unsigned long extramask[_NSIG_WORDS-1];
+	char retcode[8];
+};
+
+struct rt_sigframe
+{
+	char __user *pretcode;
+	int sig;
+	struct siginfo __user *pinfo;
+	void __user *puc;
+	struct siginfo info;
+	struct ucontext uc;
+	struct _fpstate fpstate;
+	char retcode[8];
+};
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
new file mode 100644
index 000000000000..c03570f7fe8e
--- /dev/null
+++ b/arch/x86/kernel/signal_32.c
@@ -0,0 +1,667 @@
+/*
+ *  linux/arch/i386/kernel/signal.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/suspend.h>
+#include <linux/ptrace.h>
+#include <linux/elf.h>
+#include <linux/binfmts.h>
+#include <asm/processor.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include "sigframe_32.h"
+
+#define DEBUG_SIG 0
+
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
+/*
+ * Atomically swap in the new signal mask, and wait for a signal.
+ */
+asmlinkage int
+sys_sigsuspend(int history0, int history1, old_sigset_t mask)
+{
+	mask &= _BLOCKABLE;
+	spin_lock_irq(&current->sighand->siglock);
+	current->saved_sigmask = current->blocked;
+	siginitset(&current->blocked, mask);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	current->state = TASK_INTERRUPTIBLE;
+	schedule();
+	set_thread_flag(TIF_RESTORE_SIGMASK);
+	return -ERESTARTNOHAND;
+}
+
+asmlinkage int 
+sys_sigaction(int sig, const struct old_sigaction __user *act,
+	      struct old_sigaction __user *oact)
+{
+	struct k_sigaction new_ka, old_ka;
+	int ret;
+
+	if (act) {
+		old_sigset_t mask;
+		if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+		    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
+		    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
+			return -EFAULT;
+		__get_user(new_ka.sa.sa_flags, &act->sa_flags);
+		__get_user(mask, &act->sa_mask);
+		siginitset(&new_ka.sa.sa_mask, mask);
+	}
+
+	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+	if (!ret && oact) {
+		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+		    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
+		    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
+			return -EFAULT;
+		__put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+		__put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
+	}
+
+	return ret;
+}
+
+asmlinkage int
+sys_sigaltstack(unsigned long ebx)
+{
+	/* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
+	struct pt_regs *regs = (struct pt_regs *)&ebx;
+	const stack_t __user *uss = (const stack_t __user *)ebx;
+	stack_t __user *uoss = (stack_t __user *)regs->ecx;
+
+	return do_sigaltstack(uss, uoss, regs->esp);
+}
+
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+
+static int
+restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax)
+{
+	unsigned int err = 0;
+
+	/* Always make any pending restarted system calls return -EINTR */
+	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+
+#define COPY(x)		err |= __get_user(regs->x, &sc->x)
+
+#define COPY_SEG(seg)							\
+	{ unsigned short tmp;						\
+	  err |= __get_user(tmp, &sc->seg);				\
+	  regs->x##seg = tmp; }
+
+#define COPY_SEG_STRICT(seg)						\
+	{ unsigned short tmp;						\
+	  err |= __get_user(tmp, &sc->seg);				\
+	  regs->x##seg = tmp|3; }
+
+#define GET_SEG(seg)							\
+	{ unsigned short tmp;						\
+	  err |= __get_user(tmp, &sc->seg);				\
+	  loadsegment(seg,tmp); }
+
+#define	FIX_EFLAGS	(X86_EFLAGS_AC | X86_EFLAGS_RF |		 \
+			 X86_EFLAGS_OF | X86_EFLAGS_DF |		 \
+			 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
+			 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
+
+	GET_SEG(gs);
+	COPY_SEG(fs);
+	COPY_SEG(es);
+	COPY_SEG(ds);
+	COPY(edi);
+	COPY(esi);
+	COPY(ebp);
+	COPY(esp);
+	COPY(ebx);
+	COPY(edx);
+	COPY(ecx);
+	COPY(eip);
+	COPY_SEG_STRICT(cs);
+	COPY_SEG_STRICT(ss);
+	
+	{
+		unsigned int tmpflags;
+		err |= __get_user(tmpflags, &sc->eflags);
+		regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+		regs->orig_eax = -1;		/* disable syscall checks */
+	}
+
+	{
+		struct _fpstate __user * buf;
+		err |= __get_user(buf, &sc->fpstate);
+		if (buf) {
+			if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
+				goto badframe;
+			err |= restore_i387(buf);
+		} else {
+			struct task_struct *me = current;
+			if (used_math()) {
+				clear_fpu(me);
+				clear_used_math();
+			}
+		}
+	}
+
+	err |= __get_user(*peax, &sc->eax);
+	return err;
+
+badframe:
+	return 1;
+}
+
+asmlinkage int sys_sigreturn(unsigned long __unused)
+{
+	struct pt_regs *regs = (struct pt_regs *) &__unused;
+	struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8);
+	sigset_t set;
+	int eax;
+
+	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+		goto badframe;
+	if (__get_user(set.sig[0], &frame->sc.oldmask)
+	    || (_NSIG_WORDS > 1
+		&& __copy_from_user(&set.sig[1], &frame->extramask,
+				    sizeof(frame->extramask))))
+		goto badframe;
+
+	sigdelsetmask(&set, ~_BLOCKABLE);
+	spin_lock_irq(&current->sighand->siglock);
+	current->blocked = set;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	
+	if (restore_sigcontext(regs, &frame->sc, &eax))
+		goto badframe;
+	return eax;
+
+badframe:
+	if (show_unhandled_signals && printk_ratelimit())
+		printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
+		       " esp:%lx oeax:%lx\n",
+		    current->pid > 1 ? KERN_INFO : KERN_EMERG,
+		    current->comm, current->pid, frame, regs->eip,
+		    regs->esp, regs->orig_eax);
+
+	force_sig(SIGSEGV, current);
+	return 0;
+}	
+
+asmlinkage int sys_rt_sigreturn(unsigned long __unused)
+{
+	struct pt_regs *regs = (struct pt_regs *) &__unused;
+	struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4);
+	sigset_t set;
+	int eax;
+
+	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+		goto badframe;
+	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+		goto badframe;
+
+	sigdelsetmask(&set, ~_BLOCKABLE);
+	spin_lock_irq(&current->sighand->siglock);
+	current->blocked = set;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+		goto badframe;
+
+	if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT)
+		goto badframe;
+
+	return eax;
+
+badframe:
+	force_sig(SIGSEGV, current);
+	return 0;
+}	
+
+/*
+ * Set up a signal frame.
+ */
+
+static int
+setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
+		 struct pt_regs *regs, unsigned long mask)
+{
+	int tmp, err = 0;
+
+	err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
+	savesegment(gs, tmp);
+	err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+
+	err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
+	err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
+	err |= __put_user(regs->edi, &sc->edi);
+	err |= __put_user(regs->esi, &sc->esi);
+	err |= __put_user(regs->ebp, &sc->ebp);
+	err |= __put_user(regs->esp, &sc->esp);
+	err |= __put_user(regs->ebx, &sc->ebx);
+	err |= __put_user(regs->edx, &sc->edx);
+	err |= __put_user(regs->ecx, &sc->ecx);
+	err |= __put_user(regs->eax, &sc->eax);
+	err |= __put_user(current->thread.trap_no, &sc->trapno);
+	err |= __put_user(current->thread.error_code, &sc->err);
+	err |= __put_user(regs->eip, &sc->eip);
+	err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
+	err |= __put_user(regs->eflags, &sc->eflags);
+	err |= __put_user(regs->esp, &sc->esp_at_signal);
+	err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
+
+	tmp = save_i387(fpstate);
+	if (tmp < 0)
+	  err = 1;
+	else
+	  err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
+
+	/* non-iBCS2 extensions.. */
+	err |= __put_user(mask, &sc->oldmask);
+	err |= __put_user(current->thread.cr2, &sc->cr2);
+
+	return err;
+}
+
+/*
+ * Determine which stack to use..
+ */
+static inline void __user *
+get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
+{
+	unsigned long esp;
+
+	/* Default to using normal stack */
+	esp = regs->esp;
+
+	/* This is the X/Open sanctioned signal stack switching.  */
+	if (ka->sa.sa_flags & SA_ONSTACK) {
+		if (sas_ss_flags(esp) == 0)
+			esp = current->sas_ss_sp + current->sas_ss_size;
+	}
+
+	/* This is the legacy signal stack switching. */
+	else if ((regs->xss & 0xffff) != __USER_DS &&
+		 !(ka->sa.sa_flags & SA_RESTORER) &&
+		 ka->sa.sa_restorer) {
+		esp = (unsigned long) ka->sa.sa_restorer;
+	}
+
+	esp -= frame_size;
+	/* Align the stack pointer according to the i386 ABI,
+	 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
+	esp = ((esp + 4) & -16ul) - 4;
+	return (void __user *) esp;
+}
+
+/* These symbols are defined with the addresses in the vsyscall page.
+   See vsyscall-sigreturn.S.  */
+extern void __user __kernel_sigreturn;
+extern void __user __kernel_rt_sigreturn;
+
+static int setup_frame(int sig, struct k_sigaction *ka,
+		       sigset_t *set, struct pt_regs * regs)
+{
+	void __user *restorer;
+	struct sigframe __user *frame;
+	int err = 0;
+	int usig;
+
+	frame = get_sigframe(ka, regs, sizeof(*frame));
+
+	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+		goto give_sigsegv;
+
+	usig = current_thread_info()->exec_domain
+		&& current_thread_info()->exec_domain->signal_invmap
+		&& sig < 32
+		? current_thread_info()->exec_domain->signal_invmap[sig]
+		: sig;
+
+	err = __put_user(usig, &frame->sig);
+	if (err)
+		goto give_sigsegv;
+
+	err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
+	if (err)
+		goto give_sigsegv;
+
+	if (_NSIG_WORDS > 1) {
+		err = __copy_to_user(&frame->extramask, &set->sig[1],
+				      sizeof(frame->extramask));
+		if (err)
+			goto give_sigsegv;
+	}
+
+	if (current->binfmt->hasvdso)
+		restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
+	else
+		restorer = (void *)&frame->retcode;
+	if (ka->sa.sa_flags & SA_RESTORER)
+		restorer = ka->sa.sa_restorer;
+
+	/* Set up to return from userspace.  */
+	err |= __put_user(restorer, &frame->pretcode);
+	 
+	/*
+	 * This is popl %eax ; movl $,%eax ; int $0x80
+	 *
+	 * WE DO NOT USE IT ANY MORE! It's only left here for historical
+	 * reasons and because gdb uses it as a signature to notice
+	 * signal handler stack frames.
+	 */
+	err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
+	err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
+	err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
+
+	if (err)
+		goto give_sigsegv;
+
+	/* Set up registers for signal handler */
+	regs->esp = (unsigned long) frame;
+	regs->eip = (unsigned long) ka->sa.sa_handler;
+	regs->eax = (unsigned long) sig;
+	regs->edx = (unsigned long) 0;
+	regs->ecx = (unsigned long) 0;
+
+	set_fs(USER_DS);
+	regs->xds = __USER_DS;
+	regs->xes = __USER_DS;
+	regs->xss = __USER_DS;
+	regs->xcs = __USER_CS;
+
+	/*
+	 * Clear TF when entering the signal handler, but
+	 * notify any tracer that was single-stepping it.
+	 * The tracer may want to single-step inside the
+	 * handler too.
+	 */
+	regs->eflags &= ~TF_MASK;
+	if (test_thread_flag(TIF_SINGLESTEP))
+		ptrace_notify(SIGTRAP);
+
+#if DEBUG_SIG
+	printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+		current->comm, current->pid, frame, regs->eip, frame->pretcode);
+#endif
+
+	return 0;
+
+give_sigsegv:
+	force_sigsegv(sig, current);
+	return -EFAULT;
+}
+
+static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+			   sigset_t *set, struct pt_regs * regs)
+{
+	void __user *restorer;
+	struct rt_sigframe __user *frame;
+	int err = 0;
+	int usig;
+
+	frame = get_sigframe(ka, regs, sizeof(*frame));
+
+	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+		goto give_sigsegv;
+
+	usig = current_thread_info()->exec_domain
+		&& current_thread_info()->exec_domain->signal_invmap
+		&& sig < 32
+		? current_thread_info()->exec_domain->signal_invmap[sig]
+		: sig;
+
+	err |= __put_user(usig, &frame->sig);
+	err |= __put_user(&frame->info, &frame->pinfo);
+	err |= __put_user(&frame->uc, &frame->puc);
+	err |= copy_siginfo_to_user(&frame->info, info);
+	if (err)
+		goto give_sigsegv;
+
+	/* Create the ucontext.  */
+	err |= __put_user(0, &frame->uc.uc_flags);
+	err |= __put_user(0, &frame->uc.uc_link);
+	err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+	err |= __put_user(sas_ss_flags(regs->esp),
+			  &frame->uc.uc_stack.ss_flags);
+	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+	err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
+			        regs, set->sig[0]);
+	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+	if (err)
+		goto give_sigsegv;
+
+	/* Set up to return from userspace.  */
+	restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
+	if (ka->sa.sa_flags & SA_RESTORER)
+		restorer = ka->sa.sa_restorer;
+	err |= __put_user(restorer, &frame->pretcode);
+	 
+	/*
+	 * This is movl $,%eax ; int $0x80
+	 *
+	 * WE DO NOT USE IT ANY MORE! It's only left here for historical
+	 * reasons and because gdb uses it as a signature to notice
+	 * signal handler stack frames.
+	 */
+	err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
+	err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
+	err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
+
+	if (err)
+		goto give_sigsegv;
+
+	/* Set up registers for signal handler */
+	regs->esp = (unsigned long) frame;
+	regs->eip = (unsigned long) ka->sa.sa_handler;
+	regs->eax = (unsigned long) usig;
+	regs->edx = (unsigned long) &frame->info;
+	regs->ecx = (unsigned long) &frame->uc;
+
+	set_fs(USER_DS);
+	regs->xds = __USER_DS;
+	regs->xes = __USER_DS;
+	regs->xss = __USER_DS;
+	regs->xcs = __USER_CS;
+
+	/*
+	 * Clear TF when entering the signal handler, but
+	 * notify any tracer that was single-stepping it.
+	 * The tracer may want to single-step inside the
+	 * handler too.
+	 */
+	regs->eflags &= ~TF_MASK;
+	if (test_thread_flag(TIF_SINGLESTEP))
+		ptrace_notify(SIGTRAP);
+
+#if DEBUG_SIG
+	printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+		current->comm, current->pid, frame, regs->eip, frame->pretcode);
+#endif
+
+	return 0;
+
+give_sigsegv:
+	force_sigsegv(sig, current);
+	return -EFAULT;
+}
+
+/*
+ * OK, we're invoking a handler
+ */	
+
+static int
+handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
+	      sigset_t *oldset,	struct pt_regs * regs)
+{
+	int ret;
+
+	/* Are we from a system call? */
+	if (regs->orig_eax >= 0) {
+		/* If so, check system call restarting.. */
+		switch (regs->eax) {
+		        case -ERESTART_RESTARTBLOCK:
+			case -ERESTARTNOHAND:
+				regs->eax = -EINTR;
+				break;
+
+			case -ERESTARTSYS:
+				if (!(ka->sa.sa_flags & SA_RESTART)) {
+					regs->eax = -EINTR;
+					break;
+				}
+			/* fallthrough */
+			case -ERESTARTNOINTR:
+				regs->eax = regs->orig_eax;
+				regs->eip -= 2;
+		}
+	}
+
+	/*
+	 * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
+	 * that register information in the sigcontext is correct.
+	 */
+	if (unlikely(regs->eflags & TF_MASK)
+	    && likely(current->ptrace & PT_DTRACE)) {
+		current->ptrace &= ~PT_DTRACE;
+		regs->eflags &= ~TF_MASK;
+	}
+
+	/* Set up the stack frame */
+	if (ka->sa.sa_flags & SA_SIGINFO)
+		ret = setup_rt_frame(sig, ka, info, oldset, regs);
+	else
+		ret = setup_frame(sig, ka, oldset, regs);
+
+	if (ret == 0) {
+		spin_lock_irq(&current->sighand->siglock);
+		sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
+		if (!(ka->sa.sa_flags & SA_NODEFER))
+			sigaddset(&current->blocked,sig);
+		recalc_sigpending();
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+
+	return ret;
+}
+
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+static void fastcall do_signal(struct pt_regs *regs)
+{
+	siginfo_t info;
+	int signr;
+	struct k_sigaction ka;
+	sigset_t *oldset;
+
+	/*
+	 * We want the common case to go fast, which
+	 * is why we may in certain cases get here from
+	 * kernel mode. Just return without doing anything
+ 	 * if so.  vm86 regs switched out by assembly code
+ 	 * before reaching here, so testing against kernel
+ 	 * CS suffices.
+	 */
+	if (!user_mode(regs))
+		return;
+
+	if (test_thread_flag(TIF_RESTORE_SIGMASK))
+		oldset = &current->saved_sigmask;
+	else
+		oldset = &current->blocked;
+
+	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+	if (signr > 0) {
+		/* Reenable any watchpoints before delivering the
+		 * signal to user space. The processor register will
+		 * have been cleared if the watchpoint triggered
+		 * inside the kernel.
+		 */
+		if (unlikely(current->thread.debugreg[7]))
+			set_debugreg(current->thread.debugreg[7], 7);
+
+		/* Whee!  Actually deliver the signal.  */
+		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+			/* a signal was successfully delivered; the saved
+			 * sigmask will have been stored in the signal frame,
+			 * and will be restored by sigreturn, so we can simply
+			 * clear the TIF_RESTORE_SIGMASK flag */
+			if (test_thread_flag(TIF_RESTORE_SIGMASK))
+				clear_thread_flag(TIF_RESTORE_SIGMASK);
+		}
+
+		return;
+	}
+
+	/* Did we come from a system call? */
+	if (regs->orig_eax >= 0) {
+		/* Restart the system call - no handlers present */
+		switch (regs->eax) {
+		case -ERESTARTNOHAND:
+		case -ERESTARTSYS:
+		case -ERESTARTNOINTR:
+			regs->eax = regs->orig_eax;
+			regs->eip -= 2;
+			break;
+
+		case -ERESTART_RESTARTBLOCK:
+			regs->eax = __NR_restart_syscall;
+			regs->eip -= 2;
+			break;
+		}
+	}
+
+	/* if there's no signal to deliver, we just put the saved sigmask
+	 * back */
+	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
+		clear_thread_flag(TIF_RESTORE_SIGMASK);
+		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+	}
+}
+
+/*
+ * notification of userspace execution resumption
+ * - triggered by the TIF_WORK_MASK flags
+ */
+__attribute__((regparm(3)))
+void do_notify_resume(struct pt_regs *regs, void *_unused,
+		      __u32 thread_info_flags)
+{
+	/* Pending single-step? */
+	if (thread_info_flags & _TIF_SINGLESTEP) {
+		regs->eflags |= TF_MASK;
+		clear_thread_flag(TIF_SINGLESTEP);
+	}
+
+	/* deal with pending signal delivery */
+	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
+		do_signal(regs);
+	
+	clear_thread_flag(TIF_IRET);
+}
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
new file mode 100644
index 000000000000..2d35d8502029
--- /dev/null
+++ b/arch/x86/kernel/smp_32.c
@@ -0,0 +1,707 @@
+/*
+ *	Intel SMP support routines.
+ *
+ *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *	(c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *	This code is released under the GNU General Public License version 2 or
+ *	later.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <mach_apic.h>
+
+/*
+ *	Some notes on x86 processor bugs affecting SMP operation:
+ *
+ *	Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ *	The Linux implications for SMP are handled as follows:
+ *
+ *	Pentium III / [Xeon]
+ *		None of the E1AP-E3AP errata are visible to the user.
+ *
+ *	E1AP.	see PII A1AP
+ *	E2AP.	see PII A2AP
+ *	E3AP.	see PII A3AP
+ *
+ *	Pentium II / [Xeon]
+ *		None of the A1AP-A3AP errata are visible to the user.
+ *
+ *	A1AP.	see PPro 1AP
+ *	A2AP.	see PPro 2AP
+ *	A3AP.	see PPro 7AP
+ *
+ *	Pentium Pro
+ *		None of 1AP-9AP errata are visible to the normal user,
+ *	except occasional delivery of 'spurious interrupt' as trap #15.
+ *	This is very rare and a non-problem.
+ *
+ *	1AP.	Linux maps APIC as non-cacheable
+ *	2AP.	worked around in hardware
+ *	3AP.	fixed in C0 and above steppings microcode update.
+ *		Linux does not use excessive STARTUP_IPIs.
+ *	4AP.	worked around in hardware
+ *	5AP.	symmetric IO mode (normal Linux operation) not affected.
+ *		'noapic' mode has vector 0xf filled out properly.
+ *	6AP.	'noapic' mode might be affected - fixed in later steppings
+ *	7AP.	We do not assume writes to the LVT deassering IRQs
+ *	8AP.	We do not enable low power mode (deep sleep) during MP bootup
+ *	9AP.	We do not use mixed mode
+ *
+ *	Pentium
+ *		There is a marginal case where REP MOVS on 100MHz SMP
+ *	machines with B stepping processors can fail. XXX should provide
+ *	an L1cache=Writethrough or L1cache=off option.
+ *
+ *		B stepping CPUs may hang. There are hardware work arounds
+ *	for this. We warn about it in case your board doesn't have the work
+ *	arounds. Basically thats so I can tell anyone with a B stepping
+ *	CPU and SMP problems "tough".
+ *
+ *	Specific items [From Pentium Processor Specification Update]
+ *
+ *	1AP.	Linux doesn't use remote read
+ *	2AP.	Linux doesn't trust APIC errors
+ *	3AP.	We work around this
+ *	4AP.	Linux never generated 3 interrupts of the same priority
+ *		to cause a lost local interrupt.
+ *	5AP.	Remote read is never used
+ *	6AP.	not affected - worked around in hardware
+ *	7AP.	not affected - worked around in hardware
+ *	8AP.	worked around in hardware - we get explicit CS errors if not
+ *	9AP.	only 'noapic' mode affected. Might generate spurious
+ *		interrupts, we log only the first one and count the
+ *		rest silently.
+ *	10AP.	not affected - worked around in hardware
+ *	11AP.	Linux reads the APIC between writes to avoid this, as per
+ *		the documentation. Make sure you preserve this as it affects
+ *		the C stepping chips too.
+ *	12AP.	not affected - worked around in hardware
+ *	13AP.	not affected - worked around in hardware
+ *	14AP.	we always deassert INIT during bootup
+ *	15AP.	not affected - worked around in hardware
+ *	16AP.	not affected - worked around in hardware
+ *	17AP.	not affected - worked around in hardware
+ *	18AP.	not affected - worked around in hardware
+ *	19AP.	not affected - worked around in BIOS
+ *
+ *	If this sounds worrying believe me these bugs are either ___RARE___,
+ *	or are signal timing bugs worked around in hardware and there's
+ *	about nothing of note with C stepping upwards.
+ */
+
+DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
+
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
+
+static inline int __prepare_ICR (unsigned int shortcut, int vector)
+{
+	unsigned int icr = shortcut | APIC_DEST_LOGICAL;
+
+	switch (vector) {
+	default:
+		icr |= APIC_DM_FIXED | vector;
+		break;
+	case NMI_VECTOR:
+		icr |= APIC_DM_NMI;
+		break;
+	}
+	return icr;
+}
+
+static inline int __prepare_ICR2 (unsigned int mask)
+{
+	return SET_APIC_DEST_FIELD(mask);
+}
+
+void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+	/*
+	 * Subtle. In the case of the 'never do double writes' workaround
+	 * we have to lock out interrupts to be safe.  As we don't care
+	 * of the value read we use an atomic rmw access to avoid costly
+	 * cli/sti.  Otherwise we use an even cheaper single atomic write
+	 * to the APIC.
+	 */
+	unsigned int cfg;
+
+	/*
+	 * Wait for idle.
+	 */
+	apic_wait_icr_idle();
+
+	/*
+	 * No need to touch the target chip field
+	 */
+	cfg = __prepare_ICR(shortcut, vector);
+
+	/*
+	 * Send the IPI. The write to APIC_ICR fires this off.
+	 */
+	apic_write_around(APIC_ICR, cfg);
+}
+
+void fastcall send_IPI_self(int vector)
+{
+	__send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+static inline void __send_IPI_dest_field(unsigned long mask, int vector)
+{
+	unsigned long cfg;
+
+	/*
+	 * Wait for idle.
+	 */
+	if (unlikely(vector == NMI_VECTOR))
+		safe_apic_wait_icr_idle();
+	else
+		apic_wait_icr_idle();
+		
+	/*
+	 * prepare target chip field
+	 */
+	cfg = __prepare_ICR2(mask);
+	apic_write_around(APIC_ICR2, cfg);
+		
+	/*
+	 * program the ICR 
+	 */
+	cfg = __prepare_ICR(0, vector);
+			
+	/*
+	 * Send the IPI. The write to APIC_ICR fires this off.
+	 */
+	apic_write_around(APIC_ICR, cfg);
+}
+
+/*
+ * This is only used on smaller machines.
+ */
+void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+{
+	unsigned long mask = cpus_addr(cpumask)[0];
+	unsigned long flags;
+
+	local_irq_save(flags);
+	WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+	__send_IPI_dest_field(mask, vector);
+	local_irq_restore(flags);
+}
+
+void send_IPI_mask_sequence(cpumask_t mask, int vector)
+{
+	unsigned long flags;
+	unsigned int query_cpu;
+
+	/*
+	 * Hack. The clustered APIC addressing mode doesn't allow us to send 
+	 * to an arbitrary mask, so I do a unicasts to each CPU instead. This 
+	 * should be modified to do 1 message per cluster ID - mbligh
+	 */ 
+
+	local_irq_save(flags);
+	for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
+		if (cpu_isset(query_cpu, mask)) {
+			__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
+					      vector);
+		}
+	}
+	local_irq_restore(flags);
+}
+
+#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
+
+/*
+ *	Smarter SMP flushing macros. 
+ *		c/o Linus Torvalds.
+ *
+ *	These mean you can really definitely utterly forget about
+ *	writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *	Optimizations Manfred Spraul <manfred@colorfullife.com>
+ */
+
+static cpumask_t flush_cpumask;
+static struct mm_struct * flush_mm;
+static unsigned long flush_va;
+static DEFINE_SPINLOCK(tlbstate_lock);
+
+/*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ *
+ * We need to reload %cr3 since the page tables may be going
+ * away from under us..
+ */
+void leave_mm(unsigned long cpu)
+{
+	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
+		BUG();
+	cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+	load_cr3(swapper_pg_dir);
+}
+
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ * 	Stop ipi delivery for the old mm. This is not synchronized with
+ * 	the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ * 	for the wrong mm, and in the worst case we perform a superflous
+ * 	tlb flush.
+ * 1a2) set cpu_tlbstate to TLBSTATE_OK
+ * 	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *	was in lazy tlb mode.
+ * 1a3) update cpu_tlbstate[].active_mm
+ * 	Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ * 	Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ *	cpu_tlbstate[].active_mm is correct, cpu0 already handles
+ *	flush ipis.
+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ * 	Atomically set the bit [other cpus will start sending flush ipis],
+ * 	and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu_tlbstate is local to each cpu, no
+ * write/read ordering problems.
+ */
+
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ */
+
+fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
+{
+	unsigned long cpu;
+
+	cpu = get_cpu();
+
+	if (!cpu_isset(cpu, flush_cpumask))
+		goto out;
+		/* 
+		 * This was a BUG() but until someone can quote me the
+		 * line from the intel manual that guarantees an IPI to
+		 * multiple CPUs is retried _only_ on the erroring CPUs
+		 * its staying as a return
+		 *
+		 * BUG();
+		 */
+		 
+	if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
+		if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
+			if (flush_va == TLB_FLUSH_ALL)
+				local_flush_tlb();
+			else
+				__flush_tlb_one(flush_va);
+		} else
+			leave_mm(cpu);
+	}
+	ack_APIC_irq();
+	smp_mb__before_clear_bit();
+	cpu_clear(cpu, flush_cpumask);
+	smp_mb__after_clear_bit();
+out:
+	put_cpu_no_resched();
+}
+
+void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
+			     unsigned long va)
+{
+	cpumask_t cpumask = *cpumaskp;
+
+	/*
+	 * A couple of (to be removed) sanity checks:
+	 *
+	 * - current CPU must not be in mask
+	 * - mask must exist :)
+	 */
+	BUG_ON(cpus_empty(cpumask));
+	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+	BUG_ON(!mm);
+
+#ifdef CONFIG_HOTPLUG_CPU
+	/* If a CPU which we ran on has gone down, OK. */
+	cpus_and(cpumask, cpumask, cpu_online_map);
+	if (unlikely(cpus_empty(cpumask)))
+		return;
+#endif
+
+	/*
+	 * i'm not happy about this global shared spinlock in the
+	 * MM hot path, but we'll see how contended it is.
+	 * AK: x86-64 has a faster method that could be ported.
+	 */
+	spin_lock(&tlbstate_lock);
+	
+	flush_mm = mm;
+	flush_va = va;
+	cpus_or(flush_cpumask, cpumask, flush_cpumask);
+	/*
+	 * We have to send the IPI only to
+	 * CPUs affected.
+	 */
+	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+
+	while (!cpus_empty(flush_cpumask))
+		/* nothing. lockup detection does not belong here */
+		cpu_relax();
+
+	flush_mm = NULL;
+	flush_va = 0;
+	spin_unlock(&tlbstate_lock);
+}
+	
+void flush_tlb_current_task(void)
+{
+	struct mm_struct *mm = current->mm;
+	cpumask_t cpu_mask;
+
+	preempt_disable();
+	cpu_mask = mm->cpu_vm_mask;
+	cpu_clear(smp_processor_id(), cpu_mask);
+
+	local_flush_tlb();
+	if (!cpus_empty(cpu_mask))
+		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+	preempt_enable();
+}
+
+void flush_tlb_mm (struct mm_struct * mm)
+{
+	cpumask_t cpu_mask;
+
+	preempt_disable();
+	cpu_mask = mm->cpu_vm_mask;
+	cpu_clear(smp_processor_id(), cpu_mask);
+
+	if (current->active_mm == mm) {
+		if (current->mm)
+			local_flush_tlb();
+		else
+			leave_mm(smp_processor_id());
+	}
+	if (!cpus_empty(cpu_mask))
+		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+
+	preempt_enable();
+}
+
+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	cpumask_t cpu_mask;
+
+	preempt_disable();
+	cpu_mask = mm->cpu_vm_mask;
+	cpu_clear(smp_processor_id(), cpu_mask);
+
+	if (current->active_mm == mm) {
+		if(current->mm)
+			__flush_tlb_one(va);
+		 else
+		 	leave_mm(smp_processor_id());
+	}
+
+	if (!cpus_empty(cpu_mask))
+		flush_tlb_others(cpu_mask, mm, va);
+
+	preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_page);
+
+static void do_flush_tlb_all(void* info)
+{
+	unsigned long cpu = smp_processor_id();
+
+	__flush_tlb_all();
+	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
+		leave_mm(cpu);
+}
+
+void flush_tlb_all(void)
+{
+	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+}
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+static void native_smp_send_reschedule(int cpu)
+{
+	WARN_ON(cpu_is_offline(cpu));
+	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+	void (*func) (void *info);
+	void *info;
+	atomic_t started;
+	atomic_t finished;
+	int wait;
+};
+
+void lock_ipi_call_lock(void)
+{
+	spin_lock_irq(&call_lock);
+}
+
+void unlock_ipi_call_lock(void)
+{
+	spin_unlock_irq(&call_lock);
+}
+
+static struct call_data_struct *call_data;
+
+static void __smp_call_function(void (*func) (void *info), void *info,
+				int nonatomic, int wait)
+{
+	struct call_data_struct data;
+	int cpus = num_online_cpus() - 1;
+
+	if (!cpus)
+		return;
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	mb();
+	
+	/* Send a message to all other CPUs and wait for them to respond */
+	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus)
+		cpu_relax();
+
+	if (wait)
+		while (atomic_read(&data.finished) != cpus)
+			cpu_relax();
+}
+
+
+/**
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on.  Must not include the current cpu.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+  * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+static int
+native_smp_call_function_mask(cpumask_t mask,
+			      void (*func)(void *), void *info,
+			      int wait)
+{
+	struct call_data_struct data;
+	cpumask_t allbutself;
+	int cpus;
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
+	/* Holding any lock stops cpus from going down. */
+	spin_lock(&call_lock);
+
+	allbutself = cpu_online_map;
+	cpu_clear(smp_processor_id(), allbutself);
+
+	cpus_and(mask, mask, allbutself);
+	cpus = cpus_weight(mask);
+
+	if (!cpus) {
+		spin_unlock(&call_lock);
+		return 0;
+	}
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	mb();
+
+	/* Send a message to other CPUs */
+	if (cpus_equal(mask, allbutself))
+		send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+	else
+		send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus)
+		cpu_relax();
+
+	if (wait)
+		while (atomic_read(&data.finished) != cpus)
+			cpu_relax();
+	spin_unlock(&call_lock);
+
+	return 0;
+}
+
+static void stop_this_cpu (void * dummy)
+{
+	local_irq_disable();
+	/*
+	 * Remove this CPU:
+	 */
+	cpu_clear(smp_processor_id(), cpu_online_map);
+	disable_local_APIC();
+	if (cpu_data[smp_processor_id()].hlt_works_ok)
+		for(;;) halt();
+	for (;;);
+}
+
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+static void native_smp_send_stop(void)
+{
+	/* Don't deadlock on the call lock in panic */
+	int nolock = !spin_trylock(&call_lock);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__smp_call_function(stop_this_cpu, NULL, 0, 0);
+	if (!nolock)
+		spin_unlock(&call_lock);
+	disable_local_APIC();
+	local_irq_restore(flags);
+}
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+}
+
+fastcall void smp_call_function_interrupt(struct pt_regs *regs)
+{
+	void (*func) (void *info) = call_data->func;
+	void *info = call_data->info;
+	int wait = call_data->wait;
+
+	ack_APIC_irq();
+	/*
+	 * Notify initiating CPU that I've grabbed the data and am
+	 * about to execute the function
+	 */
+	mb();
+	atomic_inc(&call_data->started);
+	/*
+	 * At this point the info structure may be out of scope unless wait==1
+	 */
+	irq_enter();
+	(*func)(info);
+	irq_exit();
+
+	if (wait) {
+		mb();
+		atomic_inc(&call_data->finished);
+	}
+}
+
+static int convert_apicid_to_cpu(int apic_id)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (x86_cpu_to_apicid[i] == apic_id)
+			return i;
+	}
+	return -1;
+}
+
+int safe_smp_processor_id(void)
+{
+	int apicid, cpuid;
+
+	if (!boot_cpu_has(X86_FEATURE_APIC))
+		return 0;
+
+	apicid = hard_smp_processor_id();
+	if (apicid == BAD_APICID)
+		return 0;
+
+	cpuid = convert_apicid_to_cpu(apicid);
+
+	return cpuid >= 0 ? cpuid : 0;
+}
+
+struct smp_ops smp_ops = {
+	.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+	.smp_prepare_cpus = native_smp_prepare_cpus,
+	.cpu_up = native_cpu_up,
+	.smp_cpus_done = native_smp_cpus_done,
+
+	.smp_send_stop = native_smp_send_stop,
+	.smp_send_reschedule = native_smp_send_reschedule,
+	.smp_call_function_mask = native_smp_call_function_mask,
+};
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
new file mode 100644
index 000000000000..e4f61d1c6248
--- /dev/null
+++ b/arch/x86/kernel/smpboot_32.c
@@ -0,0 +1,1322 @@
+/*
+ *	x86 SMP booting functions
+ *
+ *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *	Much of the core SMP work is based on previous work by Thomas Radke, to
+ *	whom a great many thanks are extended.
+ *
+ *	Thanks to Intel for making available several different Pentium,
+ *	Pentium Pro and Pentium-II/Xeon MP machines.
+ *	Original development of Linux SMP code supported by Caldera.
+ *
+ *	This code is released under the GNU General Public License version 2 or
+ *	later.
+ *
+ *	Fixes
+ *		Felix Koop	:	NR_CPUS used properly
+ *		Jose Renau	:	Handle single CPU case.
+ *		Alan Cox	:	By repeated request 8) - Total BogoMIPS report.
+ *		Greg Wright	:	Fix for kernel stacks panic.
+ *		Erich Boleyn	:	MP v1.4 and additional changes.
+ *	Matthias Sattler	:	Changes for 2.1 kernel map.
+ *	Michel Lespinasse	:	Changes for 2.1 kernel map.
+ *	Michael Chastain	:	Change trampoline.S to gnu as.
+ *		Alan Cox	:	Dumb bug: 'B' step PPro's are fine
+ *		Ingo Molnar	:	Added APIC timers, based on code
+ *					from Jose Renau
+ *		Ingo Molnar	:	various cleanups and rewrites
+ *		Tigran Aivazian	:	fixed "0.00 in /proc/uptime on SMP" bug.
+ *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs
+ *		Martin J. Bligh	: 	Added support for multi-quad systems
+ *		Dave Jones	:	Report invalid combinations of Athlon CPUs.
+*		Rusty Russell	:	Hacked into shape for new "hotplug" boot process. */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
+#include <linux/nmi.h>
+
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/arch_hooks.h>
+#include <asm/nmi.h>
+
+#include <mach_apic.h>
+#include <mach_wakecpu.h>
+#include <smpboot_hooks.h>
+#include <asm/vmi.h>
+#include <asm/mtrr.h>
+
+/* Set if we find a B stepping CPU */
+static int __devinitdata smp_b_stepping;
+
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+EXPORT_SYMBOL(smp_num_siblings);
+
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+
+/* representing HT siblings of each logical CPU */
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_sibling_map);
+
+/* representing HT and core siblings of each logical CPU */
+cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_core_map);
+
+/* bitmap of online cpus */
+cpumask_t cpu_online_map __read_mostly;
+EXPORT_SYMBOL(cpu_online_map);
+
+cpumask_t cpu_callin_map;
+cpumask_t cpu_callout_map;
+EXPORT_SYMBOL(cpu_callout_map);
+cpumask_t cpu_possible_map;
+EXPORT_SYMBOL(cpu_possible_map);
+static cpumask_t smp_commenced_mask;
+
+/* Per CPU bogomips and other parameters */
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_data);
+
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
+			{ [0 ... NR_CPUS-1] = 0xff };
+EXPORT_SYMBOL(x86_cpu_to_apicid);
+
+u8 apicid_2_node[MAX_APICID];
+
+/*
+ * Trampoline 80x86 program as an array.
+ */
+
+extern unsigned char trampoline_data [];
+extern unsigned char trampoline_end  [];
+static unsigned char *trampoline_base;
+static int trampoline_exec;
+
+static void map_cpu_to_logical_apicid(void);
+
+/* State of each CPU. */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+/*
+ * Currently trivial. Write the real->protected mode
+ * bootstrap into the page concerned. The caller
+ * has made sure it's suitably aligned.
+ */
+
+static unsigned long __devinit setup_trampoline(void)
+{
+	memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
+	return virt_to_phys(trampoline_base);
+}
+
+/*
+ * We are called very early to get the low memory for the
+ * SMP bootup trampoline page.
+ */
+void __init smp_alloc_memory(void)
+{
+	trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
+	/*
+	 * Has to be in very low memory so we can execute
+	 * real-mode AP code.
+	 */
+	if (__pa(trampoline_base) >= 0x9F000)
+		BUG();
+	/*
+	 * Make the SMP trampoline executable:
+	 */
+	trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
+}
+
+/*
+ * The bootstrap kernel entry code has set these up. Save them for
+ * a given CPU
+ */
+
+void __cpuinit smp_store_cpu_info(int id)
+{
+	struct cpuinfo_x86 *c = cpu_data + id;
+
+	*c = boot_cpu_data;
+	if (id!=0)
+		identify_secondary_cpu(c);
+	/*
+	 * Mask B, Pentium, but not Pentium MMX
+	 */
+	if (c->x86_vendor == X86_VENDOR_INTEL &&
+	    c->x86 == 5 &&
+	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
+	    c->x86_model <= 3)
+		/*
+		 * Remember we have B step Pentia with bugs
+		 */
+		smp_b_stepping = 1;
+
+	/*
+	 * Certain Athlons might work (for various values of 'work') in SMP
+	 * but they are not certified as MP capable.
+	 */
+	if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
+
+		if (num_possible_cpus() == 1)
+			goto valid_k7;
+
+		/* Athlon 660/661 is valid. */	
+		if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
+			goto valid_k7;
+
+		/* Duron 670 is valid */
+		if ((c->x86_model==7) && (c->x86_mask==0))
+			goto valid_k7;
+
+		/*
+		 * Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
+		 * It's worth noting that the A5 stepping (662) of some Athlon XP's
+		 * have the MP bit set.
+		 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
+		 */
+		if (((c->x86_model==6) && (c->x86_mask>=2)) ||
+		    ((c->x86_model==7) && (c->x86_mask>=1)) ||
+		     (c->x86_model> 7))
+			if (cpu_has_mp)
+				goto valid_k7;
+
+		/* If we get here, it's not a certified SMP capable AMD system. */
+		add_taint(TAINT_UNSAFE_SMP);
+	}
+
+valid_k7:
+	;
+}
+
+extern void calibrate_delay(void);
+
+static atomic_t init_deasserted;
+
+static void __cpuinit smp_callin(void)
+{
+	int cpuid, phys_id;
+	unsigned long timeout;
+
+	/*
+	 * If waken up by an INIT in an 82489DX configuration
+	 * we may get here before an INIT-deassert IPI reaches
+	 * our local APIC.  We have to wait for the IPI or we'll
+	 * lock up on an APIC access.
+	 */
+	wait_for_init_deassert(&init_deasserted);
+
+	/*
+	 * (This works even if the APIC is not enabled.)
+	 */
+	phys_id = GET_APIC_ID(apic_read(APIC_ID));
+	cpuid = smp_processor_id();
+	if (cpu_isset(cpuid, cpu_callin_map)) {
+		printk("huh, phys CPU#%d, CPU#%d already present??\n",
+					phys_id, cpuid);
+		BUG();
+	}
+	Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
+
+	/*
+	 * STARTUP IPIs are fragile beasts as they might sometimes
+	 * trigger some glue motherboard logic. Complete APIC bus
+	 * silence for 1 second, this overestimates the time the
+	 * boot CPU is spending to send the up to 2 STARTUP IPIs
+	 * by a factor of two. This should be enough.
+	 */
+
+	/*
+	 * Waiting 2s total for startup (udelay is not yet working)
+	 */
+	timeout = jiffies + 2*HZ;
+	while (time_before(jiffies, timeout)) {
+		/*
+		 * Has the boot CPU finished it's STARTUP sequence?
+		 */
+		if (cpu_isset(cpuid, cpu_callout_map))
+			break;
+		rep_nop();
+	}
+
+	if (!time_before(jiffies, timeout)) {
+		printk("BUG: CPU%d started up but did not get a callout!\n",
+			cpuid);
+		BUG();
+	}
+
+	/*
+	 * the boot CPU has finished the init stage and is spinning
+	 * on callin_map until we finish. We are free to set up this
+	 * CPU, first the APIC. (this is probably redundant on most
+	 * boards)
+	 */
+
+	Dprintk("CALLIN, before setup_local_APIC().\n");
+	smp_callin_clear_local_apic();
+	setup_local_APIC();
+	map_cpu_to_logical_apicid();
+
+	/*
+	 * Get our bogomips.
+	 */
+	calibrate_delay();
+	Dprintk("Stack at about %p\n",&cpuid);
+
+	/*
+	 * Save our processor parameters
+	 */
+	smp_store_cpu_info(cpuid);
+
+	/*
+	 * Allow the master to continue.
+	 */
+	cpu_set(cpuid, cpu_callin_map);
+}
+
+static int cpucount;
+
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	/*
+	 * For perf, we return last level cache shared map.
+	 * And for power savings, we return cpu_core_map
+	 */
+	if (sched_mc_power_savings || sched_smt_power_savings)
+		return cpu_core_map[cpu];
+	else
+		return c->llc_shared_map;
+}
+
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
+
+void __cpuinit set_cpu_sibling_map(int cpu)
+{
+	int i;
+	struct cpuinfo_x86 *c = cpu_data;
+
+	cpu_set(cpu, cpu_sibling_setup_map);
+
+	if (smp_num_siblings > 1) {
+		for_each_cpu_mask(i, cpu_sibling_setup_map) {
+			if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+			    c[cpu].cpu_core_id == c[i].cpu_core_id) {
+				cpu_set(i, cpu_sibling_map[cpu]);
+				cpu_set(cpu, cpu_sibling_map[i]);
+				cpu_set(i, cpu_core_map[cpu]);
+				cpu_set(cpu, cpu_core_map[i]);
+				cpu_set(i, c[cpu].llc_shared_map);
+				cpu_set(cpu, c[i].llc_shared_map);
+			}
+		}
+	} else {
+		cpu_set(cpu, cpu_sibling_map[cpu]);
+	}
+
+	cpu_set(cpu, c[cpu].llc_shared_map);
+
+	if (current_cpu_data.x86_max_cores == 1) {
+		cpu_core_map[cpu] = cpu_sibling_map[cpu];
+		c[cpu].booted_cores = 1;
+		return;
+	}
+
+	for_each_cpu_mask(i, cpu_sibling_setup_map) {
+		if (cpu_llc_id[cpu] != BAD_APICID &&
+		    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+			cpu_set(i, c[cpu].llc_shared_map);
+			cpu_set(cpu, c[i].llc_shared_map);
+		}
+		if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
+			cpu_set(i, cpu_core_map[cpu]);
+			cpu_set(cpu, cpu_core_map[i]);
+			/*
+			 *  Does this new cpu bringup a new core?
+			 */
+			if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+				/*
+				 * for each core in package, increment
+				 * the booted_cores for this new cpu
+				 */
+				if (first_cpu(cpu_sibling_map[i]) == i)
+					c[cpu].booted_cores++;
+				/*
+				 * increment the core count for all
+				 * the other cpus in this package
+				 */
+				if (i != cpu)
+					c[i].booted_cores++;
+			} else if (i != cpu && !c[cpu].booted_cores)
+				c[cpu].booted_cores = c[i].booted_cores;
+		}
+	}
+}
+
+/*
+ * Activate a secondary processor.
+ */
+static void __cpuinit start_secondary(void *unused)
+{
+	/*
+	 * Don't put *anything* before cpu_init(), SMP booting is too
+	 * fragile that we want to limit the things done here to the
+	 * most necessary things.
+	 */
+#ifdef CONFIG_VMI
+	vmi_bringup();
+#endif
+	cpu_init();
+	preempt_disable();
+	smp_callin();
+	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
+		rep_nop();
+	/*
+	 * Check TSC synchronization with the BP:
+	 */
+	check_tsc_sync_target();
+
+	setup_secondary_clock();
+	if (nmi_watchdog == NMI_IO_APIC) {
+		disable_8259A_irq(0);
+		enable_NMI_through_LVT0(NULL);
+		enable_8259A_irq(0);
+	}
+	/*
+	 * low-memory mappings have been cleared, flush them from
+	 * the local TLBs too.
+	 */
+	local_flush_tlb();
+
+	/* This must be done before setting cpu_online_map */
+	set_cpu_sibling_map(raw_smp_processor_id());
+	wmb();
+
+	/*
+	 * We need to hold call_lock, so there is no inconsistency
+	 * between the time smp_call_function() determines number of
+	 * IPI receipients, and the time when the determination is made
+	 * for which cpus receive the IPI. Holding this
+	 * lock helps us to not include this cpu in a currently in progress
+	 * smp_call_function().
+	 */
+	lock_ipi_call_lock();
+	cpu_set(smp_processor_id(), cpu_online_map);
+	unlock_ipi_call_lock();
+	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+
+	/* We can take interrupts now: we're officially "up". */
+	local_irq_enable();
+
+	wmb();
+	cpu_idle();
+}
+
+/*
+ * Everything has been set up for the secondary
+ * CPUs - they just need to reload everything
+ * from the task structure
+ * This function must not return.
+ */
+void __devinit initialize_secondary(void)
+{
+	/*
+	 * We don't actually need to load the full TSS,
+	 * basically just the stack pointer and the eip.
+	 */
+
+	asm volatile(
+		"movl %0,%%esp\n\t"
+		"jmp *%1"
+		:
+		:"m" (current->thread.esp),"m" (current->thread.eip));
+}
+
+/* Static state in head.S used to set up a CPU */
+extern struct {
+	void * esp;
+	unsigned short ss;
+} stack_start;
+
+#ifdef CONFIG_NUMA
+
+/* which logical CPUs are on which nodes */
+cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
+				{ [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
+EXPORT_SYMBOL(node_2_cpu_mask);
+/* which node each logical CPU is on */
+int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
+EXPORT_SYMBOL(cpu_2_node);
+
+/* set up a mapping between cpu and node. */
+static inline void map_cpu_to_node(int cpu, int node)
+{
+	printk("Mapping cpu %d to node %d\n", cpu, node);
+	cpu_set(cpu, node_2_cpu_mask[node]);
+	cpu_2_node[cpu] = node;
+}
+
+/* undo a mapping between cpu and node. */
+static inline void unmap_cpu_to_node(int cpu)
+{
+	int node;
+
+	printk("Unmapping cpu %d from all nodes\n", cpu);
+	for (node = 0; node < MAX_NUMNODES; node ++)
+		cpu_clear(cpu, node_2_cpu_mask[node]);
+	cpu_2_node[cpu] = 0;
+}
+#else /* !CONFIG_NUMA */
+
+#define map_cpu_to_node(cpu, node)	({})
+#define unmap_cpu_to_node(cpu)	({})
+
+#endif /* CONFIG_NUMA */
+
+u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+
+static void map_cpu_to_logical_apicid(void)
+{
+	int cpu = smp_processor_id();
+	int apicid = logical_smp_processor_id();
+	int node = apicid_to_node(apicid);
+
+	if (!node_online(node))
+		node = first_online_node;
+
+	cpu_2_logical_apicid[cpu] = apicid;
+	map_cpu_to_node(cpu, node);
+}
+
+static void unmap_cpu_to_logical_apicid(int cpu)
+{
+	cpu_2_logical_apicid[cpu] = BAD_APICID;
+	unmap_cpu_to_node(cpu);
+}
+
+static inline void __inquire_remote_apic(int apicid)
+{
+	int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
+	char *names[] = { "ID", "VERSION", "SPIV" };
+	int timeout;
+	unsigned long status;
+
+	printk("Inquiring remote APIC #%d...\n", apicid);
+
+	for (i = 0; i < ARRAY_SIZE(regs); i++) {
+		printk("... APIC #%d %s: ", apicid, names[i]);
+
+		/*
+		 * Wait for idle.
+		 */
+		status = safe_apic_wait_icr_idle();
+		if (status)
+			printk("a previous APIC delivery may have failed\n");
+
+		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
+		apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
+
+		timeout = 0;
+		do {
+			udelay(100);
+			status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
+		} while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
+
+		switch (status) {
+		case APIC_ICR_RR_VALID:
+			status = apic_read(APIC_RRR);
+			printk("%lx\n", status);
+			break;
+		default:
+			printk("failed\n");
+		}
+	}
+}
+
+#ifdef WAKE_SECONDARY_VIA_NMI
+/* 
+ * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
+ * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
+ * won't ... remember to clear down the APIC, etc later.
+ */
+static int __devinit
+wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+{
+	unsigned long send_status, accept_status = 0;
+	int maxlvt;
+
+	/* Target chip */
+	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
+
+	/* Boot on the stack */
+	/* Kick the second */
+	apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
+
+	Dprintk("Waiting for send to finish...\n");
+	send_status = safe_apic_wait_icr_idle();
+
+	/*
+	 * Give the other CPU some time to accept the IPI.
+	 */
+	udelay(200);
+	/*
+	 * Due to the Pentium erratum 3AP.
+	 */
+	maxlvt = lapic_get_maxlvt();
+	if (maxlvt > 3) {
+		apic_read_around(APIC_SPIV);
+		apic_write(APIC_ESR, 0);
+	}
+	accept_status = (apic_read(APIC_ESR) & 0xEF);
+	Dprintk("NMI sent.\n");
+
+	if (send_status)
+		printk("APIC never delivered???\n");
+	if (accept_status)
+		printk("APIC delivery error (%lx).\n", accept_status);
+
+	return (send_status | accept_status);
+}
+#endif	/* WAKE_SECONDARY_VIA_NMI */
+
+#ifdef WAKE_SECONDARY_VIA_INIT
+static int __devinit
+wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+{
+	unsigned long send_status, accept_status = 0;
+	int maxlvt, num_starts, j;
+
+	/*
+	 * Be paranoid about clearing APIC errors.
+	 */
+	if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+		apic_read_around(APIC_SPIV);
+		apic_write(APIC_ESR, 0);
+		apic_read(APIC_ESR);
+	}
+
+	Dprintk("Asserting INIT.\n");
+
+	/*
+	 * Turn INIT on target chip
+	 */
+	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+	/*
+	 * Send IPI
+	 */
+	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
+				| APIC_DM_INIT);
+
+	Dprintk("Waiting for send to finish...\n");
+	send_status = safe_apic_wait_icr_idle();
+
+	mdelay(10);
+
+	Dprintk("Deasserting INIT.\n");
+
+	/* Target chip */
+	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+	/* Send IPI */
+	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+
+	Dprintk("Waiting for send to finish...\n");
+	send_status = safe_apic_wait_icr_idle();
+
+	atomic_set(&init_deasserted, 1);
+
+	/*
+	 * Should we send STARTUP IPIs ?
+	 *
+	 * Determine this based on the APIC version.
+	 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
+	 */
+	if (APIC_INTEGRATED(apic_version[phys_apicid]))
+		num_starts = 2;
+	else
+		num_starts = 0;
+
+	/*
+	 * Paravirt / VMI wants a startup IPI hook here to set up the
+	 * target processor state.
+	 */
+	startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
+		         (unsigned long) stack_start.esp);
+
+	/*
+	 * Run STARTUP IPI loop.
+	 */
+	Dprintk("#startup loops: %d.\n", num_starts);
+
+	maxlvt = lapic_get_maxlvt();
+
+	for (j = 1; j <= num_starts; j++) {
+		Dprintk("Sending STARTUP #%d.\n",j);
+		apic_read_around(APIC_SPIV);
+		apic_write(APIC_ESR, 0);
+		apic_read(APIC_ESR);
+		Dprintk("After apic_write.\n");
+
+		/*
+		 * STARTUP IPI
+		 */
+
+		/* Target chip */
+		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+		/* Boot on the stack */
+		/* Kick the second */
+		apic_write_around(APIC_ICR, APIC_DM_STARTUP
+					| (start_eip >> 12));
+
+		/*
+		 * Give the other CPU some time to accept the IPI.
+		 */
+		udelay(300);
+
+		Dprintk("Startup point 1.\n");
+
+		Dprintk("Waiting for send to finish...\n");
+		send_status = safe_apic_wait_icr_idle();
+
+		/*
+		 * Give the other CPU some time to accept the IPI.
+		 */
+		udelay(200);
+		/*
+		 * Due to the Pentium erratum 3AP.
+		 */
+		if (maxlvt > 3) {
+			apic_read_around(APIC_SPIV);
+			apic_write(APIC_ESR, 0);
+		}
+		accept_status = (apic_read(APIC_ESR) & 0xEF);
+		if (send_status || accept_status)
+			break;
+	}
+	Dprintk("After Startup.\n");
+
+	if (send_status)
+		printk("APIC never delivered???\n");
+	if (accept_status)
+		printk("APIC delivery error (%lx).\n", accept_status);
+
+	return (send_status | accept_status);
+}
+#endif	/* WAKE_SECONDARY_VIA_INIT */
+
+extern cpumask_t cpu_initialized;
+static inline int alloc_cpu_id(void)
+{
+	cpumask_t	tmp_map;
+	int cpu;
+	cpus_complement(tmp_map, cpu_present_map);
+	cpu = first_cpu(tmp_map);
+	if (cpu >= NR_CPUS)
+		return -ENODEV;
+	return cpu;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS];
+static inline struct task_struct * alloc_idle_task(int cpu)
+{
+	struct task_struct *idle;
+
+	if ((idle = cpu_idle_tasks[cpu]) != NULL) {
+		/* initialize thread_struct.  we really want to avoid destroy
+		 * idle tread
+		 */
+		idle->thread.esp = (unsigned long)task_pt_regs(idle);
+		init_idle(idle, cpu);
+		return idle;
+	}
+	idle = fork_idle(cpu);
+
+	if (!IS_ERR(idle))
+		cpu_idle_tasks[cpu] = idle;
+	return idle;
+}
+#else
+#define alloc_idle_task(cpu) fork_idle(cpu)
+#endif
+
+static int __cpuinit do_boot_cpu(int apicid, int cpu)
+/*
+ * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
+ * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
+ * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
+ */
+{
+	struct task_struct *idle;
+	unsigned long boot_error;
+	int timeout;
+	unsigned long start_eip;
+	unsigned short nmi_high = 0, nmi_low = 0;
+
+	/*
+	 * Save current MTRR state in case it was changed since early boot
+	 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+	 */
+	mtrr_save_state();
+
+	/*
+	 * We can't use kernel_thread since we must avoid to
+	 * reschedule the child.
+	 */
+	idle = alloc_idle_task(cpu);
+	if (IS_ERR(idle))
+		panic("failed fork for CPU %d", cpu);
+
+	init_gdt(cpu);
+ 	per_cpu(current_task, cpu) = idle;
+	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+
+	idle->thread.eip = (unsigned long) start_secondary;
+	/* start_eip had better be page-aligned! */
+	start_eip = setup_trampoline();
+
+	++cpucount;
+	alternatives_smp_switch(1);
+
+	/* So we see what's up   */
+	printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+	/* Stack for startup_32 can be just as for start_secondary onwards */
+	stack_start.esp = (void *) idle->thread.esp;
+
+	irq_ctx_init(cpu);
+
+	x86_cpu_to_apicid[cpu] = apicid;
+	/*
+	 * This grunge runs the startup process for
+	 * the targeted processor.
+	 */
+
+	atomic_set(&init_deasserted, 0);
+
+	Dprintk("Setting warm reset code and vector.\n");
+
+	store_NMI_vector(&nmi_high, &nmi_low);
+
+	smpboot_setup_warm_reset_vector(start_eip);
+
+	/*
+	 * Starting actual IPI sequence...
+	 */
+	boot_error = wakeup_secondary_cpu(apicid, start_eip);
+
+	if (!boot_error) {
+		/*
+		 * allow APs to start initializing.
+		 */
+		Dprintk("Before Callout %d.\n", cpu);
+		cpu_set(cpu, cpu_callout_map);
+		Dprintk("After Callout %d.\n", cpu);
+
+		/*
+		 * Wait 5s total for a response
+		 */
+		for (timeout = 0; timeout < 50000; timeout++) {
+			if (cpu_isset(cpu, cpu_callin_map))
+				break;	/* It has booted */
+			udelay(100);
+		}
+
+		if (cpu_isset(cpu, cpu_callin_map)) {
+			/* number CPUs logically, starting from 1 (BSP is 0) */
+			Dprintk("OK.\n");
+			printk("CPU%d: ", cpu);
+			print_cpu_info(&cpu_data[cpu]);
+			Dprintk("CPU has booted.\n");
+		} else {
+			boot_error= 1;
+			if (*((volatile unsigned char *)trampoline_base)
+					== 0xA5)
+				/* trampoline started but...? */
+				printk("Stuck ??\n");
+			else
+				/* trampoline code not run */
+				printk("Not responding.\n");
+			inquire_remote_apic(apicid);
+		}
+	}
+
+	if (boot_error) {
+		/* Try to put things back the way they were before ... */
+		unmap_cpu_to_logical_apicid(cpu);
+		cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
+		cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
+		cpucount--;
+	} else {
+		x86_cpu_to_apicid[cpu] = apicid;
+		cpu_set(cpu, cpu_present_map);
+	}
+
+	/* mark "stuck" area as not stuck */
+	*((volatile unsigned long *)trampoline_base) = 0;
+
+	return boot_error;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void cpu_exit_clear(void)
+{
+	int cpu = raw_smp_processor_id();
+
+	idle_task_exit();
+
+	cpucount --;
+	cpu_uninit();
+	irq_ctx_exit(cpu);
+
+	cpu_clear(cpu, cpu_callout_map);
+	cpu_clear(cpu, cpu_callin_map);
+
+	cpu_clear(cpu, smp_commenced_mask);
+	unmap_cpu_to_logical_apicid(cpu);
+}
+
+struct warm_boot_cpu_info {
+	struct completion *complete;
+	struct work_struct task;
+	int apicid;
+	int cpu;
+};
+
+static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
+{
+	struct warm_boot_cpu_info *info =
+		container_of(work, struct warm_boot_cpu_info, task);
+	do_boot_cpu(info->apicid, info->cpu);
+	complete(info->complete);
+}
+
+static int __cpuinit __smp_prepare_cpu(int cpu)
+{
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct warm_boot_cpu_info info;
+	int	apicid, ret;
+
+	apicid = x86_cpu_to_apicid[cpu];
+	if (apicid == BAD_APICID) {
+		ret = -ENODEV;
+		goto exit;
+	}
+
+	info.complete = &done;
+	info.apicid = apicid;
+	info.cpu = cpu;
+	INIT_WORK(&info.task, do_warm_boot_cpu);
+
+	/* init low mem mapping */
+	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+			min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+	flush_tlb_all();
+	schedule_work(&info.task);
+	wait_for_completion(&done);
+
+	zap_low_mappings();
+	ret = 0;
+exit:
+	return ret;
+}
+#endif
+
+/*
+ * Cycle through the processors sending APIC IPIs to boot each.
+ */
+
+static int boot_cpu_logical_apicid;
+/* Where the IO area was mapped on multiquad, always 0 otherwise */
+void *xquad_portio;
+#ifdef CONFIG_X86_NUMAQ
+EXPORT_SYMBOL(xquad_portio);
+#endif
+
+static void __init smp_boot_cpus(unsigned int max_cpus)
+{
+	int apicid, cpu, bit, kicked;
+	unsigned long bogosum = 0;
+
+	/*
+	 * Setup boot CPU information
+	 */
+	smp_store_cpu_info(0); /* Final full version of the data */
+	printk("CPU%d: ", 0);
+	print_cpu_info(&cpu_data[0]);
+
+	boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+	boot_cpu_logical_apicid = logical_smp_processor_id();
+	x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+
+	current_thread_info()->cpu = 0;
+
+	set_cpu_sibling_map(0);
+
+	/*
+	 * If we couldn't find an SMP configuration at boot time,
+	 * get out of here now!
+	 */
+	if (!smp_found_config && !acpi_lapic) {
+		printk(KERN_NOTICE "SMP motherboard not detected.\n");
+		smpboot_clear_io_apic_irqs();
+		phys_cpu_present_map = physid_mask_of_physid(0);
+		if (APIC_init_uniprocessor())
+			printk(KERN_NOTICE "Local APIC not detected."
+					   " Using dummy APIC emulation.\n");
+		map_cpu_to_logical_apicid();
+		cpu_set(0, cpu_sibling_map[0]);
+		cpu_set(0, cpu_core_map[0]);
+		return;
+	}
+
+	/*
+	 * Should not be necessary because the MP table should list the boot
+	 * CPU too, but we do it for the sake of robustness anyway.
+	 * Makes no sense to do this check in clustered apic mode, so skip it
+	 */
+	if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
+		printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
+				boot_cpu_physical_apicid);
+		physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+	}
+
+	/*
+	 * If we couldn't find a local APIC, then get out of here now!
+	 */
+	if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
+		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+			boot_cpu_physical_apicid);
+		printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
+		smpboot_clear_io_apic_irqs();
+		phys_cpu_present_map = physid_mask_of_physid(0);
+		cpu_set(0, cpu_sibling_map[0]);
+		cpu_set(0, cpu_core_map[0]);
+		return;
+	}
+
+	verify_local_APIC();
+
+	/*
+	 * If SMP should be disabled, then really disable it!
+	 */
+	if (!max_cpus) {
+		smp_found_config = 0;
+		printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
+		smpboot_clear_io_apic_irqs();
+		phys_cpu_present_map = physid_mask_of_physid(0);
+		cpu_set(0, cpu_sibling_map[0]);
+		cpu_set(0, cpu_core_map[0]);
+		return;
+	}
+
+	connect_bsp_APIC();
+	setup_local_APIC();
+	map_cpu_to_logical_apicid();
+
+
+	setup_portio_remap();
+
+	/*
+	 * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
+	 *
+	 * In clustered apic mode, phys_cpu_present_map is a constructed thus:
+	 * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the 
+	 * clustered apic ID.
+	 */
+	Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
+
+	kicked = 1;
+	for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
+		apicid = cpu_present_to_apicid(bit);
+		/*
+		 * Don't even attempt to start the boot CPU!
+		 */
+		if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
+			continue;
+
+		if (!check_apicid_present(bit))
+			continue;
+		if (max_cpus <= cpucount+1)
+			continue;
+
+		if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
+			printk("CPU #%d not responding - cannot use it.\n",
+								apicid);
+		else
+			++kicked;
+	}
+
+	/*
+	 * Cleanup possible dangling ends...
+	 */
+	smpboot_restore_warm_reset_vector();
+
+	/*
+	 * Allow the user to impress friends.
+	 */
+	Dprintk("Before bogomips.\n");
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (cpu_isset(cpu, cpu_callout_map))
+			bogosum += cpu_data[cpu].loops_per_jiffy;
+	printk(KERN_INFO
+		"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+		cpucount+1,
+		bogosum/(500000/HZ),
+		(bogosum/(5000/HZ))%100);
+	
+	Dprintk("Before bogocount - setting activated=1.\n");
+
+	if (smp_b_stepping)
+		printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
+
+	/*
+	 * Don't taint if we are running SMP kernel on a single non-MP
+	 * approved Athlon
+	 */
+	if (tainted & TAINT_UNSAFE_SMP) {
+		if (cpucount)
+			printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
+		else
+			tainted &= ~TAINT_UNSAFE_SMP;
+	}
+
+	Dprintk("Boot done.\n");
+
+	/*
+	 * construct cpu_sibling_map[], so that we can tell sibling CPUs
+	 * efficiently.
+	 */
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		cpus_clear(cpu_sibling_map[cpu]);
+		cpus_clear(cpu_core_map[cpu]);
+	}
+
+	cpu_set(0, cpu_sibling_map[0]);
+	cpu_set(0, cpu_core_map[0]);
+
+	smpboot_setup_io_apic();
+
+	setup_boot_clock();
+}
+
+/* These are wrappers to interface to the new boot process.  Someone
+   who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
+void __init native_smp_prepare_cpus(unsigned int max_cpus)
+{
+	smp_commenced_mask = cpumask_of_cpu(0);
+	cpu_callin_map = cpumask_of_cpu(0);
+	mb();
+	smp_boot_cpus(max_cpus);
+}
+
+void __init native_smp_prepare_boot_cpu(void)
+{
+	unsigned int cpu = smp_processor_id();
+
+	init_gdt(cpu);
+	switch_to_new_gdt();
+
+	cpu_set(cpu, cpu_online_map);
+	cpu_set(cpu, cpu_callout_map);
+	cpu_set(cpu, cpu_present_map);
+	cpu_set(cpu, cpu_possible_map);
+	__get_cpu_var(cpu_state) = CPU_ONLINE;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void remove_siblinginfo(int cpu)
+{
+	int sibling;
+	struct cpuinfo_x86 *c = cpu_data;
+
+	for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+		cpu_clear(cpu, cpu_core_map[sibling]);
+		/*
+		 * last thread sibling in this cpu core going down
+		 */
+		if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+			c[sibling].booted_cores--;
+	}
+			
+	for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+		cpu_clear(cpu, cpu_sibling_map[sibling]);
+	cpus_clear(cpu_sibling_map[cpu]);
+	cpus_clear(cpu_core_map[cpu]);
+	c[cpu].phys_proc_id = 0;
+	c[cpu].cpu_core_id = 0;
+	cpu_clear(cpu, cpu_sibling_setup_map);
+}
+
+int __cpu_disable(void)
+{
+	cpumask_t map = cpu_online_map;
+	int cpu = smp_processor_id();
+
+	/*
+	 * Perhaps use cpufreq to drop frequency, but that could go
+	 * into generic code.
+ 	 *
+	 * We won't take down the boot processor on i386 due to some
+	 * interrupts only being able to be serviced by the BSP.
+	 * Especially so if we're not using an IOAPIC	-zwane
+	 */
+	if (cpu == 0)
+		return -EBUSY;
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		stop_apic_nmi_watchdog(NULL);
+	clear_local_APIC();
+	/* Allow any queued timer interrupts to get serviced */
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+
+	remove_siblinginfo(cpu);
+
+	cpu_clear(cpu, map);
+	fixup_irqs(map);
+	/* It's now safe to remove this processor from the online map */
+	cpu_clear(cpu, cpu_online_map);
+	return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	/* We don't do anything here: idle task is faking death itself. */
+	unsigned int i;
+
+	for (i = 0; i < 10; i++) {
+		/* They ack this in play_dead by setting CPU_DEAD */
+		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+			printk ("CPU %d is now offline\n", cpu);
+			if (1 == num_online_cpus())
+				alternatives_smp_switch(0);
+			return;
+		}
+		msleep(100);
+	}
+ 	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+#else /* ... !CONFIG_HOTPLUG_CPU */
+int __cpu_disable(void)
+{
+	return -ENOSYS;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	/* We said "no" in __cpu_disable */
+	BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __cpuinit native_cpu_up(unsigned int cpu)
+{
+	unsigned long flags;
+#ifdef CONFIG_HOTPLUG_CPU
+	int ret = 0;
+
+	/*
+	 * We do warm boot only on cpus that had booted earlier
+	 * Otherwise cold boot is all handled from smp_boot_cpus().
+	 * cpu_callin_map is set during AP kickstart process. Its reset
+	 * when a cpu is taken offline from cpu_exit_clear().
+	 */
+	if (!cpu_isset(cpu, cpu_callin_map))
+		ret = __smp_prepare_cpu(cpu);
+
+	if (ret)
+		return -EIO;
+#endif
+
+	/* In case one didn't come up */
+	if (!cpu_isset(cpu, cpu_callin_map)) {
+		printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
+		return -EIO;
+	}
+
+	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+	/* Unleash the CPU! */
+	cpu_set(cpu, smp_commenced_mask);
+
+	/*
+	 * Check TSC synchronization with the AP (keep irqs disabled
+	 * while doing so):
+	 */
+	local_irq_save(flags);
+	check_tsc_sync_source(cpu);
+	local_irq_restore(flags);
+
+	while (!cpu_isset(cpu, cpu_online_map)) {
+		cpu_relax();
+		touch_nmi_watchdog();
+	}
+
+	return 0;
+}
+
+void __init native_smp_cpus_done(unsigned int max_cpus)
+{
+#ifdef CONFIG_X86_IO_APIC
+	setup_ioapic_dest();
+#endif
+	zap_low_mappings();
+#ifndef CONFIG_HOTPLUG_CPU
+	/*
+	 * Disable executability of the SMP trampoline:
+	 */
+	set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
+#endif
+}
+
+void __init smp_intr_init(void)
+{
+	/*
+	 * IRQ0 must be given a fixed assignment and initialized,
+	 * because it's used before the IO-APIC is set up.
+	 */
+	set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+
+	/*
+	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+	 * IPI, driven by wakeup.
+	 */
+	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+	/* IPI for invalidation */
+	set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+
+	/* IPI for generic function call */
+	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+}
+
+/*
+ * If the BIOS enumerates physical processors before logical,
+ * maxcpus=N at enumeration-time can be used to disable HT.
+ */
+static int __init parse_maxcpus(char *arg)
+{
+	extern unsigned int maxcpus;
+
+	maxcpus = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
new file mode 100644
index 000000000000..bbfe85a0f699
--- /dev/null
+++ b/arch/x86/kernel/smpcommon_32.c
@@ -0,0 +1,81 @@
+/*
+ * SMP stuff which is common to all sub-architectures.
+ */
+#include <linux/module.h>
+#include <asm/smp.h>
+
+DEFINE_PER_CPU(unsigned long, this_cpu_off);
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
+
+/* Initialize the CPU's GDT.  This is either the boot CPU doing itself
+   (still using the master per-cpu area), or a CPU doing it for a
+   secondary which will soon come up. */
+__cpuinit void init_gdt(int cpu)
+{
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
+			(u32 *)&gdt[GDT_ENTRY_PERCPU].b,
+			__per_cpu_offset[cpu], 0xFFFFF,
+			0x80 | DESCTYPE_S | 0x2, 0x8);
+
+	per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
+	per_cpu(cpu_number, cpu) = cpu;
+}
+
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Unused.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
+		      int wait)
+{
+	return smp_call_function_mask(cpu_online_map, func, info, wait);
+}
+EXPORT_SYMBOL(smp_call_function);
+
+/**
+ * smp_call_function_single - Run a function on a specific CPU
+ * @cpu: The target CPU.  Cannot be the calling CPU.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ */
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+			     int nonatomic, int wait)
+{
+	/* prevent preemption and reschedule on another processor */
+	int ret;
+	int me = get_cpu();
+	if (cpu == me) {
+		local_irq_disable();
+		func(info);
+		local_irq_enable();
+		put_cpu();
+		return 0;
+	}
+
+	ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
+
+	put_cpu();
+	return ret;
+}
+EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
new file mode 100644
index 000000000000..2a8713ec0f9a
--- /dev/null
+++ b/arch/x86/kernel/srat_32.c
@@ -0,0 +1,360 @@
+/*
+ * Some of the code in this file has been gleaned from the 64 bit 
+ * discontigmem support code base.
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to Pat Gaughen <gone@us.ibm.com>
+ */
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/acpi.h>
+#include <linux/nodemask.h>
+#include <asm/srat.h>
+#include <asm/topology.h>
+#include <asm/smp.h>
+
+/*
+ * proximity macros and definitions
+ */
+#define NODE_ARRAY_INDEX(x)	((x) / 8)	/* 8 bits/char */
+#define NODE_ARRAY_OFFSET(x)	((x) % 8)	/* 8 bits/char */
+#define BMAP_SET(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
+#define BMAP_TEST(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
+/* bitmap length; _PXM is at most 255 */
+#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
+static u8 pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
+
+#define MAX_CHUNKS_PER_NODE	3
+#define MAXCHUNKS		(MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
+struct node_memory_chunk_s {
+	unsigned long	start_pfn;
+	unsigned long	end_pfn;
+	u8	pxm;		// proximity domain of node
+	u8	nid;		// which cnode contains this chunk?
+	u8	bank;		// which mem bank on this node
+};
+static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
+
+static int num_memory_chunks;		/* total number of memory chunks */
+static u8 __initdata apicid_to_pxm[MAX_APICID];
+
+extern void * boot_ioremap(unsigned long, unsigned long);
+
+/* Identify CPU proximity domains */
+static void __init parse_cpu_affinity_structure(char *p)
+{
+	struct acpi_srat_cpu_affinity *cpu_affinity =
+				(struct acpi_srat_cpu_affinity *) p;
+
+	if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+		return;		/* empty entry */
+
+	/* mark this node as "seen" in node bitmap */
+	BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
+
+	apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
+
+	printk("CPU 0x%02X in proximity domain 0x%02X\n",
+		cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
+}
+
+/*
+ * Identify memory proximity domains and hot-remove capabilities.
+ * Fill node memory chunk list structure.
+ */
+static void __init parse_memory_affinity_structure (char *sratp)
+{
+	unsigned long long paddr, size;
+	unsigned long start_pfn, end_pfn;
+	u8 pxm;
+	struct node_memory_chunk_s *p, *q, *pend;
+	struct acpi_srat_mem_affinity *memory_affinity =
+			(struct acpi_srat_mem_affinity *) sratp;
+
+	if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+		return;		/* empty entry */
+
+	pxm = memory_affinity->proximity_domain & 0xff;
+
+	/* mark this node as "seen" in node bitmap */
+	BMAP_SET(pxm_bitmap, pxm);
+
+	/* calculate info for memory chunk structure */
+	paddr = memory_affinity->base_address;
+	size = memory_affinity->length;
+
+	start_pfn = paddr >> PAGE_SHIFT;
+	end_pfn = (paddr + size) >> PAGE_SHIFT;
+
+
+	if (num_memory_chunks >= MAXCHUNKS) {
+		printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n",
+			size/(1024*1024), paddr);
+		return;
+	}
+
+	/* Insertion sort based on base address */
+	pend = &node_memory_chunk[num_memory_chunks];
+	for (p = &node_memory_chunk[0]; p < pend; p++) {
+		if (start_pfn < p->start_pfn)
+			break;
+	}
+	if (p < pend) {
+		for (q = pend; q >= p; q--)
+			*(q + 1) = *q;
+	}
+	p->start_pfn = start_pfn;
+	p->end_pfn = end_pfn;
+	p->pxm = pxm;
+
+	num_memory_chunks++;
+
+	printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n",
+		start_pfn, end_pfn,
+		memory_affinity->memory_type,
+		pxm,
+		((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
+		 "enabled and removable" : "enabled" ) );
+}
+
+/*
+ * The SRAT table always lists ascending addresses, so can always
+ * assume that the first "start" address that you see is the real
+ * start of the node, and that the current "end" address is after
+ * the previous one.
+ */
+static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
+{
+	/*
+	 * Only add present memory as told by the e820.
+	 * There is no guarantee from the SRAT that the memory it
+	 * enumerates is present at boot time because it represents
+	 * *possible* memory hotplug areas the same as normal RAM.
+	 */
+	if (memory_chunk->start_pfn >= max_pfn) {
+		printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n",
+			memory_chunk->start_pfn, memory_chunk->end_pfn);
+		return;
+	}
+	if (memory_chunk->nid != nid)
+		return;
+
+	if (!node_has_online_mem(nid))
+		node_start_pfn[nid] = memory_chunk->start_pfn;
+
+	if (node_start_pfn[nid] > memory_chunk->start_pfn)
+		node_start_pfn[nid] = memory_chunk->start_pfn;
+
+	if (node_end_pfn[nid] < memory_chunk->end_pfn)
+		node_end_pfn[nid] = memory_chunk->end_pfn;
+}
+
+/* Parse the ACPI Static Resource Affinity Table */
+static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
+{
+	u8 *start, *end, *p;
+	int i, j, nid;
+
+	start = (u8 *)(&(sratp->reserved) + 1);	/* skip header */
+	p = start;
+	end = (u8 *)sratp + sratp->header.length;
+
+	memset(pxm_bitmap, 0, sizeof(pxm_bitmap));	/* init proximity domain bitmap */
+	memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
+
+	num_memory_chunks = 0;
+	while (p < end) {
+		switch (*p) {
+		case ACPI_SRAT_TYPE_CPU_AFFINITY:
+			parse_cpu_affinity_structure(p);
+			break;
+		case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
+			parse_memory_affinity_structure(p);
+			break;
+		default:
+			printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
+			break;
+		}
+		p += p[1];
+		if (p[1] == 0) {
+			printk("acpi20_parse_srat: Entry length value is zero;"
+				" can't parse any further!\n");
+			break;
+		}
+	}
+
+	if (num_memory_chunks == 0) {
+		printk("could not finy any ACPI SRAT memory areas.\n");
+		goto out_fail;
+	}
+
+	/* Calculate total number of nodes in system from PXM bitmap and create
+	 * a set of sequential node IDs starting at zero.  (ACPI doesn't seem
+	 * to specify the range of _PXM values.)
+	 */
+	/*
+	 * MCD - we no longer HAVE to number nodes sequentially.  PXM domain
+	 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
+	 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
+	 * approaches MAX_PXM_DOMAINS for i386.
+	 */
+	nodes_clear(node_online_map);
+	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
+		if (BMAP_TEST(pxm_bitmap, i)) {
+			int nid = acpi_map_pxm_to_node(i);
+			node_set_online(nid);
+		}
+	}
+	BUG_ON(num_online_nodes() == 0);
+
+	/* set cnode id in memory chunk structure */
+	for (i = 0; i < num_memory_chunks; i++)
+		node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
+
+	printk("pxm bitmap: ");
+	for (i = 0; i < sizeof(pxm_bitmap); i++) {
+		printk("%02X ", pxm_bitmap[i]);
+	}
+	printk("\n");
+	printk("Number of logical nodes in system = %d\n", num_online_nodes());
+	printk("Number of memory chunks in system = %d\n", num_memory_chunks);
+
+	for (i = 0; i < MAX_APICID; i++)
+		apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
+
+	for (j = 0; j < num_memory_chunks; j++){
+		struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
+		printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
+		       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
+		node_read_chunk(chunk->nid, chunk);
+		add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
+	}
+ 
+	for_each_online_node(nid) {
+		unsigned long start = node_start_pfn[nid];
+		unsigned long end = node_end_pfn[nid];
+
+		memory_present(nid, start, end);
+		node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
+	}
+	return 1;
+out_fail:
+	return 0;
+}
+
+struct acpi_static_rsdt {
+	struct acpi_table_rsdt table;
+	u32 padding[7]; /* Allow for 7 more table entries */
+};
+
+int __init get_memcfg_from_srat(void)
+{
+	struct acpi_table_header *header = NULL;
+	struct acpi_table_rsdp *rsdp = NULL;
+	struct acpi_table_rsdt *rsdt = NULL;
+	acpi_native_uint rsdp_address = 0;
+	struct acpi_static_rsdt saved_rsdt;
+	int tables = 0;
+	int i = 0;
+
+	rsdp_address = acpi_find_rsdp();
+	if (!rsdp_address) {
+		printk("%s: System description tables not found\n",
+		       __FUNCTION__);
+		goto out_err;
+	}
+
+	printk("%s: assigning address to rsdp\n", __FUNCTION__);
+	rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
+	if (!rsdp) {
+		printk("%s: Didn't find ACPI root!\n", __FUNCTION__);
+		goto out_err;
+	}
+
+	printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
+		rsdp->oem_id);
+
+	if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
+		printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__);
+		goto out_err;
+	}
+
+	rsdt = (struct acpi_table_rsdt *)
+	    boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
+
+	if (!rsdt) {
+		printk(KERN_WARNING
+		       "%s: ACPI: Invalid root system description tables (RSDT)\n",
+		       __FUNCTION__);
+		goto out_err;
+	}
+
+	header = &rsdt->header;
+
+	if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
+		printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
+		goto out_err;
+	}
+
+	/* 
+	 * The number of tables is computed by taking the 
+	 * size of all entries (header size minus total 
+	 * size of RSDT) divided by the size of each entry
+	 * (4-byte table pointers).
+	 */
+	tables = (header->length - sizeof(struct acpi_table_header)) / 4;
+
+	if (!tables)
+		goto out_err;
+
+	memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
+
+	if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
+		printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
+		       saved_rsdt.table.header.length);
+		goto out_err;
+	}
+
+	printk("Begin SRAT table scan....\n");
+
+	for (i = 0; i < tables; i++) {
+		/* Map in header, then map in full table length. */
+		header = (struct acpi_table_header *)
+			boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
+		if (!header)
+			break;
+		header = (struct acpi_table_header *)
+			boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
+		if (!header)
+			break;
+
+		if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
+			continue;
+
+		/* we've found the srat table. don't need to look at any more tables */
+		return acpi20_parse_srat((struct acpi_table_srat *)header);
+	}
+out_err:
+	remove_all_active_ranges();
+	printk("failed to get NUMA memory information from SRAT table\n");
+	return 0;
+}
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
new file mode 100644
index 000000000000..d0e01a3acf35
--- /dev/null
+++ b/arch/x86/kernel/summit_32.c
@@ -0,0 +1,180 @@
+/*
+ * arch/i386/kernel/summit.c - IBM Summit-Specific Code
+ *
+ * Written By: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (c) 2003 IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <asm/io.h>
+#include <asm/mach-summit/mach_mpparse.h>
+
+static struct rio_table_hdr *rio_table_hdr __initdata;
+static struct scal_detail   *scal_devs[MAX_NUMNODES] __initdata;
+static struct rio_detail    *rio_devs[MAX_NUMNODES*4] __initdata;
+
+static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
+{
+	int twister = 0, node = 0;
+	int i, bus, num_buses;
+
+	for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
+		if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id){
+			twister = rio_devs[i]->owner_id;
+			break;
+		}
+	}
+	if (i == rio_table_hdr->num_rio_dev){
+		printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__);
+		return last_bus;
+	}
+
+	for(i = 0; i < rio_table_hdr->num_scal_dev; i++){
+		if (scal_devs[i]->node_id == twister){
+			node = scal_devs[i]->node_id;
+			break;
+		}
+	}
+	if (i == rio_table_hdr->num_scal_dev){
+		printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__);
+		return last_bus;
+	}
+
+	switch (rio_devs[wpeg_num]->type){
+	case CompatWPEG:
+		/* The Compatability Winnipeg controls the 2 legacy buses,
+		 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
+		 * a PCI-PCI bridge card is used in either slot: total 5 buses.
+		 */
+		num_buses = 5;
+		break;
+	case AltWPEG:
+		/* The Alternate Winnipeg controls the 2 133MHz buses [1 slot
+		 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
+		 * the "extra" buses for each of those slots: total 7 buses.
+		 */
+		num_buses = 7;
+		break;
+	case LookOutAWPEG:
+	case LookOutBWPEG:
+		/* A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
+		 * & the "extra" buses for each of those slots: total 9 buses.
+		 */
+		num_buses = 9;
+		break;
+	default:
+		printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __FUNCTION__);
+		return last_bus;
+	}
+
+	for(bus = last_bus; bus < last_bus + num_buses; bus++)
+		mp_bus_id_to_node[bus] = node;
+	return bus;
+}
+
+static int __init build_detail_arrays(void)
+{
+	unsigned long ptr;
+	int i, scal_detail_size, rio_detail_size;
+
+	if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
+		printk(KERN_WARNING "%s: MAX_NUMNODES too low!  Defined as %d, but system has %d nodes.\n", __FUNCTION__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
+		return 0;
+	}
+
+	switch (rio_table_hdr->version){
+	default:
+		printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __FUNCTION__, rio_table_hdr->version);
+		return 0;
+	case 2:
+		scal_detail_size = 11;
+		rio_detail_size = 13;
+		break;
+	case 3:
+		scal_detail_size = 12;
+		rio_detail_size = 15;
+		break;
+	}
+
+	ptr = (unsigned long)rio_table_hdr + 3;
+	for(i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
+		scal_devs[i] = (struct scal_detail *)ptr;
+
+	for(i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
+		rio_devs[i] = (struct rio_detail *)ptr;
+
+	return 1;
+}
+
+void __init setup_summit(void)
+{
+	unsigned long		ptr;
+	unsigned short		offset;
+	int			i, next_wpeg, next_bus = 0;
+
+	/* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
+	ptr = *(unsigned short *)phys_to_virt(0x40Eul);
+	ptr = (unsigned long)phys_to_virt(ptr << 4);
+
+	rio_table_hdr = NULL;
+	offset = 0x180;
+	while (offset){
+		/* The block id is stored in the 2nd word */
+		if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
+			/* set the pointer past the offset & block id */
+			rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
+			break;
+		}
+		/* The next offset is stored in the 1st word.  0 means no more */
+		offset = *((unsigned short *)(ptr + offset));
+	}
+	if (!rio_table_hdr){
+		printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__);
+		return;
+	}
+
+	if (!build_detail_arrays())
+		return;
+
+	/* The first Winnipeg we're looking for has an index of 0 */
+	next_wpeg = 0;
+	do {
+		for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
+			if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg){
+				/* It's the Winnipeg we're looking for! */
+				next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
+				next_wpeg++;
+				break;
+			}
+		}
+		/*
+		 * If we go through all Rio devices and don't find one with
+		 * the next index, it means we've found all the Winnipegs,
+		 * and thus all the PCI buses.
+		 */
+		if (i == rio_table_hdr->num_rio_dev)
+			next_wpeg = 0;
+	} while (next_wpeg != 0);
+}
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
new file mode 100644
index 000000000000..42147304de88
--- /dev/null
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -0,0 +1,265 @@
+/*
+ * linux/arch/i386/kernel/sys_i386.c
+ *
+ * This file contains various random system calls that
+ * have a non-standard calling sequence on the Linux/i386
+ * platform.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/stat.h>
+#include <linux/syscalls.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/ipc.h>
+
+/*
+ * sys_pipe() is the normal C calling standard for creating
+ * a pipe. It's not the way Unix traditionally does this, though.
+ */
+asmlinkage int sys_pipe(unsigned long __user * fildes)
+{
+	int fd[2];
+	int error;
+
+	error = do_pipe(fd);
+	if (!error) {
+		if (copy_to_user(fildes, fd, 2*sizeof(int)))
+			error = -EFAULT;
+	}
+	return error;
+}
+
+asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+			  unsigned long prot, unsigned long flags,
+			  unsigned long fd, unsigned long pgoff)
+{
+	int error = -EBADF;
+	struct file *file = NULL;
+	struct mm_struct *mm = current->mm;
+
+	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+	if (!(flags & MAP_ANONYMOUS)) {
+		file = fget(fd);
+		if (!file)
+			goto out;
+	}
+
+	down_write(&mm->mmap_sem);
+	error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+	up_write(&mm->mmap_sem);
+
+	if (file)
+		fput(file);
+out:
+	return error;
+}
+
+/*
+ * Perform the select(nd, in, out, ex, tv) and mmap() system
+ * calls. Linux/i386 didn't use to be able to handle more than
+ * 4 system call parameters, so these system calls used a memory
+ * block for parameter passing..
+ */
+
+struct mmap_arg_struct {
+	unsigned long addr;
+	unsigned long len;
+	unsigned long prot;
+	unsigned long flags;
+	unsigned long fd;
+	unsigned long offset;
+};
+
+asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
+{
+	struct mmap_arg_struct a;
+	int err = -EFAULT;
+
+	if (copy_from_user(&a, arg, sizeof(a)))
+		goto out;
+
+	err = -EINVAL;
+	if (a.offset & ~PAGE_MASK)
+		goto out;
+
+	err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
+			a.fd, a.offset >> PAGE_SHIFT);
+out:
+	return err;
+}
+
+
+struct sel_arg_struct {
+	unsigned long n;
+	fd_set __user *inp, *outp, *exp;
+	struct timeval __user *tvp;
+};
+
+asmlinkage int old_select(struct sel_arg_struct __user *arg)
+{
+	struct sel_arg_struct a;
+
+	if (copy_from_user(&a, arg, sizeof(a)))
+		return -EFAULT;
+	/* sys_select() does the appropriate kernel locking */
+	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
+}
+
+/*
+ * sys_ipc() is the de-multiplexer for the SysV IPC calls..
+ *
+ * This is really horribly ugly.
+ */
+asmlinkage int sys_ipc (uint call, int first, int second,
+			int third, void __user *ptr, long fifth)
+{
+	int version, ret;
+
+	version = call >> 16; /* hack for backward compatibility */
+	call &= 0xffff;
+
+	switch (call) {
+	case SEMOP:
+		return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
+	case SEMTIMEDOP:
+		return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
+					(const struct timespec __user *)fifth);
+
+	case SEMGET:
+		return sys_semget (first, second, third);
+	case SEMCTL: {
+		union semun fourth;
+		if (!ptr)
+			return -EINVAL;
+		if (get_user(fourth.__pad, (void __user * __user *) ptr))
+			return -EFAULT;
+		return sys_semctl (first, second, third, fourth);
+	}
+
+	case MSGSND:
+		return sys_msgsnd (first, (struct msgbuf __user *) ptr, 
+				   second, third);
+	case MSGRCV:
+		switch (version) {
+		case 0: {
+			struct ipc_kludge tmp;
+			if (!ptr)
+				return -EINVAL;
+			
+			if (copy_from_user(&tmp,
+					   (struct ipc_kludge __user *) ptr, 
+					   sizeof (tmp)))
+				return -EFAULT;
+			return sys_msgrcv (first, tmp.msgp, second,
+					   tmp.msgtyp, third);
+		}
+		default:
+			return sys_msgrcv (first,
+					   (struct msgbuf __user *) ptr,
+					   second, fifth, third);
+		}
+	case MSGGET:
+		return sys_msgget ((key_t) first, second);
+	case MSGCTL:
+		return sys_msgctl (first, second, (struct msqid_ds __user *) ptr);
+
+	case SHMAT:
+		switch (version) {
+		default: {
+			ulong raddr;
+			ret = do_shmat (first, (char __user *) ptr, second, &raddr);
+			if (ret)
+				return ret;
+			return put_user (raddr, (ulong __user *) third);
+		}
+		case 1:	/* iBCS2 emulator entry point */
+			if (!segment_eq(get_fs(), get_ds()))
+				return -EINVAL;
+			/* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
+			return do_shmat (first, (char __user *) ptr, second, (ulong *) third);
+		}
+	case SHMDT: 
+		return sys_shmdt ((char __user *)ptr);
+	case SHMGET:
+		return sys_shmget (first, second, third);
+	case SHMCTL:
+		return sys_shmctl (first, second,
+				   (struct shmid_ds __user *) ptr);
+	default:
+		return -ENOSYS;
+	}
+}
+
+/*
+ * Old cruft
+ */
+asmlinkage int sys_uname(struct old_utsname __user * name)
+{
+	int err;
+	if (!name)
+		return -EFAULT;
+	down_read(&uts_sem);
+	err = copy_to_user(name, utsname(), sizeof (*name));
+	up_read(&uts_sem);
+	return err?-EFAULT:0;
+}
+
+asmlinkage int sys_olduname(struct oldold_utsname __user * name)
+{
+	int error;
+
+	if (!name)
+		return -EFAULT;
+	if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
+		return -EFAULT;
+  
+  	down_read(&uts_sem);
+	
+	error = __copy_to_user(&name->sysname, &utsname()->sysname,
+			       __OLD_UTS_LEN);
+	error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
+	error |= __copy_to_user(&name->nodename, &utsname()->nodename,
+				__OLD_UTS_LEN);
+	error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
+	error |= __copy_to_user(&name->release, &utsname()->release,
+				__OLD_UTS_LEN);
+	error |= __put_user(0, name->release + __OLD_UTS_LEN);
+	error |= __copy_to_user(&name->version, &utsname()->version,
+				__OLD_UTS_LEN);
+	error |= __put_user(0, name->version + __OLD_UTS_LEN);
+	error |= __copy_to_user(&name->machine, &utsname()->machine,
+				__OLD_UTS_LEN);
+	error |= __put_user(0, name->machine + __OLD_UTS_LEN);
+	
+	up_read(&uts_sem);
+	
+	error = error ? -EFAULT : 0;
+
+	return error;
+}
+
+
+/*
+ * Do a system call from kernel instead of calling sys_execve so we
+ * end up with proper pt_regs.
+ */
+int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+{
+	long __res;
+	asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
+	: "=a" (__res)
+	: "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory");
+	return __res;
+}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
new file mode 100644
index 000000000000..8344c70adf61
--- /dev/null
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -0,0 +1,326 @@
+ENTRY(sys_call_table)
+	.long sys_restart_syscall	/* 0 - old "setup()" system call, used for restarting */
+	.long sys_exit
+	.long sys_fork
+	.long sys_read
+	.long sys_write
+	.long sys_open		/* 5 */
+	.long sys_close
+	.long sys_waitpid
+	.long sys_creat
+	.long sys_link
+	.long sys_unlink	/* 10 */
+	.long sys_execve
+	.long sys_chdir
+	.long sys_time
+	.long sys_mknod
+	.long sys_chmod		/* 15 */
+	.long sys_lchown16
+	.long sys_ni_syscall	/* old break syscall holder */
+	.long sys_stat
+	.long sys_lseek
+	.long sys_getpid	/* 20 */
+	.long sys_mount
+	.long sys_oldumount
+	.long sys_setuid16
+	.long sys_getuid16
+	.long sys_stime		/* 25 */
+	.long sys_ptrace
+	.long sys_alarm
+	.long sys_fstat
+	.long sys_pause
+	.long sys_utime		/* 30 */
+	.long sys_ni_syscall	/* old stty syscall holder */
+	.long sys_ni_syscall	/* old gtty syscall holder */
+	.long sys_access
+	.long sys_nice
+	.long sys_ni_syscall	/* 35 - old ftime syscall holder */
+	.long sys_sync
+	.long sys_kill
+	.long sys_rename
+	.long sys_mkdir
+	.long sys_rmdir		/* 40 */
+	.long sys_dup
+	.long sys_pipe
+	.long sys_times
+	.long sys_ni_syscall	/* old prof syscall holder */
+	.long sys_brk		/* 45 */
+	.long sys_setgid16
+	.long sys_getgid16
+	.long sys_signal
+	.long sys_geteuid16
+	.long sys_getegid16	/* 50 */
+	.long sys_acct
+	.long sys_umount	/* recycled never used phys() */
+	.long sys_ni_syscall	/* old lock syscall holder */
+	.long sys_ioctl
+	.long sys_fcntl		/* 55 */
+	.long sys_ni_syscall	/* old mpx syscall holder */
+	.long sys_setpgid
+	.long sys_ni_syscall	/* old ulimit syscall holder */
+	.long sys_olduname
+	.long sys_umask		/* 60 */
+	.long sys_chroot
+	.long sys_ustat
+	.long sys_dup2
+	.long sys_getppid
+	.long sys_getpgrp	/* 65 */
+	.long sys_setsid
+	.long sys_sigaction
+	.long sys_sgetmask
+	.long sys_ssetmask
+	.long sys_setreuid16	/* 70 */
+	.long sys_setregid16
+	.long sys_sigsuspend
+	.long sys_sigpending
+	.long sys_sethostname
+	.long sys_setrlimit	/* 75 */
+	.long sys_old_getrlimit
+	.long sys_getrusage
+	.long sys_gettimeofday
+	.long sys_settimeofday
+	.long sys_getgroups16	/* 80 */
+	.long sys_setgroups16
+	.long old_select
+	.long sys_symlink
+	.long sys_lstat
+	.long sys_readlink	/* 85 */
+	.long sys_uselib
+	.long sys_swapon
+	.long sys_reboot
+	.long old_readdir
+	.long old_mmap		/* 90 */
+	.long sys_munmap
+	.long sys_truncate
+	.long sys_ftruncate
+	.long sys_fchmod
+	.long sys_fchown16	/* 95 */
+	.long sys_getpriority
+	.long sys_setpriority
+	.long sys_ni_syscall	/* old profil syscall holder */
+	.long sys_statfs
+	.long sys_fstatfs	/* 100 */
+	.long sys_ioperm
+	.long sys_socketcall
+	.long sys_syslog
+	.long sys_setitimer
+	.long sys_getitimer	/* 105 */
+	.long sys_newstat
+	.long sys_newlstat
+	.long sys_newfstat
+	.long sys_uname
+	.long sys_iopl		/* 110 */
+	.long sys_vhangup
+	.long sys_ni_syscall	/* old "idle" system call */
+	.long sys_vm86old
+	.long sys_wait4
+	.long sys_swapoff	/* 115 */
+	.long sys_sysinfo
+	.long sys_ipc
+	.long sys_fsync
+	.long sys_sigreturn
+	.long sys_clone		/* 120 */
+	.long sys_setdomainname
+	.long sys_newuname
+	.long sys_modify_ldt
+	.long sys_adjtimex
+	.long sys_mprotect	/* 125 */
+	.long sys_sigprocmask
+	.long sys_ni_syscall	/* old "create_module" */
+	.long sys_init_module
+	.long sys_delete_module
+	.long sys_ni_syscall	/* 130:	old "get_kernel_syms" */
+	.long sys_quotactl
+	.long sys_getpgid
+	.long sys_fchdir
+	.long sys_bdflush
+	.long sys_sysfs		/* 135 */
+	.long sys_personality
+	.long sys_ni_syscall	/* reserved for afs_syscall */
+	.long sys_setfsuid16
+	.long sys_setfsgid16
+	.long sys_llseek	/* 140 */
+	.long sys_getdents
+	.long sys_select
+	.long sys_flock
+	.long sys_msync
+	.long sys_readv		/* 145 */
+	.long sys_writev
+	.long sys_getsid
+	.long sys_fdatasync
+	.long sys_sysctl
+	.long sys_mlock		/* 150 */
+	.long sys_munlock
+	.long sys_mlockall
+	.long sys_munlockall
+	.long sys_sched_setparam
+	.long sys_sched_getparam   /* 155 */
+	.long sys_sched_setscheduler
+	.long sys_sched_getscheduler
+	.long sys_sched_yield
+	.long sys_sched_get_priority_max
+	.long sys_sched_get_priority_min  /* 160 */
+	.long sys_sched_rr_get_interval
+	.long sys_nanosleep
+	.long sys_mremap
+	.long sys_setresuid16
+	.long sys_getresuid16	/* 165 */
+	.long sys_vm86
+	.long sys_ni_syscall	/* Old sys_query_module */
+	.long sys_poll
+	.long sys_nfsservctl
+	.long sys_setresgid16	/* 170 */
+	.long sys_getresgid16
+	.long sys_prctl
+	.long sys_rt_sigreturn
+	.long sys_rt_sigaction
+	.long sys_rt_sigprocmask	/* 175 */
+	.long sys_rt_sigpending
+	.long sys_rt_sigtimedwait
+	.long sys_rt_sigqueueinfo
+	.long sys_rt_sigsuspend
+	.long sys_pread64	/* 180 */
+	.long sys_pwrite64
+	.long sys_chown16
+	.long sys_getcwd
+	.long sys_capget
+	.long sys_capset	/* 185 */
+	.long sys_sigaltstack
+	.long sys_sendfile
+	.long sys_ni_syscall	/* reserved for streams1 */
+	.long sys_ni_syscall	/* reserved for streams2 */
+	.long sys_vfork		/* 190 */
+	.long sys_getrlimit
+	.long sys_mmap2
+	.long sys_truncate64
+	.long sys_ftruncate64
+	.long sys_stat64	/* 195 */
+	.long sys_lstat64
+	.long sys_fstat64
+	.long sys_lchown
+	.long sys_getuid
+	.long sys_getgid	/* 200 */
+	.long sys_geteuid
+	.long sys_getegid
+	.long sys_setreuid
+	.long sys_setregid
+	.long sys_getgroups	/* 205 */
+	.long sys_setgroups
+	.long sys_fchown
+	.long sys_setresuid
+	.long sys_getresuid
+	.long sys_setresgid	/* 210 */
+	.long sys_getresgid
+	.long sys_chown
+	.long sys_setuid
+	.long sys_setgid
+	.long sys_setfsuid	/* 215 */
+	.long sys_setfsgid
+	.long sys_pivot_root
+	.long sys_mincore
+	.long sys_madvise
+	.long sys_getdents64	/* 220 */
+	.long sys_fcntl64
+	.long sys_ni_syscall	/* reserved for TUX */
+	.long sys_ni_syscall
+	.long sys_gettid
+	.long sys_readahead	/* 225 */
+	.long sys_setxattr
+	.long sys_lsetxattr
+	.long sys_fsetxattr
+	.long sys_getxattr
+	.long sys_lgetxattr	/* 230 */
+	.long sys_fgetxattr
+	.long sys_listxattr
+	.long sys_llistxattr
+	.long sys_flistxattr
+	.long sys_removexattr	/* 235 */
+	.long sys_lremovexattr
+	.long sys_fremovexattr
+	.long sys_tkill
+	.long sys_sendfile64
+	.long sys_futex		/* 240 */
+	.long sys_sched_setaffinity
+	.long sys_sched_getaffinity
+	.long sys_set_thread_area
+	.long sys_get_thread_area
+	.long sys_io_setup	/* 245 */
+	.long sys_io_destroy
+	.long sys_io_getevents
+	.long sys_io_submit
+	.long sys_io_cancel
+	.long sys_fadvise64	/* 250 */
+	.long sys_ni_syscall
+	.long sys_exit_group
+	.long sys_lookup_dcookie
+	.long sys_epoll_create
+	.long sys_epoll_ctl	/* 255 */
+	.long sys_epoll_wait
+ 	.long sys_remap_file_pages
+ 	.long sys_set_tid_address
+ 	.long sys_timer_create
+ 	.long sys_timer_settime		/* 260 */
+ 	.long sys_timer_gettime
+ 	.long sys_timer_getoverrun
+ 	.long sys_timer_delete
+ 	.long sys_clock_settime
+ 	.long sys_clock_gettime		/* 265 */
+ 	.long sys_clock_getres
+ 	.long sys_clock_nanosleep
+	.long sys_statfs64
+	.long sys_fstatfs64
+	.long sys_tgkill	/* 270 */
+	.long sys_utimes
+ 	.long sys_fadvise64_64
+	.long sys_ni_syscall	/* sys_vserver */
+	.long sys_mbind
+	.long sys_get_mempolicy
+	.long sys_set_mempolicy
+	.long sys_mq_open
+	.long sys_mq_unlink
+	.long sys_mq_timedsend
+	.long sys_mq_timedreceive	/* 280 */
+	.long sys_mq_notify
+	.long sys_mq_getsetattr
+	.long sys_kexec_load
+	.long sys_waitid
+	.long sys_ni_syscall		/* 285 */ /* available */
+	.long sys_add_key
+	.long sys_request_key
+	.long sys_keyctl
+	.long sys_ioprio_set
+	.long sys_ioprio_get		/* 290 */
+	.long sys_inotify_init
+	.long sys_inotify_add_watch
+	.long sys_inotify_rm_watch
+	.long sys_migrate_pages
+	.long sys_openat		/* 295 */
+	.long sys_mkdirat
+	.long sys_mknodat
+	.long sys_fchownat
+	.long sys_futimesat
+	.long sys_fstatat64		/* 300 */
+	.long sys_unlinkat
+	.long sys_renameat
+	.long sys_linkat
+	.long sys_symlinkat
+	.long sys_readlinkat		/* 305 */
+	.long sys_fchmodat
+	.long sys_faccessat
+	.long sys_pselect6
+	.long sys_ppoll
+	.long sys_unshare		/* 310 */
+	.long sys_set_robust_list
+	.long sys_get_robust_list
+	.long sys_splice
+	.long sys_sync_file_range
+	.long sys_tee			/* 315 */
+	.long sys_vmsplice
+	.long sys_move_pages
+	.long sys_getcpu
+	.long sys_epoll_pwait
+	.long sys_utimensat		/* 320 */
+	.long sys_signalfd
+	.long sys_timerfd
+	.long sys_eventfd
+	.long sys_fallocate
diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/kernel/sysenter_32.c
new file mode 100644
index 000000000000..4eb2e408764f
--- /dev/null
+++ b/arch/x86/kernel/sysenter_32.c
@@ -0,0 +1,348 @@
+/*
+ * linux/arch/i386/kernel/sysenter.c
+ *
+ * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This file contains the needed initializations to support sysenter.
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/module.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+#include <asm/pgtable.h>
+#include <asm/unistd.h>
+#include <asm/elf.h>
+#include <asm/tlbflush.h>
+
+enum {
+	VDSO_DISABLED = 0,
+	VDSO_ENABLED = 1,
+	VDSO_COMPAT = 2,
+};
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT	VDSO_COMPAT
+#else
+#define VDSO_DEFAULT	VDSO_ENABLED
+#endif
+
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
+
+EXPORT_SYMBOL_GPL(vdso_enabled);
+
+static int __init vdso_setup(char *s)
+{
+	vdso_enabled = simple_strtoul(s, NULL, 0);
+
+	return 1;
+}
+
+__setup("vdso=", vdso_setup);
+
+extern asmlinkage void sysenter_entry(void);
+
+static __init void reloc_symtab(Elf32_Ehdr *ehdr,
+				unsigned offset, unsigned size)
+{
+	Elf32_Sym *sym = (void *)ehdr + offset;
+	unsigned nsym = size / sizeof(*sym);
+	unsigned i;
+
+	for(i = 0; i < nsym; i++, sym++) {
+		if (sym->st_shndx == SHN_UNDEF ||
+		    sym->st_shndx == SHN_ABS)
+			continue;  /* skip */
+
+		if (sym->st_shndx > SHN_LORESERVE) {
+			printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
+			       sym->st_shndx);
+			continue;
+		}
+
+		switch(ELF_ST_TYPE(sym->st_info)) {
+		case STT_OBJECT:
+		case STT_FUNC:
+		case STT_SECTION:
+		case STT_FILE:
+			sym->st_value += VDSO_HIGH_BASE;
+		}
+	}
+}
+
+static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
+{
+	Elf32_Dyn *dyn = (void *)ehdr + offset;
+
+	for(; dyn->d_tag != DT_NULL; dyn++)
+		switch(dyn->d_tag) {
+		case DT_PLTGOT:
+		case DT_HASH:
+		case DT_STRTAB:
+		case DT_SYMTAB:
+		case DT_RELA:
+		case DT_INIT:
+		case DT_FINI:
+		case DT_REL:
+		case DT_DEBUG:
+		case DT_JMPREL:
+		case DT_VERSYM:
+		case DT_VERDEF:
+		case DT_VERNEED:
+		case DT_ADDRRNGLO ... DT_ADDRRNGHI:
+			/* definitely pointers needing relocation */
+			dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+			break;
+
+		case DT_ENCODING ... OLD_DT_LOOS-1:
+		case DT_LOOS ... DT_HIOS-1:
+			/* Tags above DT_ENCODING are pointers if
+			   they're even */
+			if (dyn->d_tag >= DT_ENCODING &&
+			    (dyn->d_tag & 1) == 0)
+				dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+			break;
+
+		case DT_VERDEFNUM:
+		case DT_VERNEEDNUM:
+		case DT_FLAGS_1:
+		case DT_RELACOUNT:
+		case DT_RELCOUNT:
+		case DT_VALRNGLO ... DT_VALRNGHI:
+			/* definitely not pointers */
+			break;
+
+		case OLD_DT_LOOS ... DT_LOOS-1:
+		case DT_HIOS ... DT_VALRNGLO-1:
+		default:
+			if (dyn->d_tag > DT_ENCODING)
+				printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
+				       dyn->d_tag);
+			break;
+		}
+}
+
+static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+{
+	Elf32_Phdr *phdr;
+	Elf32_Shdr *shdr;
+	int i;
+
+	BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
+	       !elf_check_arch(ehdr) ||
+	       ehdr->e_type != ET_DYN);
+
+	ehdr->e_entry += VDSO_HIGH_BASE;
+
+	/* rebase phdrs */
+	phdr = (void *)ehdr + ehdr->e_phoff;
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		phdr[i].p_vaddr += VDSO_HIGH_BASE;
+
+		/* relocate dynamic stuff */
+		if (phdr[i].p_type == PT_DYNAMIC)
+			reloc_dyn(ehdr, phdr[i].p_offset);
+	}
+
+	/* rebase sections */
+	shdr = (void *)ehdr + ehdr->e_shoff;
+	for(i = 0; i < ehdr->e_shnum; i++) {
+		if (!(shdr[i].sh_flags & SHF_ALLOC))
+			continue;
+
+		shdr[i].sh_addr += VDSO_HIGH_BASE;
+
+		if (shdr[i].sh_type == SHT_SYMTAB ||
+		    shdr[i].sh_type == SHT_DYNSYM)
+			reloc_symtab(ehdr, shdr[i].sh_offset,
+				     shdr[i].sh_size);
+	}
+}
+
+void enable_sep_cpu(void)
+{
+	int cpu = get_cpu();
+	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+	if (!boot_cpu_has(X86_FEATURE_SEP)) {
+		put_cpu();
+		return;
+	}
+
+	tss->x86_tss.ss1 = __KERNEL_CS;
+	tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
+	wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
+	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
+	put_cpu();	
+}
+
+static struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+	gate_vma.vm_mm = NULL;
+	gate_vma.vm_start = FIXADDR_USER_START;
+	gate_vma.vm_end = FIXADDR_USER_END;
+	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+	gate_vma.vm_page_prot = __P101;
+	/*
+	 * Make sure the vDSO gets into every core dump.
+	 * Dumping its contents makes post-mortem fully interpretable later
+	 * without matching up the same kernel and hardware config to see
+	 * what PC values meant.
+	 */
+	gate_vma.vm_flags |= VM_ALWAYSDUMP;
+	return 0;
+}
+
+/*
+ * These symbols are defined by vsyscall.o to mark the bounds
+ * of the ELF DSO images included therein.
+ */
+extern const char vsyscall_int80_start, vsyscall_int80_end;
+extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
+static struct page *syscall_pages[1];
+
+static void map_compat_vdso(int map)
+{
+	static int vdso_mapped;
+
+	if (map == vdso_mapped)
+		return;
+
+	vdso_mapped = map;
+
+	__set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
+		     map ? PAGE_READONLY_EXEC : PAGE_NONE);
+
+	/* flush stray tlbs */
+	flush_tlb_all();
+}
+
+int __init sysenter_setup(void)
+{
+	void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
+	const void *vsyscall;
+	size_t vsyscall_len;
+
+	syscall_pages[0] = virt_to_page(syscall_page);
+
+	gate_vma_init();
+
+	printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
+
+	if (!boot_cpu_has(X86_FEATURE_SEP)) {
+		vsyscall = &vsyscall_int80_start;
+		vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
+	} else {
+		vsyscall = &vsyscall_sysenter_start;
+		vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
+	}
+
+	memcpy(syscall_page, vsyscall, vsyscall_len);
+	relocate_vdso(syscall_page);
+
+	return 0;
+}
+
+/* Defined in vsyscall-sysenter.S */
+extern void SYSENTER_RETURN;
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret = 0;
+	bool compat;
+
+	down_write(&mm->mmap_sem);
+
+	/* Test compat mode once here, in case someone
+	   changes it via sysctl */
+	compat = (vdso_enabled == VDSO_COMPAT);
+
+	map_compat_vdso(compat);
+
+	if (compat)
+		addr = VDSO_HIGH_BASE;
+	else {
+		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+		if (IS_ERR_VALUE(addr)) {
+			ret = addr;
+			goto up_fail;
+		}
+
+		/*
+		 * MAYWRITE to allow gdb to COW and set breakpoints
+		 *
+		 * Make sure the vDSO gets into every core dump.
+		 * Dumping its contents makes post-mortem fully
+		 * interpretable later without matching up the same
+		 * kernel and hardware config to see what PC values
+		 * meant.
+		 */
+		ret = install_special_mapping(mm, addr, PAGE_SIZE,
+					      VM_READ|VM_EXEC|
+					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+					      VM_ALWAYSDUMP,
+					      syscall_pages);
+
+		if (ret)
+			goto up_fail;
+	}
+
+	current->mm->context.vdso = (void *)addr;
+	current_thread_info()->sysenter_return =
+		(void *)VDSO_SYM(&SYSENTER_RETURN);
+
+  up_fail:
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+		return "[vdso]";
+	return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+	struct mm_struct *mm = tsk->mm;
+
+	/* Check to see if this task was created in compat vdso mode */
+	if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
+		return &gate_vma;
+	return NULL;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+	const struct vm_area_struct *vma = get_gate_vma(task);
+
+	return vma && addr >= vma->vm_start && addr < vma->vm_end;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+	return 0;
+}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
new file mode 100644
index 000000000000..19a6c678d02e
--- /dev/null
+++ b/arch/x86/kernel/time_32.c
@@ -0,0 +1,236 @@
+/*
+ *  linux/arch/i386/kernel/time.c
+ *
+ *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
+ *
+ * This file contains the PC-specific time handling details:
+ * reading the RTC at bootup, etc..
+ * 1994-07-02    Alan Modra
+ *	fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
+ * 1995-03-26    Markus Kuhn
+ *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
+ *      precision CMOS clock update
+ * 1996-05-03    Ingo Molnar
+ *      fixed time warps in do_[slow|fast]_gettimeoffset()
+ * 1997-09-10	Updated NTP code according to technical memorandum Jan '96
+ *		"A Kernel Model for Precision Timekeeping" by Dave Mills
+ * 1998-09-05    (Various)
+ *	More robust do_fast_gettimeoffset() algorithm implemented
+ *	(works with APM, Cyrix 6x86MX and Centaur C6),
+ *	monotonic gettimeofday() with fast_get_timeoffset(),
+ *	drift-proof precision TSC calibration on boot
+ *	(C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
+ *	Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
+ *	ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
+ * 1998-12-16    Andrea Arcangeli
+ *	Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
+ *	because was not accounting lost_ticks.
+ * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
+ *	Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ *	serialize accesses to xtime/lost_ticks).
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/bcd.h>
+#include <linux/efi.h>
+#include <linux/mca.h>
+
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/irq.h>
+#include <asm/msr.h>
+#include <asm/delay.h>
+#include <asm/mpspec.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/timer.h>
+#include <asm/time.h>
+
+#include "mach_time.h"
+
+#include <linux/timex.h>
+
+#include <asm/hpet.h>
+
+#include <asm/arch_hooks.h>
+
+#include "io_ports.h"
+
+#include <asm/i8259.h>
+
+#include "do_timer.h"
+
+unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+
+/*
+ * This is a special lock that is owned by the CPU and holds the index
+ * register we are working with.  It is required for NMI access to the
+ * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
+ */
+volatile unsigned long cmos_lock = 0;
+EXPORT_SYMBOL(cmos_lock);
+
+/* Routines for accessing the CMOS RAM/RTC. */
+unsigned char rtc_cmos_read(unsigned char addr)
+{
+	unsigned char val;
+	lock_cmos_prefix(addr);
+	outb_p(addr, RTC_PORT(0));
+	val = inb_p(RTC_PORT(1));
+	lock_cmos_suffix(addr);
+	return val;
+}
+EXPORT_SYMBOL(rtc_cmos_read);
+
+void rtc_cmos_write(unsigned char val, unsigned char addr)
+{
+	lock_cmos_prefix(addr);
+	outb_p(addr, RTC_PORT(0));
+	outb_p(val, RTC_PORT(1));
+	lock_cmos_suffix(addr);
+}
+EXPORT_SYMBOL(rtc_cmos_write);
+
+static int set_rtc_mmss(unsigned long nowtime)
+{
+	int retval;
+	unsigned long flags;
+
+	/* gets recalled with irq locally disabled */
+	/* XXX - does irqsave resolve this? -johnstul */
+	spin_lock_irqsave(&rtc_lock, flags);
+	retval = set_wallclock(nowtime);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	return retval;
+}
+
+
+int timer_ack;
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+	unsigned long pc = instruction_pointer(regs);
+
+#ifdef CONFIG_SMP
+	if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
+	    in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
+		return *(unsigned long *)(regs->ebp + 4);
+#else
+		unsigned long *sp = (unsigned long *)&regs->esp;
+
+		/* Return address is either directly at stack pointer
+		   or above a saved eflags. Eflags has bits 22-31 zero,
+		   kernel addresses don't. */
+ 		if (sp[0] >> 22)
+			return sp[0];
+		if (sp[1] >> 22)
+			return sp[1];
+#endif
+	}
+#endif
+	return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
+/*
+ * This is the same as the above, except we _also_ save the current
+ * Time Stamp Counter value at the time of the timer interrupt, so that
+ * we later on can estimate the time of day more exactly.
+ */
+irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+#ifdef CONFIG_X86_IO_APIC
+	if (timer_ack) {
+		/*
+		 * Subtle, when I/O APICs are used we have to ack timer IRQ
+		 * manually to reset the IRR bit for do_slow_gettimeoffset().
+		 * This will also deassert NMI lines for the watchdog if run
+		 * on an 82489DX-based system.
+		 */
+		spin_lock(&i8259A_lock);
+		outb(0x0c, PIC_MASTER_OCW3);
+		/* Ack the IRQ; AEOI will end it automatically. */
+		inb(PIC_MASTER_POLL);
+		spin_unlock(&i8259A_lock);
+	}
+#endif
+
+	do_timer_interrupt_hook();
+
+	if (MCA_bus) {
+		/* The PS/2 uses level-triggered interrupts.  You can't
+		turn them off, nor would you want to (any attempt to
+		enable edge-triggered interrupts usually gets intercepted by a
+		special hardware circuit).  Hence we have to acknowledge
+		the timer interrupt.  Through some incredibly stupid
+		design idea, the reset for IRQ 0 is done by setting the
+		high bit of the PPI port B (0x61).  Note that some PS/2s,
+		notably the 55SX, work fine if this is removed.  */
+
+		u8 irq_v = inb_p( 0x61 );	/* read the current state */
+		outb_p( irq_v|0x80, 0x61 );	/* reset the IRQ */
+	}
+
+	return IRQ_HANDLED;
+}
+
+/* not static: needed by APM */
+unsigned long read_persistent_clock(void)
+{
+	unsigned long retval;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+
+	retval = get_wallclock();
+
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	return retval;
+}
+
+int update_persistent_clock(struct timespec now)
+{
+	return set_rtc_mmss(now.tv_sec);
+}
+
+extern void (*late_time_init)(void);
+/* Duplicate of time_init() below, with hpet_enable part added */
+void __init hpet_time_init(void)
+{
+	if (!hpet_enable())
+		setup_pit_timer();
+	time_init_hook();
+}
+
+/*
+ * This is called directly from init code; we must delay timer setup in the
+ * HPET case as we can't make the decision to turn on HPET this early in the
+ * boot process.
+ *
+ * The chosen time_init function will usually be hpet_time_init, above, but
+ * in the case of virtual hardware, an alternative function may be substituted.
+ */
+void __init time_init(void)
+{
+	tsc_init();
+	late_time_init = choose_time_init();
+}
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
new file mode 100644
index 000000000000..45782356a618
--- /dev/null
+++ b/arch/x86/kernel/topology.c
@@ -0,0 +1,77 @@
+/*
+ * arch/i386/kernel/topology.c - Populate sysfs with topology information
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/nodemask.h>
+#include <linux/mmzone.h>
+#include <asm/cpu.h>
+
+static struct i386_cpu cpu_devices[NR_CPUS];
+
+int arch_register_cpu(int num)
+{
+	/*
+	 * CPU0 cannot be offlined due to several
+	 * restrictions and assumptions in kernel. This basically
+	 * doesnt add a control file, one cannot attempt to offline
+	 * BSP.
+	 *
+	 * Also certain PCI quirks require not to enable hotplug control
+	 * for all CPU's.
+	 */
+	if (num && enable_cpu_hotplug)
+		cpu_devices[num].cpu.hotpluggable = 1;
+
+	return register_cpu(&cpu_devices[num].cpu, num);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int enable_cpu_hotplug = 1;
+
+void arch_unregister_cpu(int num) {
+	return unregister_cpu(&cpu_devices[num].cpu);
+}
+EXPORT_SYMBOL(arch_register_cpu);
+EXPORT_SYMBOL(arch_unregister_cpu);
+#endif /*CONFIG_HOTPLUG_CPU*/
+
+static int __init topology_init(void)
+{
+	int i;
+
+#ifdef CONFIG_NUMA
+	for_each_online_node(i)
+		register_one_node(i);
+#endif /* CONFIG_NUMA */
+
+	for_each_present_cpu(i)
+		arch_register_cpu(i);
+	return 0;
+}
+
+subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
new file mode 100644
index 000000000000..f62815f8d06a
--- /dev/null
+++ b/arch/x86/kernel/trampoline_32.S
@@ -0,0 +1,85 @@
+/*
+ *
+ *	Trampoline.S	Derived from Setup.S by Linus Torvalds
+ *
+ *	4 Jan 1997 Michael Chastain: changed to gnu as.
+ *
+ *	This is only used for booting secondary CPUs in SMP machine
+ *
+ *	Entry: CS:IP point to the start of our code, we are 
+ *	in real mode with no stack, but the rest of the 
+ *	trampoline page to make our stack and everything else
+ *	is a mystery.
+ *
+ *	In fact we don't actually need a stack so we don't
+ *	set one up.
+ *
+ *	We jump into the boot/compressed/head.S code. So you'd
+ *	better be running a compressed kernel image or you
+ *	won't get very far.
+ *
+ *	On entry to trampoline_data, the processor is in real mode
+ *	with 16-bit addressing and 16-bit data.  CS has some value
+ *	and IP is zero.  Thus, data addresses need to be absolute
+ *	(no relocation) and are taken with regard to r_base.
+ *
+ *	If you work on this file, check the object module with
+ *	objdump --reloc to make sure there are no relocation
+ *	entries except for:
+ *
+ *	TYPE              VALUE
+ *	R_386_32          startup_32_smp
+ *	R_386_32          boot_gdt
+ */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+
+.data
+
+/* We can free up trampoline after bootup if cpu hotplug is not supported. */
+#ifndef CONFIG_HOTPLUG_CPU
+.section ".init.data","aw",@progbits
+#endif
+
+.code16
+
+ENTRY(trampoline_data)
+r_base = .
+	wbinvd			# Needed for NUMA-Q should be harmless for others
+	mov	%cs, %ax	# Code and data in the same place
+	mov	%ax, %ds
+
+	cli			# We should be safe anyway
+
+	movl	$0xA5A5A5A5, trampoline_data - r_base
+				# write marker for master knows we're running
+
+	/* GDT tables in non default location kernel can be beyond 16MB and
+	 * lgdt will not be able to load the address as in real mode default
+	 * operand size is 16bit. Use lgdtl instead to force operand size
+	 * to 32 bit.
+	 */
+
+	lidtl	boot_idt_descr - r_base	# load idt with 0, 0
+	lgdtl	boot_gdt_descr - r_base	# load gdt with whatever is appropriate
+
+	xor	%ax, %ax
+	inc	%ax		# protected mode (PE) bit
+	lmsw	%ax		# into protected mode
+	# flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
+	ljmpl	$__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
+
+	# These need to be in the same 64K segment as the above;
+	# hence we don't use the boot_gdt_descr defined in head.S
+boot_gdt_descr:
+	.word	__BOOT_DS + 7			# gdt limit
+	.long	boot_gdt - __PAGE_OFFSET	# gdt base
+
+boot_idt_descr:
+	.word	0				# idt limit = 0
+	.long	0				# idt base = 0L
+
+.globl trampoline_end
+trampoline_end:
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
new file mode 100644
index 000000000000..47b0bef335bd
--- /dev/null
+++ b/arch/x86/kernel/traps_32.c
@@ -0,0 +1,1250 @@
+/*
+ *  linux/arch/i386/traps.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * 'Traps.c' handles hardware traps and faults after we have saved some
+ * state in 'asm.s'.
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/highmem.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/utsname.h>
+#include <linux/kprobes.h>
+#include <linux/kexec.h>
+#include <linux/unwind.h>
+#include <linux/uaccess.h>
+#include <linux/nmi.h>
+#include <linux/bug.h>
+
+#ifdef CONFIG_EISA
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#endif
+
+#ifdef CONFIG_MCA
+#include <linux/mca.h>
+#endif
+
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/nmi.h>
+#include <asm/unwind.h>
+#include <asm/smp.h>
+#include <asm/arch_hooks.h>
+#include <linux/kdebug.h>
+#include <asm/stacktrace.h>
+
+#include <linux/module.h>
+
+#include "mach_traps.h"
+
+int panic_on_unrecovered_nmi;
+
+asmlinkage int system_call(void);
+
+/* Do we ignore FPU interrupts ? */
+char ignore_fpu_irq = 0;
+
+/*
+ * The IDT has to be page-aligned to simplify the Pentium
+ * F0 0F bug workaround.. We have a special link segment
+ * for this.
+ */
+struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
+
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void simd_coprocessor_error(void);
+asmlinkage void alignment_check(void);
+asmlinkage void spurious_interrupt_bug(void);
+asmlinkage void machine_check(void);
+
+int kstack_depth_to_print = 24;
+static unsigned int code_bytes = 64;
+
+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
+{
+	return	p > (void *)tinfo &&
+		p <= (void *)tinfo + THREAD_SIZE - size;
+}
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+	struct stack_frame *next_frame;
+	unsigned long return_address;
+};
+
+static inline unsigned long print_context_stack(struct thread_info *tinfo,
+				unsigned long *stack, unsigned long ebp,
+				struct stacktrace_ops *ops, void *data)
+{
+#ifdef	CONFIG_FRAME_POINTER
+	struct stack_frame *frame = (struct stack_frame *)ebp;
+	while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
+		struct stack_frame *next;
+		unsigned long addr;
+
+		addr = frame->return_address;
+		ops->address(data, addr);
+		/*
+		 * break out of recursive entries (such as
+		 * end_of_stack_stop_unwind_function). Also,
+		 * we can never allow a frame pointer to
+		 * move downwards!
+		 */
+		next = frame->next_frame;
+		if (next <= frame)
+			break;
+		frame = next;
+	}
+#else
+	while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
+		unsigned long addr;
+
+		addr = *stack++;
+		if (__kernel_text_address(addr))
+			ops->address(data, addr);
+	}
+#endif
+	return ebp;
+}
+
+#define MSG(msg) ops->warning(data, msg)
+
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
+	        unsigned long *stack,
+		struct stacktrace_ops *ops, void *data)
+{
+	unsigned long ebp = 0;
+
+	if (!task)
+		task = current;
+
+	if (!stack) {
+		unsigned long dummy;
+		stack = &dummy;
+		if (task != current)
+			stack = (unsigned long *)task->thread.esp;
+	}
+
+#ifdef CONFIG_FRAME_POINTER
+	if (!ebp) {
+		if (task == current) {
+			/* Grab ebp right from our regs */
+			asm ("movl %%ebp, %0" : "=r" (ebp) : );
+		} else {
+			/* ebp is the last reg pushed by switch_to */
+			ebp = *(unsigned long *) task->thread.esp;
+		}
+	}
+#endif
+
+	while (1) {
+		struct thread_info *context;
+		context = (struct thread_info *)
+			((unsigned long)stack & (~(THREAD_SIZE - 1)));
+		ebp = print_context_stack(context, stack, ebp, ops, data);
+		/* Should be after the line below, but somewhere
+		   in early boot context comes out corrupted and we
+		   can't reference it -AK */
+		if (ops->stack(data, "IRQ") < 0)
+			break;
+		stack = (unsigned long*)context->previous_esp;
+		if (!stack)
+			break;
+		touch_nmi_watchdog();
+	}
+}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	printk(data);
+	print_symbol(msg, symbol);
+	printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+	printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+	return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr)
+{
+	printk("%s [<%08lx>] ", (char *)data, addr);
+	print_symbol("%s\n", addr);
+	touch_nmi_watchdog();
+}
+
+static struct stacktrace_ops print_trace_ops = {
+	.warning = print_trace_warning,
+	.warning_symbol = print_trace_warning_symbol,
+	.stack = print_trace_stack,
+	.address = print_trace_address,
+};
+
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		   unsigned long * stack, char *log_lvl)
+{
+	dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
+	printk("%s =======================\n", log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+		unsigned long * stack)
+{
+	show_trace_log_lvl(task, regs, stack, "");
+}
+
+static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+			       unsigned long *esp, char *log_lvl)
+{
+	unsigned long *stack;
+	int i;
+
+	if (esp == NULL) {
+		if (task)
+			esp = (unsigned long*)task->thread.esp;
+		else
+			esp = (unsigned long *)&esp;
+	}
+
+	stack = esp;
+	for(i = 0; i < kstack_depth_to_print; i++) {
+		if (kstack_end(stack))
+			break;
+		if (i && ((i % 8) == 0))
+			printk("\n%s       ", log_lvl);
+		printk("%08lx ", *stack++);
+	}
+	printk("\n%sCall Trace:\n", log_lvl);
+	show_trace_log_lvl(task, regs, esp, log_lvl);
+}
+
+void show_stack(struct task_struct *task, unsigned long *esp)
+{
+	printk("       ");
+	show_stack_log_lvl(task, NULL, esp, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+	unsigned long stack;
+
+	show_trace(current, NULL, &stack);
+}
+
+EXPORT_SYMBOL(dump_stack);
+
+void show_registers(struct pt_regs *regs)
+{
+	int i;
+	int in_kernel = 1;
+	unsigned long esp;
+	unsigned short ss, gs;
+
+	esp = (unsigned long) (&regs->esp);
+	savesegment(ss, ss);
+	savesegment(gs, gs);
+	if (user_mode_vm(regs)) {
+		in_kernel = 0;
+		esp = regs->esp;
+		ss = regs->xss & 0xffff;
+	}
+	print_modules();
+	printk(KERN_EMERG "CPU:    %d\n"
+		KERN_EMERG "EIP:    %04x:[<%08lx>]    %s VLI\n"
+		KERN_EMERG "EFLAGS: %08lx   (%s %.*s)\n",
+		smp_processor_id(), 0xffff & regs->xcs, regs->eip,
+		print_tainted(), regs->eflags, init_utsname()->release,
+		(int)strcspn(init_utsname()->version, " "),
+		init_utsname()->version);
+	print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
+	printk(KERN_EMERG "eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+		regs->eax, regs->ebx, regs->ecx, regs->edx);
+	printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+		regs->esi, regs->edi, regs->ebp, esp);
+	printk(KERN_EMERG "ds: %04x   es: %04x   fs: %04x  gs: %04x  ss: %04x\n",
+	       regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
+	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
+		TASK_COMM_LEN, current->comm, current->pid,
+		current_thread_info(), current, task_thread_info(current));
+	/*
+	 * When in-kernel, we also print out the stack and code at the
+	 * time of the fault..
+	 */
+	if (in_kernel) {
+		u8 *eip;
+		unsigned int code_prologue = code_bytes * 43 / 64;
+		unsigned int code_len = code_bytes;
+		unsigned char c;
+
+		printk("\n" KERN_EMERG "Stack: ");
+		show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
+
+		printk(KERN_EMERG "Code: ");
+
+		eip = (u8 *)regs->eip - code_prologue;
+		if (eip < (u8 *)PAGE_OFFSET ||
+			probe_kernel_address(eip, c)) {
+			/* try starting at EIP */
+			eip = (u8 *)regs->eip;
+			code_len = code_len - code_prologue + 1;
+		}
+		for (i = 0; i < code_len; i++, eip++) {
+			if (eip < (u8 *)PAGE_OFFSET ||
+				probe_kernel_address(eip, c)) {
+				printk(" Bad EIP value.");
+				break;
+			}
+			if (eip == (u8 *)regs->eip)
+				printk("<%02x> ", c);
+			else
+				printk("%02x ", c);
+		}
+	}
+	printk("\n");
+}	
+
+int is_valid_bugaddr(unsigned long eip)
+{
+	unsigned short ud2;
+
+	if (eip < PAGE_OFFSET)
+		return 0;
+	if (probe_kernel_address((unsigned short *)eip, ud2))
+		return 0;
+
+	return ud2 == 0x0b0f;
+}
+
+/*
+ * This is gone through when something in the kernel has done something bad and
+ * is about to be terminated.
+ */
+void die(const char * str, struct pt_regs * regs, long err)
+{
+	static struct {
+		spinlock_t lock;
+		u32 lock_owner;
+		int lock_owner_depth;
+	} die = {
+		.lock =			__SPIN_LOCK_UNLOCKED(die.lock),
+		.lock_owner =		-1,
+		.lock_owner_depth =	0
+	};
+	static int die_counter;
+	unsigned long flags;
+
+	oops_enter();
+
+	if (die.lock_owner != raw_smp_processor_id()) {
+		console_verbose();
+		spin_lock_irqsave(&die.lock, flags);
+		die.lock_owner = smp_processor_id();
+		die.lock_owner_depth = 0;
+		bust_spinlocks(1);
+	}
+	else
+		local_save_flags(flags);
+
+	if (++die.lock_owner_depth < 3) {
+		int nl = 0;
+		unsigned long esp;
+		unsigned short ss;
+
+		report_bug(regs->eip, regs);
+
+		printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+		printk(KERN_EMERG "PREEMPT ");
+		nl = 1;
+#endif
+#ifdef CONFIG_SMP
+		if (!nl)
+			printk(KERN_EMERG);
+		printk("SMP ");
+		nl = 1;
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+		if (!nl)
+			printk(KERN_EMERG);
+		printk("DEBUG_PAGEALLOC");
+		nl = 1;
+#endif
+		if (nl)
+			printk("\n");
+		if (notify_die(DIE_OOPS, str, regs, err,
+					current->thread.trap_no, SIGSEGV) !=
+				NOTIFY_STOP) {
+			show_registers(regs);
+			/* Executive summary in case the oops scrolled away */
+			esp = (unsigned long) (&regs->esp);
+			savesegment(ss, ss);
+			if (user_mode(regs)) {
+				esp = regs->esp;
+				ss = regs->xss & 0xffff;
+			}
+			printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
+			print_symbol("%s", regs->eip);
+			printk(" SS:ESP %04x:%08lx\n", ss, esp);
+		}
+		else
+			regs = NULL;
+  	} else
+		printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
+
+	bust_spinlocks(0);
+	die.lock_owner = -1;
+	add_taint(TAINT_DIE);
+	spin_unlock_irqrestore(&die.lock, flags);
+
+	if (!regs)
+		return;
+
+	if (kexec_should_crash(current))
+		crash_kexec(regs);
+
+	if (in_interrupt())
+		panic("Fatal exception in interrupt");
+
+	if (panic_on_oops)
+		panic("Fatal exception");
+
+	oops_exit();
+	do_exit(SIGSEGV);
+}
+
+static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
+{
+	if (!user_mode_vm(regs))
+		die(str, regs, err);
+}
+
+static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
+			      struct pt_regs * regs, long error_code,
+			      siginfo_t *info)
+{
+	struct task_struct *tsk = current;
+
+	if (regs->eflags & VM_MASK) {
+		if (vm86)
+			goto vm86_trap;
+		goto trap_signal;
+	}
+
+	if (!user_mode(regs))
+		goto kernel_trap;
+
+	trap_signal: {
+		/*
+		 * We want error_code and trap_no set for userspace faults and
+		 * kernelspace faults which result in die(), but not
+		 * kernelspace faults which are fixed up.  die() gives the
+		 * process no chance to handle the signal and notice the
+		 * kernel fault information, so that won't result in polluting
+		 * the information about previously queued, but not yet
+		 * delivered, faults.  See also do_general_protection below.
+		 */
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = trapnr;
+
+		if (info)
+			force_sig_info(signr, info, tsk);
+		else
+			force_sig(signr, tsk);
+		return;
+	}
+
+	kernel_trap: {
+		if (!fixup_exception(regs)) {
+			tsk->thread.error_code = error_code;
+			tsk->thread.trap_no = trapnr;
+			die(str, regs, error_code);
+		}
+		return;
+	}
+
+	vm86_trap: {
+		int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
+		if (ret) goto trap_signal;
+		return;
+	}
+}
+
+#define DO_ERROR(trapnr, signr, str, name) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+						== NOTIFY_STOP) \
+		return; \
+	do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
+}
+
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+	siginfo_t info; \
+	if (irq) \
+		local_irq_enable(); \
+	info.si_signo = signr; \
+	info.si_errno = 0; \
+	info.si_code = sicode; \
+	info.si_addr = (void __user *)siaddr; \
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+						== NOTIFY_STOP) \
+		return; \
+	do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
+}
+
+#define DO_VM86_ERROR(trapnr, signr, str, name) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+						== NOTIFY_STOP) \
+		return; \
+	do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
+}
+
+#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+	siginfo_t info; \
+	info.si_signo = signr; \
+	info.si_errno = 0; \
+	info.si_code = sicode; \
+	info.si_addr = (void __user *)siaddr; \
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+						== NOTIFY_STOP) \
+		return; \
+	do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
+}
+
+DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
+#ifndef CONFIG_KPROBES
+DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
+#endif
+DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
+DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
+DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
+DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
+DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+
+fastcall void __kprobes do_general_protection(struct pt_regs * regs,
+					      long error_code)
+{
+	int cpu = get_cpu();
+	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+	struct thread_struct *thread = &current->thread;
+
+	/*
+	 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
+	 * invalid offset set (the LAZY one) and the faulting thread has
+	 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
+	 * and we set the offset field correctly. Then we let the CPU to
+	 * restart the faulting instruction.
+	 */
+	if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
+	    thread->io_bitmap_ptr) {
+		memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
+		       thread->io_bitmap_max);
+		/*
+		 * If the previously set map was extending to higher ports
+		 * than the current one, pad extra space with 0xff (no access).
+		 */
+		if (thread->io_bitmap_max < tss->io_bitmap_max)
+			memset((char *) tss->io_bitmap +
+				thread->io_bitmap_max, 0xff,
+				tss->io_bitmap_max - thread->io_bitmap_max);
+		tss->io_bitmap_max = thread->io_bitmap_max;
+		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+		tss->io_bitmap_owner = thread;
+		put_cpu();
+		return;
+	}
+	put_cpu();
+
+	if (regs->eflags & VM_MASK)
+		goto gp_in_vm86;
+
+	if (!user_mode(regs))
+		goto gp_in_kernel;
+
+	current->thread.error_code = error_code;
+	current->thread.trap_no = 13;
+	if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
+	    printk_ratelimit())
+		printk(KERN_INFO
+		    "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
+		    current->comm, current->pid,
+		    regs->eip, regs->esp, error_code);
+
+	force_sig(SIGSEGV, current);
+	return;
+
+gp_in_vm86:
+	local_irq_enable();
+	handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+	return;
+
+gp_in_kernel:
+	if (!fixup_exception(regs)) {
+		current->thread.error_code = error_code;
+		current->thread.trap_no = 13;
+		if (notify_die(DIE_GPF, "general protection fault", regs,
+				error_code, 13, SIGSEGV) == NOTIFY_STOP)
+			return;
+		die("general protection fault", regs, error_code);
+	}
+}
+
+static __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs * regs)
+{
+	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+		"CPU %d.\n", reason, smp_processor_id());
+	printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
+
+#if defined(CONFIG_EDAC)
+	if(edac_handler_set()) {
+		edac_atomic_assert_error();
+		return;
+	}
+#endif
+
+	if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+
+	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+
+	/* Clear and disable the memory parity error line. */
+	clear_mem_error(reason);
+}
+
+static __kprobes void
+io_check_error(unsigned char reason, struct pt_regs * regs)
+{
+	unsigned long i;
+
+	printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+	show_registers(regs);
+
+	/* Re-enable the IOCK line, wait for a few seconds */
+	reason = (reason & 0xf) | 8;
+	outb(reason, 0x61);
+	i = 2000;
+	while (--i) udelay(1000);
+	reason &= ~8;
+	outb(reason, 0x61);
+}
+
+static __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+{
+#ifdef CONFIG_MCA
+	/* Might actually be able to figure out what the guilty party
+	* is. */
+	if( MCA_bus ) {
+		mca_handle_nmi();
+		return;
+	}
+#endif
+	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+		"CPU %d.\n", reason, smp_processor_id());
+	printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+	if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+
+	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+}
+
+static DEFINE_SPINLOCK(nmi_print_lock);
+
+void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+{
+	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
+	    NOTIFY_STOP)
+		return;
+
+	spin_lock(&nmi_print_lock);
+	/*
+	* We are in trouble anyway, lets at least try
+	* to get a message out.
+	*/
+	bust_spinlocks(1);
+	printk(KERN_EMERG "%s", msg);
+	printk(" on CPU%d, eip %08lx, registers:\n",
+		smp_processor_id(), regs->eip);
+	show_registers(regs);
+	console_silent();
+	spin_unlock(&nmi_print_lock);
+	bust_spinlocks(0);
+
+	/* If we are in kernel we are probably nested up pretty bad
+	 * and might aswell get out now while we still can.
+	*/
+	if (!user_mode_vm(regs)) {
+		current->thread.trap_no = 2;
+		crash_kexec(regs);
+	}
+
+	do_exit(SIGSEGV);
+}
+
+static __kprobes void default_do_nmi(struct pt_regs * regs)
+{
+	unsigned char reason = 0;
+
+	/* Only the BSP gets external NMIs from the system.  */
+	if (!smp_processor_id())
+		reason = get_nmi_reason();
+ 
+	if (!(reason & 0xc0)) {
+		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
+							== NOTIFY_STOP)
+			return;
+#ifdef CONFIG_X86_LOCAL_APIC
+		/*
+		 * Ok, so this is none of the documented NMI sources,
+		 * so it must be the NMI watchdog.
+		 */
+		if (nmi_watchdog_tick(regs, reason))
+			return;
+		if (!do_nmi_callback(regs, smp_processor_id()))
+#endif
+			unknown_nmi_error(reason, regs);
+
+		return;
+	}
+	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+		return;
+	if (reason & 0x80)
+		mem_parity_error(reason, regs);
+	if (reason & 0x40)
+		io_check_error(reason, regs);
+	/*
+	 * Reassert NMI in case it became active meanwhile
+	 * as it's edge-triggered.
+	 */
+	reassert_nmi();
+}
+
+static int ignore_nmis;
+
+fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
+{
+	int cpu;
+
+	nmi_enter();
+
+	cpu = smp_processor_id();
+
+	++nmi_count(cpu);
+
+	if (!ignore_nmis)
+		default_do_nmi(regs);
+
+	nmi_exit();
+}
+
+void stop_nmi(void)
+{
+	acpi_nmi_disable();
+	ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+	ignore_nmis--;
+	acpi_nmi_enable();
+}
+
+#ifdef CONFIG_KPROBES
+fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
+{
+	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+			== NOTIFY_STOP)
+		return;
+	/* This is an interrupt gate, because kprobes wants interrupts
+	disabled.  Normal trap handlers don't. */
+	restore_interrupts(regs);
+	do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+}
+#endif
+
+/*
+ * Our handling of the processor debug registers is non-trivial.
+ * We do not clear them on entry and exit from the kernel. Therefore
+ * it is possible to get a watchpoint trap here from inside the kernel.
+ * However, the code in ./ptrace.c has ensured that the user can
+ * only set watchpoints on userspace addresses. Therefore the in-kernel
+ * watchpoint trap can only occur in code which is reading/writing
+ * from user space. Such code must not hold kernel locks (since it
+ * can equally take a page fault), therefore it is safe to call
+ * force_sig_info even though that claims and releases locks.
+ * 
+ * Code in ./signal.c ensures that the debug control register
+ * is restored before we deliver any signal, and therefore that
+ * user code runs with the correct debug control register even though
+ * we clear it here.
+ *
+ * Being careful here means that we don't have to be as careful in a
+ * lot of more complicated places (task switching can be a bit lazy
+ * about restoring all the debug state, and ptrace doesn't have to
+ * find every occurrence of the TF bit that could be saved away even
+ * by user code)
+ */
+fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
+{
+	unsigned int condition;
+	struct task_struct *tsk = current;
+
+	get_debugreg(condition, 6);
+
+	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+					SIGTRAP) == NOTIFY_STOP)
+		return;
+	/* It's safe to allow irq's after DR6 has been saved */
+	if (regs->eflags & X86_EFLAGS_IF)
+		local_irq_enable();
+
+	/* Mask out spurious debug traps due to lazy DR7 setting */
+	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
+		if (!tsk->thread.debugreg[7])
+			goto clear_dr7;
+	}
+
+	if (regs->eflags & VM_MASK)
+		goto debug_vm86;
+
+	/* Save debug status register where ptrace can see it */
+	tsk->thread.debugreg[6] = condition;
+
+	/*
+	 * Single-stepping through TF: make sure we ignore any events in
+	 * kernel space (but re-enable TF when returning to user mode).
+	 */
+	if (condition & DR_STEP) {
+		/*
+		 * We already checked v86 mode above, so we can
+		 * check for kernel mode by just checking the CPL
+		 * of CS.
+		 */
+		if (!user_mode(regs))
+			goto clear_TF_reenable;
+	}
+
+	/* Ok, finally something we can handle */
+	send_sigtrap(tsk, regs, error_code);
+
+	/* Disable additional traps. They'll be re-enabled when
+	 * the signal is delivered.
+	 */
+clear_dr7:
+	set_debugreg(0, 7);
+	return;
+
+debug_vm86:
+	handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
+	return;
+
+clear_TF_reenable:
+	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+	regs->eflags &= ~TF_MASK;
+	return;
+}
+
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+void math_error(void __user *eip)
+{
+	struct task_struct * task;
+	siginfo_t info;
+	unsigned short cwd, swd;
+
+	/*
+	 * Save the info for the exception handler and clear the error.
+	 */
+	task = current;
+	save_init_fpu(task);
+	task->thread.trap_no = 16;
+	task->thread.error_code = 0;
+	info.si_signo = SIGFPE;
+	info.si_errno = 0;
+	info.si_code = __SI_FAULT;
+	info.si_addr = eip;
+	/*
+	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
+	 * status.  0x3f is the exception bits in these regs, 0x200 is the
+	 * C1 reg you need in case of a stack fault, 0x040 is the stack
+	 * fault bit.  We should only be taking one exception at a time,
+	 * so if this combination doesn't produce any single exception,
+	 * then we have a bad program that isn't syncronizing its FPU usage
+	 * and it will suffer the consequences since we won't be able to
+	 * fully reproduce the context of the exception
+	 */
+	cwd = get_fpu_cwd(task);
+	swd = get_fpu_swd(task);
+	switch (swd & ~cwd & 0x3f) {
+		case 0x000: /* No unmasked exception */
+			return;
+		default:    /* Multiple exceptions */
+			break;
+		case 0x001: /* Invalid Op */
+			/*
+			 * swd & 0x240 == 0x040: Stack Underflow
+			 * swd & 0x240 == 0x240: Stack Overflow
+			 * User must clear the SF bit (0x40) if set
+			 */
+			info.si_code = FPE_FLTINV;
+			break;
+		case 0x002: /* Denormalize */
+		case 0x010: /* Underflow */
+			info.si_code = FPE_FLTUND;
+			break;
+		case 0x004: /* Zero Divide */
+			info.si_code = FPE_FLTDIV;
+			break;
+		case 0x008: /* Overflow */
+			info.si_code = FPE_FLTOVF;
+			break;
+		case 0x020: /* Precision */
+			info.si_code = FPE_FLTRES;
+			break;
+	}
+	force_sig_info(SIGFPE, &info, task);
+}
+
+fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
+{
+	ignore_fpu_irq = 1;
+	math_error((void __user *)regs->eip);
+}
+
+static void simd_math_error(void __user *eip)
+{
+	struct task_struct * task;
+	siginfo_t info;
+	unsigned short mxcsr;
+
+	/*
+	 * Save the info for the exception handler and clear the error.
+	 */
+	task = current;
+	save_init_fpu(task);
+	task->thread.trap_no = 19;
+	task->thread.error_code = 0;
+	info.si_signo = SIGFPE;
+	info.si_errno = 0;
+	info.si_code = __SI_FAULT;
+	info.si_addr = eip;
+	/*
+	 * The SIMD FPU exceptions are handled a little differently, as there
+	 * is only a single status/control register.  Thus, to determine which
+	 * unmasked exception was caught we must mask the exception mask bits
+	 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+	 */
+	mxcsr = get_fpu_mxcsr(task);
+	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
+		case 0x000:
+		default:
+			break;
+		case 0x001: /* Invalid Op */
+			info.si_code = FPE_FLTINV;
+			break;
+		case 0x002: /* Denormalize */
+		case 0x010: /* Underflow */
+			info.si_code = FPE_FLTUND;
+			break;
+		case 0x004: /* Zero Divide */
+			info.si_code = FPE_FLTDIV;
+			break;
+		case 0x008: /* Overflow */
+			info.si_code = FPE_FLTOVF;
+			break;
+		case 0x020: /* Precision */
+			info.si_code = FPE_FLTRES;
+			break;
+	}
+	force_sig_info(SIGFPE, &info, task);
+}
+
+fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
+					  long error_code)
+{
+	if (cpu_has_xmm) {
+		/* Handle SIMD FPU exceptions on PIII+ processors. */
+		ignore_fpu_irq = 1;
+		simd_math_error((void __user *)regs->eip);
+	} else {
+		/*
+		 * Handle strange cache flush from user space exception
+		 * in all other cases.  This is undocumented behaviour.
+		 */
+		if (regs->eflags & VM_MASK) {
+			handle_vm86_fault((struct kernel_vm86_regs *)regs,
+					  error_code);
+			return;
+		}
+		current->thread.trap_no = 19;
+		current->thread.error_code = error_code;
+		die_if_kernel("cache flush denied", regs, error_code);
+		force_sig(SIGSEGV, current);
+	}
+}
+
+fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
+					  long error_code)
+{
+#if 0
+	/* No need to warn about this any longer. */
+	printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
+#endif
+}
+
+fastcall unsigned long patch_espfix_desc(unsigned long uesp,
+					  unsigned long kesp)
+{
+	struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
+	unsigned long base = (kesp - uesp) & -THREAD_SIZE;
+	unsigned long new_kesp = kesp - base;
+	unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
+	__u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
+	/* Set up base for espfix segment */
+ 	desc &= 0x00f0ff0000000000ULL;
+ 	desc |=	((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
+		((((__u64)base) << 32) & 0xff00000000000000ULL) |
+		((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
+		(lim_pages & 0xffff);
+	*(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
+	return new_kesp;
+}
+
+/*
+ *  'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+ * Don't touch unless you *really* know how it works.
+ *
+ * Must be called with kernel preemption disabled (in this case,
+ * local interrupts are disabled at the call-site in entry.S).
+ */
+asmlinkage void math_state_restore(void)
+{
+	struct thread_info *thread = current_thread_info();
+	struct task_struct *tsk = thread->task;
+
+	clts();		/* Allow maths ops (or we recurse) */
+	if (!tsk_used_math(tsk))
+		init_fpu(tsk);
+	restore_fpu(tsk);
+	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
+	tsk->fpu_counter++;
+}
+EXPORT_SYMBOL_GPL(math_state_restore);
+
+#ifndef CONFIG_MATH_EMULATION
+
+asmlinkage void math_emulate(long arg)
+{
+	printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
+	printk(KERN_EMERG "killing %s.\n",current->comm);
+	force_sig(SIGFPE,current);
+	schedule();
+}
+
+#endif /* CONFIG_MATH_EMULATION */
+
+#ifdef CONFIG_X86_F00F_BUG
+void __init trap_init_f00f_bug(void)
+{
+	__set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
+
+	/*
+	 * Update the IDT descriptor and reload the IDT so that
+	 * it uses the read-only mapped virtual address.
+	 */
+	idt_descr.address = fix_to_virt(FIX_F00F_IDT);
+	load_idt(&idt_descr);
+}
+#endif
+
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+void set_intr_gate(unsigned int n, void *addr)
+{
+	_set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+	_set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
+}
+
+static void __init set_trap_gate(unsigned int n, void *addr)
+{
+	_set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
+}
+
+static void __init set_system_gate(unsigned int n, void *addr)
+{
+	_set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
+}
+
+static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+	_set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
+}
+
+
+void __init trap_init(void)
+{
+#ifdef CONFIG_EISA
+	void __iomem *p = ioremap(0x0FFFD9, 4);
+	if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
+		EISA_bus = 1;
+	}
+	iounmap(p);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	init_apic_mappings();
+#endif
+
+	set_trap_gate(0,&divide_error);
+	set_intr_gate(1,&debug);
+	set_intr_gate(2,&nmi);
+	set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
+	set_system_gate(4,&overflow);
+	set_trap_gate(5,&bounds);
+	set_trap_gate(6,&invalid_op);
+	set_trap_gate(7,&device_not_available);
+	set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
+	set_trap_gate(9,&coprocessor_segment_overrun);
+	set_trap_gate(10,&invalid_TSS);
+	set_trap_gate(11,&segment_not_present);
+	set_trap_gate(12,&stack_segment);
+	set_trap_gate(13,&general_protection);
+	set_intr_gate(14,&page_fault);
+	set_trap_gate(15,&spurious_interrupt_bug);
+	set_trap_gate(16,&coprocessor_error);
+	set_trap_gate(17,&alignment_check);
+#ifdef CONFIG_X86_MCE
+	set_trap_gate(18,&machine_check);
+#endif
+	set_trap_gate(19,&simd_coprocessor_error);
+
+	if (cpu_has_fxsr) {
+		/*
+		 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
+		 * Generates a compile-time "error: zero width for bit-field" if
+		 * the alignment is wrong.
+		 */
+		struct fxsrAlignAssert {
+			int _:!(offsetof(struct task_struct,
+					thread.i387.fxsave) & 15);
+		};
+
+		printk(KERN_INFO "Enabling fast FPU save and restore... ");
+		set_in_cr4(X86_CR4_OSFXSR);
+		printk("done.\n");
+	}
+	if (cpu_has_xmm) {
+		printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
+				"support... ");
+		set_in_cr4(X86_CR4_OSXMMEXCPT);
+		printk("done.\n");
+	}
+
+	set_system_gate(SYSCALL_VECTOR,&system_call);
+
+	/*
+	 * Should be a barrier for any external CPU state.
+	 */
+	cpu_init();
+
+	trap_init_hook();
+}
+
+static int __init kstack_setup(char *s)
+{
+	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+	return 1;
+}
+__setup("kstack=", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+	code_bytes = simple_strtoul(s, NULL, 0);
+	if (code_bytes > 8192)
+		code_bytes = 8192;
+
+	return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
new file mode 100644
index 000000000000..a39280b4dd3a
--- /dev/null
+++ b/arch/x86/kernel/tsc_32.c
@@ -0,0 +1,413 @@
+/*
+ * This code largely moved from arch/i386/kernel/timer/timer_tsc.c
+ * which was originally moved from arch/i386/kernel/time.c.
+ * See comments there for proper credits.
+ */
+
+#include <linux/sched.h>
+#include <linux/clocksource.h>
+#include <linux/workqueue.h>
+#include <linux/cpufreq.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+
+#include <asm/delay.h>
+#include <asm/tsc.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+
+#include "mach_timer.h"
+
+static int tsc_enabled;
+
+/*
+ * On some systems the TSC frequency does not
+ * change with the cpu frequency. So we need
+ * an extra value to store the TSC freq
+ */
+unsigned int tsc_khz;
+EXPORT_SYMBOL_GPL(tsc_khz);
+
+int tsc_disable;
+
+#ifdef CONFIG_X86_TSC
+static int __init tsc_setup(char *str)
+{
+	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
+				"cannot disable TSC.\n");
+	return 1;
+}
+#else
+/*
+ * disable flag for tsc. Takes effect by clearing the TSC cpu flag
+ * in cpu/common.c
+ */
+static int __init tsc_setup(char *str)
+{
+	tsc_disable = 1;
+
+	return 1;
+}
+#endif
+
+__setup("notsc", tsc_setup);
+
+/*
+ * code to mark and check if the TSC is unstable
+ * due to cpufreq or due to unsynced TSCs
+ */
+static int tsc_unstable;
+
+int check_tsc_unstable(void)
+{
+	return tsc_unstable;
+}
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
+
+/* Accellerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *		ns = cycles / (freq / ns_per_sec)
+ *		ns = cycles * (ns_per_sec / freq)
+ *		ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *		ns = cycles * (10^6 / cpu_khz)
+ *
+ *	Then we use scaling math (suggested by george@mvista.com) to get:
+ *		ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *		ns = cycles * cyc2ns_scale / SC
+ *
+ *	And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better percision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+unsigned long cyc2ns_scale __read_mostly;
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
+{
+	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
+}
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+unsigned long long native_sched_clock(void)
+{
+	unsigned long long this_offset;
+
+	/*
+	 * Fall back to jiffies if there's no TSC available:
+	 * ( But note that we still use it if the TSC is marked
+	 *   unstable. We do this because unlike Time Of Day,
+	 *   the scheduler clock tolerates small errors and it's
+	 *   very important for it to be as fast as the platform
+	 *   can achive it. )
+	 */
+	if (unlikely(!tsc_enabled && !tsc_unstable))
+		/* No locking but a rare wrong value is not a big deal: */
+		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+
+	/* read the Time Stamp Counter: */
+	rdtscll(this_offset);
+
+	/* return the value in ns */
+	return cycles_2_ns(this_offset);
+}
+
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+	return paravirt_sched_clock();
+}
+#else
+unsigned long long sched_clock(void)
+	__attribute__((alias("native_sched_clock")));
+#endif
+
+unsigned long native_calculate_cpu_khz(void)
+{
+	unsigned long long start, end;
+	unsigned long count;
+	u64 delta64;
+	int i;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	/* run 3 times to ensure the cache is warm */
+	for (i = 0; i < 3; i++) {
+		mach_prepare_counter();
+		rdtscll(start);
+		mach_countup(&count);
+		rdtscll(end);
+	}
+	/*
+	 * Error: ECTCNEVERSET
+	 * The CTC wasn't reliable: we got a hit on the very first read,
+	 * or the CPU was so fast/slow that the quotient wouldn't fit in
+	 * 32 bits..
+	 */
+	if (count <= 1)
+		goto err;
+
+	delta64 = end - start;
+
+	/* cpu freq too fast: */
+	if (delta64 > (1ULL<<32))
+		goto err;
+
+	/* cpu freq too slow: */
+	if (delta64 <= CALIBRATE_TIME_MSEC)
+		goto err;
+
+	delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
+	do_div(delta64,CALIBRATE_TIME_MSEC);
+
+	local_irq_restore(flags);
+	return (unsigned long)delta64;
+err:
+	local_irq_restore(flags);
+	return 0;
+}
+
+int recalibrate_cpu_khz(void)
+{
+#ifndef CONFIG_SMP
+	unsigned long cpu_khz_old = cpu_khz;
+
+	if (cpu_has_tsc) {
+		cpu_khz = calculate_cpu_khz();
+		tsc_khz = cpu_khz;
+		cpu_data[0].loops_per_jiffy =
+			cpufreq_scale(cpu_data[0].loops_per_jiffy,
+					cpu_khz_old, cpu_khz);
+		return 0;
+	} else
+		return -ENODEV;
+#else
+	return -ENODEV;
+#endif
+}
+
+EXPORT_SYMBOL(recalibrate_cpu_khz);
+
+#ifdef CONFIG_CPU_FREQ
+
+/*
+ * if the CPU frequency is scaled, TSC-based delays will need a different
+ * loops_per_jiffy value to function properly.
+ */
+static unsigned int ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+static unsigned long cpu_khz_ref = 0;
+
+static int
+time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
+{
+	struct cpufreq_freqs *freq = data;
+
+	if (!ref_freq) {
+		if (!freq->old){
+			ref_freq = freq->new;
+			return 0;
+		}
+		ref_freq = freq->old;
+		loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
+		cpu_khz_ref = cpu_khz;
+	}
+
+	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE)) {
+		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+			cpu_data[freq->cpu].loops_per_jiffy =
+				cpufreq_scale(loops_per_jiffy_ref,
+						ref_freq, freq->new);
+
+		if (cpu_khz) {
+
+			if (num_online_cpus() == 1)
+				cpu_khz = cpufreq_scale(cpu_khz_ref,
+						ref_freq, freq->new);
+			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
+				tsc_khz = cpu_khz;
+				set_cyc2ns_scale(cpu_khz);
+				/*
+				 * TSC based sched_clock turns
+				 * to junk w/ cpufreq
+				 */
+				mark_tsc_unstable("cpufreq changes");
+			}
+		}
+	}
+
+	return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+	.notifier_call	= time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+	return cpufreq_register_notifier(&time_cpufreq_notifier_block,
+					 CPUFREQ_TRANSITION_NOTIFIER);
+}
+core_initcall(cpufreq_tsc);
+
+#endif
+
+/* clock source code */
+
+static unsigned long current_tsc_khz = 0;
+
+static cycle_t read_tsc(void)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret;
+}
+
+static struct clocksource clocksource_tsc = {
+	.name			= "tsc",
+	.rating			= 300,
+	.read			= read_tsc,
+	.mask			= CLOCKSOURCE_MASK(64),
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.flags			= CLOCK_SOURCE_IS_CONTINUOUS |
+				  CLOCK_SOURCE_MUST_VERIFY,
+};
+
+void mark_tsc_unstable(char *reason)
+{
+	if (!tsc_unstable) {
+		tsc_unstable = 1;
+		tsc_enabled = 0;
+		printk("Marking TSC unstable due to: %s.\n", reason);
+		/* Can be called before registration */
+		if (clocksource_tsc.mult)
+			clocksource_change_rating(&clocksource_tsc, 0);
+		else
+			clocksource_tsc.rating = 0;
+	}
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d)
+{
+	printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
+		       d->ident);
+	tsc_unstable = 1;
+	return 0;
+}
+
+/* List of systems that have known TSC problems */
+static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
+	{
+	 .callback = dmi_mark_tsc_unstable,
+	 .ident = "IBM Thinkpad 380XD",
+	 .matches = {
+		     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+		     DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
+		     },
+	 },
+	 {}
+};
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+__cpuinit int unsynchronized_tsc(void)
+{
+	if (!cpu_has_tsc || tsc_unstable)
+		return 1;
+	/*
+	 * Intel systems are normally all synchronized.
+	 * Exceptions must mark TSC as unstable:
+	 */
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+		/* assume multi socket systems are not synchronized: */
+		if (num_possible_cpus() > 1)
+			tsc_unstable = 1;
+	}
+	return tsc_unstable;
+}
+
+/*
+ * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
+ */
+#ifdef CONFIG_MGEODE_LX
+/* RTSC counts during suspend */
+#define RTSC_SUSP 0x100
+
+static void __init check_geode_tsc_reliable(void)
+{
+	unsigned long val;
+
+	rdmsrl(MSR_GEODE_BUSCONT_CONF0, val);
+	if ((val & RTSC_SUSP))
+		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+#else
+static inline void check_geode_tsc_reliable(void) { }
+#endif
+
+
+void __init tsc_init(void)
+{
+	if (!cpu_has_tsc || tsc_disable)
+		goto out_no_tsc;
+
+	cpu_khz = calculate_cpu_khz();
+	tsc_khz = cpu_khz;
+
+	if (!cpu_khz)
+		goto out_no_tsc;
+
+	printk("Detected %lu.%03lu MHz processor.\n",
+				(unsigned long)cpu_khz / 1000,
+				(unsigned long)cpu_khz % 1000);
+
+	set_cyc2ns_scale(cpu_khz);
+	use_tsc_delay();
+
+	/* Check and install the TSC clocksource */
+	dmi_check_system(bad_tsc_dmi_table);
+
+	unsynchronized_tsc();
+	check_geode_tsc_reliable();
+	current_tsc_khz = tsc_khz;
+	clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+	/* lower the rating if we already know its unstable: */
+	if (check_tsc_unstable()) {
+		clocksource_tsc.rating = 0;
+		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
+	} else
+		tsc_enabled = 1;
+
+	clocksource_register(&clocksource_tsc);
+
+	return;
+
+out_no_tsc:
+	/*
+	 * Set the tsc_disable flag if there's no TSC support, this
+	 * makes it a fast flag for the kernel to see whether it
+	 * should be using the TSC.
+	 */
+	tsc_disable = 1;
+}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
new file mode 100644
index 000000000000..12424629af87
--- /dev/null
+++ b/arch/x86/kernel/tsc_sync.c
@@ -0,0 +1 @@
+#include "../../x86_64/kernel/tsc_sync.c"
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
new file mode 100644
index 000000000000..f2dcd1d27c0a
--- /dev/null
+++ b/arch/x86/kernel/vm86_32.c
@@ -0,0 +1,843 @@
+/*
+ *  linux/kernel/vm86.c
+ *
+ *  Copyright (C) 1994  Linus Torvalds
+ *
+ *  29 dec 2001 - Fixed oopses caused by unchecked access to the vm86
+ *                stack - Manfred Spraul <manfred@colorfullife.com>
+ *
+ *  22 mar 2002 - Manfred detected the stackfaults, but didn't handle
+ *                them correctly. Now the emulation will be in a
+ *                consistent state after stackfaults - Kasper Dupont
+ *                <kasperd@daimi.au.dk>
+ *
+ *  22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont
+ *                <kasperd@daimi.au.dk>
+ *
+ *  ?? ??? 2002 - Fixed premature returns from handle_vm86_fault
+ *                caused by Kasper Dupont's changes - Stas Sergeev
+ *
+ *   4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes.
+ *                Kasper Dupont <kasperd@daimi.au.dk>
+ *
+ *   9 apr 2002 - Changed syntax of macros in handle_vm86_fault.
+ *                Kasper Dupont <kasperd@daimi.au.dk>
+ *
+ *   9 apr 2002 - Changed stack access macros to jump to a label
+ *                instead of returning to userspace. This simplifies
+ *                do_int, and is needed by handle_vm6_fault. Kasper
+ *                Dupont <kasperd@daimi.au.dk>
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/audit.h>
+#include <linux/stddef.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/tlbflush.h>
+#include <asm/irq.h>
+
+/*
+ * Known problems:
+ *
+ * Interrupt handling is not guaranteed:
+ * - a real x86 will disable all interrupts for one instruction
+ *   after a "mov ss,xx" to make stack handling atomic even without
+ *   the 'lss' instruction. We can't guarantee this in v86 mode,
+ *   as the next instruction might result in a page fault or similar.
+ * - a real x86 will have interrupts disabled for one instruction
+ *   past the 'sti' that enables them. We don't bother with all the
+ *   details yet.
+ *
+ * Let's hope these problems do not actually matter for anything.
+ */
+
+
+#define KVM86	((struct kernel_vm86_struct *)regs)
+#define VMPI 	KVM86->vm86plus
+
+
+/*
+ * 8- and 16-bit register defines..
+ */
+#define AL(regs)	(((unsigned char *)&((regs)->pt.eax))[0])
+#define AH(regs)	(((unsigned char *)&((regs)->pt.eax))[1])
+#define IP(regs)	(*(unsigned short *)&((regs)->pt.eip))
+#define SP(regs)	(*(unsigned short *)&((regs)->pt.esp))
+
+/*
+ * virtual flags (16 and 32-bit versions)
+ */
+#define VFLAGS	(*(unsigned short *)&(current->thread.v86flags))
+#define VEFLAGS	(current->thread.v86flags)
+
+#define set_flags(X,new,mask) \
+((X) = ((X) & ~(mask)) | ((new) & (mask)))
+
+#define SAFE_MASK	(0xDD5)
+#define RETURN_MASK	(0xDFF)
+
+/* convert kernel_vm86_regs to vm86_regs */
+static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
+				  const struct kernel_vm86_regs *regs)
+{
+	int ret = 0;
+
+	/* kernel_vm86_regs is missing xgs, so copy everything up to
+	   (but not including) orig_eax, and then rest including orig_eax. */
+	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
+			    sizeof(struct kernel_vm86_regs) -
+			    offsetof(struct kernel_vm86_regs, pt.orig_eax));
+
+	return ret;
+}
+
+/* convert vm86_regs to kernel_vm86_regs */
+static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
+				    const struct vm86_regs __user *user,
+				    unsigned extra)
+{
+	int ret = 0;
+
+	/* copy eax-xfs inclusive */
+	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+	/* copy orig_eax-__gsh+extra */
+	ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
+			      sizeof(struct kernel_vm86_regs) -
+			      offsetof(struct kernel_vm86_regs, pt.orig_eax) +
+			      extra);
+	return ret;
+}
+
+struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
+struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
+{
+	struct tss_struct *tss;
+	struct pt_regs *ret;
+	unsigned long tmp;
+
+	/*
+	 * This gets called from entry.S with interrupts disabled, but
+	 * from process context. Enable interrupts here, before trying
+	 * to access user space.
+	 */
+	local_irq_enable();
+
+	if (!current->thread.vm86_info) {
+		printk("no vm86_info: BAD\n");
+		do_exit(SIGSEGV);
+	}
+	set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
+	tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
+	tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
+	if (tmp) {
+		printk("vm86: could not access userspace vm86_info\n");
+		do_exit(SIGSEGV);
+	}
+
+	tss = &per_cpu(init_tss, get_cpu());
+	current->thread.esp0 = current->thread.saved_esp0;
+	current->thread.sysenter_cs = __KERNEL_CS;
+	load_esp0(tss, &current->thread);
+	current->thread.saved_esp0 = 0;
+	put_cpu();
+
+	ret = KVM86->regs32;
+
+	ret->xfs = current->thread.saved_fs;
+	loadsegment(gs, current->thread.saved_gs);
+
+	return ret;
+}
+
+static void mark_screen_rdonly(struct mm_struct *mm)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	spinlock_t *ptl;
+	int i;
+
+	pgd = pgd_offset(mm, 0xA0000);
+	if (pgd_none_or_clear_bad(pgd))
+		goto out;
+	pud = pud_offset(pgd, 0xA0000);
+	if (pud_none_or_clear_bad(pud))
+		goto out;
+	pmd = pmd_offset(pud, 0xA0000);
+	if (pmd_none_or_clear_bad(pmd))
+		goto out;
+	pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
+	for (i = 0; i < 32; i++) {
+		if (pte_present(*pte))
+			set_pte(pte, pte_wrprotect(*pte));
+		pte++;
+	}
+	pte_unmap_unlock(pte, ptl);
+out:
+	flush_tlb();
+}
+
+
+
+static int do_vm86_irq_handling(int subfunction, int irqnumber);
+static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
+
+asmlinkage int sys_vm86old(struct pt_regs regs)
+{
+	struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx;
+	struct kernel_vm86_struct info; /* declare this _on top_,
+					 * this avoids wasting of stack space.
+					 * This remains on the stack until we
+					 * return to 32 bit user space.
+					 */
+	struct task_struct *tsk;
+	int tmp, ret = -EPERM;
+
+	tsk = current;
+	if (tsk->thread.saved_esp0)
+		goto out;
+	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
+				       offsetof(struct kernel_vm86_struct, vm86plus) -
+				       sizeof(info.regs));
+	ret = -EFAULT;
+	if (tmp)
+		goto out;
+	memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
+	info.regs32 = &regs;
+	tsk->thread.vm86_info = v86;
+	do_sys_vm86(&info, tsk);
+	ret = 0;	/* we never return here */
+out:
+	return ret;
+}
+
+
+asmlinkage int sys_vm86(struct pt_regs regs)
+{
+	struct kernel_vm86_struct info; /* declare this _on top_,
+					 * this avoids wasting of stack space.
+					 * This remains on the stack until we
+					 * return to 32 bit user space.
+					 */
+	struct task_struct *tsk;
+	int tmp, ret;
+	struct vm86plus_struct __user *v86;
+
+	tsk = current;
+	switch (regs.ebx) {
+		case VM86_REQUEST_IRQ:
+		case VM86_FREE_IRQ:
+		case VM86_GET_IRQ_BITS:
+		case VM86_GET_AND_RESET_IRQ:
+			ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx);
+			goto out;
+		case VM86_PLUS_INSTALL_CHECK:
+			/* NOTE: on old vm86 stuff this will return the error
+			   from access_ok(), because the subfunction is
+			   interpreted as (invalid) address to vm86_struct.
+			   So the installation check works.
+			 */
+			ret = 0;
+			goto out;
+	}
+
+	/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
+	ret = -EPERM;
+	if (tsk->thread.saved_esp0)
+		goto out;
+	v86 = (struct vm86plus_struct __user *)regs.ecx;
+	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
+				       offsetof(struct kernel_vm86_struct, regs32) -
+				       sizeof(info.regs));
+	ret = -EFAULT;
+	if (tmp)
+		goto out;
+	info.regs32 = &regs;
+	info.vm86plus.is_vm86pus = 1;
+	tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
+	do_sys_vm86(&info, tsk);
+	ret = 0;	/* we never return here */
+out:
+	return ret;
+}
+
+
+static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+{
+	struct tss_struct *tss;
+/*
+ * make sure the vm86() system call doesn't try to do anything silly
+ */
+	info->regs.pt.xds = 0;
+	info->regs.pt.xes = 0;
+	info->regs.pt.xfs = 0;
+
+/* we are clearing gs later just before "jmp resume_userspace",
+ * because it is not saved/restored.
+ */
+
+/*
+ * The eflags register is also special: we cannot trust that the user
+ * has set it up safely, so this makes sure interrupt etc flags are
+ * inherited from protected mode.
+ */
+ 	VEFLAGS = info->regs.pt.eflags;
+	info->regs.pt.eflags &= SAFE_MASK;
+	info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
+	info->regs.pt.eflags |= VM_MASK;
+
+	switch (info->cpu_type) {
+		case CPU_286:
+			tsk->thread.v86mask = 0;
+			break;
+		case CPU_386:
+			tsk->thread.v86mask = NT_MASK | IOPL_MASK;
+			break;
+		case CPU_486:
+			tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK;
+			break;
+		default:
+			tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK;
+			break;
+	}
+
+/*
+ * Save old state, set default return value (%eax) to 0
+ */
+	info->regs32->eax = 0;
+	tsk->thread.saved_esp0 = tsk->thread.esp0;
+	tsk->thread.saved_fs = info->regs32->xfs;
+	savesegment(gs, tsk->thread.saved_gs);
+
+	tss = &per_cpu(init_tss, get_cpu());
+	tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
+	if (cpu_has_sep)
+		tsk->thread.sysenter_cs = 0;
+	load_esp0(tss, &tsk->thread);
+	put_cpu();
+
+	tsk->thread.screen_bitmap = info->screen_bitmap;
+	if (info->flags & VM86_SCREEN_BITMAP)
+		mark_screen_rdonly(tsk->mm);
+
+	/*call audit_syscall_exit since we do not exit via the normal paths */
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(AUDITSC_RESULT(0), 0);
+
+	__asm__ __volatile__(
+		"movl %0,%%esp\n\t"
+		"movl %1,%%ebp\n\t"
+		"mov  %2, %%gs\n\t"
+		"jmp resume_userspace"
+		: /* no outputs */
+		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
+	/* we never return here */
+}
+
+static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
+{
+	struct pt_regs * regs32;
+
+	regs32 = save_v86_state(regs16);
+	regs32->eax = retval;
+	__asm__ __volatile__("movl %0,%%esp\n\t"
+		"movl %1,%%ebp\n\t"
+		"jmp resume_userspace"
+		: : "r" (regs32), "r" (current_thread_info()));
+}
+
+static inline void set_IF(struct kernel_vm86_regs * regs)
+{
+	VEFLAGS |= VIF_MASK;
+	if (VEFLAGS & VIP_MASK)
+		return_to_32bit(regs, VM86_STI);
+}
+
+static inline void clear_IF(struct kernel_vm86_regs * regs)
+{
+	VEFLAGS &= ~VIF_MASK;
+}
+
+static inline void clear_TF(struct kernel_vm86_regs * regs)
+{
+	regs->pt.eflags &= ~TF_MASK;
+}
+
+static inline void clear_AC(struct kernel_vm86_regs * regs)
+{
+	regs->pt.eflags &= ~AC_MASK;
+}
+
+/* It is correct to call set_IF(regs) from the set_vflags_*
+ * functions. However someone forgot to call clear_IF(regs)
+ * in the opposite case.
+ * After the command sequence CLI PUSHF STI POPF you should
+ * end up with interrups disabled, but you ended up with
+ * interrupts enabled.
+ *  ( I was testing my own changes, but the only bug I
+ *    could find was in a function I had not changed. )
+ * [KD]
+ */
+
+static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
+{
+	set_flags(VEFLAGS, eflags, current->thread.v86mask);
+	set_flags(regs->pt.eflags, eflags, SAFE_MASK);
+	if (eflags & IF_MASK)
+		set_IF(regs);
+	else
+		clear_IF(regs);
+}
+
+static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
+{
+	set_flags(VFLAGS, flags, current->thread.v86mask);
+	set_flags(regs->pt.eflags, flags, SAFE_MASK);
+	if (flags & IF_MASK)
+		set_IF(regs);
+	else
+		clear_IF(regs);
+}
+
+static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
+{
+	unsigned long flags = regs->pt.eflags & RETURN_MASK;
+
+	if (VEFLAGS & VIF_MASK)
+		flags |= IF_MASK;
+	flags |= IOPL_MASK;
+	return flags | (VEFLAGS & current->thread.v86mask);
+}
+
+static inline int is_revectored(int nr, struct revectored_struct * bitmap)
+{
+	__asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
+		:"=r" (nr)
+		:"m" (*bitmap),"r" (nr));
+	return nr;
+}
+
+#define val_byte(val, n) (((__u8 *)&val)[n])
+
+#define pushb(base, ptr, val, err_label) \
+	do { \
+		__u8 __val = val; \
+		ptr--; \
+		if (put_user(__val, base + ptr) < 0) \
+			goto err_label; \
+	} while(0)
+
+#define pushw(base, ptr, val, err_label) \
+	do { \
+		__u16 __val = val; \
+		ptr--; \
+		if (put_user(val_byte(__val, 1), base + ptr) < 0) \
+			goto err_label; \
+		ptr--; \
+		if (put_user(val_byte(__val, 0), base + ptr) < 0) \
+			goto err_label; \
+	} while(0)
+
+#define pushl(base, ptr, val, err_label) \
+	do { \
+		__u32 __val = val; \
+		ptr--; \
+		if (put_user(val_byte(__val, 3), base + ptr) < 0) \
+			goto err_label; \
+		ptr--; \
+		if (put_user(val_byte(__val, 2), base + ptr) < 0) \
+			goto err_label; \
+		ptr--; \
+		if (put_user(val_byte(__val, 1), base + ptr) < 0) \
+			goto err_label; \
+		ptr--; \
+		if (put_user(val_byte(__val, 0), base + ptr) < 0) \
+			goto err_label; \
+	} while(0)
+
+#define popb(base, ptr, err_label) \
+	({ \
+		__u8 __res; \
+		if (get_user(__res, base + ptr) < 0) \
+			goto err_label; \
+		ptr++; \
+		__res; \
+	})
+
+#define popw(base, ptr, err_label) \
+	({ \
+		__u16 __res; \
+		if (get_user(val_byte(__res, 0), base + ptr) < 0) \
+			goto err_label; \
+		ptr++; \
+		if (get_user(val_byte(__res, 1), base + ptr) < 0) \
+			goto err_label; \
+		ptr++; \
+		__res; \
+	})
+
+#define popl(base, ptr, err_label) \
+	({ \
+		__u32 __res; \
+		if (get_user(val_byte(__res, 0), base + ptr) < 0) \
+			goto err_label; \
+		ptr++; \
+		if (get_user(val_byte(__res, 1), base + ptr) < 0) \
+			goto err_label; \
+		ptr++; \
+		if (get_user(val_byte(__res, 2), base + ptr) < 0) \
+			goto err_label; \
+		ptr++; \
+		if (get_user(val_byte(__res, 3), base + ptr) < 0) \
+			goto err_label; \
+		ptr++; \
+		__res; \
+	})
+
+/* There are so many possible reasons for this function to return
+ * VM86_INTx, so adding another doesn't bother me. We can expect
+ * userspace programs to be able to handle it. (Getting a problem
+ * in userspace is always better than an Oops anyway.) [KD]
+ */
+static void do_int(struct kernel_vm86_regs *regs, int i,
+    unsigned char __user * ssp, unsigned short sp)
+{
+	unsigned long __user *intr_ptr;
+	unsigned long segoffs;
+
+	if (regs->pt.xcs == BIOSSEG)
+		goto cannot_handle;
+	if (is_revectored(i, &KVM86->int_revectored))
+		goto cannot_handle;
+	if (i==0x21 && is_revectored(AH(regs),&KVM86->int21_revectored))
+		goto cannot_handle;
+	intr_ptr = (unsigned long __user *) (i << 2);
+	if (get_user(segoffs, intr_ptr))
+		goto cannot_handle;
+	if ((segoffs >> 16) == BIOSSEG)
+		goto cannot_handle;
+	pushw(ssp, sp, get_vflags(regs), cannot_handle);
+	pushw(ssp, sp, regs->pt.xcs, cannot_handle);
+	pushw(ssp, sp, IP(regs), cannot_handle);
+	regs->pt.xcs = segoffs >> 16;
+	SP(regs) -= 6;
+	IP(regs) = segoffs & 0xffff;
+	clear_TF(regs);
+	clear_IF(regs);
+	clear_AC(regs);
+	return;
+
+cannot_handle:
+	return_to_32bit(regs, VM86_INTx + (i << 8));
+}
+
+int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno)
+{
+	if (VMPI.is_vm86pus) {
+		if ( (trapno==3) || (trapno==1) )
+			return_to_32bit(regs, VM86_TRAP + (trapno << 8));
+		do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
+		return 0;
+	}
+	if (trapno !=1)
+		return 1; /* we let this handle by the calling routine */
+	if (current->ptrace & PT_PTRACED) {
+		unsigned long flags;
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		sigdelset(&current->blocked, SIGTRAP);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	}
+	send_sig(SIGTRAP, current, 1);
+	current->thread.trap_no = trapno;
+	current->thread.error_code = error_code;
+	return 0;
+}
+
+void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
+{
+	unsigned char opcode;
+	unsigned char __user *csp;
+	unsigned char __user *ssp;
+	unsigned short ip, sp, orig_flags;
+	int data32, pref_done;
+
+#define CHECK_IF_IN_TRAP \
+	if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
+		newflags |= TF_MASK
+#define VM86_FAULT_RETURN do { \
+	if (VMPI.force_return_for_pic  && (VEFLAGS & (IF_MASK | VIF_MASK))) \
+		return_to_32bit(regs, VM86_PICRETURN); \
+	if (orig_flags & TF_MASK) \
+		handle_vm86_trap(regs, 0, 1); \
+	return; } while (0)
+
+	orig_flags = *(unsigned short *)&regs->pt.eflags;
+
+	csp = (unsigned char __user *) (regs->pt.xcs << 4);
+	ssp = (unsigned char __user *) (regs->pt.xss << 4);
+	sp = SP(regs);
+	ip = IP(regs);
+
+	data32 = 0;
+	pref_done = 0;
+	do {
+		switch (opcode = popb(csp, ip, simulate_sigsegv)) {
+			case 0x66:      /* 32-bit data */     data32=1; break;
+			case 0x67:      /* 32-bit address */  break;
+			case 0x2e:      /* CS */              break;
+			case 0x3e:      /* DS */              break;
+			case 0x26:      /* ES */              break;
+			case 0x36:      /* SS */              break;
+			case 0x65:      /* GS */              break;
+			case 0x64:      /* FS */              break;
+			case 0xf2:      /* repnz */       break;
+			case 0xf3:      /* rep */             break;
+			default: pref_done = 1;
+		}
+	} while (!pref_done);
+
+	switch (opcode) {
+
+	/* pushf */
+	case 0x9c:
+		if (data32) {
+			pushl(ssp, sp, get_vflags(regs), simulate_sigsegv);
+			SP(regs) -= 4;
+		} else {
+			pushw(ssp, sp, get_vflags(regs), simulate_sigsegv);
+			SP(regs) -= 2;
+		}
+		IP(regs) = ip;
+		VM86_FAULT_RETURN;
+
+	/* popf */
+	case 0x9d:
+		{
+		unsigned long newflags;
+		if (data32) {
+			newflags=popl(ssp, sp, simulate_sigsegv);
+			SP(regs) += 4;
+		} else {
+			newflags = popw(ssp, sp, simulate_sigsegv);
+			SP(regs) += 2;
+		}
+		IP(regs) = ip;
+		CHECK_IF_IN_TRAP;
+		if (data32) {
+			set_vflags_long(newflags, regs);
+		} else {
+			set_vflags_short(newflags, regs);
+		}
+		VM86_FAULT_RETURN;
+		}
+
+	/* int xx */
+	case 0xcd: {
+		int intno=popb(csp, ip, simulate_sigsegv);
+		IP(regs) = ip;
+		if (VMPI.vm86dbg_active) {
+			if ( (1 << (intno &7)) & VMPI.vm86dbg_intxxtab[intno >> 3] )
+				return_to_32bit(regs, VM86_INTx + (intno << 8));
+		}
+		do_int(regs, intno, ssp, sp);
+		return;
+	}
+
+	/* iret */
+	case 0xcf:
+		{
+		unsigned long newip;
+		unsigned long newcs;
+		unsigned long newflags;
+		if (data32) {
+			newip=popl(ssp, sp, simulate_sigsegv);
+			newcs=popl(ssp, sp, simulate_sigsegv);
+			newflags=popl(ssp, sp, simulate_sigsegv);
+			SP(regs) += 12;
+		} else {
+			newip = popw(ssp, sp, simulate_sigsegv);
+			newcs = popw(ssp, sp, simulate_sigsegv);
+			newflags = popw(ssp, sp, simulate_sigsegv);
+			SP(regs) += 6;
+		}
+		IP(regs) = newip;
+		regs->pt.xcs = newcs;
+		CHECK_IF_IN_TRAP;
+		if (data32) {
+			set_vflags_long(newflags, regs);
+		} else {
+			set_vflags_short(newflags, regs);
+		}
+		VM86_FAULT_RETURN;
+		}
+
+	/* cli */
+	case 0xfa:
+		IP(regs) = ip;
+		clear_IF(regs);
+		VM86_FAULT_RETURN;
+
+	/* sti */
+	/*
+	 * Damn. This is incorrect: the 'sti' instruction should actually
+	 * enable interrupts after the /next/ instruction. Not good.
+	 *
+	 * Probably needs some horsing around with the TF flag. Aiee..
+	 */
+	case 0xfb:
+		IP(regs) = ip;
+		set_IF(regs);
+		VM86_FAULT_RETURN;
+
+	default:
+		return_to_32bit(regs, VM86_UNKNOWN);
+	}
+
+	return;
+
+simulate_sigsegv:
+	/* FIXME: After a long discussion with Stas we finally
+	 *        agreed, that this is wrong. Here we should
+	 *        really send a SIGSEGV to the user program.
+	 *        But how do we create the correct context? We
+	 *        are inside a general protection fault handler
+	 *        and has just returned from a page fault handler.
+	 *        The correct context for the signal handler
+	 *        should be a mixture of the two, but how do we
+	 *        get the information? [KD]
+	 */
+	return_to_32bit(regs, VM86_UNKNOWN);
+}
+
+/* ---------------- vm86 special IRQ passing stuff ----------------- */
+
+#define VM86_IRQNAME		"vm86irq"
+
+static struct vm86_irqs {
+	struct task_struct *tsk;
+	int sig;
+} vm86_irqs[16];
+
+static DEFINE_SPINLOCK(irqbits_lock);
+static int irqbits;
+
+#define ALLOWED_SIGS ( 1 /* 0 = don't send a signal */ \
+	| (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO)  | (1 << SIGURG) \
+	| (1 << SIGUNUSED) )
+	
+static irqreturn_t irq_handler(int intno, void *dev_id)
+{
+	int irq_bit;
+	unsigned long flags;
+
+	spin_lock_irqsave(&irqbits_lock, flags);	
+	irq_bit = 1 << intno;
+	if ((irqbits & irq_bit) || ! vm86_irqs[intno].tsk)
+		goto out;
+	irqbits |= irq_bit;
+	if (vm86_irqs[intno].sig)
+		send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1);
+	/*
+	 * IRQ will be re-enabled when user asks for the irq (whether
+	 * polling or as a result of the signal)
+	 */
+	disable_irq_nosync(intno);
+	spin_unlock_irqrestore(&irqbits_lock, flags);
+	return IRQ_HANDLED;
+
+out:
+	spin_unlock_irqrestore(&irqbits_lock, flags);	
+	return IRQ_NONE;
+}
+
+static inline void free_vm86_irq(int irqnumber)
+{
+	unsigned long flags;
+
+	free_irq(irqnumber, NULL);
+	vm86_irqs[irqnumber].tsk = NULL;
+
+	spin_lock_irqsave(&irqbits_lock, flags);	
+	irqbits &= ~(1 << irqnumber);
+	spin_unlock_irqrestore(&irqbits_lock, flags);	
+}
+
+void release_vm86_irqs(struct task_struct *task)
+{
+	int i;
+	for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++)
+	    if (vm86_irqs[i].tsk == task)
+		free_vm86_irq(i);
+}
+
+static inline int get_and_reset_irq(int irqnumber)
+{
+	int bit;
+	unsigned long flags;
+	int ret = 0;
+	
+	if (invalid_vm86_irq(irqnumber)) return 0;
+	if (vm86_irqs[irqnumber].tsk != current) return 0;
+	spin_lock_irqsave(&irqbits_lock, flags);	
+	bit = irqbits & (1 << irqnumber);
+	irqbits &= ~bit;
+	if (bit) {
+		enable_irq(irqnumber);
+		ret = 1;
+	}
+
+	spin_unlock_irqrestore(&irqbits_lock, flags);	
+	return ret;
+}
+
+
+static int do_vm86_irq_handling(int subfunction, int irqnumber)
+{
+	int ret;
+	switch (subfunction) {
+		case VM86_GET_AND_RESET_IRQ: {
+			return get_and_reset_irq(irqnumber);
+		}
+		case VM86_GET_IRQ_BITS: {
+			return irqbits;
+		}
+		case VM86_REQUEST_IRQ: {
+			int sig = irqnumber >> 8;
+			int irq = irqnumber & 255;
+			if (!capable(CAP_SYS_ADMIN)) return -EPERM;
+			if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM;
+			if (invalid_vm86_irq(irq)) return -EPERM;
+			if (vm86_irqs[irq].tsk) return -EPERM;
+			ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL);
+			if (ret) return ret;
+			vm86_irqs[irq].sig = sig;
+			vm86_irqs[irq].tsk = current;
+			return irq;
+		}
+		case  VM86_FREE_IRQ: {
+			if (invalid_vm86_irq(irqnumber)) return -EPERM;
+			if (!vm86_irqs[irqnumber].tsk) return 0;
+			if (vm86_irqs[irqnumber].tsk != current) return -EPERM;
+			free_vm86_irq(irqnumber);
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
new file mode 100644
index 000000000000..18673e0f193b
--- /dev/null
+++ b/arch/x86/kernel/vmi_32.c
@@ -0,0 +1,981 @@
+/*
+ * VMI specific paravirt-ops implementation
+ *
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to zach@vmware.com
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <asm/vmi.h>
+#include <asm/io.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/timer.h>
+#include <asm/vmi_time.h>
+#include <asm/kmap_types.h>
+
+/* Convenient for calling VMI functions indirectly in the ROM */
+typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
+typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
+
+#define call_vrom_func(rom,func) \
+   (((VROMFUNC *)(rom->func))())
+
+#define call_vrom_long_func(rom,func,arg) \
+   (((VROMLONGFUNC *)(rom->func)) (arg))
+
+static struct vrom_header *vmi_rom;
+static int disable_pge;
+static int disable_pse;
+static int disable_sep;
+static int disable_tsc;
+static int disable_mtrr;
+static int disable_noidle;
+static int disable_vmi_timer;
+
+/* Cached VMI operations */
+static struct {
+	void (*cpuid)(void /* non-c */);
+	void (*_set_ldt)(u32 selector);
+	void (*set_tr)(u32 selector);
+	void (*set_kernel_stack)(u32 selector, u32 esp0);
+	void (*allocate_page)(u32, u32, u32, u32, u32);
+	void (*release_page)(u32, u32);
+	void (*set_pte)(pte_t, pte_t *, unsigned);
+	void (*update_pte)(pte_t *, unsigned);
+	void (*set_linear_mapping)(int, void *, u32, u32);
+	void (*_flush_tlb)(int);
+	void (*set_initial_ap_state)(int, int);
+	void (*halt)(void);
+  	void (*set_lazy_mode)(int mode);
+} vmi_ops;
+
+/* Cached VMI operations */
+struct vmi_timer_ops vmi_timer_ops;
+
+/*
+ * VMI patching routines.
+ */
+#define MNEM_CALL 0xe8
+#define MNEM_JMP  0xe9
+#define MNEM_RET  0xc3
+
+#define IRQ_PATCH_INT_MASK 0
+#define IRQ_PATCH_DISABLE  5
+
+static inline void patch_offset(void *insnbuf,
+				unsigned long eip, unsigned long dest)
+{
+        *(unsigned long *)(insnbuf+1) = dest-eip-5;
+}
+
+static unsigned patch_internal(int call, unsigned len, void *insnbuf,
+			       unsigned long eip)
+{
+	u64 reloc;
+	struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,	call);
+	switch(rel->type) {
+		case VMI_RELOCATION_CALL_REL:
+			BUG_ON(len < 5);
+			*(char *)insnbuf = MNEM_CALL;
+			patch_offset(insnbuf, eip, (unsigned long)rel->eip);
+			return 5;
+
+		case VMI_RELOCATION_JUMP_REL:
+			BUG_ON(len < 5);
+			*(char *)insnbuf = MNEM_JMP;
+			patch_offset(insnbuf, eip, (unsigned long)rel->eip);
+			return 5;
+
+		case VMI_RELOCATION_NOP:
+			/* obliterate the whole thing */
+			return 0;
+
+		case VMI_RELOCATION_NONE:
+			/* leave native code in place */
+			break;
+
+		default:
+			BUG();
+	}
+	return len;
+}
+
+/*
+ * Apply patch if appropriate, return length of new instruction
+ * sequence.  The callee does nop padding for us.
+ */
+static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
+			  unsigned long eip, unsigned len)
+{
+	switch (type) {
+		case PARAVIRT_PATCH(irq_disable):
+			return patch_internal(VMI_CALL_DisableInterrupts, len,
+					      insns, eip);
+		case PARAVIRT_PATCH(irq_enable):
+			return patch_internal(VMI_CALL_EnableInterrupts, len,
+					      insns, eip);
+		case PARAVIRT_PATCH(restore_fl):
+			return patch_internal(VMI_CALL_SetInterruptMask, len,
+					      insns, eip);
+		case PARAVIRT_PATCH(save_fl):
+			return patch_internal(VMI_CALL_GetInterruptMask, len,
+					      insns, eip);
+		case PARAVIRT_PATCH(iret):
+			return patch_internal(VMI_CALL_IRET, len, insns, eip);
+		case PARAVIRT_PATCH(irq_enable_sysexit):
+			return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
+		default:
+			break;
+	}
+	return len;
+}
+
+/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
+static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
+                               unsigned int *ecx, unsigned int *edx)
+{
+	int override = 0;
+	if (*eax == 1)
+		override = 1;
+        asm volatile ("call *%6"
+                      : "=a" (*eax),
+                        "=b" (*ebx),
+                        "=c" (*ecx),
+                        "=d" (*edx)
+                      : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
+	if (override) {
+		if (disable_pse)
+			*edx &= ~X86_FEATURE_PSE;
+		if (disable_pge)
+			*edx &= ~X86_FEATURE_PGE;
+		if (disable_sep)
+			*edx &= ~X86_FEATURE_SEP;
+		if (disable_tsc)
+			*edx &= ~X86_FEATURE_TSC;
+		if (disable_mtrr)
+			*edx &= ~X86_FEATURE_MTRR;
+	}
+}
+
+static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
+{
+	if (gdt[nr].a != new->a || gdt[nr].b != new->b)
+		write_gdt_entry(gdt, nr, new->a, new->b);
+}
+
+static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
+	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
+	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
+}
+
+static void vmi_set_ldt(const void *addr, unsigned entries)
+{
+	unsigned cpu = smp_processor_id();
+	u32 low, high;
+
+	pack_descriptor(&low, &high, (unsigned long)addr,
+			entries * sizeof(struct desc_struct) - 1,
+			DESCTYPE_LDT, 0);
+	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
+	vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
+}
+
+static void vmi_set_tr(void)
+{
+	vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
+}
+
+static void vmi_load_esp0(struct tss_struct *tss,
+				   struct thread_struct *thread)
+{
+	tss->x86_tss.esp0 = thread->esp0;
+
+	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
+	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+		tss->x86_tss.ss1 = thread->sysenter_cs;
+		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+	}
+	vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
+}
+
+static void vmi_flush_tlb_user(void)
+{
+	vmi_ops._flush_tlb(VMI_FLUSH_TLB);
+}
+
+static void vmi_flush_tlb_kernel(void)
+{
+	vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
+}
+
+/* Stub to do nothing at all; used for delays and unimplemented calls */
+static void vmi_nop(void)
+{
+}
+
+#ifdef CONFIG_DEBUG_PAGE_TYPE
+
+#ifdef CONFIG_X86_PAE
+#define MAX_BOOT_PTS (2048+4+1)
+#else
+#define MAX_BOOT_PTS (1024+1)
+#endif
+
+/*
+ * During boot, mem_map is not yet available in paging_init, so stash
+ * all the boot page allocations here.
+ */
+static struct {
+	u32 pfn;
+	int type;
+} boot_page_allocations[MAX_BOOT_PTS];
+static int num_boot_page_allocations;
+static int boot_allocations_applied;
+
+void vmi_apply_boot_page_allocations(void)
+{
+	int i;
+	BUG_ON(!mem_map);
+	for (i = 0; i < num_boot_page_allocations; i++) {
+		struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
+		page->type = boot_page_allocations[i].type;
+		page->type = boot_page_allocations[i].type &
+				~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+	}
+	boot_allocations_applied = 1;
+}
+
+static void record_page_type(u32 pfn, int type)
+{
+	BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
+	boot_page_allocations[num_boot_page_allocations].pfn = pfn;
+	boot_page_allocations[num_boot_page_allocations].type = type;
+	num_boot_page_allocations++;
+}
+
+static void check_zeroed_page(u32 pfn, int type, struct page *page)
+{
+	u32 *ptr;
+	int i;
+	int limit = PAGE_SIZE / sizeof(int);
+
+	if (page_address(page))
+		ptr = (u32 *)page_address(page);
+	else
+		ptr = (u32 *)__va(pfn << PAGE_SHIFT);
+	/*
+	 * When cloning the root in non-PAE mode, only the userspace
+	 * pdes need to be zeroed.
+	 */
+	if (type & VMI_PAGE_CLONE)
+		limit = USER_PTRS_PER_PGD;
+	for (i = 0; i < limit; i++)
+		BUG_ON(ptr[i]);
+}
+
+/*
+ * We stash the page type into struct page so we can verify the page
+ * types are used properly.
+ */
+static void vmi_set_page_type(u32 pfn, int type)
+{
+	/* PAE can have multiple roots per page - don't track */
+	if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
+		return;
+
+	if (boot_allocations_applied) {
+		struct page *page = pfn_to_page(pfn);
+		if (type != VMI_PAGE_NORMAL)
+			BUG_ON(page->type);
+		else
+			BUG_ON(page->type == VMI_PAGE_NORMAL);
+		page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+		if (type & VMI_PAGE_ZEROED)
+			check_zeroed_page(pfn, type, page);
+	} else {
+		record_page_type(pfn, type);
+	}
+}
+
+static void vmi_check_page_type(u32 pfn, int type)
+{
+	/* PAE can have multiple roots per page - skip checks */
+	if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
+		return;
+
+	type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+	if (boot_allocations_applied) {
+		struct page *page = pfn_to_page(pfn);
+		BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
+		BUG_ON(type == VMI_PAGE_NORMAL && page->type);
+		BUG_ON((type & page->type) == 0);
+	}
+}
+#else
+#define vmi_set_page_type(p,t) do { } while (0)
+#define vmi_check_page_type(p,t) do { } while (0)
+#endif
+
+#ifdef CONFIG_HIGHPTE
+static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+	void *va = kmap_atomic(page, type);
+
+	/*
+	 * Internally, the VMI ROM must map virtual addresses to physical
+	 * addresses for processing MMU updates.  By the time MMU updates
+	 * are issued, this information is typically already lost.
+	 * Fortunately, the VMI provides a cache of mapping slots for active
+	 * page tables.
+	 *
+	 * We use slot zero for the linear mapping of physical memory, and
+	 * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
+	 *
+	 *  args:                 SLOT                 VA    COUNT PFN
+	 */
+	BUG_ON(type != KM_PTE0 && type != KM_PTE1);
+	vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
+
+	return va;
+}
+#endif
+
+static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
+{
+	vmi_set_page_type(pfn, VMI_PAGE_L1);
+	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
+}
+
+static void vmi_allocate_pd(u32 pfn)
+{
+ 	/*
+	 * This call comes in very early, before mem_map is setup.
+	 * It is called only for swapper_pg_dir, which already has
+	 * data on it.
+	 */
+ 	vmi_set_page_type(pfn, VMI_PAGE_L2);
+	vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
+}
+
+static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+{
+ 	vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
+	vmi_check_page_type(clonepfn, VMI_PAGE_L2);
+	vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
+}
+
+static void vmi_release_pt(u32 pfn)
+{
+	vmi_ops.release_page(pfn, VMI_PAGE_L1);
+	vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+
+static void vmi_release_pd(u32 pfn)
+{
+	vmi_ops.release_page(pfn, VMI_PAGE_L2);
+	vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+
+/*
+ * Helper macros for MMU update flags.  We can defer updates until a flush
+ * or page invalidation only if the update is to the current address space
+ * (otherwise, there is no flush).  We must check against init_mm, since
+ * this could be a kernel update, which usually passes init_mm, although
+ * sometimes this check can be skipped if we know the particular function
+ * is only called on user mode PTEs.  We could change the kernel to pass
+ * current->active_mm here, but in particular, I was unsure if changing
+ * mm/highmem.c to do this would still be correct on other architectures.
+ */
+#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm ||    \
+                                       (!mustbeuser && (mm) == &init_mm))
+#define vmi_flags_addr(mm, addr, level, user)                           \
+        ((level) | (is_current_as(mm, user) ?                           \
+                (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
+#define vmi_flags_addr_defer(mm, addr, level, user)                     \
+        ((level) | (is_current_as(mm, user) ?                           \
+                (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
+
+static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_set_pte(pte_t *ptep, pte_t pte)
+{
+	/* XXX because of set_pmd_pte, this can be called on PT or PD layers */
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
+	vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
+}
+
+static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+#ifdef CONFIG_X86_PAE
+	const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
+	vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
+#else
+	const pte_t pte = { pmdval.pud.pgd.pgd };
+	vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
+#endif
+	vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
+}
+
+#ifdef CONFIG_X86_PAE
+
+static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+	/*
+	 * XXX This is called from set_pmd_pte, but at both PT
+	 * and PD layers so the VMI_PAGE_PT flag is wrong.  But
+	 * it is only called for large page mapping changes,
+	 * the Xen backend, doesn't support large pages, and the
+	 * ESX backend doesn't depend on the flag.
+	 */
+	set_64bit((unsigned long long *)ptep,pte_val(pteval));
+	vmi_ops.update_pte(ptep, VMI_PAGE_PT);
+}
+
+static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
+}
+
+static void vmi_set_pud(pud_t *pudp, pud_t pudval)
+{
+	/* Um, eww */
+	const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
+	vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
+	vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
+}
+
+static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	const pte_t pte = { 0 };
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_pmd_clear(pmd_t *pmd)
+{
+	const pte_t pte = { 0 };
+	vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
+	vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
+}
+#endif
+
+#ifdef CONFIG_SMP
+static void __devinit
+vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
+		     unsigned long start_esp)
+{
+	struct vmi_ap_state ap;
+
+	/* Default everything to zero.  This is fine for most GPRs. */
+	memset(&ap, 0, sizeof(struct vmi_ap_state));
+
+	ap.gdtr_limit = GDT_SIZE - 1;
+	ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
+
+	ap.idtr_limit = IDT_ENTRIES * 8 - 1;
+	ap.idtr_base = (unsigned long) idt_table;
+
+	ap.ldtr = 0;
+
+	ap.cs = __KERNEL_CS;
+	ap.eip = (unsigned long) start_eip;
+	ap.ss = __KERNEL_DS;
+	ap.esp = (unsigned long) start_esp;
+
+	ap.ds = __USER_DS;
+	ap.es = __USER_DS;
+	ap.fs = __KERNEL_PERCPU;
+	ap.gs = 0;
+
+	ap.eflags = 0;
+
+#ifdef CONFIG_X86_PAE
+	/* efer should match BSP efer. */
+	if (cpu_has_nx) {
+		unsigned l, h;
+		rdmsr(MSR_EFER, l, h);
+		ap.efer = (unsigned long long) h << 32 | l;
+	}
+#endif
+
+	ap.cr3 = __pa(swapper_pg_dir);
+	/* Protected mode, paging, AM, WP, NE, MP. */
+	ap.cr0 = 0x80050023;
+	ap.cr4 = mmu_cr4_features;
+	vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
+}
+#endif
+
+static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+	static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
+
+	if (!vmi_ops.set_lazy_mode)
+		return;
+
+	/* Modes should never nest or overlap */
+	BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE ||
+					     mode == PARAVIRT_LAZY_FLUSH));
+
+	if (mode == PARAVIRT_LAZY_FLUSH) {
+		vmi_ops.set_lazy_mode(0);
+		vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode));
+	} else {
+		vmi_ops.set_lazy_mode(mode);
+		__get_cpu_var(lazy_mode) = mode;
+	}
+}
+
+static inline int __init check_vmi_rom(struct vrom_header *rom)
+{
+	struct pci_header *pci;
+	struct pnp_header *pnp;
+	const char *manufacturer = "UNKNOWN";
+	const char *product = "UNKNOWN";
+	const char *license = "unspecified";
+
+	if (rom->rom_signature != 0xaa55)
+		return 0;
+	if (rom->vrom_signature != VMI_SIGNATURE)
+		return 0;
+	if (rom->api_version_maj != VMI_API_REV_MAJOR ||
+	    rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
+		printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
+				rom->api_version_maj,
+				rom->api_version_min);
+		return 0;
+	}
+
+	/*
+	 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
+	 * the PCI header and device type to make sure this is really a
+	 * VMI device.
+	 */
+	if (!rom->pci_header_offs) {
+		printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
+		return 0;
+	}
+
+	pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
+	if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
+	    pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
+		/* Allow it to run... anyways, but warn */
+		printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
+	}
+
+	if (rom->pnp_header_offs) {
+		pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
+		if (pnp->manufacturer_offset)
+			manufacturer = (const char *)rom+pnp->manufacturer_offset;
+		if (pnp->product_offset)
+			product = (const char *)rom+pnp->product_offset;
+	}
+
+	if (rom->license_offs)
+		license = (char *)rom+rom->license_offs;
+
+	printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
+		manufacturer, product,
+		rom->api_version_maj, rom->api_version_min,
+		pci->rom_version_maj, pci->rom_version_min);
+
+	/* Don't allow BSD/MIT here for now because we don't want to end up
+	   with any binary only shim layers */
+	if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
+		printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
+			license);
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Probe for the VMI option ROM
+ */
+static inline int __init probe_vmi_rom(void)
+{
+	unsigned long base;
+
+	/* VMI ROM is in option ROM area, check signature */
+	for (base = 0xC0000; base < 0xE0000; base += 2048) {
+		struct vrom_header *romstart;
+		romstart = (struct vrom_header *)isa_bus_to_virt(base);
+		if (check_vmi_rom(romstart)) {
+			vmi_rom = romstart;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * VMI setup common to all processors
+ */
+void vmi_bringup(void)
+{
+ 	/* We must establish the lowmem mapping for MMU ops to work */
+	if (vmi_ops.set_linear_mapping)
+		vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
+}
+
+/*
+ * Return a pointer to a VMI function or NULL if unimplemented
+ */
+static void *vmi_get_function(int vmicall)
+{
+	u64 reloc;
+	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,	vmicall);
+	BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
+	if (rel->type == VMI_RELOCATION_CALL_REL)
+		return (void *)rel->eip;
+	else
+		return NULL;
+}
+
+/*
+ * Helper macro for making the VMI paravirt-ops fill code readable.
+ * For unimplemented operations, fall back to default, unless nop
+ * is returned by the ROM.
+ */
+#define para_fill(opname, vmicall)				\
+do {								\
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,		\
+				    VMI_CALL_##vmicall);	\
+	if (rel->type == VMI_RELOCATION_CALL_REL) 		\
+		paravirt_ops.opname = (void *)rel->eip;		\
+	else if (rel->type == VMI_RELOCATION_NOP) 		\
+		paravirt_ops.opname = (void *)vmi_nop;		\
+	else if (rel->type != VMI_RELOCATION_NONE)		\
+		printk(KERN_WARNING "VMI: Unknown relocation "	\
+				    "type %d for " #vmicall"\n",\
+					rel->type);		\
+} while (0)
+
+/*
+ * Helper macro for making the VMI paravirt-ops fill code readable.
+ * For cached operations which do not match the VMI ROM ABI and must
+ * go through a tranlation stub.  Ignore NOPs, since it is not clear
+ * a NOP * VMI function corresponds to a NOP paravirt-op when the
+ * functions are not in 1-1 correspondence.
+ */
+#define para_wrap(opname, wrapper, cache, vmicall)		\
+do {								\
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,		\
+				    VMI_CALL_##vmicall);	\
+	BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);		\
+	if (rel->type == VMI_RELOCATION_CALL_REL) {		\
+		paravirt_ops.opname = wrapper;			\
+		vmi_ops.cache = (void *)rel->eip;		\
+	}							\
+} while (0)
+
+/*
+ * Activate the VMI interface and switch into paravirtualized mode
+ */
+static inline int __init activate_vmi(void)
+{
+	short kernel_cs;
+	u64 reloc;
+	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+
+	if (call_vrom_func(vmi_rom, vmi_init) != 0) {
+		printk(KERN_ERR "VMI ROM failed to initialize!");
+		return 0;
+	}
+	savesegment(cs, kernel_cs);
+
+	paravirt_ops.paravirt_enabled = 1;
+	paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
+
+	paravirt_ops.patch = vmi_patch;
+	paravirt_ops.name = "vmi";
+
+	/*
+	 * Many of these operations are ABI compatible with VMI.
+	 * This means we can fill in the paravirt-ops with direct
+	 * pointers into the VMI ROM.  If the calling convention for
+	 * these operations changes, this code needs to be updated.
+	 *
+	 * Exceptions
+	 *  CPUID paravirt-op uses pointers, not the native ISA
+	 *  halt has no VMI equivalent; all VMI halts are "safe"
+	 *  no MSR support yet - just trap and emulate.  VMI uses the
+	 *    same ABI as the native ISA, but Linux wants exceptions
+	 *    from bogus MSR read / write handled
+	 *  rdpmc is not yet used in Linux
+	 */
+
+	/* CPUID is special, so very special it gets wrapped like a present */
+	para_wrap(cpuid, vmi_cpuid, cpuid, CPUID);
+
+	para_fill(clts, CLTS);
+	para_fill(get_debugreg, GetDR);
+	para_fill(set_debugreg, SetDR);
+	para_fill(read_cr0, GetCR0);
+	para_fill(read_cr2, GetCR2);
+	para_fill(read_cr3, GetCR3);
+	para_fill(read_cr4, GetCR4);
+	para_fill(write_cr0, SetCR0);
+	para_fill(write_cr2, SetCR2);
+	para_fill(write_cr3, SetCR3);
+	para_fill(write_cr4, SetCR4);
+	para_fill(save_fl, GetInterruptMask);
+	para_fill(restore_fl, SetInterruptMask);
+	para_fill(irq_disable, DisableInterrupts);
+	para_fill(irq_enable, EnableInterrupts);
+
+	para_fill(wbinvd, WBINVD);
+	para_fill(read_tsc, RDTSC);
+
+	/* The following we emulate with trap and emulate for now */
+	/* paravirt_ops.read_msr = vmi_rdmsr */
+	/* paravirt_ops.write_msr = vmi_wrmsr */
+	/* paravirt_ops.rdpmc = vmi_rdpmc */
+
+	/* TR interface doesn't pass TR value, wrap */
+	para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR);
+
+	/* LDT is special, too */
+	para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
+
+	para_fill(load_gdt, SetGDT);
+	para_fill(load_idt, SetIDT);
+	para_fill(store_gdt, GetGDT);
+	para_fill(store_idt, GetIDT);
+	para_fill(store_tr, GetTR);
+	paravirt_ops.load_tls = vmi_load_tls;
+	para_fill(write_ldt_entry, WriteLDTEntry);
+	para_fill(write_gdt_entry, WriteGDTEntry);
+	para_fill(write_idt_entry, WriteIDTEntry);
+	para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
+	para_fill(set_iopl_mask, SetIOPLMask);
+	para_fill(io_delay, IODelay);
+	para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
+
+	/* user and kernel flush are just handled with different flags to FlushTLB */
+	para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
+	para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
+	para_fill(flush_tlb_single, InvalPage);
+
+	/*
+	 * Until a standard flag format can be agreed on, we need to
+	 * implement these as wrappers in Linux.  Get the VMI ROM
+	 * function pointers for the two backend calls.
+	 */
+#ifdef CONFIG_X86_PAE
+	vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
+	vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
+#else
+	vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
+	vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
+#endif
+
+	if (vmi_ops.set_pte) {
+		paravirt_ops.set_pte = vmi_set_pte;
+		paravirt_ops.set_pte_at = vmi_set_pte_at;
+		paravirt_ops.set_pmd = vmi_set_pmd;
+#ifdef CONFIG_X86_PAE
+		paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
+		paravirt_ops.set_pte_present = vmi_set_pte_present;
+		paravirt_ops.set_pud = vmi_set_pud;
+		paravirt_ops.pte_clear = vmi_pte_clear;
+		paravirt_ops.pmd_clear = vmi_pmd_clear;
+#endif
+	}
+
+	if (vmi_ops.update_pte) {
+		paravirt_ops.pte_update = vmi_update_pte;
+		paravirt_ops.pte_update_defer = vmi_update_pte_defer;
+	}
+
+	vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
+	if (vmi_ops.allocate_page) {
+		paravirt_ops.alloc_pt = vmi_allocate_pt;
+		paravirt_ops.alloc_pd = vmi_allocate_pd;
+		paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+	}
+
+	vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
+	if (vmi_ops.release_page) {
+		paravirt_ops.release_pt = vmi_release_pt;
+		paravirt_ops.release_pd = vmi_release_pd;
+	}
+
+	/* Set linear is needed in all cases */
+	vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
+#ifdef CONFIG_HIGHPTE
+	if (vmi_ops.set_linear_mapping)
+		paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
+#endif
+
+	/*
+	 * These MUST always be patched.  Don't support indirect jumps
+	 * through these operations, as the VMI interface may use either
+	 * a jump or a call to get to these operations, depending on
+	 * the backend.  They are performance critical anyway, so requiring
+	 * a patch is not a big problem.
+	 */
+	paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
+	paravirt_ops.iret = (void *)0xbadbab0;
+
+#ifdef CONFIG_SMP
+	para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	para_fill(apic_read, APICRead);
+	para_fill(apic_write, APICWrite);
+	para_fill(apic_write_atomic, APICWrite);
+#endif
+
+	/*
+	 * Check for VMI timer functionality by probing for a cycle frequency method
+	 */
+	reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
+	if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
+		vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
+		vmi_timer_ops.get_cycle_counter =
+			vmi_get_function(VMI_CALL_GetCycleCounter);
+		vmi_timer_ops.get_wallclock =
+			vmi_get_function(VMI_CALL_GetWallclockTime);
+		vmi_timer_ops.wallclock_updated =
+			vmi_get_function(VMI_CALL_WallclockUpdated);
+		vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
+		vmi_timer_ops.cancel_alarm =
+			 vmi_get_function(VMI_CALL_CancelAlarm);
+		paravirt_ops.time_init = vmi_time_init;
+		paravirt_ops.get_wallclock = vmi_get_wallclock;
+		paravirt_ops.set_wallclock = vmi_set_wallclock;
+#ifdef CONFIG_X86_LOCAL_APIC
+		paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
+		paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
+#endif
+		paravirt_ops.sched_clock = vmi_sched_clock;
+ 		paravirt_ops.get_cpu_khz = vmi_cpu_khz;
+
+		/* We have true wallclock functions; disable CMOS clock sync */
+		no_sync_cmos_clock = 1;
+	} else {
+		disable_noidle = 1;
+		disable_vmi_timer = 1;
+	}
+
+	para_fill(safe_halt, Halt);
+
+	/*
+	 * Alternative instruction rewriting doesn't happen soon enough
+	 * to convert VMI_IRET to a call instead of a jump; so we have
+	 * to do this before IRQs get reenabled.  Fortunately, it is
+	 * idempotent.
+	 */
+	apply_paravirt(__parainstructions, __parainstructions_end);
+
+	vmi_bringup();
+
+	return 1;
+}
+
+#undef para_fill
+
+void __init vmi_init(void)
+{
+	unsigned long flags;
+
+	if (!vmi_rom)
+		probe_vmi_rom();
+	else
+		check_vmi_rom(vmi_rom);
+
+	/* In case probing for or validating the ROM failed, basil */
+	if (!vmi_rom)
+		return;
+
+	reserve_top_address(-vmi_rom->virtual_top);
+
+	local_irq_save(flags);
+	activate_vmi();
+
+#ifdef CONFIG_X86_IO_APIC
+	/* This is virtual hardware; timer routing is wired correctly */
+	no_timer_check = 1;
+#endif
+	local_irq_restore(flags & X86_EFLAGS_IF);
+}
+
+static int __init parse_vmi(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (!strcmp(arg, "disable_pge")) {
+		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+		disable_pge = 1;
+	} else if (!strcmp(arg, "disable_pse")) {
+		clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+		disable_pse = 1;
+	} else if (!strcmp(arg, "disable_sep")) {
+		clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
+		disable_sep = 1;
+	} else if (!strcmp(arg, "disable_tsc")) {
+		clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+		disable_tsc = 1;
+	} else if (!strcmp(arg, "disable_mtrr")) {
+		clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
+		disable_mtrr = 1;
+	} else if (!strcmp(arg, "disable_timer")) {
+		disable_vmi_timer = 1;
+		disable_noidle = 1;
+	} else if (!strcmp(arg, "disable_noidle"))
+		disable_noidle = 1;
+	return 0;
+}
+
+early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
new file mode 100644
index 000000000000..b1b5ab08b26e
--- /dev/null
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -0,0 +1,320 @@
+/*
+ * VMI paravirtual timer support routines.
+ *
+ * Copyright (C) 2007, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/cpumask.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+
+#include <asm/vmi.h>
+#include <asm/vmi_time.h>
+#include <asm/arch_hooks.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
+#include <asm/i8253.h>
+
+#include <irq_vectors.h>
+#include "io_ports.h"
+
+#define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
+#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
+
+static DEFINE_PER_CPU(struct clock_event_device, local_events);
+
+static inline u32 vmi_counter(u32 flags)
+{
+	/* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
+	 * cycle counter. */
+	return flags & VMI_ALARM_COUNTER_MASK;
+}
+
+/* paravirt_ops.get_wallclock = vmi_get_wallclock */
+unsigned long vmi_get_wallclock(void)
+{
+	unsigned long long wallclock;
+	wallclock = vmi_timer_ops.get_wallclock(); // nsec
+	(void)do_div(wallclock, 1000000000);       // sec
+
+	return wallclock;
+}
+
+/* paravirt_ops.set_wallclock = vmi_set_wallclock */
+int vmi_set_wallclock(unsigned long now)
+{
+	return 0;
+}
+
+/* paravirt_ops.sched_clock = vmi_sched_clock */
+unsigned long long vmi_sched_clock(void)
+{
+	return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
+}
+
+/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
+unsigned long vmi_cpu_khz(void)
+{
+	unsigned long long khz;
+	khz = vmi_timer_ops.get_cycle_frequency();
+	(void)do_div(khz, 1000);
+	return khz;
+}
+
+static inline unsigned int vmi_get_timer_vector(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+	return FIRST_DEVICE_VECTOR;
+#else
+	return FIRST_EXTERNAL_VECTOR;
+#endif
+}
+
+/** vmi clockchip */
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned int startup_timer_irq(unsigned int irq)
+{
+	unsigned long val = apic_read(APIC_LVTT);
+	apic_write(APIC_LVTT, vmi_get_timer_vector());
+
+	return (val & APIC_SEND_PENDING);
+}
+
+static void mask_timer_irq(unsigned int irq)
+{
+	unsigned long val = apic_read(APIC_LVTT);
+	apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
+}
+
+static void unmask_timer_irq(unsigned int irq)
+{
+	unsigned long val = apic_read(APIC_LVTT);
+	apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
+}
+
+static void ack_timer_irq(unsigned int irq)
+{
+	ack_APIC_irq();
+}
+
+static struct irq_chip vmi_chip __read_mostly = {
+	.name 		= "VMI-LOCAL",
+	.startup 	= startup_timer_irq,
+	.mask	 	= mask_timer_irq,
+	.unmask	 	= unmask_timer_irq,
+	.ack 		= ack_timer_irq
+};
+#endif
+
+/** vmi clockevent */
+#define VMI_ALARM_WIRED_IRQ0    0x00000000
+#define VMI_ALARM_WIRED_LVTT    0x00010000
+static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
+
+static inline int vmi_get_alarm_wiring(void)
+{
+	return vmi_wiring;
+}
+
+static void vmi_timer_set_mode(enum clock_event_mode mode,
+			       struct clock_event_device *evt)
+{
+	cycle_t now, cycles_per_hz;
+	BUG_ON(!irqs_disabled());
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
+		break;
+	case CLOCK_EVT_MODE_PERIODIC:
+		cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
+		(void)do_div(cycles_per_hz, HZ);
+		now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
+		vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		switch (evt->mode) {
+		case CLOCK_EVT_MODE_ONESHOT:
+			vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
+			break;
+		case CLOCK_EVT_MODE_PERIODIC:
+			vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
+			break;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+static int vmi_timer_next_event(unsigned long delta,
+				struct clock_event_device *evt)
+{
+	/* Unfortunately, set_next_event interface only passes relative
+	 * expiry, but we want absolute expiry.  It'd be better if were
+	 * were passed an aboslute expiry, since a bunch of time may
+	 * have been stolen between the time the delta is computed and
+	 * when we set the alarm below. */
+	cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
+
+	BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+	vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
+	return 0;
+}
+
+static struct clock_event_device vmi_clockevent = {
+	.name		= "vmi-timer",
+	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+	.shift		= 22,
+	.set_mode	= vmi_timer_set_mode,
+	.set_next_event = vmi_timer_next_event,
+	.rating         = 1000,
+	.irq		= 0,
+};
+
+static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
+{
+	struct clock_event_device *evt = &__get_cpu_var(local_events);
+	evt->event_handler(evt);
+	return IRQ_HANDLED;
+}
+
+static struct irqaction vmi_clock_action  = {
+	.name 		= "vmi-timer",
+	.handler 	= vmi_timer_interrupt,
+	.flags 		= IRQF_DISABLED | IRQF_NOBALANCING,
+	.mask 		= CPU_MASK_ALL,
+};
+
+static void __devinit vmi_time_init_clockevent(void)
+{
+	cycle_t cycles_per_msec;
+	struct clock_event_device *evt;
+
+	int cpu = smp_processor_id();
+	evt = &__get_cpu_var(local_events);
+
+	/* Use cycles_per_msec since div_sc params are 32-bits. */
+	cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
+	(void)do_div(cycles_per_msec, 1000);
+
+	memcpy(evt, &vmi_clockevent, sizeof(*evt));
+	/* Must pick .shift such that .mult fits in 32-bits.  Choosing
+	 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
+	 * before overflow. */
+	evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
+	/* Upper bound is clockevent's use of ulong for cycle deltas. */
+	evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
+	evt->min_delta_ns = clockevent_delta2ns(1, evt);
+	evt->cpumask = cpumask_of_cpu(cpu);
+
+	printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
+	       evt->name, evt->mult, evt->shift);
+	clockevents_register_device(evt);
+}
+
+void __init vmi_time_init(void)
+{
+	/* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
+	outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
+
+	vmi_time_init_clockevent();
+	setup_irq(0, &vmi_clock_action);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+void __devinit vmi_time_bsp_init(void)
+{
+	/*
+	 * On APIC systems, we want local timers to fire on each cpu.  We do
+	 * this by programming LVTT to deliver timer events to the IRQ handler
+	 * for IRQ-0, since we can't re-use the APIC local timer handler
+	 * without interfering with that code.
+	 */
+	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+	local_irq_disable();
+#ifdef CONFIG_X86_SMP
+	/*
+	 * XXX handle_percpu_irq only defined for SMP; we need to switch over
+	 * to using it, since this is a local interrupt, which each CPU must
+	 * handle individually without locking out or dropping simultaneous
+	 * local timers on other CPUs.  We also don't want to trigger the
+	 * quirk workaround code for interrupts which gets invoked from
+	 * handle_percpu_irq via eoi, so we use our own IRQ chip.
+	 */
+	set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
+#else
+	set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
+#endif
+	vmi_wiring = VMI_ALARM_WIRED_LVTT;
+	apic_write(APIC_LVTT, vmi_get_timer_vector());
+	local_irq_enable();
+	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+}
+
+void __devinit vmi_time_ap_init(void)
+{
+	vmi_time_init_clockevent();
+	apic_write(APIC_LVTT, vmi_get_timer_vector());
+}
+#endif
+
+/** vmi clocksource */
+
+static cycle_t read_real_cycles(void)
+{
+	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
+}
+
+static struct clocksource clocksource_vmi = {
+	.name			= "vmi-timer",
+	.rating			= 450,
+	.read			= read_real_cycles,
+	.mask			= CLOCKSOURCE_MASK(64),
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.flags			= CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static int __init init_vmi_clocksource(void)
+{
+	cycle_t cycles_per_msec;
+
+	if (!vmi_timer_ops.get_cycle_frequency)
+		return 0;
+	/* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
+	cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
+	(void)do_div(cycles_per_msec, 1000);
+
+	/* Note that clocksource.{mult, shift} converts in the opposite direction
+	 * as clockevents.  */
+	clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
+						    clocksource_vmi.shift);
+
+	printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
+	return clocksource_register(&clocksource_vmi);
+
+}
+module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
new file mode 100644
index 000000000000..849ee611f013
--- /dev/null
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "vmlinux_32.lds.S"
+#else
+# include "vmlinux_64.lds.S"
+#endif
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
new file mode 100644
index 000000000000..7d72cce00529
--- /dev/null
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -0,0 +1,213 @@
+/* ld script to make i386 Linux kernel
+ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+
+/* Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+#define LOAD_OFFSET __PAGE_OFFSET
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/thread_info.h>
+#include <asm/page.h>
+#include <asm/cache.h>
+#include <asm/boot.h>
+
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+ENTRY(phys_startup_32)
+jiffies = jiffies_64;
+
+PHDRS {
+	text PT_LOAD FLAGS(5);	/* R_E */
+	data PT_LOAD FLAGS(7);	/* RWE */
+	note PT_NOTE FLAGS(0);	/* ___ */
+}
+SECTIONS
+{
+  . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+  phys_startup_32 = startup_32 - LOAD_OFFSET;
+
+  .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+  	_text = .;			/* Text and read-only data */
+	*(.text.head)
+  } :text = 0x9090
+
+  /* read-only */
+  .text : AT(ADDR(.text) - LOAD_OFFSET) {
+	TEXT_TEXT
+	SCHED_TEXT
+	LOCK_TEXT
+	KPROBES_TEXT
+	*(.fixup)
+	*(.gnu.warning)
+  	_etext = .;			/* End of text section */
+  } :text = 0x9090
+
+  . = ALIGN(16);		/* Exception table */
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+  	__start___ex_table = .;
+	 *(__ex_table)
+  	__stop___ex_table = .;
+  }
+
+  NOTES :text :note
+
+  BUG_TABLE :text
+
+  . = ALIGN(4);
+  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+  	__tracedata_start = .;
+	*(.tracedata)
+  	__tracedata_end = .;
+  }
+
+  RODATA
+
+  /* writeable */
+  . = ALIGN(4096);
+  .data : AT(ADDR(.data) - LOAD_OFFSET) {	/* Data */
+	DATA_DATA
+	CONSTRUCTORS
+	} :data
+
+  . = ALIGN(4096);
+  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+  	__nosave_begin = .;
+	*(.data.nosave)
+  	. = ALIGN(4096);
+  	__nosave_end = .;
+  }
+
+  . = ALIGN(4096);
+  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+	*(.data.page_aligned)
+	*(.data.idt)
+  }
+
+  . = ALIGN(32);
+  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+	*(.data.cacheline_aligned)
+  }
+
+  /* rarely changed data like cpu maps */
+  . = ALIGN(32);
+  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+	*(.data.read_mostly)
+	_edata = .;		/* End of data section */
+  }
+
+  . = ALIGN(THREAD_SIZE);	/* init_task */
+  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+	*(.data.init_task)
+  }
+
+  /* might get freed after init */
+  . = ALIGN(4096);
+  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+  	__smp_locks = .;
+	*(.smp_locks)
+	__smp_locks_end = .;
+  }
+  /* will be freed after init
+   * Following ALIGN() is required to make sure no other data falls on the
+   * same page where __smp_alt_end is pointing as that page might be freed
+   * after boot. Always make sure that ALIGN() directive is present after
+   * the section which contains __smp_alt_end.
+   */
+  . = ALIGN(4096);
+
+  /* will be freed after init */
+  . = ALIGN(4096);		/* Init code and data */
+  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+  	__init_begin = .;
+	_sinittext = .;
+	*(.init.text)
+	_einittext = .;
+  }
+  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
+  . = ALIGN(16);
+  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+  	__setup_start = .;
+	*(.init.setup)
+  	__setup_end = .;
+   }
+  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+  	__initcall_start = .;
+	INITCALLS
+  	__initcall_end = .;
+  }
+  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+  	__con_initcall_start = .;
+	*(.con_initcall.init)
+  	__con_initcall_end = .;
+  }
+  SECURITY_INIT
+  . = ALIGN(4);
+  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+  	__alt_instructions = .;
+	*(.altinstructions)
+	__alt_instructions_end = .;
+  }
+  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+	*(.altinstr_replacement)
+  }
+  . = ALIGN(4);
+  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+  	__parainstructions = .;
+	*(.parainstructions)
+  	__parainstructions_end = .;
+  }
+  /* .exit.text is discard at runtime, not link time, to deal with references
+     from .altinstructions and .eh_frame */
+  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
+  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
+#if defined(CONFIG_BLK_DEV_INITRD)
+  . = ALIGN(4096);
+  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+	__initramfs_start = .;
+	*(.init.ramfs)
+	__initramfs_end = .;
+  }
+#endif
+  . = ALIGN(4096);
+  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
+	__per_cpu_start = .;
+	*(.data.percpu)
+	*(.data.percpu.shared_aligned)
+	__per_cpu_end = .;
+  }
+  . = ALIGN(4096);
+  /* freed after init ends here */
+	
+  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+	__init_end = .;
+	__bss_start = .;		/* BSS */
+	*(.bss.page_aligned)
+	*(.bss)
+	. = ALIGN(4);
+	__bss_stop = .;
+  	_end = . ;
+	/* This is where the kernel creates the early boot page tables */
+	. = ALIGN(4096);
+	pg0 = . ;
+  }
+
+  /* Sections to be discarded */
+  /DISCARD/ : {
+	*(.exitcall.exit)
+	}
+
+  STABS_DEBUG
+
+  DWARF_DEBUG
+}
diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/kernel/vsyscall-int80_32.S
new file mode 100644
index 000000000000..103cab6aa7c0
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-int80_32.S
@@ -0,0 +1,53 @@
+/*
+ * Code for the vsyscall page.  This version uses the old int $0x80 method.
+ *
+ * NOTE:
+ * 1) __kernel_vsyscall _must_ be first in this page.
+ * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
+ *    for details.
+ */
+
+	.text
+	.globl __kernel_vsyscall
+	.type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+	int $0x80
+	ret
+.LEND_vsyscall:
+	.size __kernel_vsyscall,.-.LSTART_vsyscall
+	.previous
+
+	.section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI:
+	.long .LENDCIEDLSI-.LSTARTCIEDLSI
+.LSTARTCIEDLSI:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zR"		/* NUL-terminated augmentation string */
+	.uleb128 1		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 8			/* Return address register column */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+	.byte 0x0c		/* DW_CFA_def_cfa */
+	.uleb128 4
+	.uleb128 4
+	.byte 0x88		/* DW_CFA_offset, column 0x8 */
+	.uleb128 1
+	.align 4
+.LENDCIEDLSI:
+	.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
+.LSTARTFDEDLSI:
+	.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
+	.long .LSTART_vsyscall-.	/* PC-relative start address */
+	.long .LEND_vsyscall-.LSTART_vsyscall
+	.uleb128 0
+	.align 4
+.LENDFDEDLSI:
+	.previous
+
+/*
+ * Get the common code for the sigreturn entry points.
+ */
+#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/kernel/vsyscall-note_32.S
new file mode 100644
index 000000000000..fcf376a37f79
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-note_32.S
@@ -0,0 +1,45 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+/* Ideally this would use UTS_NAME, but using a quoted string here
+   doesn't work. Remember to change this when changing the
+   kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
+	.long LINUX_VERSION_CODE
+ELFNOTE_END
+
+#ifdef CONFIG_XEN
+/*
+ * Add a special note telling glibc's dynamic linker a fake hardware
+ * flavor that it will use to choose the search path for libraries in the
+ * same way it uses real hardware capabilities like "mmx".
+ * We supply "nosegneg" as the fake capability, to indicate that we
+ * do not like negative offsets in instructions using segment overrides,
+ * since we implement those inefficiently.  This makes it possible to
+ * install libraries optimized to avoid those access patterns in someplace
+ * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
+ * corresponding to the bits here is needed to make ldconfig work right.
+ * It should contain:
+ *	hwcap 1 nosegneg
+ * to match the mapping of bit to name that we give here.
+ *
+ * At runtime, the fake hardware feature will be considered to be present
+ * if its bit is set in the mask word.  So, we start with the mask 0, and
+ * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
+ */
+
+#include "../../x86/xen/vdso.h"	/* Defines VDSO_NOTE_NONEGSEG_BIT.  */
+
+	.globl VDSO_NOTE_MASK
+ELFNOTE_START(GNU, 2, "a")
+	.long 1			/* ncaps */
+VDSO_NOTE_MASK:
+	.long 0			/* mask */
+	.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg"	/* bit, name */
+ELFNOTE_END
+#endif
diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/kernel/vsyscall-sigreturn_32.S
new file mode 100644
index 000000000000..a92262f41659
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-sigreturn_32.S
@@ -0,0 +1,143 @@
+/*
+ * Common code for the sigreturn entry points on the vsyscall page.
+ * So far this code is the same for both int80 and sysenter versions.
+ * This file is #include'd by vsyscall-*.S to define them after the
+ * vsyscall entry point.  The kernel assumes that the addresses of these
+ * routines are constant for all vsyscall implementations.
+ */
+
+#include <asm/unistd.h>
+#include <asm/asm-offsets.h>
+
+
+/* XXX
+   Should these be named "_sigtramp" or something?
+*/
+
+	.text
+	.org __kernel_vsyscall+32,0x90
+	.globl __kernel_sigreturn
+	.type __kernel_sigreturn,@function
+__kernel_sigreturn:
+.LSTART_sigreturn:
+	popl %eax		/* XXX does this mean it needs unwind info? */
+	movl $__NR_sigreturn, %eax
+	int $0x80
+.LEND_sigreturn:
+	.size __kernel_sigreturn,.-.LSTART_sigreturn
+
+	.balign 32
+	.globl __kernel_rt_sigreturn
+	.type __kernel_rt_sigreturn,@function
+__kernel_rt_sigreturn:
+.LSTART_rt_sigreturn:
+	movl $__NR_rt_sigreturn, %eax
+	int $0x80
+.LEND_rt_sigreturn:
+	.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
+	.balign 32
+	.previous
+
+	.section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI1:
+	.long .LENDCIEDLSI1-.LSTARTCIEDLSI1
+.LSTARTCIEDLSI1:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zRS"		/* NUL-terminated augmentation string */
+	.uleb128 1		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 8			/* Return address register column */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+	.byte 0			/* DW_CFA_nop */
+	.align 4
+.LENDCIEDLSI1:
+	.long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
+.LSTARTFDEDLSI1:
+	.long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
+	/* HACK: The dwarf2 unwind routines will subtract 1 from the
+	   return address to get an address in the middle of the
+	   presumed call instruction.  Since we didn't get here via
+	   a call, we need to include the nop before the real start
+	   to make up for it.  */
+	.long .LSTART_sigreturn-1-.	/* PC-relative start address */
+	.long .LEND_sigreturn-.LSTART_sigreturn+1
+	.uleb128 0			/* Augmentation */
+	/* What follows are the instructions for the table generation.
+	   We record the locations of each register saved.  This is
+	   complicated by the fact that the "CFA" is always assumed to
+	   be the value of the stack pointer in the caller.  This means
+	   that we must define the CFA of this body of code to be the
+	   saved value of the stack pointer in the sigcontext.  Which
+	   also means that there is no fixed relation to the other 
+	   saved registers, which means that we must use DW_CFA_expression
+	   to compute their addresses.  It also means that when we 
+	   adjust the stack with the popl, we have to do it all over again.  */
+
+#define do_cfa_expr(offset)						\
+	.byte 0x0f;			/* DW_CFA_def_cfa_expression */	\
+	.uleb128 1f-0f;			/*   length */			\
+0:	.byte 0x74;			/*     DW_OP_breg4 */		\
+	.sleb128 offset;		/*      offset */		\
+	.byte 0x06;			/*     DW_OP_deref */		\
+1:
+
+#define do_expr(regno, offset)						\
+	.byte 0x10;			/* DW_CFA_expression */		\
+	.uleb128 regno;			/*   regno */			\
+	.uleb128 1f-0f;			/*   length */			\
+0:	.byte 0x74;			/*     DW_OP_breg4 */		\
+	.sleb128 offset;		/*       offset */		\
+1:
+
+	do_cfa_expr(SIGCONTEXT_esp+4)
+	do_expr(0, SIGCONTEXT_eax+4)
+	do_expr(1, SIGCONTEXT_ecx+4)
+	do_expr(2, SIGCONTEXT_edx+4)
+	do_expr(3, SIGCONTEXT_ebx+4)
+	do_expr(5, SIGCONTEXT_ebp+4)
+	do_expr(6, SIGCONTEXT_esi+4)
+	do_expr(7, SIGCONTEXT_edi+4)
+	do_expr(8, SIGCONTEXT_eip+4)
+
+	.byte 0x42	/* DW_CFA_advance_loc 2 -- nop; popl eax. */
+
+	do_cfa_expr(SIGCONTEXT_esp)
+	do_expr(0, SIGCONTEXT_eax)
+	do_expr(1, SIGCONTEXT_ecx)
+	do_expr(2, SIGCONTEXT_edx)
+	do_expr(3, SIGCONTEXT_ebx)
+	do_expr(5, SIGCONTEXT_ebp)
+	do_expr(6, SIGCONTEXT_esi)
+	do_expr(7, SIGCONTEXT_edi)
+	do_expr(8, SIGCONTEXT_eip)
+
+	.align 4
+.LENDFDEDLSI1:
+
+	.long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
+.LSTARTFDEDLSI2:
+	.long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
+	/* HACK: See above wrt unwind library assumptions.  */
+	.long .LSTART_rt_sigreturn-1-.	/* PC-relative start address */
+	.long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
+	.uleb128 0			/* Augmentation */
+	/* What follows are the instructions for the table generation.
+	   We record the locations of each register saved.  This is
+	   slightly less complicated than the above, since we don't
+	   modify the stack pointer in the process.  */
+
+	do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp)
+	do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax)
+	do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx)
+	do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx)
+	do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx)
+	do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp)
+	do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi)
+	do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi)
+	do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip)
+
+	.align 4
+.LENDFDEDLSI2:
+	.previous
diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/kernel/vsyscall-sysenter_32.S
new file mode 100644
index 000000000000..ed879bf42995
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-sysenter_32.S
@@ -0,0 +1,122 @@
+/*
+ * Code for the vsyscall page.  This version uses the sysenter instruction.
+ *
+ * NOTE:
+ * 1) __kernel_vsyscall _must_ be first in this page.
+ * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
+ *    for details.
+ */
+
+/*
+ * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
+ * %ecx itself for arg2. The pushing is because the sysexit instruction
+ * (found in entry.S) requires that we clobber %ecx with the desired %esp.
+ * User code might expect that %ecx is unclobbered though, as it would be
+ * for returning via the iret instruction, so we must push and pop.
+ *
+ * The caller puts arg3 in %edx, which the sysexit instruction requires
+ * for %eip. Thus, exactly as for arg2, we must push and pop.
+ *
+ * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
+ * instruction clobbers %esp, the user's %esp won't even survive entry
+ * into the kernel. We store %esp in %ebp. Code in entry.S must fetch
+ * arg6 from the stack.
+ *
+ * You can not use this vsyscall for the clone() syscall because the
+ * three dwords on the parent stack do not get copied to the child.
+ */
+	.text
+	.globl __kernel_vsyscall
+	.type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+	push %ecx
+.Lpush_ecx:
+	push %edx
+.Lpush_edx:
+	push %ebp
+.Lenter_kernel:
+	movl %esp,%ebp
+	sysenter
+
+	/* 7: align return point with nop's to make disassembly easier */
+	.space 7,0x90
+
+	/* 14: System call restart point is here! (SYSENTER_RETURN-2) */
+	jmp .Lenter_kernel
+	/* 16: System call normal return point is here! */
+	.globl SYSENTER_RETURN	/* Symbol used by sysenter.c  */
+SYSENTER_RETURN:
+	pop %ebp
+.Lpop_ebp:
+	pop %edx
+.Lpop_edx:
+	pop %ecx
+.Lpop_ecx:
+	ret
+.LEND_vsyscall:
+	.size __kernel_vsyscall,.-.LSTART_vsyscall
+	.previous
+
+	.section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI:
+	.long .LENDCIEDLSI-.LSTARTCIEDLSI
+.LSTARTCIEDLSI:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zR"		/* NUL-terminated augmentation string */
+	.uleb128 1		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 8			/* Return address register column */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+	.byte 0x0c		/* DW_CFA_def_cfa */
+	.uleb128 4
+	.uleb128 4
+	.byte 0x88		/* DW_CFA_offset, column 0x8 */
+	.uleb128 1
+	.align 4
+.LENDCIEDLSI:
+	.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
+.LSTARTFDEDLSI:
+	.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
+	.long .LSTART_vsyscall-.	/* PC-relative start address */
+	.long .LEND_vsyscall-.LSTART_vsyscall
+	.uleb128 0
+	/* What follows are the instructions for the table generation.
+	   We have to record all changes of the stack pointer.  */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpush_ecx-.LSTART_vsyscall
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x08		/* RA at offset 8 now */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpush_edx-.Lpush_ecx
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x0c		/* RA at offset 12 now */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lenter_kernel-.Lpush_edx
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x10		/* RA at offset 16 now */
+	.byte 0x85, 0x04	/* DW_CFA_offset %ebp -16 */
+	/* Finally the epilogue.  */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpop_ebp-.Lenter_kernel
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x0c		/* RA at offset 12 now */
+	.byte 0xc5		/* DW_CFA_restore %ebp */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpop_edx-.Lpop_ebp
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x08		/* RA at offset 8 now */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpop_ecx-.Lpop_edx
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x04		/* RA at offset 4 now */
+	.align 4
+.LENDFDEDLSI:
+	.previous
+
+/*
+ * Get the common code for the sigreturn entry points.
+ */
+#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S
new file mode 100644
index 000000000000..a5ab3dc4fd25
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_32.S
@@ -0,0 +1,15 @@
+#include <linux/init.h>
+
+__INITDATA
+
+	.globl vsyscall_int80_start, vsyscall_int80_end
+vsyscall_int80_start:
+	.incbin "arch/x86/kernel/vsyscall-int80_32.so"
+vsyscall_int80_end:
+
+	.globl vsyscall_sysenter_start, vsyscall_sysenter_end
+vsyscall_sysenter_start:
+	.incbin "arch/x86/kernel/vsyscall-sysenter_32.so"
+vsyscall_sysenter_end:
+
+__FINIT
diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S
new file mode 100644
index 000000000000..4a8b0ed9b8fb
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_32.lds.S
@@ -0,0 +1,67 @@
+/*
+ * Linker script for vsyscall DSO.  The vsyscall page is an ELF shared
+ * object prelinked to its virtual address, and with only one read-only
+ * segment (that fits in one page).  This script controls its layout.
+ */
+#include <asm/asm-offsets.h>
+
+SECTIONS
+{
+  . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
+
+  .hash           : { *(.hash) }		:text
+  .gnu.hash       : { *(.gnu.hash) }
+  .dynsym         : { *(.dynsym) }
+  .dynstr         : { *(.dynstr) }
+  .gnu.version    : { *(.gnu.version) }
+  .gnu.version_d  : { *(.gnu.version_d) }
+  .gnu.version_r  : { *(.gnu.version_r) }
+
+  /* This linker script is used both with -r and with -shared.
+     For the layouts to match, we need to skip more than enough
+     space for the dynamic symbol table et al.  If this amount
+     is insufficient, ld -shared will barf.  Just increase it here.  */
+  . = VDSO_PRELINK_asm + 0x400;
+
+  .text           : { *(.text) }		:text =0x90909090
+  .note		  : { *(.note.*) }		:text :note
+  .eh_frame_hdr   : { *(.eh_frame_hdr) }	:text :eh_frame_hdr
+  .eh_frame       : { KEEP (*(.eh_frame)) }	:text
+  .dynamic        : { *(.dynamic) }		:text :dynamic
+  .useless        : {
+  	*(.got.plt) *(.got)
+	*(.data .data.* .gnu.linkonce.d.*)
+	*(.dynbss)
+	*(.bss .bss.* .gnu.linkonce.b.*)
+  }						:text
+}
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+  note PT_NOTE FLAGS(4); /* PF_R */
+  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+  LINUX_2.5 {
+    global:
+    	__kernel_vsyscall;
+    	__kernel_sigreturn;
+    	__kernel_rt_sigreturn;
+
+    local: *;
+  };
+}
+
+/* The ELF entry point can be used to set the AT_SYSINFO value.  */
+ENTRY(__kernel_vsyscall);