aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/RTFP.txt77
-rw-r--r--Documentation/RCU/UP.txt34
-rw-r--r--Documentation/RCU/checklist.txt20
-rw-r--r--Documentation/RCU/rcu.txt10
-rw-r--r--Documentation/RCU/rcubarrier.txt7
-rw-r--r--Documentation/RCU/torture.txt23
-rw-r--r--Documentation/RCU/trace.txt7
-rw-r--r--Documentation/RCU/whatisRCU.txt22
-rw-r--r--Documentation/feature-removal-schedule.txt27
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--Documentation/trace/events.txt9
-rw-r--r--Documentation/trace/ftrace.txt68
-rw-r--r--Documentation/trace/function-graph-fold.vim42
-rw-r--r--Documentation/trace/ring-buffer-design.txt955
-rw-r--r--arch/Kconfig12
-rw-r--r--arch/ia64/include/asm/dma-mapping.h19
-rw-r--r--arch/ia64/xen/time.c3
-rw-r--r--arch/m68k/include/asm/entry_mm.h4
-rw-r--r--arch/m68k/include/asm/entry_no.h8
-rw-r--r--arch/m68k/include/asm/math-emu.h20
-rw-r--r--arch/m68k/include/asm/thread_info_mm.h11
-rw-r--r--arch/m68k/kernel/asm-offsets.c39
-rw-r--r--arch/m68k/kernel/entry.S22
-rw-r--r--arch/m68k/math-emu/fp_entry.S38
-rw-r--r--arch/powerpc/include/asm/dma-mapping.h23
-rw-r--r--arch/powerpc/include/asm/pgtable.h6
-rw-r--r--arch/powerpc/include/asm/spinlock.h20
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c2
-rw-r--r--arch/powerpc/kernel/dma-swiotlb.c48
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S19
-rw-r--r--arch/powerpc/kernel/perf_callchain.c527
-rw-r--r--arch/powerpc/mm/slb.c37
-rw-r--r--arch/powerpc/mm/stab.c11
-rw-r--r--arch/s390/Kconfig2
-rw-r--r--arch/s390/defconfig2
-rw-r--r--arch/s390/include/asm/spinlock.h29
-rw-r--r--arch/s390/include/asm/thread_info.h4
-rw-r--r--arch/s390/kernel/entry.S2
-rw-r--r--arch/s390/kernel/entry64.S2
-rw-r--r--arch/s390/kernel/ftrace.c36
-rw-r--r--arch/s390/kernel/ptrace.c11
-rw-r--r--arch/sparc/Kconfig2
-rw-r--r--arch/sparc/include/asm/dma-mapping.h145
-rw-r--r--arch/sparc/include/asm/irq_64.h4
-rw-r--r--arch/sparc/include/asm/pci.h3
-rw-r--r--arch/sparc/include/asm/pci_32.h105
-rw-r--r--arch/sparc/include/asm/pci_64.h88
-rw-r--r--arch/sparc/include/asm/spinlock_32.h12
-rw-r--r--arch/sparc/include/asm/spinlock_64.h28
-rw-r--r--arch/sparc/kernel/Makefile2
-rw-r--r--arch/sparc/kernel/dma.c175
-rw-r--r--arch/sparc/kernel/dma.h14
-rw-r--r--arch/sparc/kernel/iommu.c20
-rw-r--r--arch/sparc/kernel/ioport.c190
-rw-r--r--arch/sparc/kernel/pci.c2
-rw-r--r--arch/sparc/kernel/pci_sun4v.c30
-rw-r--r--arch/sparc/kernel/process_64.c4
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/include/asm/amd_iommu.h1
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h50
-rw-r--r--arch/x86/include/asm/dma-mapping.h18
-rw-r--r--arch/x86/include/asm/ftrace.h7
-rw-r--r--arch/x86/include/asm/nmi.h4
-rw-r--r--arch/x86/include/asm/perf_counter.h10
-rw-r--r--arch/x86/include/asm/thread_info.h13
-rw-r--r--arch/x86/include/asm/topology.h47
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h6
-rw-r--r--arch/x86/kernel/amd_iommu.c489
-rw-r--r--arch/x86/kernel/amd_iommu_init.c42
-rw-r--r--arch/x86/kernel/apic/nmi.c20
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c329
-rw-r--r--arch/x86/kernel/ftrace.c51
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/pci-gart_64.c5
-rw-r--r--arch/x86/kernel/pci-nommu.c29
-rw-r--r--arch/x86/kernel/pci-swiotlb.c25
-rw-r--r--arch/x86/kernel/ptrace.c13
-rw-r--r--arch/x86/kernel/sys_x86_64.c8
-rw-r--r--arch/x86/oprofile/nmi_int.c404
-rw-r--r--arch/x86/oprofile/op_counter.h2
-rw-r--r--arch/x86/oprofile/op_model_amd.c372
-rw-r--r--arch/x86/oprofile/op_model_p4.c72
-rw-r--r--arch/x86/oprofile/op_model_ppro.c101
-rw-r--r--arch/x86/oprofile/op_x86_model.h59
-rw-r--r--arch/x86/pci/direct.c5
-rw-r--r--drivers/acpi/blacklist.c5
-rw-r--r--drivers/ata/Kconfig21
-rw-r--r--drivers/ata/Makefile1
-rw-r--r--drivers/ata/ahci.c143
-rw-r--r--drivers/ata/libata-acpi.c7
-rw-r--r--drivers/ata/libata-core.c44
-rw-r--r--drivers/ata/libata-eh.c146
-rw-r--r--drivers/ata/libata-pmp.c2
-rw-r--r--drivers/ata/libata-scsi.c159
-rw-r--r--drivers/ata/libata.h1
-rw-r--r--drivers/ata/pata_atiixp.c1
-rw-r--r--drivers/ata/pata_cs5535.c3
-rw-r--r--drivers/ata/pata_octeon_cf.c4
-rw-r--r--drivers/ata/pata_platform.c8
-rw-r--r--drivers/ata/pata_rb532_cf.c2
-rw-r--r--drivers/ata/pata_rdc.c400
-rw-r--r--drivers/ata/pata_rz1000.c4
-rw-r--r--drivers/ata/sata_fsl.c1
-rw-r--r--drivers/ata/sata_inic162x.c2
-rw-r--r--drivers/ata/sata_mv.c2
-rw-r--r--drivers/ata/sata_sil.c13
-rw-r--r--drivers/ata/sata_sil24.c11
-rw-r--r--drivers/ata/sata_sis.c75
-rw-r--r--drivers/char/sysrq.c19
-rw-r--r--drivers/firmware/dmi_scan.c77
-rw-r--r--drivers/ide/atiixp.c1
-rw-r--r--drivers/oprofile/cpu_buffer.c16
-rw-r--r--drivers/oprofile/oprof.c71
-rw-r--r--drivers/oprofile/oprof.h3
-rw-r--r--drivers/oprofile/oprofile_files.c46
-rw-r--r--drivers/oprofile/oprofile_stats.c5
-rw-r--r--drivers/oprofile/oprofile_stats.h1
-rw-r--r--drivers/pci/intr_remapping.c14
-rw-r--r--drivers/pci/quirks.c4
-rw-r--r--fs/dcache.c1
-rw-r--r--fs/locks.c2
-rw-r--r--include/asm-generic/dma-mapping-common.h6
-rw-r--r--include/linux/ata.h36
-rw-r--r--include/linux/cpu.h17
-rw-r--r--include/linux/dma-mapping.h5
-rw-r--r--include/linux/dmi.h13
-rw-r--r--include/linux/ftrace_event.h51
-rw-r--r--include/linux/hardirq.h10
-rw-r--r--include/linux/init_task.h11
-rw-r--r--include/linux/interrupt.h4
-rw-r--r--include/linux/irq.h18
-rw-r--r--include/linux/irqnr.h6
-rw-r--r--include/linux/kernel.h5
-rw-r--r--include/linux/libata.h3
-rw-r--r--include/linux/lockdep.h18
-rw-r--r--include/linux/module.h14
-rw-r--r--include/linux/nmi.h19
-rw-r--r--include/linux/oprofile.h5
-rw-r--r--include/linux/pagemap.h4
-rw-r--r--include/linux/pci_ids.h4
-rw-r--r--include/linux/perf_counter.h7
-rw-r--r--include/linux/rcuclassic.h178
-rw-r--r--include/linux/rcupdate.h98
-rw-r--r--include/linux/rcupreempt.h127
-rw-r--r--include/linux/rcupreempt_trace.h97
-rw-r--r--include/linux/rcutree.h262
-rw-r--r--include/linux/ring_buffer.h24
-rw-r--r--include/linux/sched.h126
-rw-r--r--include/linux/spinlock.h64
-rw-r--r--include/linux/spinlock_api_smp.h394
-rw-r--r--include/linux/swiotlb.h11
-rw-r--r--include/linux/syscalls.h131
-rw-r--r--include/linux/topology.h168
-rw-r--r--include/linux/tracepoint.h29
-rw-r--r--include/trace/define_trace.h7
-rw-r--r--include/trace/events/module.h126
-rw-r--r--include/trace/events/sched.h107
-rw-r--r--include/trace/events/syscalls.h70
-rw-r--r--include/trace/ftrace.h93
-rw-r--r--include/trace/syscall.h48
-rw-r--r--init/Kconfig46
-rw-r--r--init/main.c4
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/futex.c47
-rw-r--r--kernel/irq/chip.c74
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/internals.h13
-rw-r--r--kernel/irq/manage.c102
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/resend.c3
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c30
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/lockdep.c792
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c128
-rw-r--r--kernel/module.c11
-rw-r--r--kernel/perf_counter.c173
-rw-r--r--kernel/printk.c175
-rw-r--r--kernel/rcuclassic.c807
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/rcupreempt.c1539
-rw-r--r--kernel/rcupreempt_trace.c334
-rw-r--r--kernel/rcutorture.c202
-rw-r--r--kernel/rcutree.c280
-rw-r--r--kernel/rcutree.h253
-rw-r--r--kernel/rcutree_plugin.h532
-rw-r--r--kernel/rcutree_trace.c88
-rw-r--r--kernel/sched.c1232
-rw-r--r--kernel/sched_cpupri.c30
-rw-r--r--kernel/sched_debug.c4
-rw-r--r--kernel/sched_fair.c84
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c62
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/spinlock.c230
-rw-r--r--kernel/sysctl.c24
-rw-r--r--kernel/timer.c3
-rw-r--r--kernel/trace/Kconfig13
-rw-r--r--kernel/trace/blktrace.c12
-rw-r--r--kernel/trace/ftrace.c107
-rw-r--r--kernel/trace/kmemtrace.c149
-rw-r--r--kernel/trace/ring_buffer.c1112
-rw-r--r--kernel/trace/trace.c679
-rw-r--r--kernel/trace/trace.h76
-rw-r--r--kernel/trace/trace_boot.c16
-rw-r--r--kernel/trace/trace_events.c146
-rw-r--r--kernel/trace/trace_events_filter.c261
-rw-r--r--kernel/trace/trace_export.c28
-rw-r--r--kernel/trace/trace_functions.c4
-rw-r--r--kernel/trace/trace_functions_graph.c166
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_power.c22
-rw-r--r--kernel/trace/trace_sched_switch.c59
-rw-r--r--kernel/trace/trace_sched_wakeup.c7
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c43
-rw-r--r--kernel/trace/trace_stat.c17
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_syscalls.c471
-rw-r--r--kernel/trace/trace_workqueue.c32
-rw-r--r--kernel/tracepoint.c50
-rw-r--r--kernel/workqueue.c9
-rw-r--r--lib/Kconfig.debug2
-rw-r--r--lib/swiotlb.c124
-rwxr-xr-xscripts/recordmcount.pl1
-rw-r--r--tools/perf/Documentation/perf-record.txt4
-rw-r--r--tools/perf/Documentation/perf-report.txt13
-rw-r--r--tools/perf/Makefile39
-rw-r--r--tools/perf/builtin-annotate.c472
-rw-r--r--tools/perf/builtin-help.c1
-rw-r--r--tools/perf/builtin-record.c38
-rw-r--r--tools/perf/builtin-report.c719
-rw-r--r--tools/perf/builtin-stat.c239
-rw-r--r--tools/perf/builtin-top.c66
-rw-r--r--tools/perf/builtin-trace.c297
-rw-r--r--tools/perf/builtin.h1
-rw-r--r--tools/perf/perf.c1
-rw-r--r--tools/perf/util/abspath.c3
-rw-r--r--tools/perf/util/cache.h1
-rw-r--r--tools/perf/util/callchain.c2
-rw-r--r--tools/perf/util/callchain.h1
-rw-r--r--tools/perf/util/color.c16
-rw-r--r--tools/perf/util/color.h3
-rw-r--r--tools/perf/util/config.c22
-rw-r--r--tools/perf/util/debug.c95
-rw-r--r--tools/perf/util/debug.h8
-rw-r--r--tools/perf/util/event.h96
-rw-r--r--tools/perf/util/exec_cmd.c1
-rw-r--r--tools/perf/util/header.c37
-rw-r--r--tools/perf/util/header.h4
-rw-r--r--tools/perf/util/map.c97
-rw-r--r--tools/perf/util/module.c4
-rw-r--r--tools/perf/util/parse-events.c147
-rw-r--r--tools/perf/util/parse-events.h17
-rw-r--r--tools/perf/util/parse-options.c22
-rw-r--r--tools/perf/util/path.c25
-rw-r--r--tools/perf/util/run-command.c6
-rw-r--r--tools/perf/util/symbol.c199
-rw-r--r--tools/perf/util/symbol.h14
-rw-r--r--tools/perf/util/thread.c175
-rw-r--r--tools/perf/util/thread.h21
-rw-r--r--tools/perf/util/trace-event-info.c539
-rw-r--r--tools/perf/util/trace-event-parse.c2942
-rw-r--r--tools/perf/util/trace-event-read.c512
-rw-r--r--tools/perf/util/trace-event.h240
-rw-r--r--tools/perf/util/util.h6
-rw-r--r--tools/perf/util/values.c230
-rw-r--r--tools/perf/util/values.h27
278 files changed, 18033 insertions, 9400 deletions
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 9f711d2df91..d2b85237c76 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -743,3 +743,80 @@ Revised:
RCU, realtime RCU, sleepable RCU, performance.
"
}
+
+@article{PaulEMcKenney2008RCUOSR
+,author="Paul E. McKenney and Jonathan Walpole"
+,title="Introducing technology into the {Linux} kernel: a case study"
+,Year="2008"
+,journal="SIGOPS Oper. Syst. Rev."
+,volume="42"
+,number="5"
+,pages="4--17"
+,issn="0163-5980"
+,doi={http://doi.acm.org/10.1145/1400097.1400099}
+,publisher="ACM"
+,address="New York, NY, USA"
+,annotation={
+ Linux changed RCU to a far greater degree than RCU has changed Linux.
+}
+}
+
+@unpublished{PaulEMcKenney2008HierarchicalRCU
+,Author="Paul E. McKenney"
+,Title="Hierarchical {RCU}"
+,month="November"
+,day="3"
+,year="2008"
+,note="Available:
+\url{http://lwn.net/Articles/305782/}
+[Viewed November 6, 2008]"
+,annotation="
+ RCU with combining-tree-based grace-period detection,
+ permitting it to handle thousands of CPUs.
+"
+}
+
+@conference{PaulEMcKenney2009MaliciousURCU
+,Author="Paul E. McKenney"
+,Title="Using a Malicious User-Level {RCU} to Torture {RCU}-Based Algorithms"
+,Booktitle="linux.conf.au 2009"
+,month="January"
+,year="2009"
+,address="Hobart, Australia"
+,note="Available:
+\url{http://www.rdrop.com/users/paulmck/RCU/urcutorture.2009.01.22a.pdf}
+[Viewed February 2, 2009]"
+,annotation="
+ Realtime RCU and torture-testing RCU uses.
+"
+}
+
+@unpublished{MathieuDesnoyers2009URCU
+,Author="Mathieu Desnoyers"
+,Title="[{RFC} git tree] Userspace {RCU} (urcu) for {Linux}"
+,month="February"
+,day="5"
+,year="2009"
+,note="Available:
+\url{http://lkml.org/lkml/2009/2/5/572}
+\url{git://lttng.org/userspace-rcu.git}
+[Viewed February 20, 2009]"
+,annotation="
+ Mathieu Desnoyers's user-space RCU implementation.
+ git://lttng.org/userspace-rcu.git
+"
+}
+
+@unpublished{PaulEMcKenney2009BloatWatchRCU
+,Author="Paul E. McKenney"
+,Title="{RCU}: The {Bloatwatch} Edition"
+,month="March"
+,day="17"
+,year="2009"
+,note="Available:
+\url{http://lwn.net/Articles/323929/}
+[Viewed March 20, 2009]"
+,annotation="
+ Uniprocessor assumptions allow simplified RCU implementation.
+"
+}
diff --git a/Documentation/RCU/UP.txt b/Documentation/RCU/UP.txt
index aab4a9ec393..90ec5341ee9 100644
--- a/Documentation/RCU/UP.txt
+++ b/Documentation/RCU/UP.txt
@@ -2,14 +2,13 @@ RCU on Uniprocessor Systems
A common misconception is that, on UP systems, the call_rcu() primitive
-may immediately invoke its function, and that the synchronize_rcu()
-primitive may return immediately. The basis of this misconception
+may immediately invoke its function. The basis of this misconception
is that since there is only one CPU, it should not be necessary to
wait for anything else to get done, since there are no other CPUs for
anything else to be happening on. Although this approach will -sort- -of-
work a surprising amount of the time, it is a very bad idea in general.
-This document presents three examples that demonstrate exactly how bad an
-idea this is.
+This document presents three examples that demonstrate exactly how bad
+an idea this is.
Example 1: softirq Suicide
@@ -82,11 +81,18 @@ Quick Quiz #2: What locking restriction must RCU callbacks respect?
Summary
-Permitting call_rcu() to immediately invoke its arguments or permitting
-synchronize_rcu() to immediately return breaks RCU, even on a UP system.
-So do not do it! Even on a UP system, the RCU infrastructure -must-
-respect grace periods, and -must- invoke callbacks from a known environment
-in which no locks are held.
+Permitting call_rcu() to immediately invoke its arguments breaks RCU,
+even on a UP system. So do not do it! Even on a UP system, the RCU
+infrastructure -must- respect grace periods, and -must- invoke callbacks
+from a known environment in which no locks are held.
+
+It -is- safe for synchronize_sched() and synchronize_rcu_bh() to return
+immediately on an UP system. It is also safe for synchronize_rcu()
+to return immediately on UP systems, except when running preemptable
+RCU.
+
+Quick Quiz #3: Why can't synchronize_rcu() return immediately on
+ UP systems running preemptable RCU?
Answer to Quick Quiz #1:
@@ -117,3 +123,13 @@ Answer to Quick Quiz #2:
callbacks acquire locks directly. However, a great many RCU
callbacks do acquire locks -indirectly-, for example, via
the kfree() primitive.
+
+Answer to Quick Quiz #3:
+ Why can't synchronize_rcu() return immediately on UP systems
+ running preemptable RCU?
+
+ Because some other task might have been preempted in the middle
+ of an RCU read-side critical section. If synchronize_rcu()
+ simply immediately returned, it would prematurely signal the
+ end of the grace period, which would come as a nasty shock to
+ that other thread when it started running again.
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index accfe2f5247..51525a30e8b 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -11,7 +11,10 @@ over a rather long period of time, but improvements are always welcome!
structure is updated more than about 10% of the time, then
you should strongly consider some other approach, unless
detailed performance measurements show that RCU is nonetheless
- the right tool for the job.
+ the right tool for the job. Yes, you might think of RCU
+ as simply cutting overhead off of the readers and imposing it
+ on the writers. That is exactly why normal uses of RCU will
+ do much more reading than updating.
Another exception is where performance is not an issue, and RCU
provides a simpler implementation. An example of this situation
@@ -240,10 +243,11 @@ over a rather long period of time, but improvements are always welcome!
instead need to use synchronize_irq() or synchronize_sched().
12. Any lock acquired by an RCU callback must be acquired elsewhere
- with irq disabled, e.g., via spin_lock_irqsave(). Failing to
- disable irq on a given acquisition of that lock will result in
- deadlock as soon as the RCU callback happens to interrupt that
- acquisition's critical section.
+ with softirq disabled, e.g., via spin_lock_irqsave(),
+ spin_lock_bh(), etc. Failing to disable irq on a given
+ acquisition of that lock will result in deadlock as soon as the
+ RCU callback happens to interrupt that acquisition's critical
+ section.
13. RCU callbacks can be and are executed in parallel. In many cases,
the callback code simply wrappers around kfree(), so that this
@@ -310,3 +314,9 @@ over a rather long period of time, but improvements are always welcome!
Because these primitives only wait for pre-existing readers,
it is the caller's responsibility to guarantee safety to
any subsequent readers.
+
+16. The various RCU read-side primitives do -not- contain memory
+ barriers. The CPU (and in some cases, the compiler) is free
+ to reorder code into and out of RCU read-side critical sections.
+ It is the responsibility of the RCU update-side primitives to
+ deal with this.
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 7aa2002ade7..2a23523ce47 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -36,7 +36,7 @@ o How can the updater tell when a grace period has completed
executed in user mode, or executed in the idle loop, we can
safely free up that item.
- Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the
+ Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the
same effect, but require that the readers manipulate CPU-local
counters. These counters allow limited types of blocking
within RCU read-side critical sections. SRCU also uses
@@ -79,10 +79,10 @@ o I hear that RCU is patented? What is with that?
o I hear that RCU needs work in order to support realtime kernels?
This work is largely completed. Realtime-friendly RCU can be
- enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter.
- However, work is in progress for enabling priority boosting of
- preempted RCU read-side critical sections. This is needed if you
- have CPU-bound realtime threads.
+ enabled via the CONFIG_TREE_PREEMPT_RCU kernel configuration
+ parameter. However, work is in progress for enabling priority
+ boosting of preempted RCU read-side critical sections. This is
+ needed if you have CPU-bound realtime threads.
o Where can I find more information on RCU?
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt
index 909602d409b..e439a0edee2 100644
--- a/Documentation/RCU/rcubarrier.txt
+++ b/Documentation/RCU/rcubarrier.txt
@@ -170,6 +170,13 @@ module invokes call_rcu() from timers, you will need to first cancel all
the timers, and only then invoke rcu_barrier() to wait for any remaining
RCU callbacks to complete.
+Of course, if you module uses call_rcu_bh(), you will need to invoke
+rcu_barrier_bh() before unloading. Similarly, if your module uses
+call_rcu_sched(), you will need to invoke rcu_barrier_sched() before
+unloading. If your module uses call_rcu(), call_rcu_bh(), -and-
+call_rcu_sched(), then you will need to invoke each of rcu_barrier(),
+rcu_barrier_bh(), and rcu_barrier_sched().
+
Implementing rcu_barrier()
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index a342b6e1cc1..9dba3bb90e6 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -76,8 +76,10 @@ torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API,
"rcu_sync" for rcu_read_lock() with synchronous reclamation,
"rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for
rcu_read_lock_bh() with synchronous reclamation, "srcu" for
- the "srcu_read_lock()" API, and "sched" for the use of
- preempt_disable() together with synchronize_sched().
+ the "srcu_read_lock()" API, "sched" for the use of
+ preempt_disable() together with synchronize_sched(),
+ and "sched_expedited" for the use of preempt_disable()
+ with synchronize_sched_expedited().
verbose Enable debug printk()s. Default is disabled.
@@ -162,6 +164,23 @@ of the "old" and "current" counters for the corresponding CPU. The
"idx" value maps the "old" and "current" values to the underlying array,
and is useful for debugging.
+Similarly, sched_expedited RCU provides the following:
+
+ sched_expedited-torture: rtc: d0000000016c1880 ver: 1090796 tfle: 0 rta: 1090796 rtaf: 0 rtf: 1090787 rtmbe: 0 nt: 27713319
+ sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0
+ sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0
+ sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
+ state: -1 / 0:0 3:0 4:0
+
+As before, the first four lines are similar to those for RCU.
+The last line shows the task-migration state. The first number is
+-1 if synchronize_sched_expedited() is idle, -2 if in the process of
+posting wakeups to the migration kthreads, and N when waiting on CPU N.
+Each of the colon-separated fields following the "/" is a CPU:state pair.
+Valid states are "0" for idle, "1" for waiting for quiescent state,
+"2" for passed through quiescent state, and "3" when a race with a
+CPU-hotplug event forces use of the synchronize_sched() primitive.
+
USAGE
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 02cced183b2..187bbf10c92 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -191,8 +191,7 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy).
The output of "cat rcu/rcudata" looks as follows:
-rcu:
-rcu:
+rcu_sched:
0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10
1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10
2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10
@@ -306,7 +305,7 @@ comma-separated-variable spreadsheet format.
The output of "cat rcu/rcugp" looks as follows:
-rcu: completed=33062 gpnum=33063
+rcu_sched: completed=33062 gpnum=33063
rcu_bh: completed=464 gpnum=464
Again, this output is for both "rcu" and "rcu_bh". The fields are
@@ -413,7 +412,7 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
The output of "cat rcu/rcu_pending" looks as follows:
-rcu:
+rcu_sched:
0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 96170824a71..e41a7fecf0d 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -136,10 +136,10 @@ rcu_read_lock()
Used by a reader to inform the reclaimer that the reader is
entering an RCU read-side critical section. It is illegal
to block while in an RCU read-side critical section, though
- kernels built with CONFIG_PREEMPT_RCU can preempt RCU read-side
- critical sections. Any RCU-protected data structure accessed
- during an RCU read-side critical section is guaranteed to remain
- unreclaimed for the full duration of that critical section.
+ kernels built with CONFIG_TREE_PREEMPT_RCU can preempt RCU
+ read-side critical sections. Any RCU-protected data structure
+ accessed during an RCU read-side critical section is guaranteed to
+ remain unreclaimed for the full duration of that critical section.
Reference counts may be used in conjunction with RCU to maintain
longer-term references to data structures.
@@ -785,6 +785,7 @@ RCU pointer/list traversal:
rcu_dereference
list_for_each_entry_rcu
hlist_for_each_entry_rcu
+ hlist_nulls_for_each_entry_rcu
list_for_each_continue_rcu (to be deprecated in favor of new
list_for_each_entry_continue_rcu)
@@ -807,19 +808,23 @@ RCU: Critical sections Grace period Barrier
rcu_read_lock synchronize_net rcu_barrier
rcu_read_unlock synchronize_rcu
+ synchronize_rcu_expedited
call_rcu
bh: Critical sections Grace period Barrier
rcu_read_lock_bh call_rcu_bh rcu_barrier_bh
- rcu_read_unlock_bh
+ rcu_read_unlock_bh synchronize_rcu_bh
+ synchronize_rcu_bh_expedited
sched: Critical sections Grace period Barrier
- [preempt_disable] synchronize_sched rcu_barrier_sched
- [and friends] call_rcu_sched
+ rcu_read_lock_sched synchronize_sched rcu_barrier_sched
+ rcu_read_unlock_sched call_rcu_sched
+ [preempt_disable] synchronize_sched_expedited
+ [and friends]
SRCU: Critical sections Grace period Barrier
@@ -827,6 +832,9 @@ SRCU: Critical sections Grace period Barrier
srcu_read_lock synchronize_srcu N/A
srcu_read_unlock
+SRCU: Initialization/cleanup
+ init_srcu_struct
+ cleanup_srcu_struct
See the comment headers in the source code (or the docbook generated
from them) for more information.
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index f0690bbbd73..bb3a53cdfbc 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -206,24 +206,6 @@ Who: Len Brown <len.brown@intel.com>
---------------------------
-What: libata spindown skipping and warning
-When: Dec 2008
-Why: Some halt(8) implementations synchronize caches for and spin
- down libata disks because libata didn't use to spin down disk on
- system halt (only synchronized caches).
- Spin down on system halt is now implemented. sysfs node
- /sys/class/scsi_disk/h:c:i:l/manage_start_stop is present if
- spin down support is available.
- Because issuing spin down command to an already spun down disk
- makes some disks spin up just to spin down again, libata tracks
- device spindown status to skip the extra spindown command and
- warn about it.
- This is to give userspace tools the time to get updated and will
- be removed after userspace is reasonably updated.
-Who: Tejun Heo <htejun@gmail.com>
-
----------------------------
-
What: i386/x86_64 bzImage symlinks
When: April 2010
@@ -394,15 +376,6 @@ Who: Thomas Gleixner <tglx@linutronix.de>
-----------------------------
-What: obsolete generic irq defines and typedefs
-When: 2.6.30
-Why: The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t)
- have been kept around for migration reasons. After more than two years
- it's time to remove them finally
-Who: Thomas Gleixner <tglx@linutronix.de>
-
----------------------------
-
What: fakephp and associated sysfs files in /sys/bus/pci/slots/
When: 2011
Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ce885375581..5d4427d1728 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2509,6 +2509,11 @@ and is between 256 and 4096 characters. It is defined in the file
trace_buf_size=nn[KMG]
[FTRACE] will set tracing buffer size.
+ trace_event=[event-list]
+ [FTRACE] Set and start specified trace events in order
+ to facilitate early boot debugging.
+ See also Documentation/trace/events.txt
+
trix= [HW,OSS] MediaTrix AudioTrix Pro
Format:
<io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index f157d7594ea..2bcc8d4dea2 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -83,6 +83,15 @@ When reading one of these enable files, there are four results:
X - there is a mixture of events enabled and disabled
? - this file does not affect any event
+2.3 Boot option
+---------------
+
+In order to facilitate early boot debugging, use boot option:
+
+ trace_event=[event-list]
+
+The format of this boot option is the same as described in section 2.1.
+
3. Defining an event-enabled tracepoint
=======================================
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index a39b3c749de..355d0f1f8c5 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -85,26 +85,19 @@ of ftrace. Here is a list of some of the key files:
This file holds the output of the trace in a human
readable format (described below).
- latency_trace:
-
- This file shows the same trace but the information
- is organized more to display possible latencies
- in the system (described below).
-
trace_pipe:
The output is the same as the "trace" file but this
file is meant to be streamed with live tracing.
- Reads from this file will block until new data
- is retrieved. Unlike the "trace" and "latency_trace"
- files, this file is a consumer. This means reading
- from this file causes sequential reads to display
- more current data. Once data is read from this
- file, it is consumed, and will not be read
- again with a sequential read. The "trace" and
- "latency_trace" files are static, and if the
- tracer is not adding more data, they will display
- the same information every time they are read.
+ Reads from this file will block until new data is
+ retrieved. Unlike the "trace" file, this file is a
+ consumer. This means reading from this file causes
+ sequential reads to display more current data. Once
+ data is read from this file, it is consumed, and
+ will not be read again with a sequential read. The
+ "trace" file is static, and if the tracer is not
+ adding more data,they will display the same
+ information every time they are read.
trace_options:
@@ -117,10 +110,10 @@ of ftrace. Here is a list of some of the key files:
Some of the tracers record the max latency.
For example, the time interrupts are disabled.
This time is saved in this file. The max trace
- will also be stored, and displayed by either
- "trace" or "latency_trace". A new max trace will
- only be recorded if the latency is greater than
- the value in this file. (in microseconds)
+ will also be stored, and displayed by "trace".
+ A new max trace will only be recorded if the
+ latency is greater than the value in this
+ file. (in microseconds)
buffer_size_kb:
@@ -210,7 +203,7 @@ Here is the list of current tracers that may be configured.
the trace with the longest max latency.
See tracing_max_latency. When a new max is recorded,
it replaces the old trace. It is best to view this
- trace via the latency_trace file.
+ trace with the latency-format option enabled.
"preemptoff"
@@ -307,8 +300,8 @@ the lowest priority thread (pid 0).
Latency trace format
--------------------
-For traces that display latency times, the latency_trace file
-gives somewhat more information to see why a latency happened.
+When the latency-format option is enabled, the trace file gives
+somewhat more information to see why a latency happened.
Here is a typical trace.
# tracer: irqsoff
@@ -380,9 +373,10 @@ explains which is which.
The above is mostly meaningful for kernel developers.
- time: This differs from the trace file output. The trace file output
- includes an absolute timestamp. The timestamp used by the
- latency_trace file is relative to the start of the trace.
+ time: When the latency-format option is enabled, the trace file
+ output includes a timestamp relative to the start of the
+ trace. This differs from the output when latency-format
+ is disabled, which includes an absolute timestamp.
delay: This is just to help catch your eye a bit better. And
needs to be fixed to be only relative to the same CPU.
@@ -440,7 +434,8 @@ Here are the available options:
sym-addr:
bash-4000 [01] 1477.606694: simple_strtoul <c0339346>
- verbose - This deals with the latency_trace file.
+ verbose - This deals with the trace file when the
+ latency-format option is enabled.
bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \
(+0.000ms): simple_strtoul (strict_strtoul)
@@ -472,7 +467,7 @@ Here are the available options:
the app is no longer running
The lookup is performed when you read
- trace,trace_pipe,latency_trace. Example:
+ trace,trace_pipe. Example:
a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
@@ -481,6 +476,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
every scheduling event. Will add overhead if
there's a lot of tasks running at once.
+ latency-format - This option changes the trace. When
+ it is enabled, the trace displays
+ additional information about the
+ latencies, as described in "Latency
+ trace format".
sched_switch
------------
@@ -596,12 +596,13 @@ To reset the maximum, echo 0 into tracing_max_latency. Here is
an example:
# echo irqsoff > current_tracer
+ # echo latency-format > trace_options
# echo 0 > tracing_max_latency
# echo 1 > tracing_enabled
# ls -ltr
[...]
# echo 0 > tracing_enabled
- # cat latency_trace
+ # cat trace
# tracer: irqsoff
#
irqsoff latency trace v1.1.5 on 2.6.26
@@ -703,12 +704,13 @@ which preemption was disabled. The control of preemptoff tracer
is much like the irqsoff tracer.
# echo preemptoff > current_tracer
+ # echo latency-format > trace_options
# echo 0 > tracing_max_latency
# echo 1 > tracing_enabled
# ls -ltr
[...]
# echo 0 > tracing_enabled
- # cat latency_trace
+ # cat trace
# tracer: preemptoff
#
preemptoff latency trace v1.1.5 on 2.6.26-rc8
@@ -850,12 +852,13 @@ Again, using this trace is much like the irqsoff and preemptoff
tracers.
# echo preemptirqsoff > current_tracer
+ # echo latency-format > trace_options
# echo 0 > tracing_max_latency
# echo 1 > tracing_enabled
# ls -ltr
[...]
# echo 0 > tracing_enabled
- # cat latency_trace
+ # cat trace
# tracer: preemptirqsoff
#
preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
@@ -1012,11 +1015,12 @@ Instead of performing an 'ls', we will run 'sleep 1' under
'chrt' which changes the priority of the task.
# echo wakeup > current_tracer
+ # echo latency-format > trace_options
# echo 0 > tracing_max_latency
# echo 1 > tracing_enabled
# chrt -f 5 sleep 1
# echo 0 > tracing_enabled
- # cat latency_trace
+ # cat trace
# tracer: wakeup
#
wakeup latency trace v1.1.5 on 2.6.26-rc8
diff --git a/Documentation/trace/function-graph-fold.vim b/Documentation/trace/function-graph-fold.vim
new file mode 100644
index 00000000000..0544b504c8b
--- /dev/null
+++ b/Documentation/trace/function-graph-fold.vim
@@ -0,0 +1,42 @@
+" Enable folding for ftrace function_graph traces.
+"
+" To use, :source this file while viewing a function_graph trace, or use vim's
+" -S option to load from the command-line together with a trace. You can then
+" use the usual vim fold commands, such as "za", to open and close nested
+" functions. While closed, a fold will show the total time taken for a call,
+" as would normally appear on the line with the closing brace. Folded
+" functions will not include finish_task_switch(), so folding should remain
+" relatively sane even through a context switch.
+"
+" Note that this will almost certainly only work well with a
+" single-CPU trace (e.g. trace-cmd report --cpu 1).
+
+function! FunctionGraphFoldExpr(lnum)
+ let line = getline(a:lnum)
+ if line[-1:] == '{'
+ if line =~ 'finish_task_switch() {$'
+ return '>1'
+ endif
+ return 'a1'
+ elseif line[-1:] == '}'
+ return 's1'
+ else
+ return '='
+ endif
+endfunction
+
+function! FunctionGraphFoldText()
+ let s = split(getline(v:foldstart), '|', 1)
+ if getline(v:foldend+1) =~ 'finish_task_switch() {$'
+ let s[2] = ' task switch '
+ else
+ let e = split(getline(v:foldend), '|', 1)
+ let s[2] = e[2]
+ endif
+ return join(s, '|')
+endfunction
+
+setlocal foldexpr=FunctionGraphFoldExpr(v:lnum)
+setlocal foldtext=FunctionGraphFoldText()
+setlocal foldcolumn=12
+setlocal foldmethod=expr
diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt
new file mode 100644
index 00000000000..5b1d23d604c
--- /dev/null
+++ b/Documentation/trace/ring-buffer-design.txt
@@ -0,0 +1,955 @@
+ Lockless Ring Buffer Design
+ ===========================
+
+Copyright 2009 Red Hat Inc.
+ Author: Steven Rostedt <srostedt@redhat.com>
+ License: The GNU Free Documentation License, Version 1.2
+ (dual licensed under the GPL v2)
+Reviewers: Mathieu Desnoyers, Huang Ying, Hidetoshi Seto,
+ and Frederic Weisbecker.
+
+
+Written for: 2.6.31
+
+Terminology used in this Document
+---------------------------------
+
+tail - where new writes happen in the ring buffer.
+
+head - where new reads happen in the ring buffer.
+
+producer - the task that writes into the ring buffer (same as writer)
+
+writer - same as producer
+
+consumer - the task that reads from the buffer (same as reader)
+
+reader - same as consumer.
+
+reader_page - A page outside the ring buffer used solely (for the most part)
+ by the reader.
+
+head_page - a pointer to the page that the reader will use next
+
+tail_page - a pointer to the page that will be written to next
+
+commit_page - a pointer to the page with the last finished non nested write.
+
+cmpxchg - hardware assisted atomic transaction that performs the following:
+
+ A = B iff previous A == C
+
+ R = cmpxchg(A, C, B) is saying that we replace A with B if and only if
+ current A is equal to C, and we put the old (current) A into R
+
+ R gets the previous A regardless if A is updated with B or not.
+
+ To see if the update was successful a compare of R == C may be used.
+
+The Generic Ring Buffer
+-----------------------
+
+The ring buffer can be used in either an overwrite mode or in
+producer/consumer mode.
+
+Producer/consumer mode is where the producer were to fill up the
+buffer before the consumer could free up anything, the producer
+will stop writing to the buffer. This will lose most recent events.
+
+Overwrite mode is where the produce were to fill up the buffer
+before the consumer could free up anything, the producer will
+overwrite the older data. This will lose the oldest events.
+
+No two writers can write at the same time (on the same per cpu buffer),
+but a writer may interrupt another writer, but it must finish writing
+before the previous writer may continue. This is very important to the
+algorithm. The writers act like a "stack". The way interrupts works
+enforces this behavior.
+
+
+ writer1 start
+ <preempted> writer2 start
+ <preempted> writer3 start
+ writer3 finishes
+ writer2 finishes
+ writer1 finishes
+
+This is very much like a writer being preempted by an interrupt and
+the interrupt doing a write as well.
+
+Readers can happen at any time. But no two readers may run at the
+same time, nor can a reader preempt/interrupt another reader. A reader
+can not preempt/interrupt a writer, but it may read/consume from the
+buffer at the same time as a writer is writing, but the reader must be
+on another processor to do so. A reader may read on its own processor
+and can be preempted by a writer.
+
+A writer can preempt a reader, but a reader can not preempt a writer.
+But a reader can read the buffer at the same time (on another processor)
+as a writer.
+
+The ring buffer is made up of a list of pages held together by a link list.
+
+At initialization a reader page is allocated for the reader that is not
+part of the ring buffer.
+
+The head_page, tail_page and commit_page are all initialized to point
+to the same page.
+
+The reader page is initialized to have its next pointer pointing to
+the head page, and its previous pointer pointing to a page before
+the head page.
+
+The reader has its own page to use. At start up time, this page is
+allocated but is not attached to the list. When the reader wants
+to read from the buffer, if its page is empty (like it is on start up)
+it will swap its page with the head_page. The old reader page will
+become part of the ring buffer and the head_page will be removed.
+The page after the inserted page (old reader_page) will become the
+new head page.
+
+Once the new page is given to the reader, the reader could do what
+it wants with it, as long as a writer has left that page.
+
+A sample of how the reader page is swapped: Note this does not
+show the head page in the buffer, it is for demonstrating a swap
+only.
+
+ +------+
+ |reader| RING BUFFER
+ |page |
+ +------+
+ +---+ +---+ +---+
+ | |-->| |-->| |
+ | |<--| |<--| |
+ +---+ +---+ +---+
+ ^ | ^ |
+ | +-------------+ |
+ +-----------------+
+
+
+ +------+
+ |reader| RING BUFFER
+ |page |-------------------+
+ +------+ v
+ | +---+ +---+ +---+
+ | | |-->| |-->| |
+ | | |<--| |<--| |<-+
+ | +---+ +---+ +---+ |
+ | ^ | ^ | |
+ | | +-------------+ | |
+ | +-----------------+ |
+ +------------------------------------+
+
+ +------+
+ |reader| RING BUFFER
+ |page |-------------------+
+ +------+ <---------------+ v
+ | ^ +---+ +---+ +---+
+ | | | |-->| |-->| |
+ | | | | | |<--| |<-+
+ | | +---+ +---+ +---+ |
+ | | | ^ | |
+ | | +-------------+ | |
+ | +-----------------------------+ |
+ +------------------------------------+
+
+ +------+
+ |buffer| RING BUFFER
+ |page |-------------------+
+ +------+ <---------------+ v
+ | ^ +---+ +---+ +---+
+ | | | | | |-->| |
+ | | New | | | |<--| |<-+
+ | | Reader +---+ +---+ +---+ |
+ | | page ----^ | |
+ | | | |
+ | +-----------------------------+ |
+ +------------------------------------+
+
+
+
+It is possible that the page swapped is the commit page and the tail page,
+if what is in the ring buffer is less than what is held in a buffer page.
+
+
+ reader page commit page tail page
+ | | |
+ v | |
+ +---+ | |
+ | |<----------+ |
+ | |<------------------------+
+ | |------+
+ +---+ |
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |--->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+This case is still valid for this algorithm.
+When the writer leaves the page, it simply goes into the ring buffer
+since the reader page still points to the next location in the ring
+buffer.
+
+
+The main pointers:
+
+ reader page - The page used solely by the reader and is not part
+ of the ring buffer (may be swapped in)
+
+ head page - the next page in the ring buffer that will be swapped
+ with the reader page.
+
+ tail page - the page where the next write will take place.
+
+ commit page - the page that last finished a write.
+
+The commit page only is updated by the outer most writer in the
+writer stack. A writer that preempts another writer will not move the
+commit page.
+
+When data is written into the ring buffer, a position is reserved
+in the ring buffer and passed back to the writer. When the writer
+is finished writing data into that position, it commits the write.
+
+Another write (or a read) may take place at anytime during this
+transaction. If another write happens it must finish before continuing
+with the previous write.
+
+
+ Write reserve:
+
+ Buffer page
+ +---------+
+ |written |
+ +---------+ <--- given back to writer (current commit)
+ |reserved |
+ +---------+ <--- tail pointer
+ | empty |
+ +---------+
+
+ Write commit:
+
+ Buffer page
+ +---------+
+ |written |
+ +---------+
+ |written |
+ +---------+ <--- next positon for write (current commit)
+ | empty |
+ +---------+
+
+
+ If a write happens after the first reserve:
+
+ Buffer page
+ +---------+
+ |written |
+ +---------+ <-- current commit
+ |reserved |
+ +---------+ <--- given back to second writer
+ |reserved |
+ +---------+ <--- tail pointer
+
+ After second writer commits:
+
+
+ Buffer page
+ +---------+
+ |written |
+ +---------+ <--(last full commit)
+ |reserved |
+ +---------+
+ |pending |
+ |commit |
+ +---------+ <--- tail pointer
+
+ When the first writer commits:
+
+ Buffer page
+ +---------+
+ |written |
+ +---------+
+ |written |
+ +---------+
+ |written |
+ +---------+ <--(last full commit and tail pointer)
+
+
+The commit pointer points to the last write location that was
+committed without preempting another write. When a write that
+preempted another write is committed, it only becomes a pending commit
+and will not be a full commit till all writes have been committed.
+
+The commit page points to the page that has the last full commit.
+The tail page points to the page with the last write (before
+committing).
+
+The tail page is always equal to or after the commit page. It may
+be several pages ahead. If the tail page catches up to the commit
+page then no more writes may take place (regardless of the mode
+of the ring buffer: overwrite and produce/consumer).
+
+The order of pages are:
+
+ head page
+ commit page
+ tail page
+
+Possible scenario:
+ tail page
+ head page commit page |
+ | | |
+ v v v
+ +---+ +---+ +---+ +---+
+<---| |--->| |--->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+There is a special case that the head page is after either the commit page
+and possibly the tail page. That is when the commit (and tail) page has been
+swapped with the reader page. This is because the head page is always
+part of the ring buffer, but the reader page is not. When ever there
+has been less than a full page that has been committed inside the ring buffer,
+and a reader swaps out a page, it will be swapping out the commit page.
+
+
+ reader page commit page tail page
+ | | |
+ v | |
+ +---+ | |
+ | |<----------+ |
+ | |<------------------------+
+ | |------+
+ +---+ |
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |--->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+ ^
+ |
+ head page
+
+
+In this case, the head page will not move when the tail and commit
+move back into the ring buffer.
+
+The reader can not swap a page into the ring buffer if the commit page
+is still on that page. If the read meets the last commit (real commit
+not pending or reserved), then there is nothing more to read.
+The buffer is considered empty until another full commit finishes.
+
+When the tail meets the head page, if the buffer is in overwrite mode,
+the head page will be pushed ahead one. If the buffer is in producer/consumer
+mode, the write will fail.
+
+Overwrite mode:
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |--->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+ ^
+ |
+ head page
+
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |--->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+ ^
+ |
+ head page
+
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |--->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+ ^
+ |
+ head page
+
+Note, the reader page will still point to the previous head page.
+But when a swap takes place, it will use the most recent head page.
+
+
+Making the Ring Buffer Lockless:
+--------------------------------
+
+The main idea behind the lockless algorithm is to combine the moving
+of the head_page pointer with the swapping of pages with the reader.
+State flags are placed inside the pointer to the page. To do this,
+each page must be aligned in memory by 4 bytes. This will allow the 2
+least significant bits of the address to be used as flags. Since
+they will always be zero for the address. To get the address,
+simply mask out the flags.
+
+ MASK = ~3
+
+ address & MASK
+
+Two flags will be kept by these two bits:
+
+ HEADER - the page being pointed to is a head page
+
+ UPDATE - the page being pointed to is being updated by a writer
+ and was or is about to be a head page.
+
+
+ reader page
+ |
+ v
+ +---+
+ | |------+
+ +---+ |
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-H->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+
+The above pointer "-H->" would have the HEADER flag set. That is
+the next page is the next page to be swapped out by the reader.
+This pointer means the next page is the head page.
+
+When the tail page meets the head pointer, it will use cmpxchg to
+change the pointer to the UPDATE state:
+
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-H->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+"-U->" represents a pointer in the UPDATE state.
+
+Any access to the reader will need to take some sort of lock to serialize
+the readers. But the writers will never take a lock to write to the
+ring buffer. This means we only need to worry about a single reader,
+and writes only preempt in "stack" formation.
+
+When the reader tries to swap the page with the ring buffer, it
+will also use cmpxchg. If the flag bit in the pointer to the
+head page does not have the HEADER flag set, the compare will fail
+and the reader will need to look for the new head page and try again.
+Note, the flag UPDATE and HEADER are never set at the same time.
+
+The reader swaps the reader page as follows:
+
+ +------+
+ |reader| RING BUFFER
+ |page |
+ +------+
+ +---+ +---+ +---+
+ | |--->| |--->| |
+ | |<---| |<---| |
+ +---+ +---+ +---+
+ ^ | ^ |
+ | +---------------+ |
+ +-----H-------------+
+
+The reader sets the reader page next pointer as HEADER to the page after
+the head page.
+
+
+ +------+
+ |reader| RING BUFFER
+ |page |-------H-----------+
+ +------+ v
+ | +---+ +---+ +---+
+ | | |--->| |--->| |
+ | | |<---| |<---| |<-+
+ | +---+ +---+ +---+ |
+ | ^ | ^ | |
+ | | +---------------+ | |
+ | +-----H-------------+ |
+ +--------------------------------------+
+
+It does a cmpxchg with the pointer to the previous head page to make it
+point to the reader page. Note that the new pointer does not have the HEADER
+flag set. This action atomically moves the head page forward.
+
+ +------+
+ |reader| RING BUFFER
+ |page |-------H-----------+
+ +------+ v
+ | ^ +---+ +---+ +---+
+ | | | |-->| |-->| |
+ | | | |<--| |<--| |<-+
+ | | +---+ +---+ +---+ |
+ | | | ^ | |
+ | | +-------------+ | |
+ | +-----------------------------+ |
+ +------------------------------------+
+
+After the new head page is set, the previous pointer of the head page is
+updated to the reader page.
+
+ +------+
+ |reader| RING BUFFER
+ |page |-------H-----------+
+ +------+ <---------------+ v
+ | ^ +---+ +---+ +---+
+ | | | |-->| |-->| |
+ | | | | | |<--| |<-+
+ | | +---+ +---+ +---+ |
+ | | | ^ | |
+ | | +-------------+ | |
+ | +-----------------------------+ |
+ +------------------------------------+
+
+ +------+
+ |buffer| RING BUFFER
+ |page |-------H-----------+ <--- New head page
+ +------+ <---------------+ v
+ | ^ +---+ +---+ +---+
+ | | | | | |-->| |
+ | | New | | | |<--| |<-+
+ | | Reader +---+ +---+ +---+ |
+ | | page ----^ | |
+ | | | |
+ | +-----------------------------+ |
+ +------------------------------------+
+
+Another important point. The page that the reader page points back to
+by its previous pointer (the one that now points to the new head page)
+never points back to the reader page. That is because the reader page is
+not part of the ring buffer. Traversing the ring buffer via the next pointers
+will always stay in the ring buffer. Traversing the ring buffer via the
+prev pointers may not.
+
+Note, the way to determine a reader page is simply by examining the previous
+pointer of the page. If the next pointer of the previous page does not
+point back to the original page, then the original page is a reader page:
+
+
+ +--------+
+ | reader | next +----+
+ | page |-------->| |<====== (buffer page)
+ +--------+ +----+
+ | | ^
+ | v | next
+ prev | +----+
+ +------------->| |
+ +----+
+
+The way the head page moves forward:
+
+When the tail page meets the head page and the buffer is in overwrite mode
+and more writes take place, the head page must be moved forward before the
+writer may move the tail page. The way this is done is that the writer
+performs a cmpxchg to convert the pointer to the head page from the HEADER
+flag to have the UPDATE flag set. Once this is done, the reader will
+not be able to swap the head page from the buffer, nor will it be able to
+move the head page, until the writer is finished with the move.
+
+This eliminates any races that the reader can have on the writer. The reader
+must spin, and this is why the reader can not preempt the writer.
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-H->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+The following page will be made into the new head page.
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |-H->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+After the new head page has been set, we can set the old head page
+pointer back to NORMAL.
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |--->| |-H->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+After the head page has been moved, the tail page may now move forward.
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |--->| |-H->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+
+The above are the trivial updates. Now for the more complex scenarios.
+
+
+As stated before, if enough writes preempt the first write, the
+tail page may make it all the way around the buffer and meet the commit
+page. At this time, we must start dropping writes (usually with some kind
+of warning to the user). But what happens if the commit was still on the
+reader page? The commit page is not part of the ring buffer. The tail page
+must account for this.
+
+
+ reader page commit page
+ | |
+ v |
+ +---+ |
+ | |<----------+
+ | |
+ | |------+
+ +---+ |
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-H->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+ ^
+ |
+ tail page
+
+If the tail page were to simply push the head page forward, the commit when
+leaving the reader page would not be pointing to the correct page.
+
+The solution to this is to test if the commit page is on the reader page
+before pushing the head page. If it is, then it can be assumed that the
+tail page wrapped the buffer, and we must drop new writes.
+
+This is not a race condition, because the commit page can only be moved
+by the outter most writer (the writer that was preempted).
+This means that the commit will not move while a writer is moving the
+tail page. The reader can not swap the reader page if it is also being
+used as the commit page. The reader can simply check that the commit
+is off the reader page. Once the commit page leaves the reader page
+it will never go back on it unless a reader does another swap with the
+buffer page that is also the commit page.
+
+
+Nested writes
+-------------
+
+In the pushing forward of the tail page we must first push forward
+the head page if the head page is the next page. If the head page
+is not the next page, the tail page is simply updated with a cmpxchg.
+
+Only writers move the tail page. This must be done atomically to protect
+against nested writers.
+
+ temp_page = tail_page
+ next_page = temp_page->next
+ cmpxchg(tail_page, temp_page, next_page)
+
+The above will update the tail page if it is still pointing to the expected
+page. If this fails, a nested write pushed it forward, the the current write
+does not need to push it.
+
+
+ temp page
+ |
+ v
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |--->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+Nested write comes in and moves the tail page forward:
+
+ tail page (moved by nested writer)
+ temp page |
+ | |
+ v v
+ +---+ +---+ +---+ +---+
+<---| |--->| |--->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+The above would fail the cmpxchg, but since the tail page has already
+been moved forward, the writer will just try again to reserve storage
+on the new tail page.
+
+But the moving of the head page is a bit more complex.
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-H->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+The write converts the head page pointer to UPDATE.
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+But if a nested writer preempts here. It will see that the next
+page is a head page, but it is also nested. It will detect that
+it is nested and will save that information. The detection is the
+fact that it sees the UPDATE flag instead of a HEADER or NORMAL
+pointer.
+
+The nested writer will set the new head page pointer.
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |-H->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+But it will not reset the update back to normal. Only the writer
+that converted a pointer from HEAD to UPDATE will convert it back
+to NORMAL.
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |-H->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+After the nested writer finishes, the outer most writer will convert
+the UPDATE pointer to NORMAL.
+
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |--->| |-H->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+
+It can be even more complex if several nested writes came in and moved
+the tail page ahead several pages:
+
+
+(first writer)
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-H->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+The write converts the head page pointer to UPDATE.
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |--->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+Next writer comes in, and sees the update and sets up the new
+head page.
+
+(second writer)
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |-H->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+The nested writer moves the tail page forward. But does not set the old
+update page to NORMAL because it is not the outer most writer.
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |-H->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+Another writer preempts and sees the page after the tail page is a head page.
+It changes it from HEAD to UPDATE.
+
+(third writer)
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |-U->| |--->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+The writer will move the head page forward:
+
+
+(third writer)
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |-U->| |-H->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+But now that the third writer did change the HEAD flag to UPDATE it
+will convert it to normal:
+
+
+(third writer)
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |--->| |-H->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+
+Then it will move the tail page, and return back to the second writer.
+
+
+(second writer)
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |--->| |-H->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+
+The second writer will fail to move the tail page because it was already
+moved, so it will try again and add its data to the new tail page.
+It will return to the first writer.
+
+
+(first writer)
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |--->| |-H->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+The first writer can not know atomically test if the tail page moved
+while it updates the HEAD page. It will then update the head page to
+what it thinks is the new head page.
+
+
+(first writer)
+
+ tail page
+ |
+ v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |-H->| |-H->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+Since the cmpxchg returns the old value of the pointer the first writer
+will see it succeeded in updating the pointer from NORMAL to HEAD.
+But as we can see, this is not good enough. It must also check to see
+if the tail page is either where it use to be or on the next page:
+
+
+(first writer)
+
+ A B tail page
+ | | |
+ v v v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |-H->| |-H->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+If tail page != A and tail page does not equal B, then it must reset the
+pointer back to NORMAL. The fact that it only needs to worry about
+nested writers, it only needs to check this after setting the HEAD page.
+
+
+(first writer)
+
+ A B tail page
+ | | |
+ v v v
+ +---+ +---+ +---+ +---+
+<---| |--->| |-U->| |--->| |-H->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
+Now the writer can update the head page. This is also why the head page must
+remain in UPDATE and only reset by the outer most writer. This prevents
+the reader from seeing the incorrect head page.
+
+
+(first writer)
+
+ A B tail page
+ | | |
+ v v v
+ +---+ +---+ +---+ +---+
+<---| |--->| |--->| |--->| |-H->
+--->| |<---| |<---| |<---| |<---
+ +---+ +---+ +---+ +---+
+
diff --git a/arch/Kconfig b/arch/Kconfig
index 99193b16023..beea3ccebb5 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -30,6 +30,18 @@ config OPROFILE_IBS
If unsure, say N.
+config OPROFILE_EVENT_MULTIPLEX
+ bool "OProfile multiplexing support (EXPERIMENTAL)"
+ default n
+ depends on OPROFILE && X86
+ help
+ The number of hardware counters is limited. The multiplexing
+ feature enables OProfile to gather more events than counters
+ are provided by the hardware. This is realized by switching
+ between events at an user specified time interval.
+
+ If unsure, say N.
+
config HAVE_OPROFILE
bool
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 5a61b5c2e18..8d3c79cd81e 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -44,7 +44,6 @@ static inline void dma_free_coherent(struct device *dev, size_t size,
#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
#define get_dma_ops(dev) platform_dma_get_ops(dev)
-#define flush_write_buffers()
#include <asm-generic/dma-mapping-common.h>
@@ -69,6 +68,24 @@ dma_set_mask (struct device *dev, u64 mask)
return 0;
}
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+ if (!dev->dma_mask)
+ return 0;
+
+ return addr + size <= *dev->dma_mask;
+}
+
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+ return paddr;
+}
+
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+ return daddr;
+}
+
extern int dma_get_cache_alignment(void);
static inline void
diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c
index fb833269017..dbeadb9c8e2 100644
--- a/arch/ia64/xen/time.c
+++ b/arch/ia64/xen/time.c
@@ -133,8 +133,7 @@ consider_steal_time(unsigned long new_itm)
account_idle_ticks(blocked);
run_local_timers();
- if (rcu_pending(cpu))
- rcu_check_callbacks(cpu, user_mode(get_irq_regs()));
+ rcu_check_callbacks(cpu, user_mode(get_irq_regs()));
scheduler_tick();
run_posix_cpu_timers(p);
diff --git a/arch/m68k/include/asm/entry_mm.h b/arch/m68k/include/asm/entry_mm.h
index 5202f5a5b42..47412588621 100644
--- a/arch/m68k/include/asm/entry_mm.h
+++ b/arch/m68k/include/asm/entry_mm.h
@@ -46,7 +46,6 @@
#define curptr a2
LFLUSH_I_AND_D = 0x00000808
-LSIGTRAP = 5
/* process bits for task_struct.ptrace */
PT_TRACESYS_OFF = 3
@@ -118,9 +117,6 @@ PT_DTRACE_BIT = 2
#define STR(X) STR1(X)
#define STR1(X) #X
-#define PT_OFF_ORIG_D0 0x24
-#define PT_OFF_FORMATVEC 0x32
-#define PT_OFF_SR 0x2C
#define SAVE_ALL_INT \
"clrl %%sp@-;" /* stk_adj */ \
"pea -1:w;" /* orig d0 = -1 */ \
diff --git a/arch/m68k/include/asm/entry_no.h b/arch/m68k/include/asm/entry_no.h
index c2553d26273..907ed03d792 100644
--- a/arch/m68k/include/asm/entry_no.h
+++ b/arch/m68k/include/asm/entry_no.h
@@ -72,8 +72,8 @@ LENOSYS = 38
lea %sp@(-32),%sp /* space for 8 regs */
moveml %d1-%d5/%a0-%a2,%sp@
movel sw_usp,%a0 /* get usp */
- movel %a0@-,%sp@(PT_PC) /* copy exception program counter */
- movel %a0@-,%sp@(PT_FORMATVEC)/* copy exception format/vector/sr */
+ movel %a0@-,%sp@(PT_OFF_PC) /* copy exception program counter */
+ movel %a0@-,%sp@(PT_OFF_FORMATVEC)/*copy exception format/vector/sr */
bra 7f
6:
clrl %sp@- /* stkadj */
@@ -89,8 +89,8 @@ LENOSYS = 38
bnes 8f /* no, skip */
move #0x2700,%sr /* disable intrs */
movel sw_usp,%a0 /* get usp */
- movel %sp@(PT_PC),%a0@- /* copy exception program counter */
- movel %sp@(PT_FORMATVEC),%a0@-/* copy exception format/vector/sr */
+ movel %sp@(PT_OFF_PC),%a0@- /* copy exception program counter */
+ movel %sp@(PT_OFF_FORMATVEC),%a0@-/*copy exception format/vector/sr */
moveml %sp@,%d1-%d5/%a0-%a2
lea %sp@(32),%sp /* space for 8 regs */
movel %sp@+,%d0
diff --git a/arch/m68k/include/asm/math-emu.h b/arch/m68k/include/asm/math-emu.h
index ddfab96403c..5e9249b0014 100644
--- a/arch/m68k/include/asm/math-emu.h
+++ b/arch/m68k/include/asm/math-emu.h
@@ -145,16 +145,16 @@ extern unsigned int fp_debugprint;
* these are only used during instruction decoding
* where we always know how deep we're on the stack.
*/
-#define FPS_DO (PT_D0)
-#define FPS_D1 (PT_D1)
-#define FPS_D2 (PT_D2)
-#define FPS_A0 (PT_A0)
-#define FPS_A1 (PT_A1)
-#define FPS_A2 (PT_A2)
-#define FPS_SR (PT_SR)
-#define FPS_PC (PT_PC)
-#define FPS_EA (PT_PC+6)
-#define FPS_PC2 (PT_PC+10)
+#define FPS_DO (PT_OFF_D0)
+#define FPS_D1 (PT_OFF_D1)
+#define FPS_D2 (PT_OFF_D2)
+#define FPS_A0 (PT_OFF_A0)
+#define FPS_A1 (PT_OFF_A1)
+#define FPS_A2 (PT_OFF_A2)
+#define FPS_SR (PT_OFF_SR)
+#define FPS_PC (PT_OFF_PC)
+#define FPS_EA (PT_OFF_PC+6)
+#define FPS_PC2 (PT_OFF_PC+10)
.macro fp_get_fp_reg
lea (FPD_FPREG,FPDATA,%d0.w*4),%a0
diff --git a/arch/m68k/include/asm/thread_info_mm.h b/arch/m68k/include/asm/thread_info_mm.h
index 6ea5c33b3c5..b6da3882be9 100644
--- a/arch/m68k/include/asm/thread_info_mm.h
+++ b/arch/m68k/include/asm/thread_info_mm.h
@@ -1,6 +1,10 @@
#ifndef _ASM_M68K_THREAD_INFO_H
#define _ASM_M68K_THREAD_INFO_H
+#ifndef ASM_OFFSETS_C
+#include <asm/asm-offsets.h>
+#endif
+#include <asm/current.h>
#include <asm/types.h>
#include <asm/page.h>
@@ -31,7 +35,12 @@ struct thread_info {
#define init_thread_info (init_task.thread.info)
#define init_stack (init_thread_union.stack)
-#define task_thread_info(tsk) (&(tsk)->thread.info)
+#ifdef ASM_OFFSETS_C
+#define task_thread_info(tsk) ((struct thread_info *) NULL)
+#else
+#define task_thread_info(tsk) ((struct thread_info *)((char *)tsk+TASK_TINFO))
+#endif
+
#define task_stack_page(tsk) ((tsk)->stack)
#define current_thread_info() task_thread_info(current)
diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c
index b1f012f6c49..73e5e581245 100644
--- a/arch/m68k/kernel/asm-offsets.c
+++ b/arch/m68k/kernel/asm-offsets.c
@@ -8,6 +8,8 @@
* #defines from the assembly-language output.
*/
+#define ASM_OFFSETS_C
+
#include <linux/stddef.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
@@ -27,6 +29,9 @@ int main(void)
DEFINE(TASK_INFO, offsetof(struct task_struct, thread.info));
DEFINE(TASK_MM, offsetof(struct task_struct, mm));
DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
+#ifdef CONFIG_MMU
+ DEFINE(TASK_TINFO, offsetof(struct task_struct, thread.info));
+#endif
/* offsets into the thread struct */
DEFINE(THREAD_KSP, offsetof(struct thread_struct, ksp));
@@ -44,20 +49,20 @@ int main(void)
DEFINE(TINFO_FLAGS, offsetof(struct thread_info, flags));
/* offsets into the pt_regs */
- DEFINE(PT_D0, offsetof(struct pt_regs, d0));
- DEFINE(PT_ORIG_D0, offsetof(struct pt_regs, orig_d0));
- DEFINE(PT_D1, offsetof(struct pt_regs, d1));
- DEFINE(PT_D2, offsetof(struct pt_regs, d2));
- DEFINE(PT_D3, offsetof(struct pt_regs, d3));
- DEFINE(PT_D4, offsetof(struct pt_regs, d4));
- DEFINE(PT_D5, offsetof(struct pt_regs, d5));
- DEFINE(PT_A0, offsetof(struct pt_regs, a0));
- DEFINE(PT_A1, offsetof(struct pt_regs, a1));
- DEFINE(PT_A2, offsetof(struct pt_regs, a2));
- DEFINE(PT_PC, offsetof(struct pt_regs, pc));
- DEFINE(PT_SR, offsetof(struct pt_regs, sr));
+ DEFINE(PT_OFF_D0, offsetof(struct pt_regs, d0));
+ DEFINE(PT_OFF_ORIG_D0, offsetof(struct pt_regs, orig_d0));
+ DEFINE(PT_OFF_D1, offsetof(struct pt_regs, d1));
+ DEFINE(PT_OFF_D2, offsetof(struct pt_regs, d2));
+ DEFINE(PT_OFF_D3, offsetof(struct pt_regs, d3));
+ DEFINE(PT_OFF_D4, offsetof(struct pt_regs, d4));
+ DEFINE(PT_OFF_D5, offsetof(struct pt_regs, d5));
+ DEFINE(PT_OFF_A0, offsetof(struct pt_regs, a0));
+ DEFINE(PT_OFF_A1, offsetof(struct pt_regs, a1));
+ DEFINE(PT_OFF_A2, offsetof(struct pt_regs, a2));
+ DEFINE(PT_OFF_PC, offsetof(struct pt_regs, pc));
+ DEFINE(PT_OFF_SR, offsetof(struct pt_regs, sr));
/* bitfields are a bit difficult */
- DEFINE(PT_VECTOR, offsetof(struct pt_regs, pc) + 4);
+ DEFINE(PT_OFF_FORMATVEC, offsetof(struct pt_regs, pc) + 4);
/* offsets into the irq_handler struct */
DEFINE(IRQ_HANDLER, offsetof(struct irq_node, handler));
@@ -84,10 +89,10 @@ int main(void)
DEFINE(FONT_DESC_PREF, offsetof(struct font_desc, pref));
/* signal defines */
- DEFINE(SIGSEGV, SIGSEGV);
- DEFINE(SEGV_MAPERR, SEGV_MAPERR);
- DEFINE(SIGTRAP, SIGTRAP);
- DEFINE(TRAP_TRACE, TRAP_TRACE);
+ DEFINE(LSIGSEGV, SIGSEGV);
+ DEFINE(LSEGV_MAPERR, SEGV_MAPERR);
+ DEFINE(LSIGTRAP, SIGTRAP);
+ DEFINE(LTRAP_TRACE, TRAP_TRACE);
/* offsets into the custom struct */
DEFINE(CUSTOMBASE, &amiga_custom);
diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
index c3735cd6207..922f52e7ed1 100644
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -77,17 +77,17 @@ ENTRY(ret_from_fork)
jra .Lret_from_exception
do_trace_entry:
- movel #-ENOSYS,%sp@(PT_D0) | needed for strace
+ movel #-ENOSYS,%sp@(PT_OFF_D0)| needed for strace
subql #4,%sp
SAVE_SWITCH_STACK
jbsr syscall_trace
RESTORE_SWITCH_STACK
addql #4,%sp
- movel %sp@(PT_ORIG_D0),%d0
+ movel %sp@(PT_OFF_ORIG_D0),%d0
cmpl #NR_syscalls,%d0
jcs syscall
badsys:
- movel #-ENOSYS,%sp@(PT_D0)
+ movel #-ENOSYS,%sp@(PT_OFF_D0)
jra ret_from_syscall
do_trace_exit:
@@ -103,7 +103,7 @@ ENTRY(ret_from_signal)
addql #4,%sp
/* on 68040 complete pending writebacks if any */
#ifdef CONFIG_M68040
- bfextu %sp@(PT_VECTOR){#0,#4},%d0
+ bfextu %sp@(PT_OFF_FORMATVEC){#0,#4},%d0
subql #7,%d0 | bus error frame ?
jbne 1f
movel %sp,%sp@-
@@ -127,7 +127,7 @@ ENTRY(system_call)
jcc badsys
syscall:
jbsr @(sys_call_table,%d0:l:4)@(0)
- movel %d0,%sp@(PT_D0) | save the return value
+ movel %d0,%sp@(PT_OFF_D0) | save the return value
ret_from_syscall:
|oriw #0x0700,%sr
movew %curptr@(TASK_INFO+TINFO_FLAGS+2),%d0
@@ -135,7 +135,7 @@ ret_from_syscall:
1: RESTORE_ALL
syscall_exit_work:
- btst #5,%sp@(PT_SR) | check if returning to kernel
+ btst #5,%sp@(PT_OFF_SR) | check if returning to kernel
bnes 1b | if so, skip resched, signals
lslw #1,%d0
jcs do_trace_exit
@@ -148,7 +148,7 @@ syscall_exit_work:
ENTRY(ret_from_exception)
.Lret_from_exception:
- btst #5,%sp@(PT_SR) | check if returning to kernel
+ btst #5,%sp@(PT_OFF_SR) | check if returning to kernel
bnes 1f | if so, skip resched, signals
| only allow interrupts when we are really the last one on the
| kernel stack, otherwise stack overflow can occur during
@@ -182,7 +182,7 @@ do_signal_return:
jbra resume_userspace
do_delayed_trace:
- bclr #7,%sp@(PT_SR) | clear trace bit in SR
+ bclr #7,%sp@(PT_OFF_SR) | clear trace bit in SR
pea 1 | send SIGTRAP
movel %curptr,%sp@-
pea LSIGTRAP
@@ -199,7 +199,7 @@ ENTRY(auto_inthandler)
GET_CURRENT(%d0)
addqb #1,%curptr@(TASK_INFO+TINFO_PREEMPT+1)
| put exception # in d0
- bfextu %sp@(PT_VECTOR){#4,#10},%d0
+ bfextu %sp@(PT_OFF_FORMATVEC){#4,#10},%d0
subw #VEC_SPUR,%d0
movel %sp,%sp@-
@@ -216,7 +216,7 @@ ret_from_interrupt:
ALIGN
ret_from_last_interrupt:
moveq #(~ALLOWINT>>8)&0xff,%d0
- andb %sp@(PT_SR),%d0
+ andb %sp@(PT_OFF_SR),%d0
jne 2b
/* check if we need to do software interrupts */
@@ -232,7 +232,7 @@ ENTRY(user_inthandler)
GET_CURRENT(%d0)
addqb #1,%curptr@(TASK_INFO+TINFO_PREEMPT+1)
| put exception # in d0
- bfextu %sp@(PT_VECTOR){#4,#10},%d0
+ bfextu %sp@(PT_OFF_FORMATVEC){#4,#10},%d0
user_irqvec_fixup = . + 2
subw #VEC_USER,%d0
diff --git a/arch/m68k/math-emu/fp_entry.S b/arch/m68k/math-emu/fp_entry.S
index 954b4f304a7..a3fe1f348df 100644
--- a/arch/m68k/math-emu/fp_entry.S
+++ b/arch/m68k/math-emu/fp_entry.S
@@ -85,8 +85,8 @@ fp_err_ua2:
fp_err_ua1:
addq.l #4,%sp
move.l %a0,-(%sp)
- pea SEGV_MAPERR
- pea SIGSEGV
+ pea LSEGV_MAPERR
+ pea LSIGSEGV
jsr fpemu_signal
add.w #12,%sp
jra ret_from_exception
@@ -96,8 +96,8 @@ fp_err_ua1:
| it does not really belong here, but...
fp_sendtrace060:
move.l (FPS_PC,%sp),-(%sp)
- pea TRAP_TRACE
- pea SIGTRAP
+ pea LTRAP_TRACE
+ pea LSIGTRAP
jsr fpemu_signal
add.w #12,%sp
jra ret_from_exception
@@ -122,17 +122,17 @@ fp_get_data_reg:
.long fp_get_d6, fp_get_d7
fp_get_d0:
- move.l (PT_D0+8,%sp),%d0
+ move.l (PT_OFF_D0+8,%sp),%d0
printf PREGISTER,"{d0->%08x}",1,%d0
rts
fp_get_d1:
- move.l (PT_D1+8,%sp),%d0
+ move.l (PT_OFF_D1+8,%sp),%d0
printf PREGISTER,"{d1->%08x}",1,%d0
rts
fp_get_d2:
- move.l (PT_D2+8,%sp),%d0
+ move.l (PT_OFF_D2+8,%sp),%d0
printf PREGISTER,"{d2->%08x}",1,%d0
rts
@@ -173,35 +173,35 @@ fp_put_data_reg:
fp_put_d0:
printf PREGISTER,"{d0<-%08x}",1,%d0
- move.l %d0,(PT_D0+8,%sp)
+ move.l %d0,(PT_OFF_D0+8,%sp)
rts
fp_put_d1:
printf PREGISTER,"{d1<-%08x}",1,%d0
- move.l %d0,(PT_D1+8,%sp)
+ move.l %d0,(PT_OFF_D1+8,%sp)
rts
fp_put_d2:
printf PREGISTER,"{d2<-%08x}",1,%d0
- move.l %d0,(PT_D2+8,%sp)
+ move.l %d0,(PT_OFF_D2+8,%sp)
rts
fp_put_d3:
printf PREGISTER,"{d3<-%08x}",1,%d0
| move.l %d0,%d3
- move.l %d0,(PT_D3+8,%sp)
+ move.l %d0,(PT_OFF_D3+8,%sp)
rts
fp_put_d4:
printf PREGISTER,"{d4<-%08x}",1,%d0
| move.l %d0,%d4
- move.l %d0,(PT_D4+8,%sp)
+ move.l %d0,(PT_OFF_D4+8,%sp)
rts
fp_put_d5:
printf PREGISTER,"{d5<-%08x}",1,%d0
| move.l %d0,%d5
- move.l %d0,(PT_D5+8,%sp)
+ move.l %d0,(PT_OFF_D5+8,%sp)
rts
fp_put_d6:
@@ -225,17 +225,17 @@ fp_get_addr_reg:
.long fp_get_a6, fp_get_a7
fp_get_a0:
- move.l (PT_A0+8,%sp),%a0
+ move.l (PT_OFF_A0+8,%sp),%a0
printf PREGISTER,"{a0->%08x}",1,%a0
rts
fp_get_a1:
- move.l (PT_A1+8,%sp),%a0
+ move.l (PT_OFF_A1+8,%sp),%a0
printf PREGISTER,"{a1->%08x}",1,%a0
rts
fp_get_a2:
- move.l (PT_A2+8,%sp),%a0
+ move.l (PT_OFF_A2+8,%sp),%a0
printf PREGISTER,"{a2->%08x}",1,%a0
rts
@@ -276,17 +276,17 @@ fp_put_addr_reg:
fp_put_a0:
printf PREGISTER,"{a0<-%08x}",1,%a0
- move.l %a0,(PT_A0+8,%sp)
+ move.l %a0,(PT_OFF_A0+8,%sp)
rts
fp_put_a1:
printf PREGISTER,"{a1<-%08x}",1,%a0
- move.l %a0,(PT_A1+8,%sp)
+ move.l %a0,(PT_OFF_A1+8,%sp)
rts
fp_put_a2:
printf PREGISTER,"{a2<-%08x}",1,%a0
- move.l %a0,(PT_A2+8,%sp)
+ move.l %a0,(PT_OFF_A2+8,%sp)
rts
fp_put_a3:
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index b44aaabdd1a..0c34371ec49 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -424,6 +424,29 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
#endif
}
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
+ if (ops->addr_needs_map && ops->addr_needs_map(dev, addr, size))
+ return 0;
+
+ if (!dev->dma_mask)
+ return 0;
+
+ return addr + size <= *dev->dma_mask;
+}
+
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+ return paddr + get_dma_direct_offset(dev);
+}
+
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+ return daddr - get_dma_direct_offset(dev);
+}
+
#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
#ifdef CONFIG_NOT_COHERENT_CACHE
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index eb17da78112..2a5da069714 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -104,8 +104,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
else
pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte));
-#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP)
- /* Second case is 32-bit with 64-bit PTE in SMP mode. In this case, we
+#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
+ /* Second case is 32-bit with 64-bit PTE. In this case, we
* can just store as long as we do the two halves in the right order
* with a barrier in between. This is possible because we take care,
* in the hash code, to pre-invalidate if the PTE was already hashed,
@@ -140,7 +140,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
#else
/* Anything else just stores the PTE normally. That covers all 64-bit
- * cases, and 32-bit non-hash with 64-bit PTEs in UP mode
+ * cases, and 32-bit non-hash with 32-bit PTEs.
*/
*ptep = pte;
#endif
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index c3b193121f8..198266cf9e2 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -54,7 +54,7 @@
* This returns the old value in the lock, so we succeeded
* in getting the lock if the return value is 0.
*/
-static inline unsigned long __spin_trylock(raw_spinlock_t *lock)
+static inline unsigned long arch_spin_trylock(raw_spinlock_t *lock)
{
unsigned long tmp, token;
@@ -76,7 +76,7 @@ static inline unsigned long __spin_trylock(raw_spinlock_t *lock)
static inline int __raw_spin_trylock(raw_spinlock_t *lock)
{
CLEAR_IO_SYNC;
- return __spin_trylock(lock) == 0;
+ return arch_spin_trylock(lock) == 0;
}
/*
@@ -108,7 +108,7 @@ static inline void __raw_spin_lock(raw_spinlock_t *lock)
{
CLEAR_IO_SYNC;
while (1) {
- if (likely(__spin_trylock(lock) == 0))
+ if (likely(arch_spin_trylock(lock) == 0))
break;
do {
HMT_low();
@@ -126,7 +126,7 @@ void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
CLEAR_IO_SYNC;
while (1) {
- if (likely(__spin_trylock(lock) == 0))
+ if (likely(arch_spin_trylock(lock) == 0))
break;
local_save_flags(flags_dis);
local_irq_restore(flags);
@@ -181,7 +181,7 @@ extern void __raw_spin_unlock_wait(raw_spinlock_t *lock);
* This returns the old value in the lock + 1,
* so we got a read lock if the return value is > 0.
*/
-static inline long __read_trylock(raw_rwlock_t *rw)
+static inline long arch_read_trylock(raw_rwlock_t *rw)
{
long tmp;
@@ -205,7 +205,7 @@ static inline long __read_trylock(raw_rwlock_t *rw)
* This returns the old value in the lock,
* so we got the write lock if the return value is 0.
*/
-static inline long __write_trylock(raw_rwlock_t *rw)
+static inline long arch_write_trylock(raw_rwlock_t *rw)
{
long tmp, token;
@@ -228,7 +228,7 @@ static inline long __write_trylock(raw_rwlock_t *rw)
static inline void __raw_read_lock(raw_rwlock_t *rw)
{
while (1) {
- if (likely(__read_trylock(rw) > 0))
+ if (likely(arch_read_trylock(rw) > 0))
break;
do {
HMT_low();
@@ -242,7 +242,7 @@ static inline void __raw_read_lock(raw_rwlock_t *rw)
static inline void __raw_write_lock(raw_rwlock_t *rw)
{
while (1) {
- if (likely(__write_trylock(rw) == 0))
+ if (likely(arch_write_trylock(rw) == 0))
break;
do {
HMT_low();
@@ -255,12 +255,12 @@ static inline void __raw_write_lock(raw_rwlock_t *rw)
static inline int __raw_read_trylock(raw_rwlock_t *rw)
{
- return __read_trylock(rw) > 0;
+ return arch_read_trylock(rw) > 0;
}
static inline int __raw_write_trylock(raw_rwlock_t *rw)
{
- return __write_trylock(rw) == 0;
+ return arch_write_trylock(rw) == 0;
}
static inline void __raw_read_unlock(raw_rwlock_t *rw)
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index b73396b9390..9619285f64e 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
-obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o
+obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o perf_callchain.o
obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \
power5+-pmu.o power6-pmu.o power7-pmu.o
obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 561b6465231..197b15646ee 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -67,6 +67,8 @@ int main(void)
DEFINE(MMCONTEXTID, offsetof(struct mm_struct, context.id));
#ifdef CONFIG_PPC64
DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context));
+ DEFINE(SIGSEGV, SIGSEGV);
+ DEFINE(NMI_MASK, NMI_MASK);
#else
DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index 68ccf11e4f1..e8a57de85bc 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -24,50 +24,12 @@
int swiotlb __read_mostly;
unsigned int ppc_swiotlb_enable;
-void *swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t addr)
-{
- unsigned long pfn = PFN_DOWN(swiotlb_bus_to_phys(hwdev, addr));
- void *pageaddr = page_address(pfn_to_page(pfn));
-
- if (pageaddr != NULL)
- return pageaddr + (addr % PAGE_SIZE);
- return NULL;
-}
-
-dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
-{
- return paddr + get_dma_direct_offset(hwdev);
-}
-
-phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
-
-{
- return baddr - get_dma_direct_offset(hwdev);
-}
-
-/*
- * Determine if an address needs bounce buffering via swiotlb.
- * Going forward I expect the swiotlb code to generalize on using
- * a dma_ops->addr_needs_map, and this function will move from here to the
- * generic swiotlb code.
- */
-int
-swiotlb_arch_address_needs_mapping(struct device *hwdev, dma_addr_t addr,
- size_t size)
-{
- struct dma_mapping_ops *dma_ops = get_dma_ops(hwdev);
-
- BUG_ON(!dma_ops);
- return dma_ops->addr_needs_map(hwdev, addr, size);
-}
-
/*
* Determine if an address is reachable by a pci device, or if we must bounce.
*/
static int
swiotlb_pci_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size)
{
- u64 mask = dma_get_mask(hwdev);
dma_addr_t max;
struct pci_controller *hose;
struct pci_dev *pdev = to_pci_dev(hwdev);
@@ -79,16 +41,9 @@ swiotlb_pci_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size)
if ((addr + size > max) | (addr < hose->dma_window_base_cur))
return 1;
- return !is_buffer_dma_capable(mask, addr, size);
-}
-
-static int
-swiotlb_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size)
-{
- return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
+ return 0;
}
-
/*
* At the moment, all platforms that use this code only require
* swiotlb to be used if we're operating on HIGHMEM. Since
@@ -104,7 +59,6 @@ struct dma_mapping_ops swiotlb_dma_ops = {
.dma_supported = swiotlb_dma_supported,
.map_page = swiotlb_map_page,
.unmap_page = swiotlb_unmap_page,
- .addr_needs_map = swiotlb_addr_needs_map,
.sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
.sync_single_range_for_device = swiotlb_sync_single_range_for_device,
.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index eb898112e57..8ac85e08ffa 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -729,6 +729,11 @@ BEGIN_FTR_SECTION
bne- do_ste_alloc /* If so handle it */
END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
+ clrrdi r11,r1,THREAD_SHIFT
+ lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */
+ andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */
+ bne 77f /* then don't call hash_page now */
+
/*
* On iSeries, we soft-disable interrupts here, then
* hard-enable interrupts so that the hash_page code can spin on
@@ -833,6 +838,20 @@ handle_page_fault:
bl .low_hash_fault
b .ret_from_except
+/*
+ * We come here as a result of a DSI at a point where we don't want
+ * to call hash_page, such as when we are accessing memory (possibly
+ * user memory) inside a PMU interrupt that occurred while interrupts
+ * were soft-disabled. We want to invoke the exception handler for
+ * the access, or panic if there isn't a handler.
+ */
+77: bl .save_nvgprs
+ mr r4,r3
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ li r5,SIGSEGV
+ bl .bad_page_fault
+ b .ret_from_except
+
/* here we have a segment miss */
do_ste_alloc:
bl .ste_allocate /* try to insert stab entry */
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c
new file mode 100644
index 00000000000..f74b62c6751
--- /dev/null
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -0,0 +1,527 @@
+/*
+ * Performance counter callchain support - powerpc architecture code
+ *
+ * Copyright © 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_counter.h>
+#include <linux/percpu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <asm/ptrace.h>
+#include <asm/pgtable.h>
+#include <asm/sigcontext.h>
+#include <asm/ucontext.h>
+#include <asm/vdso.h>
+#ifdef CONFIG_PPC64
+#include "ppc32.h"
+#endif
+
+/*
+ * Store another value in a callchain_entry.
+ */
+static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+ unsigned int nr = entry->nr;
+
+ if (nr < PERF_MAX_STACK_DEPTH) {
+ entry->ip[nr] = ip;
+ entry->nr = nr + 1;
+ }
+}
+
+/*
+ * Is sp valid as the address of the next kernel stack frame after prev_sp?
+ * The next frame may be in a different stack area but should not go
+ * back down in the same stack area.
+ */
+static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
+{
+ if (sp & 0xf)
+ return 0; /* must be 16-byte aligned */
+ if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
+ return 0;
+ if (sp >= prev_sp + STACK_FRAME_OVERHEAD)
+ return 1;
+ /*
+ * sp could decrease when we jump off an interrupt stack
+ * back to the regular process stack.
+ */
+ if ((sp & ~(THREAD_SIZE - 1)) != (prev_sp & ~(THREAD_SIZE - 1)))
+ return 1;
+ return 0;
+}
+
+static void perf_callchain_kernel(struct pt_regs *regs,
+ struct perf_callchain_entry *entry)
+{
+ unsigned long sp, next_sp;
+ unsigned long next_ip;
+ unsigned long lr;
+ long level = 0;
+ unsigned long *fp;
+
+ lr = regs->link;
+ sp = regs->gpr[1];
+ callchain_store(entry, PERF_CONTEXT_KERNEL);
+ callchain_store(entry, regs->nip);
+
+ if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
+ return;
+
+ for (;;) {
+ fp = (unsigned long *) sp;
+ next_sp = fp[0];
+
+ if (next_sp == sp + STACK_INT_FRAME_SIZE &&
+ fp[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
+ /*
+ * This looks like an interrupt frame for an
+ * interrupt that occurred in the kernel
+ */
+ regs = (struct pt_regs *)(sp + STACK_FRAME_OVERHEAD);
+ next_ip = regs->nip;
+ lr = regs->link;
+ level = 0;
+ callchain_store(entry, PERF_CONTEXT_KERNEL);
+
+ } else {
+ if (level == 0)
+ next_ip = lr;
+ else
+ next_ip = fp[STACK_FRAME_LR_SAVE];
+
+ /*
+ * We can't tell which of the first two addresses
+ * we get are valid, but we can filter out the
+ * obviously bogus ones here. We replace them
+ * with 0 rather than removing them entirely so
+ * that userspace can tell which is which.
+ */
+ if ((level == 1 && next_ip == lr) ||
+ (level <= 1 && !kernel_text_address(next_ip)))
+ next_ip = 0;
+
+ ++level;
+ }
+
+ callchain_store(entry, next_ip);
+ if (!valid_next_sp(next_sp, sp))
+ return;
+ sp = next_sp;
+ }
+}
+
+#ifdef CONFIG_PPC64
+
+#ifdef CONFIG_HUGETLB_PAGE
+#define is_huge_psize(pagesize) (HPAGE_SHIFT && mmu_huge_psizes[pagesize])
+#else
+#define is_huge_psize(pagesize) 0
+#endif
+
+/*
+ * On 64-bit we don't want to invoke hash_page on user addresses from
+ * interrupt context, so if the access faults, we read the page tables
+ * to find which page (if any) is mapped and access it directly.
+ */
+static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
+{
+ pgd_t *pgdir;
+ pte_t *ptep, pte;
+ int pagesize;
+ unsigned long addr = (unsigned long) ptr;
+ unsigned long offset;
+ unsigned long pfn;
+ void *kaddr;
+
+ pgdir = current->mm->pgd;
+ if (!pgdir)
+ return -EFAULT;
+
+ pagesize = get_slice_psize(current->mm, addr);
+
+ /* align address to page boundary */
+ offset = addr & ((1ul << mmu_psize_defs[pagesize].shift) - 1);
+ addr -= offset;
+
+ if (is_huge_psize(pagesize))
+ ptep = huge_pte_offset(current->mm, addr);
+ else
+ ptep = find_linux_pte(pgdir, addr);
+
+ if (ptep == NULL)
+ return -EFAULT;
+ pte = *ptep;
+ if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
+ return -EFAULT;
+ pfn = pte_pfn(pte);
+ if (!page_is_ram(pfn))
+ return -EFAULT;
+
+ /* no highmem to worry about here */
+ kaddr = pfn_to_kaddr(pfn);
+ memcpy(ret, kaddr + offset, nb);
+ return 0;
+}
+
+static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
+{
+ if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) ||
+ ((unsigned long)ptr & 7))
+ return -EFAULT;
+
+ if (!__get_user_inatomic(*ret, ptr))
+ return 0;
+
+ return read_user_stack_slow(ptr, ret, 8);
+}
+
+static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
+{
+ if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
+ ((unsigned long)ptr & 3))
+ return -EFAULT;
+
+ if (!__get_user_inatomic(*ret, ptr))
+ return 0;
+
+ return read_user_stack_slow(ptr, ret, 4);
+}
+
+static inline int valid_user_sp(unsigned long sp, int is_64)
+{
+ if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32)
+ return 0;
+ return 1;
+}
+
+/*
+ * 64-bit user processes use the same stack frame for RT and non-RT signals.
+ */
+struct signal_frame_64 {
+ char dummy[__SIGNAL_FRAMESIZE];
+ struct ucontext uc;
+ unsigned long unused[2];
+ unsigned int tramp[6];
+ struct siginfo *pinfo;
+ void *puc;
+ struct siginfo info;
+ char abigap[288];
+};
+
+static int is_sigreturn_64_address(unsigned long nip, unsigned long fp)
+{
+ if (nip == fp + offsetof(struct signal_frame_64, tramp))
+ return 1;
+ if (vdso64_rt_sigtramp && current->mm->context.vdso_base &&
+ nip == current->mm->context.vdso_base + vdso64_rt_sigtramp)
+ return 1;
+ return 0;
+}
+
+/*
+ * Do some sanity checking on the signal frame pointed to by sp.
+ * We check the pinfo and puc pointers in the frame.
+ */
+static int sane_signal_64_frame(unsigned long sp)
+{
+ struct signal_frame_64 __user *sf;
+ unsigned long pinfo, puc;
+
+ sf = (struct signal_frame_64 __user *) sp;
+ if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) ||
+ read_user_stack_64((unsigned long __user *) &sf->puc, &puc))
+ return 0;
+ return pinfo == (unsigned long) &sf->info &&
+ puc == (unsigned long) &sf->uc;
+}
+
+static void perf_callchain_user_64(struct pt_regs *regs,
+ struct perf_callchain_entry *entry)
+{
+ unsigned long sp, next_sp;
+ unsigned long next_ip;
+ unsigned long lr;
+ long level = 0;
+ struct signal_frame_64 __user *sigframe;
+ unsigned long __user *fp, *uregs;
+
+ next_ip = regs->nip;
+ lr = regs->link;
+ sp = regs->gpr[1];
+ callchain_store(entry, PERF_CONTEXT_USER);
+ callchain_store(entry, next_ip);
+
+ for (;;) {
+ fp = (unsigned long __user *) sp;
+ if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
+ return;
+ if (level > 0 && read_user_stack_64(&fp[2], &next_ip))
+ return;
+
+ /*
+ * Note: the next_sp - sp >= signal frame size check
+ * is true when next_sp < sp, which can happen when
+ * transitioning from an alternate signal stack to the
+ * normal stack.
+ */
+ if (next_sp - sp >= sizeof(struct signal_frame_64) &&
+ (is_sigreturn_64_address(next_ip, sp) ||
+ (level <= 1 && is_sigreturn_64_address(lr, sp))) &&
+ sane_signal_64_frame(sp)) {
+ /*
+ * This looks like an signal frame
+ */
+ sigframe = (struct signal_frame_64 __user *) sp;
+ uregs = sigframe->uc.uc_mcontext.gp_regs;
+ if (read_user_stack_64(&uregs[PT_NIP], &next_ip) ||
+ read_user_stack_64(&uregs[PT_LNK], &lr) ||
+ read_user_stack_64(&uregs[PT_R1], &sp))
+ return;
+ level = 0;
+ callchain_store(entry, PERF_CONTEXT_USER);
+ callchain_store(entry, next_ip);
+ continue;
+ }
+
+ if (level == 0)
+ next_ip = lr;
+ callchain_store(entry, next_ip);
+ ++level;
+ sp = next_sp;
+ }
+}
+
+static inline int current_is_64bit(void)
+{
+ /*
+ * We can't use test_thread_flag() here because we may be on an
+ * interrupt stack, and the thread flags don't get copied over
+ * from the thread_info on the main stack to the interrupt stack.
+ */
+ return !test_ti_thread_flag(task_thread_info(current), TIF_32BIT);
+}
+
+#else /* CONFIG_PPC64 */
+/*
+ * On 32-bit we just access the address and let hash_page create a
+ * HPTE if necessary, so there is no need to fall back to reading
+ * the page tables. Since this is called at interrupt level,
+ * do_page_fault() won't treat a DSI as a page fault.
+ */
+static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
+{
+ if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
+ ((unsigned long)ptr & 3))
+ return -EFAULT;
+
+ return __get_user_inatomic(*ret, ptr);
+}
+
+static inline void perf_callchain_user_64(struct pt_regs *regs,
+ struct perf_callchain_entry *entry)
+{
+}
+
+static inline int current_is_64bit(void)
+{
+ return 0;
+}
+
+static inline int valid_user_sp(unsigned long sp, int is_64)
+{
+ if (!sp || (sp & 7) || sp > TASK_SIZE - 32)
+ return 0;
+ return 1;
+}
+
+#define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE
+#define sigcontext32 sigcontext
+#define mcontext32 mcontext
+#define ucontext32 ucontext
+#define compat_siginfo_t struct siginfo
+
+#endif /* CONFIG_PPC64 */
+
+/*
+ * Layout for non-RT signal frames
+ */
+struct signal_frame_32 {
+ char dummy[__SIGNAL_FRAMESIZE32];
+ struct sigcontext32 sctx;
+ struct mcontext32 mctx;
+ int abigap[56];
+};
+
+/*
+ * Layout for RT signal frames
+ */
+struct rt_signal_frame_32 {
+ char dummy[__SIGNAL_FRAMESIZE32 + 16];
+ compat_siginfo_t info;
+ struct ucontext32 uc;
+ int abigap[56];
+};
+
+static int is_sigreturn_32_address(unsigned int nip, unsigned int fp)
+{
+ if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad))
+ return 1;
+ if (vdso32_sigtramp && current->mm->context.vdso_base &&
+ nip == current->mm->context.vdso_base + vdso32_sigtramp)
+ return 1;
+ return 0;
+}
+
+static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp)
+{
+ if (nip == fp + offsetof(struct rt_signal_frame_32,
+ uc.uc_mcontext.mc_pad))
+ return 1;
+ if (vdso32_rt_sigtramp && current->mm->context.vdso_base &&
+ nip == current->mm->context.vdso_base + vdso32_rt_sigtramp)
+ return 1;
+ return 0;
+}
+
+static int sane_signal_32_frame(unsigned int sp)
+{
+ struct signal_frame_32 __user *sf;
+ unsigned int regs;
+
+ sf = (struct signal_frame_32 __user *) (unsigned long) sp;
+ if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, &regs))
+ return 0;
+ return regs == (unsigned long) &sf->mctx;
+}
+
+static int sane_rt_signal_32_frame(unsigned int sp)
+{
+ struct rt_signal_frame_32 __user *sf;
+ unsigned int regs;
+
+ sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
+ if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, &regs))
+ return 0;
+ return regs == (unsigned long) &sf->uc.uc_mcontext;
+}
+
+static unsigned int __user *signal_frame_32_regs(unsigned int sp,
+ unsigned int next_sp, unsigned int next_ip)
+{
+ struct mcontext32 __user *mctx = NULL;
+ struct signal_frame_32 __user *sf;
+ struct rt_signal_frame_32 __user *rt_sf;
+
+ /*
+ * Note: the next_sp - sp >= signal frame size check
+ * is true when next_sp < sp, for example, when
+ * transitioning from an alternate signal stack to the
+ * normal stack.
+ */
+ if (next_sp - sp >= sizeof(struct signal_frame_32) &&
+ is_sigreturn_32_address(next_ip, sp) &&
+ sane_signal_32_frame(sp)) {
+ sf = (struct signal_frame_32 __user *) (unsigned long) sp;
+ mctx = &sf->mctx;
+ }
+
+ if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) &&
+ is_rt_sigreturn_32_address(next_ip, sp) &&
+ sane_rt_signal_32_frame(sp)) {
+ rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
+ mctx = &rt_sf->uc.uc_mcontext;
+ }
+
+ if (!mctx)
+ return NULL;
+ return mctx->mc_gregs;
+}
+
+static void perf_callchain_user_32(struct pt_regs *regs,
+ struct perf_callchain_entry *entry)
+{
+ unsigned int sp, next_sp;
+ unsigned int next_ip;
+ unsigned int lr;
+ long level = 0;
+ unsigned int __user *fp, *uregs;
+
+ next_ip = regs->nip;
+ lr = regs->link;
+ sp = regs->gpr[1];
+ callchain_store(entry, PERF_CONTEXT_USER);
+ callchain_store(entry, next_ip);
+
+ while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ fp = (unsigned int __user *) (unsigned long) sp;
+ if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
+ return;
+ if (level > 0 && read_user_stack_32(&fp[1], &next_ip))
+ return;
+
+ uregs = signal_frame_32_regs(sp, next_sp, next_ip);
+ if (!uregs && level <= 1)
+ uregs = signal_frame_32_regs(sp, next_sp, lr);
+ if (uregs) {
+ /*
+ * This looks like an signal frame, so restart
+ * the stack trace with the values in it.
+ */
+ if (read_user_stack_32(&uregs[PT_NIP], &next_ip) ||
+ read_user_stack_32(&uregs[PT_LNK], &lr) ||
+ read_user_stack_32(&uregs[PT_R1], &sp))
+ return;
+ level = 0;
+ callchain_store(entry, PERF_CONTEXT_USER);
+ callchain_store(entry, next_ip);
+ continue;
+ }
+
+ if (level == 0)
+ next_ip = lr;
+ callchain_store(entry, next_ip);
+ ++level;
+ sp = next_sp;
+ }
+}
+
+/*
+ * Since we can't get PMU interrupts inside a PMU interrupt handler,
+ * we don't need separate irq and nmi entries here.
+ */
+static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+ struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
+
+ entry->nr = 0;
+
+ if (current->pid == 0) /* idle task? */
+ return entry;
+
+ if (!user_mode(regs)) {
+ perf_callchain_kernel(regs, entry);
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ if (current_is_64bit())
+ perf_callchain_user_64(regs, entry);
+ else
+ perf_callchain_user_32(regs, entry);
+ }
+
+ return entry;
+}
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 5b7038f248b..a685652effe 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -92,15 +92,13 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
: "memory" );
}
-void slb_flush_and_rebolt(void)
+static void __slb_flush_and_rebolt(void)
{
/* If you change this make sure you change SLB_NUM_BOLTED
* appropriately too. */
unsigned long linear_llp, vmalloc_llp, lflags, vflags;
unsigned long ksp_esid_data, ksp_vsid_data;
- WARN_ON(!irqs_disabled());
-
linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
lflags = SLB_VSID_KERNEL | linear_llp;
@@ -117,12 +115,6 @@ void slb_flush_and_rebolt(void)
ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
}
- /*
- * We can't take a PMU exception in the following code, so hard
- * disable interrupts.
- */
- hard_irq_disable();
-
/* We need to do this all in asm, so we're sure we don't touch
* the stack between the slbia and rebolting it. */
asm volatile("isync\n"
@@ -139,6 +131,21 @@ void slb_flush_and_rebolt(void)
: "memory");
}
+void slb_flush_and_rebolt(void)
+{
+
+ WARN_ON(!irqs_disabled());
+
+ /*
+ * We can't take a PMU exception in the following code, so hard
+ * disable interrupts.
+ */
+ hard_irq_disable();
+
+ __slb_flush_and_rebolt();
+ get_paca()->slb_cache_ptr = 0;
+}
+
void slb_vmalloc_update(void)
{
unsigned long vflags;
@@ -180,12 +187,20 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2)
/* Flush all user entries from the segment table of the current processor. */
void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
{
- unsigned long offset = get_paca()->slb_cache_ptr;
+ unsigned long offset;
unsigned long slbie_data = 0;
unsigned long pc = KSTK_EIP(tsk);
unsigned long stack = KSTK_ESP(tsk);
unsigned long unmapped_base;
+ /*
+ * We need interrupts hard-disabled here, not just soft-disabled,
+ * so that a PMU interrupt can't occur, which might try to access
+ * user memory (to get a stack trace) and possible cause an SLB miss
+ * which would update the slb_cache/slb_cache_ptr fields in the PACA.
+ */
+ hard_irq_disable();
+ offset = get_paca()->slb_cache_ptr;
if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) &&
offset <= SLB_CACHE_ENTRIES) {
int i;
@@ -200,7 +215,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
}
asm volatile("isync" : : : "memory");
} else {
- slb_flush_and_rebolt();
+ __slb_flush_and_rebolt();
}
/* Workaround POWER5 < DD2.1 issue */
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 98cd1dc2ae7..ab5fb48b3e9 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -164,7 +164,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
{
struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
struct stab_entry *ste;
- unsigned long offset = __get_cpu_var(stab_cache_ptr);
+ unsigned long offset;
unsigned long pc = KSTK_EIP(tsk);
unsigned long stack = KSTK_ESP(tsk);
unsigned long unmapped_base;
@@ -172,6 +172,15 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
/* Force previous translations to complete. DRENG */
asm volatile("isync" : : : "memory");
+ /*
+ * We need interrupts hard-disabled here, not just soft-disabled,
+ * so that a PMU interrupt can't occur, which might try to access
+ * user memory (to get a stack trace) and possible cause an STAB miss
+ * which would update the stab_cache/stab_cache_ptr per-cpu variables.
+ */
+ hard_irq_disable();
+
+ offset = __get_cpu_var(stab_cache_ptr);
if (offset <= NR_STAB_CACHE_ENTRIES) {
int i;
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e030e86ff6a..1c866efd217 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -84,7 +84,7 @@ config S390
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_TRACE_MCOUNT_TEST
select HAVE_FTRACE_MCOUNT_RECORD
- select HAVE_FTRACE_SYSCALLS
+ select HAVE_SYSCALL_TRACEPOINTS
select HAVE_DYNAMIC_FTRACE
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_DEFAULT_NO_SPIN_MUTEXES
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index fcba206529f..4e91a2573cc 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -900,7 +900,7 @@ CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
-CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
CONFIG_TRACING_SUPPORT=y
CONFIG_FTRACE=y
# CONFIG_FUNCTION_TRACER is not set
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index c9af0d19c7a..41ce6861174 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -191,4 +191,33 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
#define _raw_read_relax(lock) cpu_relax()
#define _raw_write_relax(lock) cpu_relax()
+#define __always_inline__spin_lock
+#define __always_inline__read_lock
+#define __always_inline__write_lock
+#define __always_inline__spin_lock_bh
+#define __always_inline__read_lock_bh
+#define __always_inline__write_lock_bh
+#define __always_inline__spin_lock_irq
+#define __always_inline__read_lock_irq
+#define __always_inline__write_lock_irq
+#define __always_inline__spin_lock_irqsave
+#define __always_inline__read_lock_irqsave
+#define __always_inline__write_lock_irqsave
+#define __always_inline__spin_trylock
+#define __always_inline__read_trylock
+#define __always_inline__write_trylock
+#define __always_inline__spin_trylock_bh
+#define __always_inline__spin_unlock
+#define __always_inline__read_unlock
+#define __always_inline__write_unlock
+#define __always_inline__spin_unlock_bh
+#define __always_inline__read_unlock_bh
+#define __always_inline__write_unlock_bh
+#define __always_inline__spin_unlock_irq
+#define __always_inline__read_unlock_irq
+#define __always_inline__write_unlock_irq
+#define __always_inline__spin_unlock_irqrestore
+#define __always_inline__read_unlock_irqrestore
+#define __always_inline__write_unlock_irqrestore
+
#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index ba1cab9fc1f..07eb61b2fb3 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -92,7 +92,7 @@ static inline struct thread_info *current_thread_info(void)
#define TIF_SYSCALL_TRACE 8 /* syscall trace active */
#define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */
#define TIF_SECCOMP 10 /* secure computing */
-#define TIF_SYSCALL_FTRACE 11 /* ftrace syscall instrumentation */
+#define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint instrumentation */
#define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */
#define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling
TIF_NEED_RESCHED */
@@ -111,7 +111,7 @@ static inline struct thread_info *current_thread_info(void)
#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1<<TIF_SECCOMP)
-#define _TIF_SYSCALL_FTRACE (1<<TIF_SYSCALL_FTRACE)
+#define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
#define _TIF_USEDFPU (1<<TIF_USEDFPU)
#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
#define _TIF_31BIT (1<<TIF_31BIT)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index f78580a7403..f43d2ee5446 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -54,7 +54,7 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
_TIF_MCCK_PENDING)
_TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \
- _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8)
+ _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8)
STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER
STACK_SIZE = 1 << STACK_SHIFT
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 009ca6175db..a6f7b20df61 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -57,7 +57,7 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
_TIF_MCCK_PENDING)
_TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \
- _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8)
+ _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8)
#define BASED(name) name-system_call(%r13)
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 3e298e64f0d..57bdcb1e3cd 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -220,6 +220,29 @@ struct syscall_metadata *syscall_nr_to_meta(int nr)
return syscalls_metadata[nr];
}
+int syscall_name_to_nr(char *name)
+{
+ int i;
+
+ if (!syscalls_metadata)
+ return -1;
+ for (i = 0; i < NR_syscalls; i++)
+ if (syscalls_metadata[i])
+ if (!strcmp(syscalls_metadata[i]->name, name))
+ return i;
+ return -1;
+}
+
+void set_syscall_enter_id(int num, int id)
+{
+ syscalls_metadata[num]->enter_id = id;
+}
+
+void set_syscall_exit_id(int num, int id)
+{
+ syscalls_metadata[num]->exit_id = id;
+}
+
static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
{
struct syscall_metadata *start;
@@ -237,24 +260,19 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
return NULL;
}
-void arch_init_ftrace_syscalls(void)
+static int __init arch_init_ftrace_syscalls(void)
{
struct syscall_metadata *meta;
int i;
- static atomic_t refs;
-
- if (atomic_inc_return(&refs) != 1)
- goto out;
syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * NR_syscalls,
GFP_KERNEL);
if (!syscalls_metadata)
- goto out;
+ return -ENOMEM;
for (i = 0; i < NR_syscalls; i++) {
meta = find_syscall_meta((unsigned long)sys_call_table[i]);
syscalls_metadata[i] = meta;
}
- return;
-out:
- atomic_dec(&refs);
+ return 0;
}
+arch_initcall(arch_init_ftrace_syscalls);
#endif
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 43acd73105b..f3ddd7ac06c 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -51,6 +51,9 @@
#include "compat_ptrace.h"
#endif
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
enum s390_regset {
REGSET_GENERAL,
REGSET_FP,
@@ -661,8 +664,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
ret = -1;
}
- if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
- ftrace_syscall_enter(regs);
+ if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+ trace_sys_enter(regs, regs->gprs[2]);
if (unlikely(current->audit_context))
audit_syscall_entry(is_compat_task() ?
@@ -679,8 +682,8 @@ asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]),
regs->gprs[2]);
- if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
- ftrace_syscall_exit(regs);
+ if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+ trace_sys_exit(regs, regs->gprs[2]);
if (test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall_exit(regs, 0);
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 3f8b6a92eab..233cff53a62 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -25,6 +25,8 @@ config SPARC
select ARCH_WANT_OPTIONAL_GPIOLIB
select RTC_CLASS
select RTC_DRV_M48T59
+ select HAVE_DMA_ATTRS
+ select HAVE_DMA_API_DEBUG
config SPARC32
def_bool !64BIT
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 204e4bf6443..5a8c308e2b5 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -3,6 +3,7 @@
#include <linux/scatterlist.h>
#include <linux/mm.h>
+#include <linux/dma-debug.h>
#define DMA_ERROR_CODE (~(dma_addr_t)0x0)
@@ -13,142 +14,40 @@ extern int dma_set_mask(struct device *dev, u64 dma_mask);
#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
#define dma_is_consistent(d, h) (1)
-struct dma_ops {
- void *(*alloc_coherent)(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag);
- void (*free_coherent)(struct device *dev, size_t size,
- void *cpu_addr, dma_addr_t dma_handle);
- dma_addr_t (*map_page)(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction direction);
- void (*unmap_page)(struct device *dev, dma_addr_t dma_addr,
- size_t size,
- enum dma_data_direction direction);
- int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents,
- enum dma_data_direction direction);
- void (*unmap_sg)(struct device *dev, struct scatterlist *sg,
- int nhwentries,
- enum dma_data_direction direction);
- void (*sync_single_for_cpu)(struct device *dev,
- dma_addr_t dma_handle, size_t size,
- enum dma_data_direction direction);
- void (*sync_single_for_device)(struct device *dev,
- dma_addr_t dma_handle, size_t size,
- enum dma_data_direction direction);
- void (*sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg,
- int nelems,
- enum dma_data_direction direction);
- void (*sync_sg_for_device)(struct device *dev,
- struct scatterlist *sg, int nents,
- enum dma_data_direction dir);
-};
-extern const struct dma_ops *dma_ops;
+extern struct dma_map_ops *dma_ops, pci32_dma_ops;
+extern struct bus_type pci_bus_type;
-static inline void *dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag)
-{
- return dma_ops->alloc_coherent(dev, size, dma_handle, flag);
-}
-
-static inline void dma_free_coherent(struct device *dev, size_t size,
- void *cpu_addr, dma_addr_t dma_handle)
-{
- dma_ops->free_coherent(dev, size, cpu_addr, dma_handle);
-}
-
-static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
- size_t size,
- enum dma_data_direction direction)
-{
- return dma_ops->map_page(dev, virt_to_page(cpu_addr),
- (unsigned long)cpu_addr & ~PAGE_MASK, size,
- direction);
-}
-
-static inline void dma_unmap_single(struct device *dev, dma_addr_t dma_addr,
- size_t size,
- enum dma_data_direction direction)
-{
- dma_ops->unmap_page(dev, dma_addr, size, direction);
-}
-
-static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction direction)
-{
- return dma_ops->map_page(dev, page, offset, size, direction);
-}
-
-static inline void dma_unmap_page(struct device *dev, dma_addr_t dma_address,
- size_t size,
- enum dma_data_direction direction)
-{
- dma_ops->unmap_page(dev, dma_address, size, direction);
-}
-
-static inline int dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, enum dma_data_direction direction)
-{
- return dma_ops->map_sg(dev, sg, nents, direction);
-}
-
-static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
- int nents, enum dma_data_direction direction)
+static inline struct dma_map_ops *get_dma_ops(struct device *dev)
{
- dma_ops->unmap_sg(dev, sg, nents, direction);
-}
-
-static inline void dma_sync_single_for_cpu(struct device *dev,
- dma_addr_t dma_handle, size_t size,
- enum dma_data_direction direction)
-{
- dma_ops->sync_single_for_cpu(dev, dma_handle, size, direction);
+#if defined(CONFIG_SPARC32) && defined(CONFIG_PCI)
+ if (dev->bus == &pci_bus_type)
+ return &pci32_dma_ops;
+#endif
+ return dma_ops;
}
-static inline void dma_sync_single_for_device(struct device *dev,
- dma_addr_t dma_handle,
- size_t size,
- enum dma_data_direction direction)
-{
- if (dma_ops->sync_single_for_device)
- dma_ops->sync_single_for_device(dev, dma_handle, size,
- direction);
-}
+#include <asm-generic/dma-mapping-common.h>
-static inline void dma_sync_sg_for_cpu(struct device *dev,
- struct scatterlist *sg, int nelems,
- enum dma_data_direction direction)
+static inline void *dma_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flag)
{
- dma_ops->sync_sg_for_cpu(dev, sg, nelems, direction);
-}
+ struct dma_map_ops *ops = get_dma_ops(dev);
+ void *cpu_addr;
-static inline void dma_sync_sg_for_device(struct device *dev,
- struct scatterlist *sg, int nelems,
- enum dma_data_direction direction)
-{
- if (dma_ops->sync_sg_for_device)
- dma_ops->sync_sg_for_device(dev, sg, nelems, direction);
+ cpu_addr = ops->alloc_coherent(dev, size, dma_handle, flag);
+ debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+ return cpu_addr;
}
-static inline void dma_sync_single_range_for_cpu(struct device *dev,
- dma_addr_t dma_handle,
- unsigned long offset,
- size_t size,
- enum dma_data_direction dir)
+static inline void dma_free_coherent(struct device *dev, size_t size,
+ void *cpu_addr, dma_addr_t dma_handle)
{
- dma_sync_single_for_cpu(dev, dma_handle+offset, size, dir);
-}
+ struct dma_map_ops *ops = get_dma_ops(dev);
-static inline void dma_sync_single_range_for_device(struct device *dev,
- dma_addr_t dma_handle,
- unsigned long offset,
- size_t size,
- enum dma_data_direction dir)
-{
- dma_sync_single_for_device(dev, dma_handle+offset, size, dir);
+ debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
+ ops->free_coherent(dev, size, cpu_addr, dma_handle);
}
-
static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
return (dma_addr == DMA_ERROR_CODE);
diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h
index 1934f2cbf51..a0b443cb3c1 100644
--- a/arch/sparc/include/asm/irq_64.h
+++ b/arch/sparc/include/asm/irq_64.h
@@ -89,8 +89,8 @@ static inline unsigned long get_softint(void)
return retval;
}
-void __trigger_all_cpu_backtrace(void);
-#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
+void arch_trigger_all_cpu_backtrace(void);
+#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
extern void *hardirq_stack[NR_CPUS];
extern void *softirq_stack[NR_CPUS];
diff --git a/arch/sparc/include/asm/pci.h b/arch/sparc/include/asm/pci.h
index 6e14fd17933..d9c031f9910 100644
--- a/arch/sparc/include/asm/pci.h
+++ b/arch/sparc/include/asm/pci.h
@@ -5,4 +5,7 @@
#else
#include <asm/pci_32.h>
#endif
+
+#include <asm-generic/pci-dma-compat.h>
+
#endif
diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h
index b41c4c19815..ac0e8369fd9 100644
--- a/arch/sparc/include/asm/pci_32.h
+++ b/arch/sparc/include/asm/pci_32.h
@@ -31,42 +31,8 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
*/
#define PCI_DMA_BUS_IS_PHYS (0)
-#include <asm/scatterlist.h>
-
struct pci_dev;
-/* Allocate and map kernel buffer using consistent mode DMA for a device.
- * hwdev should be valid struct pci_dev pointer for PCI devices.
- */
-extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle);
-
-/* Free and unmap a consistent DMA buffer.
- * cpu_addr is what was returned from pci_alloc_consistent,
- * size must be the same as what as passed into pci_alloc_consistent,
- * and likewise dma_addr must be the same as what *dma_addrp was set to.
- *
- * References to the memory and mappings assosciated with cpu_addr/dma_addr
- * past this call are illegal.
- */
-extern void pci_free_consistent(struct pci_dev *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle);
-
-/* Map a single buffer of the indicated size for DMA in streaming mode.
- * The 32-bit bus address to use is returned.
- *
- * Once the device is given the dma address, the device owns this memory
- * until either pci_unmap_single or pci_dma_sync_single_for_cpu is performed.
- */
-extern dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size, int direction);
-
-/* Unmap a single streaming mode DMA translation. The dma_addr and size
- * must match what was provided for in a previous pci_map_single call. All
- * other usages are undefined.
- *
- * After this call, reads by the cpu to the buffer are guaranteed to see
- * whatever the device wrote there.
- */
-extern void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, size_t size, int direction);
-
/* pci_unmap_{single,page} is not a nop, thus... */
#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
dma_addr_t ADDR_NAME;
@@ -81,69 +47,6 @@ extern void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, size_t
#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
(((PTR)->LEN_NAME) = (VAL))
-/*
- * Same as above, only with pages instead of mapped addresses.
- */
-extern dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
- unsigned long offset, size_t size, int direction);
-extern void pci_unmap_page(struct pci_dev *hwdev,
- dma_addr_t dma_address, size_t size, int direction);
-
-/* Map a set of buffers described by scatterlist in streaming
- * mode for DMA. This is the scather-gather version of the
- * above pci_map_single interface. Here the scatter gather list
- * elements are each tagged with the appropriate dma address
- * and length. They are obtained via sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- * DMA address/length pairs than there are SG table elements.
- * (for example via virtual mapping capabilities)
- * The routine returns the number of addr/length pairs actually
- * used, at most nents.
- *
- * Device ownership issues as mentioned above for pci_map_single are
- * the same here.
- */
-extern int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction);
-
-/* Unmap a set of streaming mode DMA translations.
- * Again, cpu read rules concerning calls here are the same as for
- * pci_unmap_single() above.
- */
-extern void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nhwents, int direction);
-
-/* Make physical memory consistent for a single
- * streaming mode DMA translation after a transfer.
- *
- * If you perform a pci_map_single() but wish to interrogate the
- * buffer using the cpu, yet do not wish to teardown the PCI dma
- * mapping, you must call this function before doing so. At the
- * next point you give the PCI dma address back to the card, you
- * must first perform a pci_dma_sync_for_device, and then the device
- * again owns the buffer.
- */
-extern void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction);
-extern void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction);
-
-/* Make physical memory consistent for a set of streaming
- * mode DMA translations after a transfer.
- *
- * The same as pci_dma_sync_single_* but for a scatter-gather list,
- * same rules and usage.
- */
-extern void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction);
-extern void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction);
-
-/* Return whether the given PCI device DMA address mask can
- * be supported properly. For example, if your device can
- * only drive the low 24-bits during PCI bus mastering, then
- * you would pass 0x00ffffff as the mask to this function.
- */
-static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask)
-{
- return 1;
-}
-
#ifdef CONFIG_PCI
static inline void pci_dma_burst_advice(struct pci_dev *pdev,
enum pci_dma_burst_strategy *strat,
@@ -154,14 +57,6 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
}
#endif
-#define PCI_DMA_ERROR_CODE (~(dma_addr_t)0x0)
-
-static inline int pci_dma_mapping_error(struct pci_dev *pdev,
- dma_addr_t dma_addr)
-{
- return (dma_addr == PCI_DMA_ERROR_CODE);
-}
-
struct device_node;
extern struct device_node *pci_device_to_OF_node(struct pci_dev *pdev);
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index 7a1e3566e59..5cc9f6aa549 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -35,37 +35,6 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
*/
#define PCI_DMA_BUS_IS_PHYS (0)
-static inline void *pci_alloc_consistent(struct pci_dev *pdev, size_t size,
- dma_addr_t *dma_handle)
-{
- return dma_alloc_coherent(&pdev->dev, size, dma_handle, GFP_ATOMIC);
-}
-
-static inline void pci_free_consistent(struct pci_dev *pdev, size_t size,
- void *vaddr, dma_addr_t dma_handle)
-{
- return dma_free_coherent(&pdev->dev, size, vaddr, dma_handle);
-}
-
-static inline dma_addr_t pci_map_single(struct pci_dev *pdev, void *ptr,
- size_t size, int direction)
-{
- return dma_map_single(&pdev->dev, ptr, size,
- (enum dma_data_direction) direction);
-}
-
-static inline void pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr,
- size_t size, int direction)
-{
- dma_unmap_single(&pdev->dev, dma_addr, size,
- (enum dma_data_direction) direction);
-}
-
-#define pci_map_page(dev, page, off, size, dir) \
- pci_map_single(dev, (page_address(page) + (off)), size, dir)
-#define pci_unmap_page(dev,addr,sz,dir) \
- pci_unmap_single(dev,addr,sz,dir)
-
/* pci_unmap_{single,page} is not a nop, thus... */
#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
dma_addr_t ADDR_NAME;
@@ -80,57 +49,6 @@ static inline void pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr,
#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
(((PTR)->LEN_NAME) = (VAL))
-static inline int pci_map_sg(struct pci_dev *pdev, struct scatterlist *sg,
- int nents, int direction)
-{
- return dma_map_sg(&pdev->dev, sg, nents,
- (enum dma_data_direction) direction);
-}
-
-static inline void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sg,
- int nents, int direction)
-{
- dma_unmap_sg(&pdev->dev, sg, nents,
- (enum dma_data_direction) direction);
-}
-
-static inline void pci_dma_sync_single_for_cpu(struct pci_dev *pdev,
- dma_addr_t dma_handle,
- size_t size, int direction)
-{
- dma_sync_single_for_cpu(&pdev->dev, dma_handle, size,
- (enum dma_data_direction) direction);
-}
-
-static inline void pci_dma_sync_single_for_device(struct pci_dev *pdev,
- dma_addr_t dma_handle,
- size_t size, int direction)
-{
- /* No flushing needed to sync cpu writes to the device. */
-}
-
-static inline void pci_dma_sync_sg_for_cpu(struct pci_dev *pdev,
- struct scatterlist *sg,
- int nents, int direction)
-{
- dma_sync_sg_for_cpu(&pdev->dev, sg, nents,
- (enum dma_data_direction) direction);
-}
-
-static inline void pci_dma_sync_sg_for_device(struct pci_dev *pdev,
- struct scatterlist *sg,
- int nelems, int direction)
-{
- /* No flushing needed to sync cpu writes to the device. */
-}
-
-/* Return whether the given PCI device DMA address mask can
- * be supported properly. For example, if your device can
- * only drive the low 24-bits during PCI bus mastering, then
- * you would pass 0x00ffffff as the mask to this function.
- */
-extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask);
-
/* PCI IOMMU mapping bypass support. */
/* PCI 64-bit addressing works for all slots on all controller
@@ -140,12 +58,6 @@ extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask);
#define PCI64_REQUIRED_MASK (~(dma64_addr_t)0)
#define PCI64_ADDR_BASE 0xfffc000000000000UL
-static inline int pci_dma_mapping_error(struct pci_dev *pdev,
- dma_addr_t dma_addr)
-{
- return dma_mapping_error(&pdev->dev, dma_addr);
-}
-
#ifdef CONFIG_PCI
static inline void pci_dma_burst_advice(struct pci_dev *pdev,
enum pci_dma_burst_strategy *strat,
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index 46f91ab66a5..857630cff63 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -76,7 +76,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
*
* Unfortunately this scheme limits us to ~16,000,000 cpus.
*/
-static inline void __read_lock(raw_rwlock_t *rw)
+static inline void arch_read_lock(raw_rwlock_t *rw)
{
register raw_rwlock_t *lp asm("g1");
lp = rw;
@@ -92,11 +92,11 @@ static inline void __read_lock(raw_rwlock_t *rw)
#define __raw_read_lock(lock) \
do { unsigned long flags; \
local_irq_save(flags); \
- __read_lock(lock); \
+ arch_read_lock(lock); \
local_irq_restore(flags); \
} while(0)
-static inline void __read_unlock(raw_rwlock_t *rw)
+static inline void arch_read_unlock(raw_rwlock_t *rw)
{
register raw_rwlock_t *lp asm("g1");
lp = rw;
@@ -112,7 +112,7 @@ static inline void __read_unlock(raw_rwlock_t *rw)
#define __raw_read_unlock(lock) \
do { unsigned long flags; \
local_irq_save(flags); \
- __read_unlock(lock); \
+ arch_read_unlock(lock); \
local_irq_restore(flags); \
} while(0)
@@ -150,7 +150,7 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
return (val == 0);
}
-static inline int __read_trylock(raw_rwlock_t *rw)
+static inline int arch_read_trylock(raw_rwlock_t *rw)
{
register raw_rwlock_t *lp asm("g1");
register int res asm("o0");
@@ -169,7 +169,7 @@ static inline int __read_trylock(raw_rwlock_t *rw)
({ unsigned long flags; \
int res; \
local_irq_save(flags); \
- res = __read_trylock(lock); \
+ res = arch_read_trylock(lock); \
local_irq_restore(flags); \
res; \
})
diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h
index f6b2b92ad8d..43e51478358 100644
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -92,7 +92,7 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long fla
/* Multi-reader locks, these are much saner than the 32-bit Sparc ones... */
-static void inline __read_lock(raw_rwlock_t *lock)
+static void inline arch_read_lock(raw_rwlock_t *lock)
{
unsigned long tmp1, tmp2;
@@ -115,7 +115,7 @@ static void inline __read_lock(raw_rwlock_t *lock)
: "memory");
}
-static int inline __read_trylock(raw_rwlock_t *lock)
+static int inline arch_read_trylock(raw_rwlock_t *lock)
{
int tmp1, tmp2;
@@ -136,7 +136,7 @@ static int inline __read_trylock(raw_rwlock_t *lock)
return tmp1;
}
-static void inline __read_unlock(raw_rwlock_t *lock)
+static void inline arch_read_unlock(raw_rwlock_t *lock)
{
unsigned long tmp1, tmp2;
@@ -152,7 +152,7 @@ static void inline __read_unlock(raw_rwlock_t *lock)
: "memory");
}
-static void inline __write_lock(raw_rwlock_t *lock)
+static void inline arch_write_lock(raw_rwlock_t *lock)
{
unsigned long mask, tmp1, tmp2;
@@ -177,7 +177,7 @@ static void inline __write_lock(raw_rwlock_t *lock)
: "memory");
}
-static void inline __write_unlock(raw_rwlock_t *lock)
+static void inline arch_write_unlock(raw_rwlock_t *lock)
{
__asm__ __volatile__(
" stw %%g0, [%0]"
@@ -186,7 +186,7 @@ static void inline __write_unlock(raw_rwlock_t *lock)
: "memory");
}
-static int inline __write_trylock(raw_rwlock_t *lock)
+static int inline arch_write_trylock(raw_rwlock_t *lock)
{
unsigned long mask, tmp1, tmp2, result;
@@ -210,14 +210,14 @@ static int inline __write_trylock(raw_rwlock_t *lock)
return result;
}
-#define __raw_read_lock(p) __read_lock(p)
-#define __raw_read_lock_flags(p, f) __read_lock(p)
-#define __raw_read_trylock(p) __read_trylock(p)
-#define __raw_read_unlock(p) __read_unlock(p)
-#define __raw_write_lock(p) __write_lock(p)
-#define __raw_write_lock_flags(p, f) __write_lock(p)
-#define __raw_write_unlock(p) __write_unlock(p)
-#define __raw_write_trylock(p) __write_trylock(p)
+#define __raw_read_lock(p) arch_read_lock(p)
+#define __raw_read_lock_flags(p, f) arch_read_lock(p)
+#define __raw_read_trylock(p) arch_read_trylock(p)
+#define __raw_read_unlock(p) arch_read_unlock(p)
+#define __raw_write_lock(p) arch_write_lock(p)
+#define __raw_write_lock_flags(p, f) arch_write_lock(p)
+#define __raw_write_unlock(p) arch_write_unlock(p)
+#define __raw_write_trylock(p) arch_write_trylock(p)
#define __raw_read_can_lock(rw) (!((rw)->lock & 0x80000000UL))
#define __raw_write_can_lock(rw) (!(rw)->lock)
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 475ce4696ac..29b88a58066 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -61,7 +61,7 @@ obj-$(CONFIG_SPARC64_SMP) += cpumap.o
obj-$(CONFIG_SPARC32) += devres.o
devres-y := ../../../kernel/irq/devres.o
-obj-$(CONFIG_SPARC32) += dma.o
+obj-y += dma.o
obj-$(CONFIG_SPARC32_PCI) += pcic.o
diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c
index 524c32f97c5..e1ba8ee21b9 100644
--- a/arch/sparc/kernel/dma.c
+++ b/arch/sparc/kernel/dma.c
@@ -1,178 +1,13 @@
-/* dma.c: PCI and SBUS DMA accessors for 32-bit sparc.
- *
- * Copyright (C) 2008 David S. Miller <davem@davemloft.net>
- */
-
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/dma-mapping.h>
-#include <linux/scatterlist.h>
-#include <linux/mm.h>
-
-#ifdef CONFIG_PCI
-#include <linux/pci.h>
-#endif
+#include <linux/dma-debug.h>
-#include "dma.h"
+#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 15)
-int dma_supported(struct device *dev, u64 mask)
+static int __init dma_init(void)
{
-#ifdef CONFIG_PCI
- if (dev->bus == &pci_bus_type)
- return pci_dma_supported(to_pci_dev(dev), mask);
-#endif
+ dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
return 0;
}
-EXPORT_SYMBOL(dma_supported);
-
-int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-#ifdef CONFIG_PCI
- if (dev->bus == &pci_bus_type)
- return pci_set_dma_mask(to_pci_dev(dev), dma_mask);
-#endif
- return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
-static void *dma32_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag)
-{
-#ifdef CONFIG_PCI
- if (dev->bus == &pci_bus_type)
- return pci_alloc_consistent(to_pci_dev(dev), size, dma_handle);
-#endif
- return sbus_alloc_consistent(dev, size, dma_handle);
-}
-
-static void dma32_free_coherent(struct device *dev, size_t size,
- void *cpu_addr, dma_addr_t dma_handle)
-{
-#ifdef CONFIG_PCI
- if (dev->bus == &pci_bus_type) {
- pci_free_consistent(to_pci_dev(dev), size,
- cpu_addr, dma_handle);
- return;
- }
-#endif
- sbus_free_consistent(dev, size, cpu_addr, dma_handle);
-}
-
-static dma_addr_t dma32_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
- if (dev->bus == &pci_bus_type)
- return pci_map_page(to_pci_dev(dev), page, offset,
- size, (int)direction);
-#endif
- return sbus_map_single(dev, page_address(page) + offset,
- size, (int)direction);
-}
-
-static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address,
- size_t size, enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
- if (dev->bus == &pci_bus_type) {
- pci_unmap_page(to_pci_dev(dev), dma_address,
- size, (int)direction);
- return;
- }
-#endif
- sbus_unmap_single(dev, dma_address, size, (int)direction);
-}
-
-static int dma32_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
- if (dev->bus == &pci_bus_type)
- return pci_map_sg(to_pci_dev(dev), sg, nents, (int)direction);
-#endif
- return sbus_map_sg(dev, sg, nents, direction);
-}
-
-void dma32_unmap_sg(struct device *dev, struct scatterlist *sg,
- int nents, enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
- if (dev->bus == &pci_bus_type) {
- pci_unmap_sg(to_pci_dev(dev), sg, nents, (int)direction);
- return;
- }
-#endif
- sbus_unmap_sg(dev, sg, nents, (int)direction);
-}
-
-static void dma32_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
- size_t size,
- enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
- if (dev->bus == &pci_bus_type) {
- pci_dma_sync_single_for_cpu(to_pci_dev(dev), dma_handle,
- size, (int)direction);
- return;
- }
-#endif
- sbus_dma_sync_single_for_cpu(dev, dma_handle, size, (int) direction);
-}
-
-static void dma32_sync_single_for_device(struct device *dev,
- dma_addr_t dma_handle, size_t size,
- enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
- if (dev->bus == &pci_bus_type) {
- pci_dma_sync_single_for_device(to_pci_dev(dev), dma_handle,
- size, (int)direction);
- return;
- }
-#endif
- sbus_dma_sync_single_for_device(dev, dma_handle, size, (int) direction);
-}
-
-static void dma32_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
- int nelems, enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
- if (dev->bus == &pci_bus_type) {
- pci_dma_sync_sg_for_cpu(to_pci_dev(dev), sg,
- nelems, (int)direction);
- return;
- }
-#endif
- BUG();
-}
-
-static void dma32_sync_sg_for_device(struct device *dev,
- struct scatterlist *sg, int nelems,
- enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
- if (dev->bus == &pci_bus_type) {
- pci_dma_sync_sg_for_device(to_pci_dev(dev), sg,
- nelems, (int)direction);
- return;
- }
-#endif
- BUG();
-}
-
-static const struct dma_ops dma32_dma_ops = {
- .alloc_coherent = dma32_alloc_coherent,
- .free_coherent = dma32_free_coherent,
- .map_page = dma32_map_page,
- .unmap_page = dma32_unmap_page,
- .map_sg = dma32_map_sg,
- .unmap_sg = dma32_unmap_sg,
- .sync_single_for_cpu = dma32_sync_single_for_cpu,
- .sync_single_for_device = dma32_sync_single_for_device,
- .sync_sg_for_cpu = dma32_sync_sg_for_cpu,
- .sync_sg_for_device = dma32_sync_sg_for_device,
-};
-
-const struct dma_ops *dma_ops = &dma32_dma_ops;
-EXPORT_SYMBOL(dma_ops);
+fs_initcall(dma_init);
diff --git a/arch/sparc/kernel/dma.h b/arch/sparc/kernel/dma.h
deleted file mode 100644
index f8d8951adb5..00000000000
--- a/arch/sparc/kernel/dma.h
+++ /dev/null
@@ -1,14 +0,0 @@
-void *sbus_alloc_consistent(struct device *dev, long len, u32 *dma_addrp);
-void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba);
-dma_addr_t sbus_map_single(struct device *dev, void *va,
- size_t len, int direction);
-void sbus_unmap_single(struct device *dev, dma_addr_t ba,
- size_t n, int direction);
-int sbus_map_sg(struct device *dev, struct scatterlist *sg,
- int n, int direction);
-void sbus_unmap_sg(struct device *dev, struct scatterlist *sg,
- int n, int direction);
-void sbus_dma_sync_single_for_cpu(struct device *dev, dma_addr_t ba,
- size_t size, int direction);
-void sbus_dma_sync_single_for_device(struct device *dev, dma_addr_t ba,
- size_t size, int direction);
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index 0aeaefe696b..7690cc219ec 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -353,7 +353,8 @@ static void dma_4u_free_coherent(struct device *dev, size_t size,
static dma_addr_t dma_4u_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t sz,
- enum dma_data_direction direction)
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
{
struct iommu *iommu;
struct strbuf *strbuf;
@@ -474,7 +475,8 @@ do_flush_sync:
}
static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
- size_t sz, enum dma_data_direction direction)
+ size_t sz, enum dma_data_direction direction,
+ struct dma_attrs *attrs)
{
struct iommu *iommu;
struct strbuf *strbuf;
@@ -520,7 +522,8 @@ static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
}
static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction direction)
+ int nelems, enum dma_data_direction direction,
+ struct dma_attrs *attrs)
{
struct scatterlist *s, *outs, *segstart;
unsigned long flags, handle, prot, ctx;
@@ -691,7 +694,8 @@ static unsigned long fetch_sg_ctx(struct iommu *iommu, struct scatterlist *sg)
}
static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction direction)
+ int nelems, enum dma_data_direction direction,
+ struct dma_attrs *attrs)
{
unsigned long flags, ctx;
struct scatterlist *sg;
@@ -822,7 +826,7 @@ static void dma_4u_sync_sg_for_cpu(struct device *dev,
spin_unlock_irqrestore(&iommu->lock, flags);
}
-static const struct dma_ops sun4u_dma_ops = {
+static struct dma_map_ops sun4u_dma_ops = {
.alloc_coherent = dma_4u_alloc_coherent,
.free_coherent = dma_4u_free_coherent,
.map_page = dma_4u_map_page,
@@ -833,9 +837,11 @@ static const struct dma_ops sun4u_dma_ops = {
.sync_sg_for_cpu = dma_4u_sync_sg_for_cpu,
};
-const struct dma_ops *dma_ops = &sun4u_dma_ops;
+struct dma_map_ops *dma_ops = &sun4u_dma_ops;
EXPORT_SYMBOL(dma_ops);
+extern int pci64_dma_supported(struct pci_dev *pdev, u64 device_mask);
+
int dma_supported(struct device *dev, u64 device_mask)
{
struct iommu *iommu = dev->archdata.iommu;
@@ -849,7 +855,7 @@ int dma_supported(struct device *dev, u64 device_mask)
#ifdef CONFIG_PCI
if (dev->bus == &pci_bus_type)
- return pci_dma_supported(to_pci_dev(dev), device_mask);
+ return pci64_dma_supported(to_pci_dev(dev), device_mask);
#endif
return 0;
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 87ea0d03d97..edbea232c61 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -48,8 +48,6 @@
#include <asm/iommu.h>
#include <asm/io-unit.h>
-#include "dma.h"
-
#define mmu_inval_dma_area(p, l) /* Anton pulled it out for 2.4.0-xx */
static struct resource *_sparc_find_resource(struct resource *r,
@@ -246,7 +244,8 @@ EXPORT_SYMBOL(sbus_set_sbus64);
* Typically devices use them for control blocks.
* CPU may access them without any explicit flushing.
*/
-void *sbus_alloc_consistent(struct device *dev, long len, u32 *dma_addrp)
+static void *sbus_alloc_coherent(struct device *dev, size_t len,
+ dma_addr_t *dma_addrp, gfp_t gfp)
{
struct of_device *op = to_of_device(dev);
unsigned long len_total = (len + PAGE_SIZE-1) & PAGE_MASK;
@@ -299,7 +298,8 @@ err_nopages:
return NULL;
}
-void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba)
+static void sbus_free_coherent(struct device *dev, size_t n, void *p,
+ dma_addr_t ba)
{
struct resource *res;
struct page *pgv;
@@ -317,7 +317,7 @@ void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba)
n = (n + PAGE_SIZE-1) & PAGE_MASK;
if ((res->end-res->start)+1 != n) {
- printk("sbus_free_consistent: region 0x%lx asked 0x%lx\n",
+ printk("sbus_free_consistent: region 0x%lx asked 0x%zx\n",
(long)((res->end-res->start)+1), n);
return;
}
@@ -337,8 +337,13 @@ void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba)
* CPU view of this memory may be inconsistent with
* a device view and explicit flushing is necessary.
*/
-dma_addr_t sbus_map_single(struct device *dev, void *va, size_t len, int direction)
+static dma_addr_t sbus_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t len,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
{
+ void *va = page_address(page) + offset;
+
/* XXX why are some lengths signed, others unsigned? */
if (len <= 0) {
return 0;
@@ -350,12 +355,14 @@ dma_addr_t sbus_map_single(struct device *dev, void *va, size_t len, int directi
return mmu_get_scsi_one(dev, va, len);
}
-void sbus_unmap_single(struct device *dev, dma_addr_t ba, size_t n, int direction)
+static void sbus_unmap_page(struct device *dev, dma_addr_t ba, size_t n,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
{
mmu_release_scsi_one(dev, ba, n);
}
-int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n, int direction)
+static int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
{
mmu_get_scsi_sgl(dev, sg, n);
@@ -366,19 +373,38 @@ int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n, int direction
return n;
}
-void sbus_unmap_sg(struct device *dev, struct scatterlist *sg, int n, int direction)
+static void sbus_unmap_sg(struct device *dev, struct scatterlist *sg, int n,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
{
mmu_release_scsi_sgl(dev, sg, n);
}
-void sbus_dma_sync_single_for_cpu(struct device *dev, dma_addr_t ba, size_t size, int direction)
+static void sbus_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+ int n, enum dma_data_direction dir)
{
+ BUG();
}
-void sbus_dma_sync_single_for_device(struct device *dev, dma_addr_t ba, size_t size, int direction)
+static void sbus_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+ int n, enum dma_data_direction dir)
{
+ BUG();
}
+struct dma_map_ops sbus_dma_ops = {
+ .alloc_coherent = sbus_alloc_coherent,
+ .free_coherent = sbus_free_coherent,
+ .map_page = sbus_map_page,
+ .unmap_page = sbus_unmap_page,
+ .map_sg = sbus_map_sg,
+ .unmap_sg = sbus_unmap_sg,
+ .sync_sg_for_cpu = sbus_sync_sg_for_cpu,
+ .sync_sg_for_device = sbus_sync_sg_for_device,
+};
+
+struct dma_map_ops *dma_ops = &sbus_dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
static int __init sparc_register_ioport(void)
{
register_proc_sparc_ioport();
@@ -395,7 +421,8 @@ arch_initcall(sparc_register_ioport);
/* Allocate and map kernel buffer using consistent mode DMA for a device.
* hwdev should be valid struct pci_dev pointer for PCI devices.
*/
-void *pci_alloc_consistent(struct pci_dev *pdev, size_t len, dma_addr_t *pba)
+static void *pci32_alloc_coherent(struct device *dev, size_t len,
+ dma_addr_t *pba, gfp_t gfp)
{
unsigned long len_total = (len + PAGE_SIZE-1) & PAGE_MASK;
unsigned long va;
@@ -439,7 +466,6 @@ void *pci_alloc_consistent(struct pci_dev *pdev, size_t len, dma_addr_t *pba)
*pba = virt_to_phys(va); /* equals virt_to_bus (R.I.P.) for us. */
return (void *) res->start;
}
-EXPORT_SYMBOL(pci_alloc_consistent);
/* Free and unmap a consistent DMA buffer.
* cpu_addr is what was returned from pci_alloc_consistent,
@@ -449,7 +475,8 @@ EXPORT_SYMBOL(pci_alloc_consistent);
* References to the memory and mappings associated with cpu_addr/dma_addr
* past this call are illegal.
*/
-void pci_free_consistent(struct pci_dev *pdev, size_t n, void *p, dma_addr_t ba)
+static void pci32_free_coherent(struct device *dev, size_t n, void *p,
+ dma_addr_t ba)
{
struct resource *res;
unsigned long pgp;
@@ -481,60 +508,18 @@ void pci_free_consistent(struct pci_dev *pdev, size_t n, void *p, dma_addr_t ba)
free_pages(pgp, get_order(n));
}
-EXPORT_SYMBOL(pci_free_consistent);
-
-/* Map a single buffer of the indicated size for DMA in streaming mode.
- * The 32-bit bus address to use is returned.
- *
- * Once the device is given the dma address, the device owns this memory
- * until either pci_unmap_single or pci_dma_sync_single_* is performed.
- */
-dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
- int direction)
-{
- BUG_ON(direction == PCI_DMA_NONE);
- /* IIep is write-through, not flushing. */
- return virt_to_phys(ptr);
-}
-EXPORT_SYMBOL(pci_map_single);
-
-/* Unmap a single streaming mode DMA translation. The dma_addr and size
- * must match what was provided for in a previous pci_map_single call. All
- * other usages are undefined.
- *
- * After this call, reads by the cpu to the buffer are guaranteed to see
- * whatever the device wrote there.
- */
-void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t ba, size_t size,
- int direction)
-{
- BUG_ON(direction == PCI_DMA_NONE);
- if (direction != PCI_DMA_TODEVICE) {
- mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
- (size + PAGE_SIZE-1) & PAGE_MASK);
- }
-}
-EXPORT_SYMBOL(pci_unmap_single);
/*
* Same as pci_map_single, but with pages.
*/
-dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
- unsigned long offset, size_t size, int direction)
+static dma_addr_t pci32_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
{
- BUG_ON(direction == PCI_DMA_NONE);
/* IIep is write-through, not flushing. */
return page_to_phys(page) + offset;
}
-EXPORT_SYMBOL(pci_map_page);
-
-void pci_unmap_page(struct pci_dev *hwdev,
- dma_addr_t dma_address, size_t size, int direction)
-{
- BUG_ON(direction == PCI_DMA_NONE);
- /* mmu_inval_dma_area XXX */
-}
-EXPORT_SYMBOL(pci_unmap_page);
/* Map a set of buffers described by scatterlist in streaming
* mode for DMA. This is the scather-gather version of the
@@ -551,13 +536,13 @@ EXPORT_SYMBOL(pci_unmap_page);
* Device ownership issues as mentioned above for pci_map_single are
* the same here.
*/
-int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
- int direction)
+static int pci32_map_sg(struct device *device, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
{
struct scatterlist *sg;
int n;
- BUG_ON(direction == PCI_DMA_NONE);
/* IIep is write-through, not flushing. */
for_each_sg(sgl, sg, nents, n) {
BUG_ON(page_address(sg_page(sg)) == NULL);
@@ -566,20 +551,19 @@ int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
}
return nents;
}
-EXPORT_SYMBOL(pci_map_sg);
/* Unmap a set of streaming mode DMA translations.
* Again, cpu read rules concerning calls here are the same as for
* pci_unmap_single() above.
*/
-void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
- int direction)
+static void pci32_unmap_sg(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
{
struct scatterlist *sg;
int n;
- BUG_ON(direction == PCI_DMA_NONE);
- if (direction != PCI_DMA_TODEVICE) {
+ if (dir != PCI_DMA_TODEVICE) {
for_each_sg(sgl, sg, nents, n) {
BUG_ON(page_address(sg_page(sg)) == NULL);
mmu_inval_dma_area(
@@ -588,7 +572,6 @@ void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
}
}
}
-EXPORT_SYMBOL(pci_unmap_sg);
/* Make physical memory consistent for a single
* streaming mode DMA translation before or after a transfer.
@@ -600,25 +583,23 @@ EXPORT_SYMBOL(pci_unmap_sg);
* must first perform a pci_dma_sync_for_device, and then the
* device again owns the buffer.
*/
-void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction)
+static void pci32_sync_single_for_cpu(struct device *dev, dma_addr_t ba,
+ size_t size, enum dma_data_direction dir)
{
- BUG_ON(direction == PCI_DMA_NONE);
- if (direction != PCI_DMA_TODEVICE) {
+ if (dir != PCI_DMA_TODEVICE) {
mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
(size + PAGE_SIZE-1) & PAGE_MASK);
}
}
-EXPORT_SYMBOL(pci_dma_sync_single_for_cpu);
-void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction)
+static void pci32_sync_single_for_device(struct device *dev, dma_addr_t ba,
+ size_t size, enum dma_data_direction dir)
{
- BUG_ON(direction == PCI_DMA_NONE);
- if (direction != PCI_DMA_TODEVICE) {
+ if (dir != PCI_DMA_TODEVICE) {
mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
(size + PAGE_SIZE-1) & PAGE_MASK);
}
}
-EXPORT_SYMBOL(pci_dma_sync_single_for_device);
/* Make physical memory consistent for a set of streaming
* mode DMA translations after a transfer.
@@ -626,13 +607,13 @@ EXPORT_SYMBOL(pci_dma_sync_single_for_device);
* The same as pci_dma_sync_single_* but for a scatter-gather list,
* same rules and usage.
*/
-void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, int direction)
+static void pci32_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir)
{
struct scatterlist *sg;
int n;
- BUG_ON(direction == PCI_DMA_NONE);
- if (direction != PCI_DMA_TODEVICE) {
+ if (dir != PCI_DMA_TODEVICE) {
for_each_sg(sgl, sg, nents, n) {
BUG_ON(page_address(sg_page(sg)) == NULL);
mmu_inval_dma_area(
@@ -641,15 +622,14 @@ void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sgl, int
}
}
}
-EXPORT_SYMBOL(pci_dma_sync_sg_for_cpu);
-void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, int direction)
+static void pci32_sync_sg_for_device(struct device *device, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir)
{
struct scatterlist *sg;
int n;
- BUG_ON(direction == PCI_DMA_NONE);
- if (direction != PCI_DMA_TODEVICE) {
+ if (dir != PCI_DMA_TODEVICE) {
for_each_sg(sgl, sg, nents, n) {
BUG_ON(page_address(sg_page(sg)) == NULL);
mmu_inval_dma_area(
@@ -658,9 +638,49 @@ void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sgl,
}
}
}
-EXPORT_SYMBOL(pci_dma_sync_sg_for_device);
+
+struct dma_map_ops pci32_dma_ops = {
+ .alloc_coherent = pci32_alloc_coherent,
+ .free_coherent = pci32_free_coherent,
+ .map_page = pci32_map_page,
+ .map_sg = pci32_map_sg,
+ .unmap_sg = pci32_unmap_sg,
+ .sync_single_for_cpu = pci32_sync_single_for_cpu,
+ .sync_single_for_device = pci32_sync_single_for_device,
+ .sync_sg_for_cpu = pci32_sync_sg_for_cpu,
+ .sync_sg_for_device = pci32_sync_sg_for_device,
+};
+EXPORT_SYMBOL(pci32_dma_ops);
+
#endif /* CONFIG_PCI */
+/*
+ * Return whether the given PCI device DMA address mask can be
+ * supported properly. For example, if your device can only drive the
+ * low 24-bits during PCI bus mastering, then you would pass
+ * 0x00ffffff as the mask to this function.
+ */
+int dma_supported(struct device *dev, u64 mask)
+{
+#ifdef CONFIG_PCI
+ if (dev->bus == &pci_bus_type)
+ return 1;
+#endif
+ return 0;
+}
+EXPORT_SYMBOL(dma_supported);
+
+int dma_set_mask(struct device *dev, u64 dma_mask)
+{
+#ifdef CONFIG_PCI
+ if (dev->bus == &pci_bus_type)
+ return pci_set_dma_mask(to_pci_dev(dev), dma_mask);
+#endif
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+
#ifdef CONFIG_PROC_FS
static int
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index 57859ad2354..c6864866280 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -1039,7 +1039,7 @@ static void ali_sound_dma_hack(struct pci_dev *pdev, int set_bit)
pci_dev_put(ali_isa_bridge);
}
-int pci_dma_supported(struct pci_dev *pdev, u64 device_mask)
+int pci64_dma_supported(struct pci_dev *pdev, u64 device_mask)
{
u64 dma_addr_mask;
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 2485eaa2310..23c33ff9c31 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -232,7 +232,8 @@ static void dma_4v_free_coherent(struct device *dev, size_t size, void *cpu,
static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t sz,
- enum dma_data_direction direction)
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
{
struct iommu *iommu;
unsigned long flags, npages, oaddr;
@@ -296,7 +297,8 @@ iommu_map_fail:
}
static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr,
- size_t sz, enum dma_data_direction direction)
+ size_t sz, enum dma_data_direction direction,
+ struct dma_attrs *attrs)
{
struct pci_pbm_info *pbm;
struct iommu *iommu;
@@ -336,7 +338,8 @@ static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr,
}
static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction direction)
+ int nelems, enum dma_data_direction direction,
+ struct dma_attrs *attrs)
{
struct scatterlist *s, *outs, *segstart;
unsigned long flags, handle, prot;
@@ -478,7 +481,8 @@ iommu_map_failed:
}
static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction direction)
+ int nelems, enum dma_data_direction direction,
+ struct dma_attrs *attrs)
{
struct pci_pbm_info *pbm;
struct scatterlist *sg;
@@ -521,29 +525,13 @@ static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
spin_unlock_irqrestore(&iommu->lock, flags);
}
-static void dma_4v_sync_single_for_cpu(struct device *dev,
- dma_addr_t bus_addr, size_t sz,
- enum dma_data_direction direction)
-{
- /* Nothing to do... */
-}
-
-static void dma_4v_sync_sg_for_cpu(struct device *dev,
- struct scatterlist *sglist, int nelems,
- enum dma_data_direction direction)
-{
- /* Nothing to do... */
-}
-
-static const struct dma_ops sun4v_dma_ops = {
+static struct dma_map_ops sun4v_dma_ops = {
.alloc_coherent = dma_4v_alloc_coherent,
.free_coherent = dma_4v_free_coherent,
.map_page = dma_4v_map_page,
.unmap_page = dma_4v_unmap_page,
.map_sg = dma_4v_map_sg,
.unmap_sg = dma_4v_unmap_sg,
- .sync_single_for_cpu = dma_4v_sync_single_for_cpu,
- .sync_sg_for_cpu = dma_4v_sync_sg_for_cpu,
};
static void __devinit pci_sun4v_scan_bus(struct pci_pbm_info *pbm,
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 4041f94e772..18d67854a1b 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -251,7 +251,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp)
}
}
-void __trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(void)
{
struct thread_info *tp = current_thread_info();
struct pt_regs *regs = get_irq_regs();
@@ -304,7 +304,7 @@ void __trigger_all_cpu_backtrace(void)
static void sysrq_handle_globreg(int key, struct tty_struct *tty)
{
- __trigger_all_cpu_backtrace();
+ arch_trigger_all_cpu_backtrace();
}
static struct sysrq_key_op sparc_globalreg_op = {
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13ffa5df37d..fc20fdc0f7f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -38,7 +38,7 @@ config X86
select HAVE_FUNCTION_GRAPH_FP_TEST
select HAVE_FUNCTION_TRACE_MCOUNT_TEST
select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
- select HAVE_FTRACE_SYSCALLS
+ select HAVE_SYSCALL_TRACEPOINTS
select HAVE_KVM
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
@@ -586,7 +586,6 @@ config GART_IOMMU
bool "GART IOMMU support" if EMBEDDED
default y
select SWIOTLB
- select AGP
depends on X86_64 && PCI
---help---
Support for full DMA access of devices with 32bit memory access only
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index edb992ebef9..d28fad19654 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -2355,7 +2355,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
CONFIG_HAVE_HW_BRANCH_TRACER=y
-CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
CONFIG_RING_BUFFER=y
CONFIG_TRACING=y
CONFIG_TRACING_SUPPORT=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index cee1dd2e69b..6c86acd847a 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -2329,7 +2329,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
CONFIG_HAVE_HW_BRANCH_TRACER=y
-CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
CONFIG_RING_BUFFER=y
CONFIG_TRACING=y
CONFIG_TRACING_SUPPORT=y
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index bdf96f119f0..ac95995b7ba 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -25,6 +25,7 @@
#ifdef CONFIG_AMD_IOMMU
extern int amd_iommu_init(void);
extern int amd_iommu_init_dma_ops(void);
+extern int amd_iommu_init_passthrough(void);
extern void amd_iommu_detect(void);
extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
extern void amd_iommu_flush_all_domains(void);
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 0c878caaa0a..2a2cc7a78a8 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -143,22 +143,29 @@
#define EVT_BUFFER_SIZE 8192 /* 512 entries */
#define EVT_LEN_MASK (0x9ULL << 56)
+#define PAGE_MODE_NONE 0x00
#define PAGE_MODE_1_LEVEL 0x01
#define PAGE_MODE_2_LEVEL 0x02
#define PAGE_MODE_3_LEVEL 0x03
-
-#define IOMMU_PDE_NL_0 0x000ULL
-#define IOMMU_PDE_NL_1 0x200ULL
-#define IOMMU_PDE_NL_2 0x400ULL
-#define IOMMU_PDE_NL_3 0x600ULL
-
-#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL)
-#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL)
-#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL)
-
-#define IOMMU_MAP_SIZE_L1 (1ULL << 21)
-#define IOMMU_MAP_SIZE_L2 (1ULL << 30)
-#define IOMMU_MAP_SIZE_L3 (1ULL << 39)
+#define PAGE_MODE_4_LEVEL 0x04
+#define PAGE_MODE_5_LEVEL 0x05
+#define PAGE_MODE_6_LEVEL 0x06
+
+#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
+#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
+ ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
+ (0xffffffffffffffffULL))
+#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
+#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
+#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
+ IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
+#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
+
+#define PM_MAP_4k 0
+#define PM_ADDR_MASK 0x000ffffffffff000ULL
+#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
+ (~((1ULL << (12 + ((lvl) * 9))) - 1)))
+#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
#define IOMMU_PTE_P (1ULL << 0)
#define IOMMU_PTE_TV (1ULL << 1)
@@ -167,11 +174,6 @@
#define IOMMU_PTE_IR (1ULL << 61)
#define IOMMU_PTE_IW (1ULL << 62)
-#define IOMMU_L1_PDE(address) \
- ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define IOMMU_L2_PDE(address) \
- ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
-
#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
@@ -194,11 +196,14 @@
#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
domain for an IOMMU */
+#define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page
+ translation */
+
extern bool amd_iommu_dump;
#define DUMP_printk(format, arg...) \
do { \
if (amd_iommu_dump) \
- printk(KERN_INFO "AMD IOMMU: " format, ## arg); \
+ printk(KERN_INFO "AMD-Vi: " format, ## arg); \
} while(0);
/*
@@ -226,6 +231,7 @@ struct protection_domain {
int mode; /* paging mode (0-6 levels) */
u64 *pt_root; /* page table root pointer */
unsigned long flags; /* flags to find out type of domain */
+ bool updated; /* complete domain flush required */
unsigned dev_cnt; /* devices assigned to this domain */
void *priv; /* private data */
};
@@ -337,6 +343,9 @@ struct amd_iommu {
/* if one, we need to send a completion wait command */
bool need_sync;
+ /* becomes true if a command buffer reset is running */
+ bool reset_in_progress;
+
/* default dma_ops domain for that IOMMU */
struct dma_ops_domain *default_dom;
};
@@ -457,4 +466,7 @@ static inline void amd_iommu_stats_init(void) { }
#endif /* CONFIG_AMD_IOMMU_STATS */
+/* some function prototypes */
+extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
+
#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 1c3f9435f1c..0ee770d23d0 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -55,6 +55,24 @@ extern int dma_set_mask(struct device *dev, u64 mask);
extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag);
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+ if (!dev->dma_mask)
+ return 0;
+
+ return addr + size <= *dev->dma_mask;
+}
+
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+ return paddr;
+}
+
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+ return daddr;
+}
+
static inline void
dma_cache_sync(struct device *dev, void *vaddr, size_t size,
enum dma_data_direction dir)
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index bd2c6511c88..db24c2278be 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -28,13 +28,6 @@
#endif
-/* FIXME: I don't want to stay hardcoded */
-#ifdef CONFIG_X86_64
-# define FTRACE_SYSCALL_MAX 296
-#else
-# define FTRACE_SYSCALL_MAX 333
-#endif
-
#ifdef CONFIG_FUNCTION_TRACER
#define MCOUNT_ADDR ((long)(mcount))
#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c86e5ed4af5..e63cf7d441e 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -45,8 +45,8 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
void __user *, size_t *, loff_t *);
extern int unknown_nmi_panic;
-void __trigger_all_cpu_backtrace(void);
-#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
+void arch_trigger_all_cpu_backtrace(void);
+#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
static inline void localise_nmi_watchdog(void)
{
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index fa64e401589..e7b7c938ae2 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,6 +84,16 @@ union cpuid10_edx {
#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
+/*
+ * We model BTS tracing as another fixed-mode PMC.
+ *
+ * We choose a value in the middle of the fixed counter range, since lower
+ * values are used by actual fixed counters and higher values are used
+ * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
+ */
+#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
+
+
#ifdef CONFIG_PERF_COUNTERS
extern void init_hw_perf_counters(void);
extern void perf_counters_lapic_init(void);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index fad7d40b75f..6f7786aea4f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,7 +95,7 @@ struct thread_info {
#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
-#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */
+#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -118,17 +118,17 @@ struct thread_info {
#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
-#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
+#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \
- _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
+ _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
/* work to do in syscall_trace_leave() */
#define _TIF_WORK_SYSCALL_EXIT \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
- _TIF_SYSCALL_FTRACE)
+ _TIF_SYSCALL_TRACEPOINT)
/* work to do on interrupt/exception return */
#define _TIF_WORK_MASK \
@@ -137,7 +137,8 @@ struct thread_info {
_TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
/* work to do on any return to user space */
-#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE)
+#define _TIF_ALLWORK_MASK \
+ ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT)
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 066ef590d7e..26d06e052a1 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -129,25 +129,34 @@ extern unsigned long node_remap_size[];
#endif
/* sched_domains SD_NODE_INIT for NUMA machines */
-#define SD_NODE_INIT (struct sched_domain) { \
- .min_interval = 8, \
- .max_interval = 32, \
- .busy_factor = 32, \
- .imbalance_pct = 125, \
- .cache_nice_tries = SD_CACHE_NICE_TRIES, \
- .busy_idx = 3, \
- .idle_idx = SD_IDLE_IDX, \
- .newidle_idx = SD_NEWIDLE_IDX, \
- .wake_idx = 1, \
- .forkexec_idx = SD_FORKEXEC_IDX, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_EXEC \
- | SD_BALANCE_FORK \
- | SD_WAKE_AFFINE \
- | SD_WAKE_BALANCE \
- | SD_SERIALIZE, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
+#define SD_NODE_INIT (struct sched_domain) { \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_nice_tries = SD_CACHE_NICE_TRIES, \
+ .busy_idx = 3, \
+ .idle_idx = SD_IDLE_IDX, \
+ .newidle_idx = SD_NEWIDLE_IDX, \
+ .wake_idx = 1, \
+ .forkexec_idx = SD_FORKEXEC_IDX, \
+ \
+ .flags = 1*SD_LOAD_BALANCE \
+ | 1*SD_BALANCE_NEWIDLE \
+ | 1*SD_BALANCE_EXEC \
+ | 1*SD_BALANCE_FORK \
+ | 0*SD_WAKE_IDLE \
+ | 1*SD_WAKE_AFFINE \
+ | 1*SD_WAKE_BALANCE \
+ | 0*SD_SHARE_CPUPOWER \
+ | 0*SD_POWERSAVINGS_BALANCE \
+ | 0*SD_SHARE_PKG_RESOURCES \
+ | 1*SD_SERIALIZE \
+ | 1*SD_WAKE_IDLE_FAR \
+ | 0*SD_PREFER_SIBLING \
+ , \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
}
#ifdef CONFIG_X86_64_ACPI_NUMA
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 732a3070615..8deaada61bc 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -345,6 +345,8 @@
#ifdef __KERNEL__
+#define NR_syscalls 337
+
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
#define __ARCH_WANT_OLD_STAT
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 900e1617e67..b9f3c60de5f 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -688,6 +688,12 @@ __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
#endif /* __NO_STUBS */
#ifdef __KERNEL__
+
+#ifndef COMPILE_OFFSETS
+#include <asm/asm-offsets.h>
+#define NR_syscalls (__NR_syscall_max + 1)
+#endif
+
/*
* "Conditional" syscalls
*
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 6c99f503780..98f230f6a28 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -41,9 +41,13 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
static LIST_HEAD(iommu_pd_list);
static DEFINE_SPINLOCK(iommu_pd_list_lock);
-#ifdef CONFIG_IOMMU_API
+/*
+ * Domain for untranslated devices - only allocated
+ * if iommu=pt passed on kernel cmd line.
+ */
+static struct protection_domain *pt_domain;
+
static struct iommu_ops amd_iommu_ops;
-#endif
/*
* general struct to manage commands send to an IOMMU
@@ -55,16 +59,16 @@ struct iommu_cmd {
static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
struct unity_map_entry *e);
static struct dma_ops_domain *find_protection_domain(u16 devid);
-static u64* alloc_pte(struct protection_domain *dom,
- unsigned long address, u64
- **pte_page, gfp_t gfp);
+static u64 *alloc_pte(struct protection_domain *domain,
+ unsigned long address, int end_lvl,
+ u64 **pte_page, gfp_t gfp);
static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
unsigned long start_page,
unsigned int pages);
-
-#ifndef BUS_NOTIFY_UNBOUND_DRIVER
-#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
-#endif
+static void reset_iommu_command_buffer(struct amd_iommu *iommu);
+static u64 *fetch_pte(struct protection_domain *domain,
+ unsigned long address, int map_size);
+static void update_domain(struct protection_domain *domain);
#ifdef CONFIG_AMD_IOMMU_STATS
@@ -138,7 +142,25 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
*
****************************************************************************/
-static void iommu_print_event(void *__evt)
+static void dump_dte_entry(u16 devid)
+{
+ int i;
+
+ for (i = 0; i < 8; ++i)
+ pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
+ amd_iommu_dev_table[devid].data[i]);
+}
+
+static void dump_command(unsigned long phys_addr)
+{
+ struct iommu_cmd *cmd = phys_to_virt(phys_addr);
+ int i;
+
+ for (i = 0; i < 4; ++i)
+ pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
+}
+
+static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
{
u32 *event = __evt;
int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
@@ -147,7 +169,7 @@ static void iommu_print_event(void *__evt)
int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
u64 address = (u64)(((u64)event[3]) << 32) | event[2];
- printk(KERN_ERR "AMD IOMMU: Event logged [");
+ printk(KERN_ERR "AMD-Vi: Event logged [");
switch (type) {
case EVENT_TYPE_ILL_DEV:
@@ -155,6 +177,7 @@ static void iommu_print_event(void *__evt)
"address=0x%016llx flags=0x%04x]\n",
PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
address, flags);
+ dump_dte_entry(devid);
break;
case EVENT_TYPE_IO_FAULT:
printk("IO_PAGE_FAULT device=%02x:%02x.%x "
@@ -176,6 +199,8 @@ static void iommu_print_event(void *__evt)
break;
case EVENT_TYPE_ILL_CMD:
printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
+ reset_iommu_command_buffer(iommu);
+ dump_command(address);
break;
case EVENT_TYPE_CMD_HARD_ERR:
printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
@@ -209,7 +234,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
while (head != tail) {
- iommu_print_event(iommu->evt_buf + head);
+ iommu_print_event(iommu, iommu->evt_buf + head);
head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
}
@@ -296,8 +321,11 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
- if (unlikely(i == EXIT_LOOP_COUNT))
- panic("AMD IOMMU: Completion wait loop failed\n");
+ if (unlikely(i == EXIT_LOOP_COUNT)) {
+ spin_unlock(&iommu->lock);
+ reset_iommu_command_buffer(iommu);
+ spin_lock(&iommu->lock);
+ }
}
/*
@@ -445,47 +473,78 @@ static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
}
/*
+ * This function flushes one domain on one IOMMU
+ */
+static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid)
+{
+ struct iommu_cmd cmd;
+ unsigned long flags;
+
+ __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+ domid, 1, 1);
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ __iommu_queue_command(iommu, &cmd);
+ __iommu_completion_wait(iommu);
+ __iommu_wait_for_completion(iommu);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static void flush_all_domains_on_iommu(struct amd_iommu *iommu)
+{
+ int i;
+
+ for (i = 1; i < MAX_DOMAIN_ID; ++i) {
+ if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+ continue;
+ flush_domain_on_iommu(iommu, i);
+ }
+
+}
+
+/*
* This function is used to flush the IO/TLB for a given protection domain
* on every IOMMU in the system
*/
static void iommu_flush_domain(u16 domid)
{
- unsigned long flags;
struct amd_iommu *iommu;
- struct iommu_cmd cmd;
INC_STATS_COUNTER(domain_flush_all);
- __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
- domid, 1, 1);
-
- for_each_iommu(iommu) {
- spin_lock_irqsave(&iommu->lock, flags);
- __iommu_queue_command(iommu, &cmd);
- __iommu_completion_wait(iommu);
- __iommu_wait_for_completion(iommu);
- spin_unlock_irqrestore(&iommu->lock, flags);
- }
+ for_each_iommu(iommu)
+ flush_domain_on_iommu(iommu, domid);
}
void amd_iommu_flush_all_domains(void)
{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ flush_all_domains_on_iommu(iommu);
+}
+
+static void flush_all_devices_for_iommu(struct amd_iommu *iommu)
+{
int i;
- for (i = 1; i < MAX_DOMAIN_ID; ++i) {
- if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+ for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+ if (iommu != amd_iommu_rlookup_table[i])
continue;
- iommu_flush_domain(i);
+
+ iommu_queue_inv_dev_entry(iommu, i);
+ iommu_completion_wait(iommu);
}
}
-void amd_iommu_flush_all_devices(void)
+static void flush_devices_by_domain(struct protection_domain *domain)
{
struct amd_iommu *iommu;
int i;
for (i = 0; i <= amd_iommu_last_bdf; ++i) {
- if (amd_iommu_pd_table[i] == NULL)
+ if ((domain == NULL && amd_iommu_pd_table[i] == NULL) ||
+ (amd_iommu_pd_table[i] != domain))
continue;
iommu = amd_iommu_rlookup_table[i];
@@ -497,6 +556,27 @@ void amd_iommu_flush_all_devices(void)
}
}
+static void reset_iommu_command_buffer(struct amd_iommu *iommu)
+{
+ pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
+
+ if (iommu->reset_in_progress)
+ panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
+
+ iommu->reset_in_progress = true;
+
+ amd_iommu_reset_cmd_buffer(iommu);
+ flush_all_devices_for_iommu(iommu);
+ flush_all_domains_on_iommu(iommu);
+
+ iommu->reset_in_progress = false;
+}
+
+void amd_iommu_flush_all_devices(void)
+{
+ flush_devices_by_domain(NULL);
+}
+
/****************************************************************************
*
* The functions below are used the create the page table mappings for
@@ -514,18 +594,21 @@ void amd_iommu_flush_all_devices(void)
static int iommu_map_page(struct protection_domain *dom,
unsigned long bus_addr,
unsigned long phys_addr,
- int prot)
+ int prot,
+ int map_size)
{
u64 __pte, *pte;
bus_addr = PAGE_ALIGN(bus_addr);
phys_addr = PAGE_ALIGN(phys_addr);
- /* only support 512GB address spaces for now */
- if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
+ BUG_ON(!PM_ALIGNED(map_size, bus_addr));
+ BUG_ON(!PM_ALIGNED(map_size, phys_addr));
+
+ if (!(prot & IOMMU_PROT_MASK))
return -EINVAL;
- pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
+ pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
if (IOMMU_PTE_PRESENT(*pte))
return -EBUSY;
@@ -538,29 +621,18 @@ static int iommu_map_page(struct protection_domain *dom,
*pte = __pte;
+ update_domain(dom);
+
return 0;
}
static void iommu_unmap_page(struct protection_domain *dom,
- unsigned long bus_addr)
+ unsigned long bus_addr, int map_size)
{
- u64 *pte;
-
- pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+ u64 *pte = fetch_pte(dom, bus_addr, map_size);
- if (!IOMMU_PTE_PRESENT(*pte))
- return;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
- *pte = 0;
+ if (pte)
+ *pte = 0;
}
/*
@@ -615,7 +687,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
for (addr = e->address_start; addr < e->address_end;
addr += PAGE_SIZE) {
- ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot);
+ ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
+ PM_MAP_4k);
if (ret)
return ret;
/*
@@ -670,24 +743,29 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
* This function checks if there is a PTE for a given dma address. If
* there is one, it returns the pointer to it.
*/
-static u64* fetch_pte(struct protection_domain *domain,
- unsigned long address)
+static u64 *fetch_pte(struct protection_domain *domain,
+ unsigned long address, int map_size)
{
+ int level;
u64 *pte;
- pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
+ while (level > map_size) {
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return NULL;
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+ level -= 1;
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[PM_LEVEL_INDEX(level, address)];
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+ if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
+ pte = NULL;
+ break;
+ }
+ }
return pte;
}
@@ -727,7 +805,7 @@ static int alloc_new_range(struct amd_iommu *iommu,
u64 *pte, *pte_page;
for (i = 0; i < num_ptes; ++i) {
- pte = alloc_pte(&dma_dom->domain, address,
+ pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
&pte_page, gfp);
if (!pte)
goto out_free;
@@ -760,16 +838,20 @@ static int alloc_new_range(struct amd_iommu *iommu,
for (i = dma_dom->aperture[index]->offset;
i < dma_dom->aperture_size;
i += PAGE_SIZE) {
- u64 *pte = fetch_pte(&dma_dom->domain, i);
+ u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
if (!pte || !IOMMU_PTE_PRESENT(*pte))
continue;
dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
}
+ update_domain(&dma_dom->domain);
+
return 0;
out_free:
+ update_domain(&dma_dom->domain);
+
free_page((unsigned long)dma_dom->aperture[index]->bitmap);
kfree(dma_dom->aperture[index]);
@@ -1009,7 +1091,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
dma_dom->domain.id = domain_id_alloc();
if (dma_dom->domain.id == 0)
goto free_dma_dom;
- dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
+ dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
dma_dom->domain.flags = PD_DMA_OPS_MASK;
dma_dom->domain.priv = dma_dom;
@@ -1063,6 +1145,41 @@ static struct protection_domain *domain_for_device(u16 devid)
return dom;
}
+static void set_dte_entry(u16 devid, struct protection_domain *domain)
+{
+ u64 pte_root = virt_to_phys(domain->pt_root);
+
+ pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
+ << DEV_ENTRY_MODE_SHIFT;
+ pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
+
+ amd_iommu_dev_table[devid].data[2] = domain->id;
+ amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
+ amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+
+ amd_iommu_pd_table[devid] = domain;
+}
+
+/*
+ * If a device is not yet associated with a domain, this function does
+ * assigns it visible for the hardware
+ */
+static void __attach_device(struct amd_iommu *iommu,
+ struct protection_domain *domain,
+ u16 devid)
+{
+ /* lock domain */
+ spin_lock(&domain->lock);
+
+ /* update DTE entry */
+ set_dte_entry(devid, domain);
+
+ domain->dev_cnt += 1;
+
+ /* ready */
+ spin_unlock(&domain->lock);
+}
+
/*
* If a device is not yet associated with a domain, this function does
* assigns it visible for the hardware
@@ -1072,27 +1189,16 @@ static void attach_device(struct amd_iommu *iommu,
u16 devid)
{
unsigned long flags;
- u64 pte_root = virt_to_phys(domain->pt_root);
-
- domain->dev_cnt += 1;
-
- pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
- << DEV_ENTRY_MODE_SHIFT;
- pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
- amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
- amd_iommu_dev_table[devid].data[2] = domain->id;
-
- amd_iommu_pd_table[devid] = domain;
+ __attach_device(iommu, domain, devid);
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
- /*
- * We might boot into a crash-kernel here. The crashed kernel
- * left the caches in the IOMMU dirty. So we have to flush
- * here to evict all dirty stuff.
- */
+ /*
+ * We might boot into a crash-kernel here. The crashed kernel
+ * left the caches in the IOMMU dirty. So we have to flush
+ * here to evict all dirty stuff.
+ */
iommu_queue_inv_dev_entry(iommu, devid);
iommu_flush_tlb_pde(iommu, domain->id);
}
@@ -1119,6 +1225,15 @@ static void __detach_device(struct protection_domain *domain, u16 devid)
/* ready */
spin_unlock(&domain->lock);
+
+ /*
+ * If we run in passthrough mode the device must be assigned to the
+ * passthrough domain if it is detached from any other domain
+ */
+ if (iommu_pass_through) {
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+ __attach_device(iommu, pt_domain, devid);
+ }
}
/*
@@ -1164,6 +1279,8 @@ static int device_change_notifier(struct notifier_block *nb,
case BUS_NOTIFY_UNBOUND_DRIVER:
if (!domain)
goto out;
+ if (iommu_pass_through)
+ break;
detach_device(domain, devid);
break;
case BUS_NOTIFY_ADD_DEVICE:
@@ -1292,39 +1409,91 @@ static int get_device_resources(struct device *dev,
return 1;
}
+static void update_device_table(struct protection_domain *domain)
+{
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+ if (amd_iommu_pd_table[i] != domain)
+ continue;
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ set_dte_entry(i, domain);
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+ }
+}
+
+static void update_domain(struct protection_domain *domain)
+{
+ if (!domain->updated)
+ return;
+
+ update_device_table(domain);
+ flush_devices_by_domain(domain);
+ iommu_flush_domain(domain->id);
+
+ domain->updated = false;
+}
+
/*
- * If the pte_page is not yet allocated this function is called
+ * This function is used to add another level to an IO page table. Adding
+ * another level increases the size of the address space by 9 bits to a size up
+ * to 64 bits.
*/
-static u64* alloc_pte(struct protection_domain *dom,
- unsigned long address, u64 **pte_page, gfp_t gfp)
+static bool increase_address_space(struct protection_domain *domain,
+ gfp_t gfp)
+{
+ u64 *pte;
+
+ if (domain->mode == PAGE_MODE_6_LEVEL)
+ /* address space already 64 bit large */
+ return false;
+
+ pte = (void *)get_zeroed_page(gfp);
+ if (!pte)
+ return false;
+
+ *pte = PM_LEVEL_PDE(domain->mode,
+ virt_to_phys(domain->pt_root));
+ domain->pt_root = pte;
+ domain->mode += 1;
+ domain->updated = true;
+
+ return true;
+}
+
+static u64 *alloc_pte(struct protection_domain *domain,
+ unsigned long address,
+ int end_lvl,
+ u64 **pte_page,
+ gfp_t gfp)
{
u64 *pte, *page;
+ int level;
- pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
+ while (address > PM_LEVEL_SIZE(domain->mode))
+ increase_address_space(domain, gfp);
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(gfp);
- if (!page)
- return NULL;
- *pte = IOMMU_L2_PDE(virt_to_phys(page));
- }
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+ while (level > end_lvl) {
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(gfp);
+ if (!page)
+ return NULL;
+ *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
+ }
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(gfp);
- if (!page)
- return NULL;
- *pte = IOMMU_L1_PDE(virt_to_phys(page));
- }
+ level -= 1;
- pte = IOMMU_PTE_PAGE(*pte);
+ pte = IOMMU_PTE_PAGE(*pte);
- if (pte_page)
- *pte_page = pte;
+ if (pte_page && level == end_lvl)
+ *pte_page = pte;
- pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+ pte = &pte[PM_LEVEL_INDEX(level, address)];
+ }
return pte;
}
@@ -1344,10 +1513,13 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
if (!pte) {
- pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
+ pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
+ GFP_ATOMIC);
aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
} else
- pte += IOMMU_PTE_L0_INDEX(address);
+ pte += PM_LEVEL_INDEX(0, address);
+
+ update_domain(&dom->domain);
return pte;
}
@@ -1409,7 +1581,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
if (!pte)
return;
- pte += IOMMU_PTE_L0_INDEX(address);
+ pte += PM_LEVEL_INDEX(0, address);
WARN_ON(!*pte);
@@ -1988,19 +2160,47 @@ static void cleanup_domain(struct protection_domain *domain)
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
}
-static int amd_iommu_domain_init(struct iommu_domain *dom)
+static void protection_domain_free(struct protection_domain *domain)
+{
+ if (!domain)
+ return;
+
+ if (domain->id)
+ domain_id_free(domain->id);
+
+ kfree(domain);
+}
+
+static struct protection_domain *protection_domain_alloc(void)
{
struct protection_domain *domain;
domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
- return -ENOMEM;
+ return NULL;
spin_lock_init(&domain->lock);
- domain->mode = PAGE_MODE_3_LEVEL;
domain->id = domain_id_alloc();
if (!domain->id)
+ goto out_err;
+
+ return domain;
+
+out_err:
+ kfree(domain);
+
+ return NULL;
+}
+
+static int amd_iommu_domain_init(struct iommu_domain *dom)
+{
+ struct protection_domain *domain;
+
+ domain = protection_domain_alloc();
+ if (!domain)
goto out_free;
+
+ domain->mode = PAGE_MODE_3_LEVEL;
domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
if (!domain->pt_root)
goto out_free;
@@ -2010,7 +2210,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom)
return 0;
out_free:
- kfree(domain);
+ protection_domain_free(domain);
return -ENOMEM;
}
@@ -2115,7 +2315,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
paddr &= PAGE_MASK;
for (i = 0; i < npages; ++i) {
- ret = iommu_map_page(domain, iova, paddr, prot);
+ ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
if (ret)
return ret;
@@ -2136,7 +2336,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
iova &= PAGE_MASK;
for (i = 0; i < npages; ++i) {
- iommu_unmap_page(domain, iova);
+ iommu_unmap_page(domain, iova, PM_MAP_4k);
iova += PAGE_SIZE;
}
@@ -2151,21 +2351,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
phys_addr_t paddr;
u64 *pte;
- pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
+ pte = fetch_pte(domain, iova, PM_MAP_4k);
- if (!IOMMU_PTE_PRESENT(*pte))
+ if (!pte || !IOMMU_PTE_PRESENT(*pte))
return 0;
paddr = *pte & IOMMU_PAGE_MASK;
@@ -2191,3 +2379,46 @@ static struct iommu_ops amd_iommu_ops = {
.domain_has_cap = amd_iommu_domain_has_cap,
};
+/*****************************************************************************
+ *
+ * The next functions do a basic initialization of IOMMU for pass through
+ * mode
+ *
+ * In passthrough mode the IOMMU is initialized and enabled but not used for
+ * DMA-API translation.
+ *
+ *****************************************************************************/
+
+int __init amd_iommu_init_passthrough(void)
+{
+ struct pci_dev *dev = NULL;
+ u16 devid, devid2;
+
+ /* allocate passthroug domain */
+ pt_domain = protection_domain_alloc();
+ if (!pt_domain)
+ return -ENOMEM;
+
+ pt_domain->mode |= PAGE_MODE_NONE;
+
+ while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ struct amd_iommu *iommu;
+
+ devid = calc_devid(dev->bus->number, dev->devfn);
+ if (devid > amd_iommu_last_bdf)
+ continue;
+
+ devid2 = amd_iommu_alias_table[devid];
+
+ iommu = amd_iommu_rlookup_table[devid2];
+ if (!iommu)
+ continue;
+
+ __attach_device(iommu, pt_domain, devid);
+ __attach_device(iommu, pt_domain, devid2);
+ }
+
+ pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
+
+ return 0;
+}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c1b17e97252..b4b61d462dc 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -252,7 +252,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
/* Function to enable the hardware */
static void iommu_enable(struct amd_iommu *iommu)
{
- printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
+ printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
dev_name(&iommu->dev->dev), iommu->cap_ptr);
iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
@@ -435,6 +435,20 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
}
/*
+ * This function resets the command buffer if the IOMMU stopped fetching
+ * commands from it.
+ */
+void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
+{
+ iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+
+ writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+ iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+}
+
+/*
* This function writes the command buffer address to the hardware and
* enables it.
*/
@@ -450,11 +464,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
&entry, sizeof(entry));
- /* set head and tail to zero manually */
- writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
- iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+ amd_iommu_reset_cmd_buffer(iommu);
}
static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -858,7 +868,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
switch (*p) {
case ACPI_IVHD_TYPE:
- DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
+ DUMP_printk("device: %02x:%02x.%01x cap: %04x "
"seg: %d flags: %01x info %04x\n",
PCI_BUS(h->devid), PCI_SLOT(h->devid),
PCI_FUNC(h->devid), h->cap_ptr,
@@ -902,7 +912,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
IRQF_SAMPLE_RANDOM,
- "AMD IOMMU",
+ "AMD-Vi",
NULL);
if (r) {
@@ -1150,7 +1160,7 @@ int __init amd_iommu_init(void)
if (no_iommu) {
- printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
+ printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
return 0;
}
@@ -1242,22 +1252,28 @@ int __init amd_iommu_init(void)
if (ret)
goto free;
- ret = amd_iommu_init_dma_ops();
+ if (iommu_pass_through)
+ ret = amd_iommu_init_passthrough();
+ else
+ ret = amd_iommu_init_dma_ops();
if (ret)
goto free;
enable_iommus();
- printk(KERN_INFO "AMD IOMMU: device isolation ");
+ if (iommu_pass_through)
+ goto out;
+
+ printk(KERN_INFO "AMD-Vi: device isolation ");
if (amd_iommu_isolate)
printk("enabled\n");
else
printk("disabled\n");
if (amd_iommu_unmap_flush)
- printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
+ printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
else
- printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
+ printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
out:
return ret;
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index b3025b43b63..db7220220d0 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,7 @@
int unknown_nmi_panic;
int nmi_watchdog_enabled;
-static cpumask_var_t backtrace_mask;
+static cpumask_t backtrace_mask __read_mostly;
/* nmi_active:
* >0: the lapic NMI watchdog is active, but can be disabled
@@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void)
if (!prev_nmi_count)
goto error;
- alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO);
printk(KERN_INFO "Testing NMI watchdog ... ");
#ifdef CONFIG_SMP
@@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
}
/* We can be called before check_nmi_watchdog, hence NULL check. */
- if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) {
+ if (cpumask_test_cpu(cpu, &backtrace_mask)) {
static DEFINE_SPINLOCK(lock); /* Serialise the printks */
spin_lock(&lock);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+ show_regs(regs);
dump_stack();
spin_unlock(&lock);
- cpumask_clear_cpu(cpu, backtrace_mask);
+ cpumask_clear_cpu(cpu, &backtrace_mask);
+
+ rc = 1;
}
/* Could check oops_in_progress here too, but it's safer not to */
@@ -552,14 +554,18 @@ int do_nmi_callback(struct pt_regs *regs, int cpu)
return 0;
}
-void __trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(void)
{
int i;
- cpumask_copy(backtrace_mask, cpu_online_mask);
+ cpumask_copy(&backtrace_mask, cpu_online_mask);
+
+ printk(KERN_INFO "sending NMI to all CPUs:\n");
+ apic->send_IPI_all(NMI_VECTOR);
+
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
- if (cpumask_empty(backtrace_mask))
+ if (cpumask_empty(&backtrace_mask))
break;
mdelay(1);
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 898ecc47e12..4a6aeedcd96 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -3,6 +3,7 @@
* This code generates raw asm output which is post-processed to extract
* and format the required data.
*/
+#define COMPILE_OFFSETS
#include <linux/crypto.h>
#include <linux/sched.h>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 900332b800f..f9cd0849bd4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -6,6 +6,7 @@
* Copyright (C) 2009 Jaswinder Singh Rajput
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
*
* For licencing details see kernel-base/COPYING
*/
@@ -20,6 +21,7 @@
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
+#include <linux/cpu.h>
#include <asm/apic.h>
#include <asm/stacktrace.h>
@@ -27,12 +29,52 @@
static u64 perf_counter_mask __read_mostly;
+/* The maximal number of PEBS counters: */
+#define MAX_PEBS_COUNTERS 4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE 24
+
+/* The size of a per-cpu BTS buffer in bytes: */
+#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024)
+
+/* The BTS overflow threshold in bytes from the end of the buffer: */
+#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64)
+
+
+/*
+ * Bits in the debugctlmsr controlling branch tracing.
+ */
+#define X86_DEBUGCTL_TR (1 << 6)
+#define X86_DEBUGCTL_BTS (1 << 7)
+#define X86_DEBUGCTL_BTINT (1 << 8)
+#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
+#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+ u64 bts_buffer_base;
+ u64 bts_index;
+ u64 bts_absolute_maximum;
+ u64 bts_interrupt_threshold;
+ u64 pebs_buffer_base;
+ u64 pebs_index;
+ u64 pebs_absolute_maximum;
+ u64 pebs_interrupt_threshold;
+ u64 pebs_counter_reset[MAX_PEBS_COUNTERS];
+};
+
struct cpu_hw_counters {
struct perf_counter *counters[X86_PMC_IDX_MAX];
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long interrupts;
int enabled;
+ struct debug_store *ds;
};
/*
@@ -58,6 +100,8 @@ struct x86_pmu {
int apic;
u64 max_period;
u64 intel_ctrl;
+ void (*enable_bts)(u64 config);
+ void (*disable_bts)(void);
};
static struct x86_pmu x86_pmu __read_mostly;
@@ -577,6 +621,9 @@ x86_perf_counter_update(struct perf_counter *counter,
u64 prev_raw_count, new_raw_count;
s64 delta;
+ if (idx == X86_PMC_IDX_FIXED_BTS)
+ return 0;
+
/*
* Careful: an NMI might modify the previous counter value.
*
@@ -666,10 +713,110 @@ static void release_pmc_hardware(void)
#endif
}
+static inline bool bts_available(void)
+{
+ return x86_pmu.enable_bts != NULL;
+}
+
+static inline void init_debug_store_on_cpu(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+
+ if (!ds)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+ (u32)((u64)(unsigned long)ds),
+ (u32)((u64)(unsigned long)ds >> 32));
+}
+
+static inline void fini_debug_store_on_cpu(int cpu)
+{
+ if (!per_cpu(cpu_hw_counters, cpu).ds)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_bts_hardware(void)
+{
+ int cpu;
+
+ if (!bts_available())
+ return;
+
+ get_online_cpus();
+
+ for_each_online_cpu(cpu)
+ fini_debug_store_on_cpu(cpu);
+
+ for_each_possible_cpu(cpu) {
+ struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+
+ if (!ds)
+ continue;
+
+ per_cpu(cpu_hw_counters, cpu).ds = NULL;
+
+ kfree((void *)(unsigned long)ds->bts_buffer_base);
+ kfree(ds);
+ }
+
+ put_online_cpus();
+}
+
+static int reserve_bts_hardware(void)
+{
+ int cpu, err = 0;
+
+ if (!bts_available())
+ return 0;
+
+ get_online_cpus();
+
+ for_each_possible_cpu(cpu) {
+ struct debug_store *ds;
+ void *buffer;
+
+ err = -ENOMEM;
+ buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+ if (unlikely(!buffer))
+ break;
+
+ ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+ if (unlikely(!ds)) {
+ kfree(buffer);
+ break;
+ }
+
+ ds->bts_buffer_base = (u64)(unsigned long)buffer;
+ ds->bts_index = ds->bts_buffer_base;
+ ds->bts_absolute_maximum =
+ ds->bts_buffer_base + BTS_BUFFER_SIZE;
+ ds->bts_interrupt_threshold =
+ ds->bts_absolute_maximum - BTS_OVFL_TH;
+
+ per_cpu(cpu_hw_counters, cpu).ds = ds;
+ err = 0;
+ }
+
+ if (err)
+ release_bts_hardware();
+ else {
+ for_each_online_cpu(cpu)
+ init_debug_store_on_cpu(cpu);
+ }
+
+ put_online_cpus();
+
+ return err;
+}
+
static void hw_perf_counter_destroy(struct perf_counter *counter)
{
if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
release_pmc_hardware();
+ release_bts_hardware();
mutex_unlock(&pmc_reserve_mutex);
}
}
@@ -712,6 +859,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
return 0;
}
+static void intel_pmu_enable_bts(u64 config)
+{
+ unsigned long debugctlmsr;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr |= X86_DEBUGCTL_TR;
+ debugctlmsr |= X86_DEBUGCTL_BTS;
+ debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+ debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+ debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ unsigned long debugctlmsr;
+
+ if (!cpuc->ds)
+ return;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr &=
+ ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+ X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+ update_debugctlmsr(debugctlmsr);
+}
+
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -728,9 +911,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
err = 0;
if (!atomic_inc_not_zero(&active_counters)) {
mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
- err = -EBUSY;
- else
+ if (atomic_read(&active_counters) == 0) {
+ if (!reserve_pmc_hardware())
+ err = -EBUSY;
+ else
+ err = reserve_bts_hardware();
+ }
+ if (!err)
atomic_inc(&active_counters);
mutex_unlock(&pmc_reserve_mutex);
}
@@ -793,6 +980,20 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
if (config == -1LL)
return -EINVAL;
+ /*
+ * Branch tracing:
+ */
+ if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+ (hwc->sample_period == 1)) {
+ /* BTS is not supported by this architecture. */
+ if (!bts_available())
+ return -EOPNOTSUPP;
+
+ /* BTS is currently only allowed for user-mode. */
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ return -EOPNOTSUPP;
+ }
+
hwc->config |= config;
return 0;
@@ -817,7 +1018,18 @@ static void p6_pmu_disable_all(void)
static void intel_pmu_disable_all(void)
{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ if (!cpuc->enabled)
+ return;
+
+ cpuc->enabled = 0;
+ barrier();
+
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+ if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ intel_pmu_disable_bts();
}
static void amd_pmu_disable_all(void)
@@ -875,7 +1087,25 @@ static void p6_pmu_enable_all(void)
static void intel_pmu_enable_all(void)
{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ if (cpuc->enabled)
+ return;
+
+ cpuc->enabled = 1;
+ barrier();
+
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+ if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+ struct perf_counter *counter =
+ cpuc->counters[X86_PMC_IDX_FIXED_BTS];
+
+ if (WARN_ON_ONCE(!counter))
+ return;
+
+ intel_pmu_enable_bts(counter->hw.config);
+ }
}
static void amd_pmu_enable_all(void)
@@ -962,6 +1192,11 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
static inline void
intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
{
+ if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+ intel_pmu_disable_bts();
+ return;
+ }
+
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_disable_fixed(hwc, idx);
return;
@@ -990,6 +1225,9 @@ x86_perf_counter_set_period(struct perf_counter *counter,
s64 period = hwc->sample_period;
int err, ret = 0;
+ if (idx == X86_PMC_IDX_FIXED_BTS)
+ return 0;
+
/*
* If we are way outside a reasoable range then just skip forward:
*/
@@ -1072,6 +1310,14 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
{
+ if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+ if (!__get_cpu_var(cpu_hw_counters).enabled)
+ return;
+
+ intel_pmu_enable_bts(hwc->config);
+ return;
+ }
+
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_enable_fixed(hwc, idx);
return;
@@ -1093,11 +1339,16 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
{
unsigned int event;
+ event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+ if (unlikely((event ==
+ x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+ (hwc->sample_period == 1)))
+ return X86_PMC_IDX_FIXED_BTS;
+
if (!x86_pmu.num_counters_fixed)
return -1;
- event = hwc->config & ARCH_PERFMON_EVENT_MASK;
-
if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
return X86_PMC_IDX_FIXED_INSTRUCTIONS;
if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1118,7 +1369,15 @@ static int x86_pmu_enable(struct perf_counter *counter)
int idx;
idx = fixed_mode_idx(counter, hwc);
- if (idx >= 0) {
+ if (idx == X86_PMC_IDX_FIXED_BTS) {
+ /* BTS is already occupied. */
+ if (test_and_set_bit(idx, cpuc->used_mask))
+ return -EAGAIN;
+
+ hwc->config_base = 0;
+ hwc->counter_base = 0;
+ hwc->idx = idx;
+ } else if (idx >= 0) {
/*
* Try to get the fixed counter, if that is already taken
* then try to get a generic counter:
@@ -1229,6 +1488,44 @@ void perf_counter_print_debug(void)
local_irq_restore(flags);
}
+static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
+ struct perf_sample_data *data)
+{
+ struct debug_store *ds = cpuc->ds;
+ struct bts_record {
+ u64 from;
+ u64 to;
+ u64 flags;
+ };
+ struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
+ unsigned long orig_ip = data->regs->ip;
+ struct bts_record *at, *top;
+
+ if (!counter)
+ return;
+
+ if (!ds)
+ return;
+
+ at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+ top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+ ds->bts_index = ds->bts_buffer_base;
+
+ for (; at < top; at++) {
+ data->regs->ip = at->from;
+ data->addr = at->to;
+
+ perf_counter_output(counter, 1, data);
+ }
+
+ data->regs->ip = orig_ip;
+ data->addr = 0;
+
+ /* There's new data available. */
+ counter->pending_kill = POLL_IN;
+}
+
static void x86_pmu_disable(struct perf_counter *counter)
{
struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1253,6 +1550,15 @@ static void x86_pmu_disable(struct perf_counter *counter)
* that we are disabling:
*/
x86_perf_counter_update(counter, hwc, idx);
+
+ /* Drain the remaining BTS records. */
+ if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+ struct perf_sample_data data;
+ struct pt_regs regs;
+
+ data.regs = &regs;
+ intel_pmu_drain_bts_buffer(cpuc, &data);
+ }
cpuc->counters[idx] = NULL;
clear_bit(idx, cpuc->used_mask);
@@ -1280,6 +1586,7 @@ static int intel_pmu_save_and_restart(struct perf_counter *counter)
static void intel_pmu_reset(void)
{
+ struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds;
unsigned long flags;
int idx;
@@ -1297,6 +1604,8 @@ static void intel_pmu_reset(void)
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
}
+ if (ds)
+ ds->bts_index = ds->bts_buffer_base;
local_irq_restore(flags);
}
@@ -1362,6 +1671,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_counters);
perf_disable();
+ intel_pmu_drain_bts_buffer(cpuc, &data);
status = intel_pmu_get_status();
if (!status) {
perf_enable();
@@ -1571,6 +1881,8 @@ static struct x86_pmu intel_pmu = {
* the generic counter period:
*/
.max_period = (1ULL << 31) - 1,
+ .enable_bts = intel_pmu_enable_bts,
+ .disable_bts = intel_pmu_disable_bts,
};
static struct x86_pmu amd_pmu = {
@@ -1962,3 +2274,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
return entry;
}
+
+void hw_perf_counter_setup_online(int cpu)
+{
+ init_debug_store_on_cpu(cpu);
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d94e1ea3b9f..9dbb527e165 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
unsigned long return_hooker = (unsigned long)
&return_to_handler;
- /* Nmi's are currently unsupported */
- if (unlikely(in_nmi()))
- return;
-
if (unlikely(atomic_read(&current->tracing_graph_pause)))
return;
@@ -498,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
struct syscall_metadata *syscall_nr_to_meta(int nr)
{
- if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0)
+ if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
return NULL;
return syscalls_metadata[nr];
}
-void arch_init_ftrace_syscalls(void)
+int syscall_name_to_nr(char *name)
+{
+ int i;
+
+ if (!syscalls_metadata)
+ return -1;
+
+ for (i = 0; i < NR_syscalls; i++) {
+ if (syscalls_metadata[i]) {
+ if (!strcmp(syscalls_metadata[i]->name, name))
+ return i;
+ }
+ }
+ return -1;
+}
+
+void set_syscall_enter_id(int num, int id)
+{
+ syscalls_metadata[num]->enter_id = id;
+}
+
+void set_syscall_exit_id(int num, int id)
+{
+ syscalls_metadata[num]->exit_id = id;
+}
+
+static int __init arch_init_ftrace_syscalls(void)
{
int i;
struct syscall_metadata *meta;
unsigned long **psys_syscall_table = &sys_call_table;
- static atomic_t refs;
-
- if (atomic_inc_return(&refs) != 1)
- goto end;
syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
- FTRACE_SYSCALL_MAX, GFP_KERNEL);
+ NR_syscalls, GFP_KERNEL);
if (!syscalls_metadata) {
WARN_ON(1);
- return;
+ return -ENOMEM;
}
- for (i = 0; i < FTRACE_SYSCALL_MAX; i++) {
+ for (i = 0; i < NR_syscalls; i++) {
meta = find_syscall_meta(psys_syscall_table[i]);
syscalls_metadata[i] = meta;
}
- return;
-
- /* Paranoid: avoid overflow */
-end:
- atomic_dec(&refs);
+ return 0;
}
+arch_initcall(arch_init_ftrace_syscalls);
#endif
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index fa80f60e960..d71c8655905 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -33,7 +33,14 @@ int no_iommu __read_mostly;
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
-int iommu_pass_through;
+/*
+ * This variable becomes 1 if iommu=pt is passed on the kernel command line.
+ * If this variable is 1, IOMMU implementations do no DMA ranslation for
+ * devices and allow every device to access to whole physical memory. This is
+ * useful if a user want to use an IOMMU only for KVM device assignment to
+ * guests and not for driver dma translation.
+ */
+int iommu_pass_through __read_mostly;
dma_addr_t bad_dma_address __read_mostly = 0;
EXPORT_SYMBOL(bad_dma_address);
@@ -153,7 +160,7 @@ again:
return NULL;
addr = page_to_phys(page);
- if (!is_buffer_dma_capable(dma_mask, addr, size)) {
+ if (addr + size > dma_mask) {
__free_pages(page, get_order(size));
if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index d2e56b8f48e..98a827ee9ed 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -190,14 +190,13 @@ static void iommu_full(struct device *dev, size_t size, int dir)
static inline int
need_iommu(struct device *dev, unsigned long addr, size_t size)
{
- return force_iommu ||
- !is_buffer_dma_capable(*dev->dma_mask, addr, size);
+ return force_iommu || !dma_capable(dev, addr, size);
}
static inline int
nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
{
- return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
+ return !dma_capable(dev, addr, size);
}
/* Map a single continuous physical area into the IOMMU.
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 71d412a09f3..a3933d4330c 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
static int
check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
{
- if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
+ if (hwdev && !dma_capable(hwdev, bus, size)) {
if (*hwdev->dma_mask >= DMA_BIT_MASK(32))
printk(KERN_ERR
"nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -79,12 +79,29 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
free_pages((unsigned long)vaddr, get_order(size));
}
+static void nommu_sync_single_for_device(struct device *dev,
+ dma_addr_t addr, size_t size,
+ enum dma_data_direction dir)
+{
+ flush_write_buffers();
+}
+
+
+static void nommu_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sg, int nelems,
+ enum dma_data_direction dir)
+{
+ flush_write_buffers();
+}
+
struct dma_map_ops nommu_dma_ops = {
- .alloc_coherent = dma_generic_alloc_coherent,
- .free_coherent = nommu_free_coherent,
- .map_sg = nommu_map_sg,
- .map_page = nommu_map_page,
- .is_phys = 1,
+ .alloc_coherent = dma_generic_alloc_coherent,
+ .free_coherent = nommu_free_coherent,
+ .map_sg = nommu_map_sg,
+ .map_page = nommu_map_page,
+ .sync_single_for_device = nommu_sync_single_for_device,
+ .sync_sg_for_device = nommu_sync_sg_for_device,
+ .is_phys = 1,
};
void __init no_iommu_init(void)
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 6af96ee4420..e8a35016115 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -13,31 +13,6 @@
int swiotlb __read_mostly;
-void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs)
-{
- return alloc_bootmem_low_pages(size);
-}
-
-void *swiotlb_alloc(unsigned order, unsigned long nslabs)
-{
- return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
-}
-
-dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
-{
- return paddr;
-}
-
-phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
-{
- return baddr;
-}
-
-int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
-{
- return 0;
-}
-
static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags)
{
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 09ecbde91c1..8d7d5c9c1be 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -35,10 +35,11 @@
#include <asm/proto.h>
#include <asm/ds.h>
-#include <trace/syscall.h>
-
#include "tls.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
enum x86_regset {
REGSET_GENERAL,
REGSET_FP,
@@ -1497,8 +1498,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
tracehook_report_syscall_entry(regs))
ret = -1L;
- if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
- ftrace_syscall_enter(regs);
+ if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+ trace_sys_enter(regs, regs->orig_ax);
if (unlikely(current->audit_context)) {
if (IS_IA32)
@@ -1523,8 +1524,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
if (unlikely(current->audit_context))
audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
- if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
- ftrace_syscall_exit(regs);
+ if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+ trace_sys_exit(regs, regs->ax);
if (test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall_exit(regs, 0);
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 6bc211accf0..45e00eb09c3 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -18,9 +18,9 @@
#include <asm/ia32.h>
#include <asm/syscalls.h>
-asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long off)
+SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, off)
{
long error;
struct file *file;
@@ -226,7 +226,7 @@ bottomup:
}
-asmlinkage long sys_uname(struct new_utsname __user *name)
+SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
{
int err;
down_read(&uts_sem);
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 89b9a5cd63d..cb88b1a0bd5 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -1,11 +1,14 @@
/**
* @file nmi_int.c
*
- * @remark Copyright 2002-2008 OProfile authors
+ * @remark Copyright 2002-2009 OProfile authors
* @remark Read the file COPYING
*
* @author John Levon <levon@movementarian.org>
* @author Robert Richter <robert.richter@amd.com>
+ * @author Barry Kasindorf <barry.kasindorf@amd.com>
+ * @author Jason Yeh <jason.yeh@amd.com>
+ * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
*/
#include <linux/init.h>
@@ -24,13 +27,35 @@
#include "op_counter.h"
#include "op_x86_model.h"
-static struct op_x86_model_spec const *model;
+static struct op_x86_model_spec *model;
static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
/* 0 == registered but off, 1 == registered and on */
static int nmi_enabled = 0;
+struct op_counter_config counter_config[OP_MAX_COUNTER];
+
+/* common functions */
+
+u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
+ struct op_counter_config *counter_config)
+{
+ u64 val = 0;
+ u16 event = (u16)counter_config->event;
+
+ val |= ARCH_PERFMON_EVENTSEL_INT;
+ val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
+ val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
+ val |= (counter_config->unit_mask & 0xFF) << 8;
+ event &= model->event_mask ? model->event_mask : 0xFF;
+ val |= event & 0xFF;
+ val |= (event & 0x0F00) << 24;
+
+ return val;
+}
+
+
static int profile_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data)
{
@@ -52,36 +77,214 @@ static int profile_exceptions_notify(struct notifier_block *self,
static void nmi_cpu_save_registers(struct op_msrs *msrs)
{
- unsigned int const nr_ctrs = model->num_counters;
- unsigned int const nr_ctrls = model->num_controls;
struct op_msr *counters = msrs->counters;
struct op_msr *controls = msrs->controls;
unsigned int i;
- for (i = 0; i < nr_ctrs; ++i) {
- if (counters[i].addr) {
- rdmsr(counters[i].addr,
- counters[i].saved.low,
- counters[i].saved.high);
- }
+ for (i = 0; i < model->num_counters; ++i) {
+ if (counters[i].addr)
+ rdmsrl(counters[i].addr, counters[i].saved);
+ }
+
+ for (i = 0; i < model->num_controls; ++i) {
+ if (controls[i].addr)
+ rdmsrl(controls[i].addr, controls[i].saved);
+ }
+}
+
+static void nmi_cpu_start(void *dummy)
+{
+ struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
+ model->start(msrs);
+}
+
+static int nmi_start(void)
+{
+ on_each_cpu(nmi_cpu_start, NULL, 1);
+ return 0;
+}
+
+static void nmi_cpu_stop(void *dummy)
+{
+ struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
+ model->stop(msrs);
+}
+
+static void nmi_stop(void)
+{
+ on_each_cpu(nmi_cpu_stop, NULL, 1);
+}
+
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
+static DEFINE_PER_CPU(int, switch_index);
+
+static inline int has_mux(void)
+{
+ return !!model->switch_ctrl;
+}
+
+inline int op_x86_phys_to_virt(int phys)
+{
+ return __get_cpu_var(switch_index) + phys;
+}
+
+inline int op_x86_virt_to_phys(int virt)
+{
+ return virt % model->num_counters;
+}
+
+static void nmi_shutdown_mux(void)
+{
+ int i;
+
+ if (!has_mux())
+ return;
+
+ for_each_possible_cpu(i) {
+ kfree(per_cpu(cpu_msrs, i).multiplex);
+ per_cpu(cpu_msrs, i).multiplex = NULL;
+ per_cpu(switch_index, i) = 0;
}
+}
+
+static int nmi_setup_mux(void)
+{
+ size_t multiplex_size =
+ sizeof(struct op_msr) * model->num_virt_counters;
+ int i;
+
+ if (!has_mux())
+ return 1;
+
+ for_each_possible_cpu(i) {
+ per_cpu(cpu_msrs, i).multiplex =
+ kmalloc(multiplex_size, GFP_KERNEL);
+ if (!per_cpu(cpu_msrs, i).multiplex)
+ return 0;
+ }
+
+ return 1;
+}
+
+static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
+{
+ int i;
+ struct op_msr *multiplex = msrs->multiplex;
+
+ if (!has_mux())
+ return;
- for (i = 0; i < nr_ctrls; ++i) {
- if (controls[i].addr) {
- rdmsr(controls[i].addr,
- controls[i].saved.low,
- controls[i].saved.high);
+ for (i = 0; i < model->num_virt_counters; ++i) {
+ if (counter_config[i].enabled) {
+ multiplex[i].saved = -(u64)counter_config[i].count;
+ } else {
+ multiplex[i].addr = 0;
+ multiplex[i].saved = 0;
}
}
+
+ per_cpu(switch_index, cpu) = 0;
+}
+
+static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
+{
+ struct op_msr *multiplex = msrs->multiplex;
+ int i;
+
+ for (i = 0; i < model->num_counters; ++i) {
+ int virt = op_x86_phys_to_virt(i);
+ if (multiplex[virt].addr)
+ rdmsrl(multiplex[virt].addr, multiplex[virt].saved);
+ }
+}
+
+static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
+{
+ struct op_msr *multiplex = msrs->multiplex;
+ int i;
+
+ for (i = 0; i < model->num_counters; ++i) {
+ int virt = op_x86_phys_to_virt(i);
+ if (multiplex[virt].addr)
+ wrmsrl(multiplex[virt].addr, multiplex[virt].saved);
+ }
}
-static void nmi_save_registers(void *dummy)
+static void nmi_cpu_switch(void *dummy)
{
int cpu = smp_processor_id();
+ int si = per_cpu(switch_index, cpu);
struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
- nmi_cpu_save_registers(msrs);
+
+ nmi_cpu_stop(NULL);
+ nmi_cpu_save_mpx_registers(msrs);
+
+ /* move to next set */
+ si += model->num_counters;
+ if ((si > model->num_virt_counters) || (counter_config[si].count == 0))
+ per_cpu(switch_index, cpu) = 0;
+ else
+ per_cpu(switch_index, cpu) = si;
+
+ model->switch_ctrl(model, msrs);
+ nmi_cpu_restore_mpx_registers(msrs);
+
+ nmi_cpu_start(NULL);
+}
+
+
+/*
+ * Quick check to see if multiplexing is necessary.
+ * The check should be sufficient since counters are used
+ * in ordre.
+ */
+static int nmi_multiplex_on(void)
+{
+ return counter_config[model->num_counters].count ? 0 : -EINVAL;
+}
+
+static int nmi_switch_event(void)
+{
+ if (!has_mux())
+ return -ENOSYS; /* not implemented */
+ if (nmi_multiplex_on() < 0)
+ return -EINVAL; /* not necessary */
+
+ on_each_cpu(nmi_cpu_switch, NULL, 1);
+
+ return 0;
+}
+
+static inline void mux_init(struct oprofile_operations *ops)
+{
+ if (has_mux())
+ ops->switch_events = nmi_switch_event;
+}
+
+static void mux_clone(int cpu)
+{
+ if (!has_mux())
+ return;
+
+ memcpy(per_cpu(cpu_msrs, cpu).multiplex,
+ per_cpu(cpu_msrs, 0).multiplex,
+ sizeof(struct op_msr) * model->num_virt_counters);
}
+#else
+
+inline int op_x86_phys_to_virt(int phys) { return phys; }
+inline int op_x86_virt_to_phys(int virt) { return virt; }
+static inline void nmi_shutdown_mux(void) { }
+static inline int nmi_setup_mux(void) { return 1; }
+static inline void
+nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { }
+static inline void mux_init(struct oprofile_operations *ops) { }
+static void mux_clone(int cpu) { }
+
+#endif
+
static void free_msrs(void)
{
int i;
@@ -95,38 +298,32 @@ static void free_msrs(void)
static int allocate_msrs(void)
{
- int success = 1;
size_t controls_size = sizeof(struct op_msr) * model->num_controls;
size_t counters_size = sizeof(struct op_msr) * model->num_counters;
int i;
for_each_possible_cpu(i) {
per_cpu(cpu_msrs, i).counters = kmalloc(counters_size,
- GFP_KERNEL);
- if (!per_cpu(cpu_msrs, i).counters) {
- success = 0;
- break;
- }
+ GFP_KERNEL);
+ if (!per_cpu(cpu_msrs, i).counters)
+ return 0;
per_cpu(cpu_msrs, i).controls = kmalloc(controls_size,
- GFP_KERNEL);
- if (!per_cpu(cpu_msrs, i).controls) {
- success = 0;
- break;
- }
+ GFP_KERNEL);
+ if (!per_cpu(cpu_msrs, i).controls)
+ return 0;
}
- if (!success)
- free_msrs();
-
- return success;
+ return 1;
}
static void nmi_cpu_setup(void *dummy)
{
int cpu = smp_processor_id();
struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
+ nmi_cpu_save_registers(msrs);
spin_lock(&oprofilefs_lock);
- model->setup_ctrs(msrs);
+ model->setup_ctrs(model, msrs);
+ nmi_cpu_setup_mux(cpu, msrs);
spin_unlock(&oprofilefs_lock);
per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC);
apic_write(APIC_LVTPC, APIC_DM_NMI);
@@ -144,11 +341,15 @@ static int nmi_setup(void)
int cpu;
if (!allocate_msrs())
- return -ENOMEM;
+ err = -ENOMEM;
+ else if (!nmi_setup_mux())
+ err = -ENOMEM;
+ else
+ err = register_die_notifier(&profile_exceptions_nb);
- err = register_die_notifier(&profile_exceptions_nb);
if (err) {
free_msrs();
+ nmi_shutdown_mux();
return err;
}
@@ -159,45 +360,38 @@ static int nmi_setup(void)
/* Assume saved/restored counters are the same on all CPUs */
model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
for_each_possible_cpu(cpu) {
- if (cpu != 0) {
- memcpy(per_cpu(cpu_msrs, cpu).counters,
- per_cpu(cpu_msrs, 0).counters,
- sizeof(struct op_msr) * model->num_counters);
-
- memcpy(per_cpu(cpu_msrs, cpu).controls,
- per_cpu(cpu_msrs, 0).controls,
- sizeof(struct op_msr) * model->num_controls);
- }
+ if (!cpu)
+ continue;
+
+ memcpy(per_cpu(cpu_msrs, cpu).counters,
+ per_cpu(cpu_msrs, 0).counters,
+ sizeof(struct op_msr) * model->num_counters);
+
+ memcpy(per_cpu(cpu_msrs, cpu).controls,
+ per_cpu(cpu_msrs, 0).controls,
+ sizeof(struct op_msr) * model->num_controls);
+ mux_clone(cpu);
}
- on_each_cpu(nmi_save_registers, NULL, 1);
on_each_cpu(nmi_cpu_setup, NULL, 1);
nmi_enabled = 1;
return 0;
}
-static void nmi_restore_registers(struct op_msrs *msrs)
+static void nmi_cpu_restore_registers(struct op_msrs *msrs)
{
- unsigned int const nr_ctrs = model->num_counters;
- unsigned int const nr_ctrls = model->num_controls;
struct op_msr *counters = msrs->counters;
struct op_msr *controls = msrs->controls;
unsigned int i;
- for (i = 0; i < nr_ctrls; ++i) {
- if (controls[i].addr) {
- wrmsr(controls[i].addr,
- controls[i].saved.low,
- controls[i].saved.high);
- }
+ for (i = 0; i < model->num_controls; ++i) {
+ if (controls[i].addr)
+ wrmsrl(controls[i].addr, controls[i].saved);
}
- for (i = 0; i < nr_ctrs; ++i) {
- if (counters[i].addr) {
- wrmsr(counters[i].addr,
- counters[i].saved.low,
- counters[i].saved.high);
- }
+ for (i = 0; i < model->num_counters; ++i) {
+ if (counters[i].addr)
+ wrmsrl(counters[i].addr, counters[i].saved);
}
}
@@ -205,7 +399,7 @@ static void nmi_cpu_shutdown(void *dummy)
{
unsigned int v;
int cpu = smp_processor_id();
- struct op_msrs *msrs = &__get_cpu_var(cpu_msrs);
+ struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
/* restoring APIC_LVTPC can trigger an apic error because the delivery
* mode and vector nr combination can be illegal. That's by design: on
@@ -216,7 +410,7 @@ static void nmi_cpu_shutdown(void *dummy)
apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
apic_write(APIC_LVTERR, v);
- nmi_restore_registers(msrs);
+ nmi_cpu_restore_registers(msrs);
}
static void nmi_shutdown(void)
@@ -226,42 +420,18 @@ static void nmi_shutdown(void)
nmi_enabled = 0;
on_each_cpu(nmi_cpu_shutdown, NULL, 1);
unregister_die_notifier(&profile_exceptions_nb);
+ nmi_shutdown_mux();
msrs = &get_cpu_var(cpu_msrs);
model->shutdown(msrs);
free_msrs();
put_cpu_var(cpu_msrs);
}
-static void nmi_cpu_start(void *dummy)
-{
- struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
- model->start(msrs);
-}
-
-static int nmi_start(void)
-{
- on_each_cpu(nmi_cpu_start, NULL, 1);
- return 0;
-}
-
-static void nmi_cpu_stop(void *dummy)
-{
- struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
- model->stop(msrs);
-}
-
-static void nmi_stop(void)
-{
- on_each_cpu(nmi_cpu_stop, NULL, 1);
-}
-
-struct op_counter_config counter_config[OP_MAX_COUNTER];
-
static int nmi_create_files(struct super_block *sb, struct dentry *root)
{
unsigned int i;
- for (i = 0; i < model->num_counters; ++i) {
+ for (i = 0; i < model->num_virt_counters; ++i) {
struct dentry *dir;
char buf[4];
@@ -270,7 +440,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
* NOTE: assumes 1:1 mapping here (that counters are organized
* sequentially in their struct assignment).
*/
- if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i)))
+ if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i)))
continue;
snprintf(buf, sizeof(buf), "%d", i);
@@ -402,6 +572,7 @@ module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
static int __init ppro_init(char **cpu_type)
{
__u8 cpu_model = boot_cpu_data.x86_model;
+ struct op_x86_model_spec *spec = &op_ppro_spec; /* default */
if (force_arch_perfmon && cpu_has_arch_perfmon)
return 0;
@@ -428,7 +599,7 @@ static int __init ppro_init(char **cpu_type)
*cpu_type = "i386/core_2";
break;
case 26:
- arch_perfmon_setup_counters();
+ spec = &op_arch_perfmon_spec;
*cpu_type = "i386/core_i7";
break;
case 28:
@@ -439,17 +610,7 @@ static int __init ppro_init(char **cpu_type)
return 0;
}
- model = &op_ppro_spec;
- return 1;
-}
-
-static int __init arch_perfmon_init(char **cpu_type)
-{
- if (!cpu_has_arch_perfmon)
- return 0;
- *cpu_type = "i386/arch_perfmon";
- model = &op_arch_perfmon_spec;
- arch_perfmon_setup_counters();
+ model = spec;
return 1;
}
@@ -471,27 +632,26 @@ int __init op_nmi_init(struct oprofile_operations *ops)
/* Needs to be at least an Athlon (or hammer in 32bit mode) */
switch (family) {
- default:
- return -ENODEV;
case 6:
- model = &op_amd_spec;
cpu_type = "i386/athlon";
break;
case 0xf:
- model = &op_amd_spec;
- /* Actually it could be i386/hammer too, but give
- user space an consistent name. */
+ /*
+ * Actually it could be i386/hammer too, but
+ * give user space an consistent name.
+ */
cpu_type = "x86-64/hammer";
break;
case 0x10:
- model = &op_amd_spec;
cpu_type = "x86-64/family10";
break;
case 0x11:
- model = &op_amd_spec;
cpu_type = "x86-64/family11h";
break;
+ default:
+ return -ENODEV;
}
+ model = &op_amd_spec;
break;
case X86_VENDOR_INTEL:
@@ -510,8 +670,15 @@ int __init op_nmi_init(struct oprofile_operations *ops)
break;
}
- if (!cpu_type && !arch_perfmon_init(&cpu_type))
+ if (cpu_type)
+ break;
+
+ if (!cpu_has_arch_perfmon)
return -ENODEV;
+
+ /* use arch perfmon as fallback */
+ cpu_type = "i386/arch_perfmon";
+ model = &op_arch_perfmon_spec;
break;
default:
@@ -522,18 +689,23 @@ int __init op_nmi_init(struct oprofile_operations *ops)
register_cpu_notifier(&oprofile_cpu_nb);
#endif
/* default values, can be overwritten by model */
- ops->create_files = nmi_create_files;
- ops->setup = nmi_setup;
- ops->shutdown = nmi_shutdown;
- ops->start = nmi_start;
- ops->stop = nmi_stop;
- ops->cpu_type = cpu_type;
+ ops->create_files = nmi_create_files;
+ ops->setup = nmi_setup;
+ ops->shutdown = nmi_shutdown;
+ ops->start = nmi_start;
+ ops->stop = nmi_stop;
+ ops->cpu_type = cpu_type;
if (model->init)
ret = model->init(ops);
if (ret)
return ret;
+ if (!model->num_virt_counters)
+ model->num_virt_counters = model->num_counters;
+
+ mux_init(ops);
+
init_sysfs();
using_nmi = 1;
printk(KERN_INFO "oprofile: using NMI interrupt.\n");
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
index 91b6a116165..e28398df0df 100644
--- a/arch/x86/oprofile/op_counter.h
+++ b/arch/x86/oprofile/op_counter.h
@@ -10,7 +10,7 @@
#ifndef OP_COUNTER_H
#define OP_COUNTER_H
-#define OP_MAX_COUNTER 8
+#define OP_MAX_COUNTER 32
/* Per-perfctr configuration as set via
* oprofilefs.
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 8fdf06e4edf..39686c29f03 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -9,12 +9,15 @@
* @author Philippe Elie
* @author Graydon Hoare
* @author Robert Richter <robert.richter@amd.com>
- * @author Barry Kasindorf
+ * @author Barry Kasindorf <barry.kasindorf@amd.com>
+ * @author Jason Yeh <jason.yeh@amd.com>
+ * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
*/
#include <linux/oprofile.h>
#include <linux/device.h>
#include <linux/pci.h>
+#include <linux/percpu.h>
#include <asm/ptrace.h>
#include <asm/msr.h>
@@ -25,43 +28,36 @@
#define NUM_COUNTERS 4
#define NUM_CONTROLS 4
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+#define NUM_VIRT_COUNTERS 32
+#define NUM_VIRT_CONTROLS 32
+#else
+#define NUM_VIRT_COUNTERS NUM_COUNTERS
+#define NUM_VIRT_CONTROLS NUM_CONTROLS
+#endif
+
+#define OP_EVENT_MASK 0x0FFF
+#define OP_CTR_OVERFLOW (1ULL<<31)
-#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
-#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
-#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0)
-#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
-
-#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
-#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
-#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
-#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
-#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
-#define CTRL_CLEAR_LO(x) (x &= (1<<21))
-#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
-#define CTRL_SET_ENABLE(val) (val |= 1<<20)
-#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
-#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
-#define CTRL_SET_UM(val, m) (val |= (m << 8))
-#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
-#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
-#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
-#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
-
-static unsigned long reset_value[NUM_COUNTERS];
+#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
+
+static unsigned long reset_value[NUM_VIRT_COUNTERS];
#ifdef CONFIG_OPROFILE_IBS
/* IbsFetchCtl bits/masks */
-#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */
-#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */
-#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */
+#define IBS_FETCH_RAND_EN (1ULL<<57)
+#define IBS_FETCH_VAL (1ULL<<49)
+#define IBS_FETCH_ENABLE (1ULL<<48)
+#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL
/*IbsOpCtl bits */
-#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */
-#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */
+#define IBS_OP_CNT_CTL (1ULL<<19)
+#define IBS_OP_VAL (1ULL<<18)
+#define IBS_OP_ENABLE (1ULL<<17)
-#define IBS_FETCH_SIZE 6
-#define IBS_OP_SIZE 12
+#define IBS_FETCH_SIZE 6
+#define IBS_OP_SIZE 12
static int has_ibs; /* AMD Family10h and later */
@@ -78,6 +74,45 @@ static struct op_ibs_config ibs_config;
#endif
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
+static void op_mux_fill_in_addresses(struct op_msrs * const msrs)
+{
+ int i;
+
+ for (i = 0; i < NUM_VIRT_COUNTERS; i++) {
+ int hw_counter = op_x86_virt_to_phys(i);
+ if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
+ msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter;
+ else
+ msrs->multiplex[i].addr = 0;
+ }
+}
+
+static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs)
+{
+ u64 val;
+ int i;
+
+ /* enable active counters */
+ for (i = 0; i < NUM_COUNTERS; ++i) {
+ int virt = op_x86_phys_to_virt(i);
+ if (!counter_config[virt].enabled)
+ continue;
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= model->reserved;
+ val |= op_x86_get_ctrl(model, &counter_config[virt]);
+ wrmsrl(msrs->controls[i].addr, val);
+ }
+}
+
+#else
+
+static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { }
+
+#endif
+
/* functions for op_amd_spec */
static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
@@ -97,150 +132,174 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
else
msrs->controls[i].addr = 0;
}
-}
+ op_mux_fill_in_addresses(msrs);
+}
-static void op_amd_setup_ctrs(struct op_msrs const * const msrs)
+static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs)
{
- unsigned int low, high;
+ u64 val;
int i;
+ /* setup reset_value */
+ for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
+ if (counter_config[i].enabled)
+ reset_value[i] = counter_config[i].count;
+ else
+ reset_value[i] = 0;
+ }
+
/* clear all counters */
- for (i = 0 ; i < NUM_CONTROLS; ++i) {
- if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
+ for (i = 0; i < NUM_CONTROLS; ++i) {
+ if (unlikely(!msrs->controls[i].addr))
continue;
- CTRL_READ(low, high, msrs, i);
- CTRL_CLEAR_LO(low);
- CTRL_CLEAR_HI(high);
- CTRL_WRITE(low, high, msrs, i);
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= model->reserved;
+ wrmsrl(msrs->controls[i].addr, val);
}
/* avoid a false detection of ctr overflows in NMI handler */
for (i = 0; i < NUM_COUNTERS; ++i) {
- if (unlikely(!CTR_IS_RESERVED(msrs, i)))
+ if (unlikely(!msrs->counters[i].addr))
continue;
- CTR_WRITE(1, msrs, i);
+ wrmsrl(msrs->counters[i].addr, -1LL);
}
/* enable active counters */
for (i = 0; i < NUM_COUNTERS; ++i) {
- if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
- reset_value[i] = counter_config[i].count;
+ int virt = op_x86_phys_to_virt(i);
+ if (!counter_config[virt].enabled)
+ continue;
+ if (!msrs->counters[i].addr)
+ continue;
- CTR_WRITE(counter_config[i].count, msrs, i);
-
- CTRL_READ(low, high, msrs, i);
- CTRL_CLEAR_LO(low);
- CTRL_CLEAR_HI(high);
- CTRL_SET_ENABLE(low);
- CTRL_SET_USR(low, counter_config[i].user);
- CTRL_SET_KERN(low, counter_config[i].kernel);
- CTRL_SET_UM(low, counter_config[i].unit_mask);
- CTRL_SET_EVENT_LOW(low, counter_config[i].event);
- CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
- CTRL_SET_HOST_ONLY(high, 0);
- CTRL_SET_GUEST_ONLY(high, 0);
-
- CTRL_WRITE(low, high, msrs, i);
- } else {
- reset_value[i] = 0;
- }
+ /* setup counter registers */
+ wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
+
+ /* setup control registers */
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= model->reserved;
+ val |= op_x86_get_ctrl(model, &counter_config[virt]);
+ wrmsrl(msrs->controls[i].addr, val);
}
}
#ifdef CONFIG_OPROFILE_IBS
-static inline int
+static inline void
op_amd_handle_ibs(struct pt_regs * const regs,
struct op_msrs const * const msrs)
{
- u32 low, high;
- u64 msr;
+ u64 val, ctl;
struct op_entry entry;
if (!has_ibs)
- return 1;
+ return;
if (ibs_config.fetch_enabled) {
- rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
- if (high & IBS_FETCH_HIGH_VALID_BIT) {
- rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr);
- oprofile_write_reserve(&entry, regs, msr,
+ rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
+ if (ctl & IBS_FETCH_VAL) {
+ rdmsrl(MSR_AMD64_IBSFETCHLINAD, val);
+ oprofile_write_reserve(&entry, regs, val,
IBS_FETCH_CODE, IBS_FETCH_SIZE);
- oprofile_add_data(&entry, (u32)msr);
- oprofile_add_data(&entry, (u32)(msr >> 32));
- oprofile_add_data(&entry, low);
- oprofile_add_data(&entry, high);
- rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr);
- oprofile_add_data(&entry, (u32)msr);
- oprofile_add_data(&entry, (u32)(msr >> 32));
+ oprofile_add_data64(&entry, val);
+ oprofile_add_data64(&entry, ctl);
+ rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val);
+ oprofile_add_data64(&entry, val);
oprofile_write_commit(&entry);
/* reenable the IRQ */
- high &= ~IBS_FETCH_HIGH_VALID_BIT;
- high |= IBS_FETCH_HIGH_ENABLE;
- low &= IBS_FETCH_LOW_MAX_CNT_MASK;
- wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
+ ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK);
+ ctl |= IBS_FETCH_ENABLE;
+ wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
}
}
if (ibs_config.op_enabled) {
- rdmsr(MSR_AMD64_IBSOPCTL, low, high);
- if (low & IBS_OP_LOW_VALID_BIT) {
- rdmsrl(MSR_AMD64_IBSOPRIP, msr);
- oprofile_write_reserve(&entry, regs, msr,
+ rdmsrl(MSR_AMD64_IBSOPCTL, ctl);
+ if (ctl & IBS_OP_VAL) {
+ rdmsrl(MSR_AMD64_IBSOPRIP, val);
+ oprofile_write_reserve(&entry, regs, val,
IBS_OP_CODE, IBS_OP_SIZE);
- oprofile_add_data(&entry, (u32)msr);
- oprofile_add_data(&entry, (u32)(msr >> 32));
- rdmsrl(MSR_AMD64_IBSOPDATA, msr);
- oprofile_add_data(&entry, (u32)msr);
- oprofile_add_data(&entry, (u32)(msr >> 32));
- rdmsrl(MSR_AMD64_IBSOPDATA2, msr);
- oprofile_add_data(&entry, (u32)msr);
- oprofile_add_data(&entry, (u32)(msr >> 32));
- rdmsrl(MSR_AMD64_IBSOPDATA3, msr);
- oprofile_add_data(&entry, (u32)msr);
- oprofile_add_data(&entry, (u32)(msr >> 32));
- rdmsrl(MSR_AMD64_IBSDCLINAD, msr);
- oprofile_add_data(&entry, (u32)msr);
- oprofile_add_data(&entry, (u32)(msr >> 32));
- rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr);
- oprofile_add_data(&entry, (u32)msr);
- oprofile_add_data(&entry, (u32)(msr >> 32));
+ oprofile_add_data64(&entry, val);
+ rdmsrl(MSR_AMD64_IBSOPDATA, val);
+ oprofile_add_data64(&entry, val);
+ rdmsrl(MSR_AMD64_IBSOPDATA2, val);
+ oprofile_add_data64(&entry, val);
+ rdmsrl(MSR_AMD64_IBSOPDATA3, val);
+ oprofile_add_data64(&entry, val);
+ rdmsrl(MSR_AMD64_IBSDCLINAD, val);
+ oprofile_add_data64(&entry, val);
+ rdmsrl(MSR_AMD64_IBSDCPHYSAD, val);
+ oprofile_add_data64(&entry, val);
oprofile_write_commit(&entry);
/* reenable the IRQ */
- high = 0;
- low &= ~IBS_OP_LOW_VALID_BIT;
- low |= IBS_OP_LOW_ENABLE;
- wrmsr(MSR_AMD64_IBSOPCTL, low, high);
+ ctl &= ~IBS_OP_VAL & 0xFFFFFFFF;
+ ctl |= IBS_OP_ENABLE;
+ wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
}
}
+}
- return 1;
+static inline void op_amd_start_ibs(void)
+{
+ u64 val;
+ if (has_ibs && ibs_config.fetch_enabled) {
+ val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
+ val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
+ val |= IBS_FETCH_ENABLE;
+ wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
+ }
+
+ if (has_ibs && ibs_config.op_enabled) {
+ val = (ibs_config.max_cnt_op >> 4) & 0xFFFF;
+ val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
+ val |= IBS_OP_ENABLE;
+ wrmsrl(MSR_AMD64_IBSOPCTL, val);
+ }
+}
+
+static void op_amd_stop_ibs(void)
+{
+ if (has_ibs && ibs_config.fetch_enabled)
+ /* clear max count and enable */
+ wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
+
+ if (has_ibs && ibs_config.op_enabled)
+ /* clear max count and enable */
+ wrmsrl(MSR_AMD64_IBSOPCTL, 0);
}
+#else
+
+static inline void op_amd_handle_ibs(struct pt_regs * const regs,
+ struct op_msrs const * const msrs) { }
+static inline void op_amd_start_ibs(void) { }
+static inline void op_amd_stop_ibs(void) { }
+
#endif
static int op_amd_check_ctrs(struct pt_regs * const regs,
struct op_msrs const * const msrs)
{
- unsigned int low, high;
+ u64 val;
int i;
- for (i = 0 ; i < NUM_COUNTERS; ++i) {
- if (!reset_value[i])
+ for (i = 0; i < NUM_COUNTERS; ++i) {
+ int virt = op_x86_phys_to_virt(i);
+ if (!reset_value[virt])
continue;
- CTR_READ(low, high, msrs, i);
- if (CTR_OVERFLOWED(low)) {
- oprofile_add_sample(regs, i);
- CTR_WRITE(reset_value[i], msrs, i);
- }
+ rdmsrl(msrs->counters[i].addr, val);
+ /* bit is clear if overflowed: */
+ if (val & OP_CTR_OVERFLOW)
+ continue;
+ oprofile_add_sample(regs, virt);
+ wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
}
-#ifdef CONFIG_OPROFILE_IBS
op_amd_handle_ibs(regs, msrs);
-#endif
/* See op_model_ppro.c */
return 1;
@@ -248,79 +307,50 @@ static int op_amd_check_ctrs(struct pt_regs * const regs,
static void op_amd_start(struct op_msrs const * const msrs)
{
- unsigned int low, high;
+ u64 val;
int i;
- for (i = 0 ; i < NUM_COUNTERS ; ++i) {
- if (reset_value[i]) {
- CTRL_READ(low, high, msrs, i);
- CTRL_SET_ACTIVE(low);
- CTRL_WRITE(low, high, msrs, i);
- }
- }
-#ifdef CONFIG_OPROFILE_IBS
- if (has_ibs && ibs_config.fetch_enabled) {
- low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
- high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */
- + IBS_FETCH_HIGH_ENABLE;
- wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
+ for (i = 0; i < NUM_COUNTERS; ++i) {
+ if (!reset_value[op_x86_phys_to_virt(i)])
+ continue;
+ rdmsrl(msrs->controls[i].addr, val);
+ val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(msrs->controls[i].addr, val);
}
- if (has_ibs && ibs_config.op_enabled) {
- low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF)
- + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */
- + IBS_OP_LOW_ENABLE;
- high = 0;
- wrmsr(MSR_AMD64_IBSOPCTL, low, high);
- }
-#endif
+ op_amd_start_ibs();
}
-
static void op_amd_stop(struct op_msrs const * const msrs)
{
- unsigned int low, high;
+ u64 val;
int i;
/*
* Subtle: stop on all counters to avoid race with setting our
* pm callback
*/
- for (i = 0 ; i < NUM_COUNTERS ; ++i) {
- if (!reset_value[i])
+ for (i = 0; i < NUM_COUNTERS; ++i) {
+ if (!reset_value[op_x86_phys_to_virt(i)])
continue;
- CTRL_READ(low, high, msrs, i);
- CTRL_SET_INACTIVE(low);
- CTRL_WRITE(low, high, msrs, i);
- }
-
-#ifdef CONFIG_OPROFILE_IBS
- if (has_ibs && ibs_config.fetch_enabled) {
- /* clear max count and enable */
- low = 0;
- high = 0;
- wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(msrs->controls[i].addr, val);
}
- if (has_ibs && ibs_config.op_enabled) {
- /* clear max count and enable */
- low = 0;
- high = 0;
- wrmsr(MSR_AMD64_IBSOPCTL, low, high);
- }
-#endif
+ op_amd_stop_ibs();
}
static void op_amd_shutdown(struct op_msrs const * const msrs)
{
int i;
- for (i = 0 ; i < NUM_COUNTERS ; ++i) {
- if (CTR_IS_RESERVED(msrs, i))
+ for (i = 0; i < NUM_COUNTERS; ++i) {
+ if (msrs->counters[i].addr)
release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
}
- for (i = 0 ; i < NUM_CONTROLS ; ++i) {
- if (CTRL_IS_RESERVED(msrs, i))
+ for (i = 0; i < NUM_CONTROLS; ++i) {
+ if (msrs->controls[i].addr)
release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
}
}
@@ -490,15 +520,21 @@ static void op_amd_exit(void) {}
#endif /* CONFIG_OPROFILE_IBS */
-struct op_x86_model_spec const op_amd_spec = {
- .init = op_amd_init,
- .exit = op_amd_exit,
+struct op_x86_model_spec op_amd_spec = {
.num_counters = NUM_COUNTERS,
.num_controls = NUM_CONTROLS,
+ .num_virt_counters = NUM_VIRT_COUNTERS,
+ .reserved = MSR_AMD_EVENTSEL_RESERVED,
+ .event_mask = OP_EVENT_MASK,
+ .init = op_amd_init,
+ .exit = op_amd_exit,
.fill_in_addresses = &op_amd_fill_in_addresses,
.setup_ctrs = &op_amd_setup_ctrs,
.check_ctrs = &op_amd_check_ctrs,
.start = &op_amd_start,
.stop = &op_amd_stop,
- .shutdown = &op_amd_shutdown
+ .shutdown = &op_amd_shutdown,
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+ .switch_ctrl = &op_mux_switch_ctrl,
+#endif
};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 819b131fd75..ac6b354becd 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -32,6 +32,8 @@
#define NUM_CCCRS_HT2 9
#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2)
+#define OP_CTR_OVERFLOW (1ULL<<31)
+
static unsigned int num_counters = NUM_COUNTERS_NON_HT;
static unsigned int num_controls = NUM_CONTROLS_NON_HT;
@@ -350,8 +352,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
-#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
-#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
#define CCCR_RESERVED_BITS 0x38030FFF
#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
@@ -361,17 +361,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
-#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
-#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
-#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
-#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
-#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0)
-#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0)
-#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
-
/* this assigns a "stagger" to the current CPU, which is used throughout
the code in this module as an extra array offset, to select the "even"
@@ -515,7 +507,7 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
if (ev->bindings[i].virt_counter & counter_bit) {
/* modify ESCR */
- ESCR_READ(escr, high, ev, i);
+ rdmsr(ev->bindings[i].escr_address, escr, high);
ESCR_CLEAR(escr);
if (stag == 0) {
ESCR_SET_USR_0(escr, counter_config[ctr].user);
@@ -526,10 +518,11 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
}
ESCR_SET_EVENT_SELECT(escr, ev->event_select);
ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
- ESCR_WRITE(escr, high, ev, i);
+ wrmsr(ev->bindings[i].escr_address, escr, high);
/* modify CCCR */
- CCCR_READ(cccr, high, VIRT_CTR(stag, ctr));
+ rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
+ cccr, high);
CCCR_CLEAR(cccr);
CCCR_SET_REQUIRED_BITS(cccr);
CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
@@ -537,7 +530,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
CCCR_SET_PMI_OVF_0(cccr);
else
CCCR_SET_PMI_OVF_1(cccr);
- CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr));
+ wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
+ cccr, high);
return;
}
}
@@ -548,7 +542,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
}
-static void p4_setup_ctrs(struct op_msrs const * const msrs)
+static void p4_setup_ctrs(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs)
{
unsigned int i;
unsigned int low, high;
@@ -563,8 +558,8 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
}
/* clear the cccrs we will use */
- for (i = 0 ; i < num_counters ; i++) {
- if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
+ for (i = 0; i < num_counters; i++) {
+ if (unlikely(!msrs->controls[i].addr))
continue;
rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
CCCR_CLEAR(low);
@@ -574,17 +569,18 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
/* clear all escrs (including those outside our concern) */
for (i = num_counters; i < num_controls; i++) {
- if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
+ if (unlikely(!msrs->controls[i].addr))
continue;
wrmsr(msrs->controls[i].addr, 0, 0);
}
/* setup all counters */
- for (i = 0 ; i < num_counters ; ++i) {
- if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) {
+ for (i = 0; i < num_counters; ++i) {
+ if (counter_config[i].enabled && msrs->controls[i].addr) {
reset_value[i] = counter_config[i].count;
pmc_setup_one_p4_counter(i);
- CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i));
+ wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address,
+ -(u64)counter_config[i].count);
} else {
reset_value[i] = 0;
}
@@ -624,14 +620,16 @@ static int p4_check_ctrs(struct pt_regs * const regs,
real = VIRT_CTR(stag, i);
- CCCR_READ(low, high, real);
- CTR_READ(ctr, high, real);
- if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) {
+ rdmsr(p4_counters[real].cccr_address, low, high);
+ rdmsr(p4_counters[real].counter_address, ctr, high);
+ if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) {
oprofile_add_sample(regs, i);
- CTR_WRITE(reset_value[i], real);
+ wrmsrl(p4_counters[real].counter_address,
+ -(u64)reset_value[i]);
CCCR_CLEAR_OVF(low);
- CCCR_WRITE(low, high, real);
- CTR_WRITE(reset_value[i], real);
+ wrmsr(p4_counters[real].cccr_address, low, high);
+ wrmsrl(p4_counters[real].counter_address,
+ -(u64)reset_value[i]);
}
}
@@ -653,9 +651,9 @@ static void p4_start(struct op_msrs const * const msrs)
for (i = 0; i < num_counters; ++i) {
if (!reset_value[i])
continue;
- CCCR_READ(low, high, VIRT_CTR(stag, i));
+ rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
CCCR_SET_ENABLE(low);
- CCCR_WRITE(low, high, VIRT_CTR(stag, i));
+ wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
}
}
@@ -670,9 +668,9 @@ static void p4_stop(struct op_msrs const * const msrs)
for (i = 0; i < num_counters; ++i) {
if (!reset_value[i])
continue;
- CCCR_READ(low, high, VIRT_CTR(stag, i));
+ rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
CCCR_SET_DISABLE(low);
- CCCR_WRITE(low, high, VIRT_CTR(stag, i));
+ wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
}
}
@@ -680,8 +678,8 @@ static void p4_shutdown(struct op_msrs const * const msrs)
{
int i;
- for (i = 0 ; i < num_counters ; ++i) {
- if (CTR_IS_RESERVED(msrs, i))
+ for (i = 0; i < num_counters; ++i) {
+ if (msrs->counters[i].addr)
release_perfctr_nmi(msrs->counters[i].addr);
}
/*
@@ -689,15 +687,15 @@ static void p4_shutdown(struct op_msrs const * const msrs)
* conjunction with the counter registers (hence the starting offset).
* This saves a few bits.
*/
- for (i = num_counters ; i < num_controls ; ++i) {
- if (CTRL_IS_RESERVED(msrs, i))
+ for (i = num_counters; i < num_controls; ++i) {
+ if (msrs->controls[i].addr)
release_evntsel_nmi(msrs->controls[i].addr);
}
}
#ifdef CONFIG_SMP
-struct op_x86_model_spec const op_p4_ht2_spec = {
+struct op_x86_model_spec op_p4_ht2_spec = {
.num_counters = NUM_COUNTERS_HT2,
.num_controls = NUM_CONTROLS_HT2,
.fill_in_addresses = &p4_fill_in_addresses,
@@ -709,7 +707,7 @@ struct op_x86_model_spec const op_p4_ht2_spec = {
};
#endif
-struct op_x86_model_spec const op_p4_spec = {
+struct op_x86_model_spec op_p4_spec = {
.num_counters = NUM_COUNTERS_NON_HT,
.num_controls = NUM_CONTROLS_NON_HT,
.fill_in_addresses = &p4_fill_in_addresses,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 4da7230b3d1..4899215999d 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -10,6 +10,7 @@
* @author Philippe Elie
* @author Graydon Hoare
* @author Andi Kleen
+ * @author Robert Richter <robert.richter@amd.com>
*/
#include <linux/oprofile.h>
@@ -18,7 +19,6 @@
#include <asm/msr.h>
#include <asm/apic.h>
#include <asm/nmi.h>
-#include <asm/perf_counter.h>
#include "op_x86_model.h"
#include "op_counter.h"
@@ -26,20 +26,7 @@
static int num_counters = 2;
static int counter_width = 32;
-#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
-#define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1))))
-
-#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
-#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0)
-#define CTRL_WRITE(l, h, msrs, c) do {wrmsr((msrs->controls[(c)].addr), (l), (h)); } while (0)
-#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
-#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
-#define CTRL_CLEAR(x) (x &= (1<<21))
-#define CTRL_SET_ENABLE(val) (val |= 1<<20)
-#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
-#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
-#define CTRL_SET_UM(val, m) (val |= (m << 8))
-#define CTRL_SET_EVENT(val, e) (val |= e)
+#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21))
static u64 *reset_value;
@@ -63,9 +50,10 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs)
}
-static void ppro_setup_ctrs(struct op_msrs const * const msrs)
+static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs)
{
- unsigned int low, high;
+ u64 val;
int i;
if (!reset_value) {
@@ -93,36 +81,30 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
}
/* clear all counters */
- for (i = 0 ; i < num_counters; ++i) {
- if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
+ for (i = 0; i < num_counters; ++i) {
+ if (unlikely(!msrs->controls[i].addr))
continue;
- CTRL_READ(low, high, msrs, i);
- CTRL_CLEAR(low);
- CTRL_WRITE(low, high, msrs, i);
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= model->reserved;
+ wrmsrl(msrs->controls[i].addr, val);
}
/* avoid a false detection of ctr overflows in NMI handler */
for (i = 0; i < num_counters; ++i) {
- if (unlikely(!CTR_IS_RESERVED(msrs, i)))
+ if (unlikely(!msrs->counters[i].addr))
continue;
wrmsrl(msrs->counters[i].addr, -1LL);
}
/* enable active counters */
for (i = 0; i < num_counters; ++i) {
- if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
+ if (counter_config[i].enabled && msrs->counters[i].addr) {
reset_value[i] = counter_config[i].count;
-
wrmsrl(msrs->counters[i].addr, -reset_value[i]);
-
- CTRL_READ(low, high, msrs, i);
- CTRL_CLEAR(low);
- CTRL_SET_ENABLE(low);
- CTRL_SET_USR(low, counter_config[i].user);
- CTRL_SET_KERN(low, counter_config[i].kernel);
- CTRL_SET_UM(low, counter_config[i].unit_mask);
- CTRL_SET_EVENT(low, counter_config[i].event);
- CTRL_WRITE(low, high, msrs, i);
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= model->reserved;
+ val |= op_x86_get_ctrl(model, &counter_config[i]);
+ wrmsrl(msrs->controls[i].addr, val);
} else {
reset_value[i] = 0;
}
@@ -143,14 +125,14 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
if (unlikely(!reset_value))
goto out;
- for (i = 0 ; i < num_counters; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (!reset_value[i])
continue;
rdmsrl(msrs->counters[i].addr, val);
- if (CTR_OVERFLOWED(val)) {
- oprofile_add_sample(regs, i);
- wrmsrl(msrs->counters[i].addr, -reset_value[i]);
- }
+ if (val & (1ULL << (counter_width - 1)))
+ continue;
+ oprofile_add_sample(regs, i);
+ wrmsrl(msrs->counters[i].addr, -reset_value[i]);
}
out:
@@ -171,16 +153,16 @@ out:
static void ppro_start(struct op_msrs const * const msrs)
{
- unsigned int low, high;
+ u64 val;
int i;
if (!reset_value)
return;
for (i = 0; i < num_counters; ++i) {
if (reset_value[i]) {
- CTRL_READ(low, high, msrs, i);
- CTRL_SET_ACTIVE(low);
- CTRL_WRITE(low, high, msrs, i);
+ rdmsrl(msrs->controls[i].addr, val);
+ val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(msrs->controls[i].addr, val);
}
}
}
@@ -188,7 +170,7 @@ static void ppro_start(struct op_msrs const * const msrs)
static void ppro_stop(struct op_msrs const * const msrs)
{
- unsigned int low, high;
+ u64 val;
int i;
if (!reset_value)
@@ -196,9 +178,9 @@ static void ppro_stop(struct op_msrs const * const msrs)
for (i = 0; i < num_counters; ++i) {
if (!reset_value[i])
continue;
- CTRL_READ(low, high, msrs, i);
- CTRL_SET_INACTIVE(low);
- CTRL_WRITE(low, high, msrs, i);
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(msrs->controls[i].addr, val);
}
}
@@ -206,12 +188,12 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
{
int i;
- for (i = 0 ; i < num_counters ; ++i) {
- if (CTR_IS_RESERVED(msrs, i))
+ for (i = 0; i < num_counters; ++i) {
+ if (msrs->counters[i].addr)
release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
}
- for (i = 0 ; i < num_counters ; ++i) {
- if (CTRL_IS_RESERVED(msrs, i))
+ for (i = 0; i < num_counters; ++i) {
+ if (msrs->controls[i].addr)
release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
}
if (reset_value) {
@@ -222,8 +204,9 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
struct op_x86_model_spec op_ppro_spec = {
- .num_counters = 2, /* can be overriden */
- .num_controls = 2, /* dito */
+ .num_counters = 2,
+ .num_controls = 2,
+ .reserved = MSR_PPRO_EVENTSEL_RESERVED,
.fill_in_addresses = &ppro_fill_in_addresses,
.setup_ctrs = &ppro_setup_ctrs,
.check_ctrs = &ppro_check_ctrs,
@@ -241,7 +224,7 @@ struct op_x86_model_spec op_ppro_spec = {
* the specific CPU.
*/
-void arch_perfmon_setup_counters(void)
+static void arch_perfmon_setup_counters(void)
{
union cpuid10_eax eax;
@@ -259,11 +242,17 @@ void arch_perfmon_setup_counters(void)
op_arch_perfmon_spec.num_counters = num_counters;
op_arch_perfmon_spec.num_controls = num_counters;
- op_ppro_spec.num_counters = num_counters;
- op_ppro_spec.num_controls = num_counters;
+}
+
+static int arch_perfmon_init(struct oprofile_operations *ignore)
+{
+ arch_perfmon_setup_counters();
+ return 0;
}
struct op_x86_model_spec op_arch_perfmon_spec = {
+ .reserved = MSR_PPRO_EVENTSEL_RESERVED,
+ .init = &arch_perfmon_init,
/* num_counters/num_controls filled in at runtime */
.fill_in_addresses = &ppro_fill_in_addresses,
/* user space does the cpuid check for available events */
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 825e79064d6..b83776180c7 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -6,51 +6,66 @@
* @remark Read the file COPYING
*
* @author Graydon Hoare
+ * @author Robert Richter <robert.richter@amd.com>
*/
#ifndef OP_X86_MODEL_H
#define OP_X86_MODEL_H
-struct op_saved_msr {
- unsigned int high;
- unsigned int low;
-};
+#include <asm/types.h>
+#include <asm/perf_counter.h>
struct op_msr {
- unsigned long addr;
- struct op_saved_msr saved;
+ unsigned long addr;
+ u64 saved;
};
struct op_msrs {
struct op_msr *counters;
struct op_msr *controls;
+ struct op_msr *multiplex;
};
struct pt_regs;
+struct oprofile_operations;
+
/* The model vtable abstracts the differences between
* various x86 CPU models' perfctr support.
*/
struct op_x86_model_spec {
- int (*init)(struct oprofile_operations *ops);
- void (*exit)(void);
- unsigned int num_counters;
- unsigned int num_controls;
- void (*fill_in_addresses)(struct op_msrs * const msrs);
- void (*setup_ctrs)(struct op_msrs const * const msrs);
- int (*check_ctrs)(struct pt_regs * const regs,
- struct op_msrs const * const msrs);
- void (*start)(struct op_msrs const * const msrs);
- void (*stop)(struct op_msrs const * const msrs);
- void (*shutdown)(struct op_msrs const * const msrs);
+ unsigned int num_counters;
+ unsigned int num_controls;
+ unsigned int num_virt_counters;
+ u64 reserved;
+ u16 event_mask;
+ int (*init)(struct oprofile_operations *ops);
+ void (*exit)(void);
+ void (*fill_in_addresses)(struct op_msrs * const msrs);
+ void (*setup_ctrs)(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs);
+ int (*check_ctrs)(struct pt_regs * const regs,
+ struct op_msrs const * const msrs);
+ void (*start)(struct op_msrs const * const msrs);
+ void (*stop)(struct op_msrs const * const msrs);
+ void (*shutdown)(struct op_msrs const * const msrs);
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+ void (*switch_ctrl)(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs);
+#endif
};
+struct op_counter_config;
+
+extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
+ struct op_counter_config *counter_config);
+extern int op_x86_phys_to_virt(int phys);
+extern int op_x86_virt_to_phys(int virt);
+
extern struct op_x86_model_spec op_ppro_spec;
-extern struct op_x86_model_spec const op_p4_spec;
-extern struct op_x86_model_spec const op_p4_ht2_spec;
-extern struct op_x86_model_spec const op_amd_spec;
+extern struct op_x86_model_spec op_p4_spec;
+extern struct op_x86_model_spec op_p4_ht2_spec;
+extern struct op_x86_model_spec op_amd_spec;
extern struct op_x86_model_spec op_arch_perfmon_spec;
-extern void arch_perfmon_setup_counters(void);
-
#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index bd13c3e4c6d..347d882b3bb 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -192,13 +192,14 @@ struct pci_raw_ops pci_direct_conf2 = {
static int __init pci_sanity_check(struct pci_raw_ops *o)
{
u32 x = 0;
- int devfn;
+ int year, devfn;
if (pci_probe & PCI_NO_CHECKS)
return 1;
/* Assume Type 1 works for newer systems.
This handles machines that don't have anything on PCI Bus 0. */
- if (dmi_get_year(DMI_BIOS_DATE) >= 2001)
+ dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL);
+ if (year >= 2001)
return 1;
for (devfn = 0; devfn < 0x100; devfn++) {
diff --git a/drivers/acpi/blacklist.c b/drivers/acpi/blacklist.c
index f6baa77deef..0c4ca4d318b 100644
--- a/drivers/acpi/blacklist.c
+++ b/drivers/acpi/blacklist.c
@@ -78,9 +78,10 @@ static struct acpi_blacklist_item acpi_blacklist[] __initdata = {
static int __init blacklist_by_year(void)
{
- int year = dmi_get_year(DMI_BIOS_DATE);
+ int year;
+
/* Doesn't exist? Likely an old system */
- if (year == -1) {
+ if (!dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL)) {
printk(KERN_ERR PREFIX "no DMI BIOS year, "
"acpi=force is required to enable ACPI\n" );
return 1;
diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
index b17c57f8503..ab2fa4eeb36 100644
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -26,6 +26,17 @@ config ATA_NONSTANDARD
bool
default n
+config ATA_VERBOSE_ERROR
+ bool "Verbose ATA error reporting"
+ default y
+ help
+ This option adds parsing of ATA command descriptions and error bits
+ in libata kernel output, making it easier to interpret.
+ This option will enlarge the kernel by approx. 6KB. Disable it only
+ if kernel size is more important than ease of debugging.
+
+ If unsure, say Y.
+
config ATA_ACPI
bool "ATA ACPI Support"
depends on ACPI && PCI
@@ -586,6 +597,16 @@ config PATA_RB532
If unsure, say N.
+config PATA_RDC
+ tristate "RDC PATA support"
+ depends on PCI
+ help
+ This option enables basic support for the later RDC PATA controllers
+ controllers via the new ATA layer. For the RDC 1010, you need to
+ enable the IT821X driver instead.
+
+ If unsure, say N.
+
config PATA_RZ1000
tristate "PC Tech RZ1000 PATA support"
depends on PCI
diff --git a/drivers/ata/Makefile b/drivers/ata/Makefile
index 38906f9bbb4..463eb52236a 100644
--- a/drivers/ata/Makefile
+++ b/drivers/ata/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_PATA_PDC_OLD) += pata_pdc202xx_old.o
obj-$(CONFIG_PATA_QDI) += pata_qdi.o
obj-$(CONFIG_PATA_RADISYS) += pata_radisys.o
obj-$(CONFIG_PATA_RB532) += pata_rb532_cf.o
+obj-$(CONFIG_PATA_RDC) += pata_rdc.o
obj-$(CONFIG_PATA_RZ1000) += pata_rz1000.o
obj-$(CONFIG_PATA_SC1200) += pata_sc1200.o
obj-$(CONFIG_PATA_SERVERWORKS) += pata_serverworks.o
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index fe3eba5d6b3..d4cd9c20331 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -329,10 +329,24 @@ static ssize_t ahci_activity_store(struct ata_device *dev,
enum sw_activity val);
static void ahci_init_sw_activity(struct ata_link *link);
+static ssize_t ahci_show_host_caps(struct device *dev,
+ struct device_attribute *attr, char *buf);
+static ssize_t ahci_show_host_version(struct device *dev,
+ struct device_attribute *attr, char *buf);
+static ssize_t ahci_show_port_cmd(struct device *dev,
+ struct device_attribute *attr, char *buf);
+
+DEVICE_ATTR(ahci_host_caps, S_IRUGO, ahci_show_host_caps, NULL);
+DEVICE_ATTR(ahci_host_version, S_IRUGO, ahci_show_host_version, NULL);
+DEVICE_ATTR(ahci_port_cmd, S_IRUGO, ahci_show_port_cmd, NULL);
+
static struct device_attribute *ahci_shost_attrs[] = {
&dev_attr_link_power_management_policy,
&dev_attr_em_message_type,
&dev_attr_em_message,
+ &dev_attr_ahci_host_caps,
+ &dev_attr_ahci_host_version,
+ &dev_attr_ahci_port_cmd,
NULL
};
@@ -539,6 +553,12 @@ static const struct pci_device_id ahci_pci_tbl[] = {
{ PCI_VDEVICE(ATI, 0x4394), board_ahci_sb700 }, /* ATI SB700/800 */
{ PCI_VDEVICE(ATI, 0x4395), board_ahci_sb700 }, /* ATI SB700/800 */
+ /* AMD */
+ { PCI_VDEVICE(AMD, 0x7800), board_ahci }, /* AMD SB900 */
+ /* AMD is using RAID class only for ahci controllers */
+ { PCI_VENDOR_ID_AMD, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
+ PCI_CLASS_STORAGE_RAID << 8, 0xffffff, board_ahci },
+
/* VIA */
{ PCI_VDEVICE(VIA, 0x3349), board_ahci_vt8251 }, /* VIA VT8251 */
{ PCI_VDEVICE(VIA, 0x6287), board_ahci_vt8251 }, /* VIA VT8251 */
@@ -702,6 +722,36 @@ static void ahci_enable_ahci(void __iomem *mmio)
WARN_ON(1);
}
+static ssize_t ahci_show_host_caps(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct Scsi_Host *shost = class_to_shost(dev);
+ struct ata_port *ap = ata_shost_to_port(shost);
+ struct ahci_host_priv *hpriv = ap->host->private_data;
+
+ return sprintf(buf, "%x\n", hpriv->cap);
+}
+
+static ssize_t ahci_show_host_version(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct Scsi_Host *shost = class_to_shost(dev);
+ struct ata_port *ap = ata_shost_to_port(shost);
+ void __iomem *mmio = ap->host->iomap[AHCI_PCI_BAR];
+
+ return sprintf(buf, "%x\n", readl(mmio + HOST_VERSION));
+}
+
+static ssize_t ahci_show_port_cmd(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct Scsi_Host *shost = class_to_shost(dev);
+ struct ata_port *ap = ata_shost_to_port(shost);
+ void __iomem *port_mmio = ahci_port_base(ap);
+
+ return sprintf(buf, "%x\n", readl(port_mmio + PORT_CMD));
+}
+
/**
* ahci_save_initial_config - Save and fixup initial config values
* @pdev: target PCI device
@@ -1584,7 +1634,7 @@ static void ahci_fill_cmd_slot(struct ahci_port_priv *pp, unsigned int tag,
pp->cmd_slot[tag].tbl_addr_hi = cpu_to_le32((cmd_tbl_dma >> 16) >> 16);
}
-static int ahci_kick_engine(struct ata_port *ap, int force_restart)
+static int ahci_kick_engine(struct ata_port *ap)
{
void __iomem *port_mmio = ahci_port_base(ap);
struct ahci_host_priv *hpriv = ap->host->private_data;
@@ -1592,18 +1642,16 @@ static int ahci_kick_engine(struct ata_port *ap, int force_restart)
u32 tmp;
int busy, rc;
- /* do we need to kick the port? */
- busy = status & (ATA_BUSY | ATA_DRQ);
- if (!busy && !force_restart)
- return 0;
-
/* stop engine */
rc = ahci_stop_engine(ap);
if (rc)
goto out_restart;
- /* need to do CLO? */
- if (!busy) {
+ /* need to do CLO?
+ * always do CLO if PMP is attached (AHCI-1.3 9.2)
+ */
+ busy = status & (ATA_BUSY | ATA_DRQ);
+ if (!busy && !sata_pmp_attached(ap)) {
rc = 0;
goto out_restart;
}
@@ -1651,7 +1699,7 @@ static int ahci_exec_polled_cmd(struct ata_port *ap, int pmp,
tmp = ata_wait_register(port_mmio + PORT_CMD_ISSUE, 0x1, 0x1,
1, timeout_msec);
if (tmp & 0x1) {
- ahci_kick_engine(ap, 1);
+ ahci_kick_engine(ap);
return -EBUSY;
}
} else
@@ -1674,7 +1722,7 @@ static int ahci_do_softreset(struct ata_link *link, unsigned int *class,
DPRINTK("ENTER\n");
/* prepare for SRST (AHCI-1.1 10.4.1) */
- rc = ahci_kick_engine(ap, 1);
+ rc = ahci_kick_engine(ap);
if (rc && rc != -EOPNOTSUPP)
ata_link_printk(link, KERN_WARNING,
"failed to reset engine (errno=%d)\n", rc);
@@ -1890,7 +1938,7 @@ static int ahci_p5wdh_hardreset(struct ata_link *link, unsigned int *class,
rc = ata_wait_after_reset(link, jiffies + 2 * HZ,
ahci_check_ready);
if (rc)
- ahci_kick_engine(ap, 0);
+ ahci_kick_engine(ap);
}
return rc;
}
@@ -2271,7 +2319,7 @@ static void ahci_post_internal_cmd(struct ata_queued_cmd *qc)
/* make DMA engine forget about the failed command */
if (qc->flags & ATA_QCFLAG_FAILED)
- ahci_kick_engine(ap, 1);
+ ahci_kick_engine(ap);
}
static void ahci_pmp_attach(struct ata_port *ap)
@@ -2603,14 +2651,18 @@ static void ahci_p5wdh_workaround(struct ata_host *host)
}
/*
- * SB600 ahci controller on ASUS M2A-VM can't do 64bit DMA with older
- * BIOS. The oldest version known to be broken is 0901 and working is
- * 1501 which was released on 2007-10-26. Force 32bit DMA on anything
- * older than 1501. Please read bko#9412 for more info.
+ * SB600 ahci controller on certain boards can't do 64bit DMA with
+ * older BIOS.
*/
-static bool ahci_asus_m2a_vm_32bit_only(struct pci_dev *pdev)
+static bool ahci_sb600_32bit_only(struct pci_dev *pdev)
{
static const struct dmi_system_id sysids[] = {
+ /*
+ * The oldest version known to be broken is 0901 and
+ * working is 1501 which was released on 2007-10-26.
+ * Force 32bit DMA on anything older than 1501.
+ * Please read bko#9412 for more info.
+ */
{
.ident = "ASUS M2A-VM",
.matches = {
@@ -2618,31 +2670,48 @@ static bool ahci_asus_m2a_vm_32bit_only(struct pci_dev *pdev)
"ASUSTeK Computer INC."),
DMI_MATCH(DMI_BOARD_NAME, "M2A-VM"),
},
+ .driver_data = "20071026", /* yyyymmdd */
+ },
+ /*
+ * It's yet unknown whether more recent BIOS fixes the
+ * problem. Blacklist the whole board for the time
+ * being. Please read the following thread for more
+ * info.
+ *
+ * http://thread.gmane.org/gmane.linux.ide/42326
+ */
+ {
+ .ident = "Gigabyte GA-MA69VM-S2",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR,
+ "Gigabyte Technology Co., Ltd."),
+ DMI_MATCH(DMI_BOARD_NAME, "GA-MA69VM-S2"),
+ },
},
{ }
};
- const char *cutoff_mmdd = "10/26";
- const char *date;
- int year;
+ const struct dmi_system_id *match;
+ match = dmi_first_match(sysids);
if (pdev->bus->number != 0 || pdev->devfn != PCI_DEVFN(0x12, 0) ||
- !dmi_check_system(sysids))
+ !match)
return false;
- /*
- * Argh.... both version and date are free form strings.
- * Let's hope they're using the same date format across
- * different versions.
- */
- date = dmi_get_system_info(DMI_BIOS_DATE);
- year = dmi_get_year(DMI_BIOS_DATE);
- if (date && strlen(date) >= 10 && date[2] == '/' && date[5] == '/' &&
- (year > 2007 ||
- (year == 2007 && strncmp(date, cutoff_mmdd, 5) >= 0)))
- return false;
+ if (match->driver_data) {
+ int year, month, date;
+ char buf[9];
+
+ dmi_get_date(DMI_BIOS_DATE, &year, &month, &date);
+ snprintf(buf, sizeof(buf), "%04d%02d%02d", year, month, date);
- dev_printk(KERN_WARNING, &pdev->dev, "ASUS M2A-VM: BIOS too old, "
- "forcing 32bit DMA, update BIOS\n");
+ if (strcmp(buf, match->driver_data) >= 0)
+ return false;
+
+ dev_printk(KERN_WARNING, &pdev->dev, "%s: BIOS too old, "
+ "forcing 32bit DMA, update BIOS\n", match->ident);
+ } else
+ dev_printk(KERN_WARNING, &pdev->dev, "%s: this board can't "
+ "do 64bit DMA, forcing 32bit\n", match->ident);
return true;
}
@@ -2857,8 +2926,8 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
if (board_id == board_ahci_sb700 && pdev->revision >= 0x40)
hpriv->flags &= ~AHCI_HFLAG_IGN_SERR_INTERNAL;
- /* apply ASUS M2A_VM quirk */
- if (ahci_asus_m2a_vm_32bit_only(pdev))
+ /* apply sb600 32bit only quirk */
+ if (ahci_sb600_32bit_only(pdev))
hpriv->flags |= AHCI_HFLAG_32BIT_ONLY;
if (!(hpriv->flags & AHCI_HFLAG_NO_MSI))
@@ -2869,7 +2938,7 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
/* prepare host */
if (hpriv->cap & HOST_CAP_NCQ)
- pi.flags |= ATA_FLAG_NCQ;
+ pi.flags |= ATA_FLAG_NCQ | ATA_FLAG_FPDMA_AA;
if (hpriv->cap & HOST_CAP_PMP)
pi.flags |= ATA_FLAG_PMP;
diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index ac176da1f94..01964b6e6f6 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -689,6 +689,7 @@ static int ata_acpi_run_tf(struct ata_device *dev,
struct ata_taskfile tf, ptf, rtf;
unsigned int err_mask;
const char *level;
+ const char *descr;
char msg[60];
int rc;
@@ -736,11 +737,13 @@ static int ata_acpi_run_tf(struct ata_device *dev,
snprintf(msg, sizeof(msg), "filtered out");
rc = 0;
}
+ descr = ata_get_cmd_descript(tf.command);
ata_dev_printk(dev, level,
- "ACPI cmd %02x/%02x:%02x:%02x:%02x:%02x:%02x %s\n",
+ "ACPI cmd %02x/%02x:%02x:%02x:%02x:%02x:%02x (%s) %s\n",
tf.command, tf.feature, tf.nsect, tf.lbal,
- tf.lbam, tf.lbah, tf.device, msg);
+ tf.lbam, tf.lbah, tf.device,
+ (descr ? descr : "unknown"), msg);
return rc;
}
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 072ba5ea138..df31deac5c8 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -709,7 +709,13 @@ u64 ata_tf_read_block(struct ata_taskfile *tf, struct ata_device *dev)
head = tf->device & 0xf;
sect = tf->lbal;
- block = (cyl * dev->heads + head) * dev->sectors + sect;
+ if (!sect) {
+ ata_dev_printk(dev, KERN_WARNING, "device reported "
+ "invalid CHS sector 0\n");
+ sect = 1; /* oh well */
+ }
+
+ block = (cyl * dev->heads + head) * dev->sectors + sect - 1;
}
return block;
@@ -2299,29 +2305,49 @@ static inline u8 ata_dev_knobble(struct ata_device *dev)
return ((ap->cbl == ATA_CBL_SATA) && (!ata_id_is_sata(dev->id)));
}
-static void ata_dev_config_ncq(struct ata_device *dev,
+static int ata_dev_config_ncq(struct ata_device *dev,
char *desc, size_t desc_sz)
{
struct ata_port *ap = dev->link->ap;
int hdepth = 0, ddepth = ata_id_queue_depth(dev->id);
+ unsigned int err_mask;
+ char *aa_desc = "";
if (!ata_id_has_ncq(dev->id)) {
desc[0] = '\0';
- return;
+ return 0;
}
if (dev->horkage & ATA_HORKAGE_NONCQ) {
snprintf(desc, desc_sz, "NCQ (not used)");
- return;
+ return 0;
}
if (ap->flags & ATA_FLAG_NCQ) {
hdepth = min(ap->scsi_host->can_queue, ATA_MAX_QUEUE - 1);
dev->flags |= ATA_DFLAG_NCQ;
}
+ if (!(dev->horkage & ATA_HORKAGE_BROKEN_FPDMA_AA) &&
+ (ap->flags & ATA_FLAG_FPDMA_AA) &&
+ ata_id_has_fpdma_aa(dev->id)) {
+ err_mask = ata_dev_set_feature(dev, SETFEATURES_SATA_ENABLE,
+ SATA_FPDMA_AA);
+ if (err_mask) {
+ ata_dev_printk(dev, KERN_ERR, "failed to enable AA"
+ "(error_mask=0x%x)\n", err_mask);
+ if (err_mask != AC_ERR_DEV) {
+ dev->horkage |= ATA_HORKAGE_BROKEN_FPDMA_AA;
+ return -EIO;
+ }
+ } else
+ aa_desc = ", AA";
+ }
+
if (hdepth >= ddepth)
- snprintf(desc, desc_sz, "NCQ (depth %d)", ddepth);
+ snprintf(desc, desc_sz, "NCQ (depth %d)%s", ddepth, aa_desc);
else
- snprintf(desc, desc_sz, "NCQ (depth %d/%d)", hdepth, ddepth);
+ snprintf(desc, desc_sz, "NCQ (depth %d/%d)%s", hdepth,
+ ddepth, aa_desc);
+ return 0;
}
/**
@@ -2461,7 +2487,7 @@ int ata_dev_configure(struct ata_device *dev)
if (ata_id_has_lba(id)) {
const char *lba_desc;
- char ncq_desc[20];
+ char ncq_desc[24];
lba_desc = "LBA";
dev->flags |= ATA_DFLAG_LBA;
@@ -2475,7 +2501,9 @@ int ata_dev_configure(struct ata_device *dev)
}
/* config NCQ */
- ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc));
+ rc = ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc));
+ if (rc)
+ return rc;
/* print device info to dmesg */
if (ata_msg_drv(ap) && print_info) {
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 79711b64054..a04488f0de8 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -40,6 +40,7 @@
#include <scsi/scsi_eh.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_dbg.h>
#include "../scsi/scsi_transport_api.h"
#include <linux/libata.h>
@@ -999,7 +1000,9 @@ static void __ata_port_freeze(struct ata_port *ap)
* ata_port_freeze - abort & freeze port
* @ap: ATA port to freeze
*
- * Abort and freeze @ap.
+ * Abort and freeze @ap. The freeze operation must be called
+ * first, because some hardware requires special operations
+ * before the taskfile registers are accessible.
*
* LOCKING:
* spin_lock_irqsave(host lock)
@@ -1013,8 +1016,8 @@ int ata_port_freeze(struct ata_port *ap)
WARN_ON(!ap->ops->error_handler);
- nr_aborted = ata_port_abort(ap);
__ata_port_freeze(ap);
+ nr_aborted = ata_port_abort(ap);
return nr_aborted;
}
@@ -2110,6 +2113,116 @@ void ata_eh_autopsy(struct ata_port *ap)
}
/**
+ * ata_get_cmd_descript - get description for ATA command
+ * @command: ATA command code to get description for
+ *
+ * Return a textual description of the given command, or NULL if the
+ * command is not known.
+ *
+ * LOCKING:
+ * None
+ */
+const char *ata_get_cmd_descript(u8 command)
+{
+#ifdef CONFIG_ATA_VERBOSE_ERROR
+ static const struct
+ {
+ u8 command;
+ const char *text;
+ } cmd_descr[] = {
+ { ATA_CMD_DEV_RESET, "DEVICE RESET" },
+ { ATA_CMD_CHK_POWER, "CHECK POWER MODE" },
+ { ATA_CMD_STANDBY, "STANDBY" },
+ { ATA_CMD_IDLE, "IDLE" },
+ { ATA_CMD_EDD, "EXECUTE DEVICE DIAGNOSTIC" },
+ { ATA_CMD_DOWNLOAD_MICRO, "DOWNLOAD MICROCODE" },
+ { ATA_CMD_NOP, "NOP" },
+ { ATA_CMD_FLUSH, "FLUSH CACHE" },
+ { ATA_CMD_FLUSH_EXT, "FLUSH CACHE EXT" },
+ { ATA_CMD_ID_ATA, "IDENTIFY DEVICE" },
+ { ATA_CMD_ID_ATAPI, "IDENTIFY PACKET DEVICE" },
+ { ATA_CMD_SERVICE, "SERVICE" },
+ { ATA_CMD_READ, "READ DMA" },
+ { ATA_CMD_READ_EXT, "READ DMA EXT" },
+ { ATA_CMD_READ_QUEUED, "READ DMA QUEUED" },
+ { ATA_CMD_READ_STREAM_EXT, "READ STREAM EXT" },
+ { ATA_CMD_READ_STREAM_DMA_EXT, "READ STREAM DMA EXT" },
+ { ATA_CMD_WRITE, "WRITE DMA" },
+ { ATA_CMD_WRITE_EXT, "WRITE DMA EXT" },
+ { ATA_CMD_WRITE_QUEUED, "WRITE DMA QUEUED EXT" },
+ { ATA_CMD_WRITE_STREAM_EXT, "WRITE STREAM EXT" },
+ { ATA_CMD_WRITE_STREAM_DMA_EXT, "WRITE STREAM DMA EXT" },
+ { ATA_CMD_WRITE_FUA_EXT, "WRITE DMA FUA EXT" },
+ { ATA_CMD_WRITE_QUEUED_FUA_EXT, "WRITE DMA QUEUED FUA EXT" },
+ { ATA_CMD_FPDMA_READ, "READ FPDMA QUEUED" },
+ { ATA_CMD_FPDMA_WRITE, "WRITE FPDMA QUEUED" },
+ { ATA_CMD_PIO_READ, "READ SECTOR(S)" },
+ { ATA_CMD_PIO_READ_EXT, "READ SECTOR(S) EXT" },
+ { ATA_CMD_PIO_WRITE, "WRITE SECTOR(S)" },
+ { ATA_CMD_PIO_WRITE_EXT, "WRITE SECTOR(S) EXT" },
+ { ATA_CMD_READ_MULTI, "READ MULTIPLE" },
+ { ATA_CMD_READ_MULTI_EXT, "READ MULTIPLE EXT" },
+ { ATA_CMD_WRITE_MULTI, "WRITE MULTIPLE" },
+ { ATA_CMD_WRITE_MULTI_EXT, "WRITE MULTIPLE EXT" },
+ { ATA_CMD_WRITE_MULTI_FUA_EXT, "WRITE MULTIPLE FUA EXT" },
+ { ATA_CMD_SET_FEATURES, "SET FEATURES" },
+ { ATA_CMD_SET_MULTI, "SET MULTIPLE MODE" },
+ { ATA_CMD_VERIFY, "READ VERIFY SECTOR(S)" },
+ { ATA_CMD_VERIFY_EXT, "READ VERIFY SECTOR(S) EXT" },
+ { ATA_CMD_WRITE_UNCORR_EXT, "WRITE UNCORRECTABLE EXT" },
+ { ATA_CMD_STANDBYNOW1, "STANDBY IMMEDIATE" },
+ { ATA_CMD_IDLEIMMEDIATE, "IDLE IMMEDIATE" },
+ { ATA_CMD_SLEEP, "SLEEP" },
+ { ATA_CMD_INIT_DEV_PARAMS, "INITIALIZE DEVICE PARAMETERS" },
+ { ATA_CMD_READ_NATIVE_MAX, "READ NATIVE MAX ADDRESS" },
+ { ATA_CMD_READ_NATIVE_MAX_EXT, "READ NATIVE MAX ADDRESS EXT" },
+ { ATA_CMD_SET_MAX, "SET MAX ADDRESS" },
+ { ATA_CMD_SET_MAX_EXT, "SET MAX ADDRESS EXT" },
+ { ATA_CMD_READ_LOG_EXT, "READ LOG EXT" },
+ { ATA_CMD_WRITE_LOG_EXT, "WRITE LOG EXT" },
+ { ATA_CMD_READ_LOG_DMA_EXT, "READ LOG DMA EXT" },
+ { ATA_CMD_WRITE_LOG_DMA_EXT, "WRITE LOG DMA EXT" },
+ { ATA_CMD_TRUSTED_RCV, "TRUSTED RECEIVE" },
+ { ATA_CMD_TRUSTED_RCV_DMA, "TRUSTED RECEIVE DMA" },
+ { ATA_CMD_TRUSTED_SND, "TRUSTED SEND" },
+ { ATA_CMD_TRUSTED_SND_DMA, "TRUSTED SEND DMA" },
+ { ATA_CMD_PMP_READ, "READ BUFFER" },
+ { ATA_CMD_PMP_WRITE, "WRITE BUFFER" },
+ { ATA_CMD_CONF_OVERLAY, "DEVICE CONFIGURATION OVERLAY" },
+ { ATA_CMD_SEC_SET_PASS, "SECURITY SET PASSWORD" },
+ { ATA_CMD_SEC_UNLOCK, "SECURITY UNLOCK" },
+ { ATA_CMD_SEC_ERASE_PREP, "SECURITY ERASE PREPARE" },
+ { ATA_CMD_SEC_ERASE_UNIT, "SECURITY ERASE UNIT" },
+ { ATA_CMD_SEC_FREEZE_LOCK, "SECURITY FREEZE LOCK" },
+ { ATA_CMD_SEC_DISABLE_PASS, "SECURITY DISABLE PASSWORD" },
+ { ATA_CMD_CONFIG_STREAM, "CONFIGURE STREAM" },
+ { ATA_CMD_SMART, "SMART" },
+ { ATA_CMD_MEDIA_LOCK, "DOOR LOCK" },
+ { ATA_CMD_MEDIA_UNLOCK, "DOOR UNLOCK" },
+ { ATA_CMD_CHK_MED_CRD_TYP, "CHECK MEDIA CARD TYPE" },
+ { ATA_CMD_CFA_REQ_EXT_ERR, "CFA REQUEST EXTENDED ERROR" },
+ { ATA_CMD_CFA_WRITE_NE, "CFA WRITE SECTORS WITHOUT ERASE" },
+ { ATA_CMD_CFA_TRANS_SECT, "CFA TRANSLATE SECTOR" },
+ { ATA_CMD_CFA_ERASE, "CFA ERASE SECTORS" },
+ { ATA_CMD_CFA_WRITE_MULT_NE, "CFA WRITE MULTIPLE WITHOUT ERASE" },
+ { ATA_CMD_READ_LONG, "READ LONG (with retries)" },
+ { ATA_CMD_READ_LONG_ONCE, "READ LONG (without retries)" },
+ { ATA_CMD_WRITE_LONG, "WRITE LONG (with retries)" },
+ { ATA_CMD_WRITE_LONG_ONCE, "WRITE LONG (without retries)" },
+ { ATA_CMD_RESTORE, "RECALIBRATE" },
+ { 0, NULL } /* terminate list */
+ };
+
+ unsigned int i;
+ for (i = 0; cmd_descr[i].text; i++)
+ if (cmd_descr[i].command == command)
+ return cmd_descr[i].text;
+#endif
+
+ return NULL;
+}
+
+/**
* ata_eh_link_report - report error handling to user
* @link: ATA link EH is going on
*
@@ -2175,6 +2288,7 @@ static void ata_eh_link_report(struct ata_link *link)
ata_link_printk(link, KERN_ERR, "%s\n", desc);
}
+#ifdef CONFIG_ATA_VERBOSE_ERROR
if (ehc->i.serror)
ata_link_printk(link, KERN_ERR,
"SError: { %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s}\n",
@@ -2195,6 +2309,7 @@ static void ata_eh_link_report(struct ata_link *link)
ehc->i.serror & SERR_TRANS_ST_ERROR ? "TrStaTrns " : "",
ehc->i.serror & SERR_UNRECOG_FIS ? "UnrecFIS " : "",
ehc->i.serror & SERR_DEV_XCHG ? "DevExch " : "");
+#endif
for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag);
@@ -2226,14 +2341,23 @@ static void ata_eh_link_report(struct ata_link *link)
dma_str[qc->dma_dir]);
}
- if (ata_is_atapi(qc->tf.protocol))
- snprintf(cdb_buf, sizeof(cdb_buf),
+ if (ata_is_atapi(qc->tf.protocol)) {
+ if (qc->scsicmd)
+ scsi_print_command(qc->scsicmd);
+ else
+ snprintf(cdb_buf, sizeof(cdb_buf),
"cdb %02x %02x %02x %02x %02x %02x %02x %02x "
"%02x %02x %02x %02x %02x %02x %02x %02x\n ",
cdb[0], cdb[1], cdb[2], cdb[3],
cdb[4], cdb[5], cdb[6], cdb[7],
cdb[8], cdb[9], cdb[10], cdb[11],
cdb[12], cdb[13], cdb[14], cdb[15]);
+ } else {
+ const char *descr = ata_get_cmd_descript(cmd->command);
+ if (descr)
+ ata_dev_printk(qc->dev, KERN_ERR,
+ "failed command: %s\n", descr);
+ }
ata_dev_printk(qc->dev, KERN_ERR,
"cmd %02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x "
@@ -2252,6 +2376,7 @@ static void ata_eh_link_report(struct ata_link *link)
res->device, qc->err_mask, ata_err_string(qc->err_mask),
qc->err_mask & AC_ERR_NCQ ? " <F>" : "");
+#ifdef CONFIG_ATA_VERBOSE_ERROR
if (res->command & (ATA_BUSY | ATA_DRDY | ATA_DF | ATA_DRQ |
ATA_ERR)) {
if (res->command & ATA_BUSY)
@@ -2275,6 +2400,7 @@ static void ata_eh_link_report(struct ata_link *link)
res->feature & ATA_UNC ? "UNC " : "",
res->feature & ATA_IDNF ? "IDNF " : "",
res->feature & ATA_ABORTED ? "ABRT " : "");
+#endif
}
}
@@ -2574,11 +2700,17 @@ int ata_eh_reset(struct ata_link *link, int classify,
postreset(slave, classes);
}
- /* clear cached SError */
+ /*
+ * Some controllers can't be frozen very well and may set
+ * spuruious error conditions during reset. Clear accumulated
+ * error information. As reset is the final recovery action,
+ * nothing is lost by doing this.
+ */
spin_lock_irqsave(link->ap->lock, flags);
- link->eh_info.serror = 0;
+ memset(&link->eh_info, 0, sizeof(link->eh_info));
if (slave)
- slave->eh_info.serror = 0;
+ memset(&slave->eh_info, 0, sizeof(link->eh_info));
+ ap->pflags &= ~ATA_PFLAG_EH_PENDING;
spin_unlock_irqrestore(link->ap->lock, flags);
/* Make sure onlineness and classification result correspond.
diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c
index 619f2c33950..51f0ffb78cb 100644
--- a/drivers/ata/libata-pmp.c
+++ b/drivers/ata/libata-pmp.c
@@ -221,6 +221,8 @@ static const char *sata_pmp_spec_rev_str(const u32 *gscr)
{
u32 rev = gscr[SATA_PMP_GSCR_REV];
+ if (rev & (1 << 3))
+ return "1.2";
if (rev & (1 << 2))
return "1.1";
if (rev & (1 << 1))
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index d0dfeef55db..b4ee28dec52 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1119,10 +1119,6 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
blk_queue_dma_drain(q, atapi_drain_needed, buf, ATAPI_MAX_DRAIN);
} else {
- if (ata_id_is_ssd(dev->id))
- queue_flag_set_unlocked(QUEUE_FLAG_NONROT,
- sdev->request_queue);
-
/* ATA devices must be sector aligned */
blk_queue_update_dma_alignment(sdev->request_queue,
ATA_SECT_SIZE - 1);
@@ -1257,23 +1253,6 @@ int ata_scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth)
return queue_depth;
}
-/* XXX: for spindown warning */
-static void ata_delayed_done_timerfn(unsigned long arg)
-{
- struct scsi_cmnd *scmd = (void *)arg;
-
- scmd->scsi_done(scmd);
-}
-
-/* XXX: for spindown warning */
-static void ata_delayed_done(struct scsi_cmnd *scmd)
-{
- static struct timer_list timer;
-
- setup_timer(&timer, ata_delayed_done_timerfn, (unsigned long)scmd);
- mod_timer(&timer, jiffies + 5 * HZ);
-}
-
/**
* ata_scsi_start_stop_xlat - Translate SCSI START STOP UNIT command
* @qc: Storage for translated ATA taskfile
@@ -1338,32 +1317,6 @@ static unsigned int ata_scsi_start_stop_xlat(struct ata_queued_cmd *qc)
system_entering_hibernation())
goto skip;
- /* XXX: This is for backward compatibility, will be
- * removed. Read Documentation/feature-removal-schedule.txt
- * for more info.
- */
- if ((qc->dev->flags & ATA_DFLAG_SPUNDOWN) &&
- (system_state == SYSTEM_HALT ||
- system_state == SYSTEM_POWER_OFF)) {
- static unsigned long warned;
-
- if (!test_and_set_bit(0, &warned)) {
- ata_dev_printk(qc->dev, KERN_WARNING,
- "DISK MIGHT NOT BE SPUN DOWN PROPERLY. "
- "UPDATE SHUTDOWN UTILITY\n");
- ata_dev_printk(qc->dev, KERN_WARNING,
- "For more info, visit "
- "http://linux-ata.org/shutdown.html\n");
-
- /* ->scsi_done is not used, use it for
- * delayed completion.
- */
- scmd->scsi_done = qc->scsidone;
- qc->scsidone = ata_delayed_done;
- }
- goto skip;
- }
-
/* Issue ATA STANDBY IMMEDIATE command */
tf->command = ATA_CMD_STANDBYNOW1;
}
@@ -1764,14 +1717,6 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
}
}
- /* XXX: track spindown state for spindown skipping and warning */
- if (unlikely(qc->tf.command == ATA_CMD_STANDBY ||
- qc->tf.command == ATA_CMD_STANDBYNOW1))
- qc->dev->flags |= ATA_DFLAG_SPUNDOWN;
- else if (likely(system_state != SYSTEM_HALT &&
- system_state != SYSTEM_POWER_OFF))
- qc->dev->flags &= ~ATA_DFLAG_SPUNDOWN;
-
if (need_sense && !ap->ops->error_handler)
ata_dump_status(ap->print_id, &qc->result_tf);
@@ -2815,28 +2760,6 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
goto invalid_fld;
/*
- * Filter TPM commands by default. These provide an
- * essentially uncontrolled encrypted "back door" between
- * applications and the disk. Set libata.allow_tpm=1 if you
- * have a real reason for wanting to use them. This ensures
- * that installed software cannot easily mess stuff up without
- * user intent. DVR type users will probably ship with this enabled
- * for movie content management.
- *
- * Note that for ATA8 we can issue a DCS change and DCS freeze lock
- * for this and should do in future but that it is not sufficient as
- * DCS is an optional feature set. Thus we also do the software filter
- * so that we comply with the TC consortium stated goal that the user
- * can turn off TC features of their system.
- */
- if (tf->command >= 0x5C && tf->command <= 0x5F && !libata_allow_tpm)
- goto invalid_fld;
-
- /* We may not issue DMA commands if no DMA mode is set */
- if (tf->protocol == ATA_PROT_DMA && dev->dma_mode == 0)
- goto invalid_fld;
-
- /*
* 12 and 16 byte CDBs use different offsets to
* provide the various register values.
*/
@@ -2885,6 +2808,41 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
tf->device = dev->devno ?
tf->device | ATA_DEV1 : tf->device & ~ATA_DEV1;
+ /* READ/WRITE LONG use a non-standard sect_size */
+ qc->sect_size = ATA_SECT_SIZE;
+ switch (tf->command) {
+ case ATA_CMD_READ_LONG:
+ case ATA_CMD_READ_LONG_ONCE:
+ case ATA_CMD_WRITE_LONG:
+ case ATA_CMD_WRITE_LONG_ONCE:
+ if (tf->protocol != ATA_PROT_PIO || tf->nsect != 1)
+ goto invalid_fld;
+ qc->sect_size = scsi_bufflen(scmd);
+ }
+
+ /*
+ * Set flags so that all registers will be written, pass on
+ * write indication (used for PIO/DMA setup), result TF is
+ * copied back and we don't whine too much about its failure.
+ */
+ tf->flags = ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
+ if (scmd->sc_data_direction == DMA_TO_DEVICE)
+ tf->flags |= ATA_TFLAG_WRITE;
+
+ qc->flags |= ATA_QCFLAG_RESULT_TF | ATA_QCFLAG_QUIET;
+
+ /*
+ * Set transfer length.
+ *
+ * TODO: find out if we need to do more here to
+ * cover scatter/gather case.
+ */
+ ata_qc_set_pc_nbytes(qc);
+
+ /* We may not issue DMA commands if no DMA mode is set */
+ if (tf->protocol == ATA_PROT_DMA && dev->dma_mode == 0)
+ goto invalid_fld;
+
/* sanity check for pio multi commands */
if ((cdb[1] & 0xe0) && !is_multi_taskfile(tf))
goto invalid_fld;
@@ -2901,18 +2859,6 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
multi_count);
}
- /* READ/WRITE LONG use a non-standard sect_size */
- qc->sect_size = ATA_SECT_SIZE;
- switch (tf->command) {
- case ATA_CMD_READ_LONG:
- case ATA_CMD_READ_LONG_ONCE:
- case ATA_CMD_WRITE_LONG:
- case ATA_CMD_WRITE_LONG_ONCE:
- if (tf->protocol != ATA_PROT_PIO || tf->nsect != 1)
- goto invalid_fld;
- qc->sect_size = scsi_bufflen(scmd);
- }
-
/*
* Filter SET_FEATURES - XFER MODE command -- otherwise,
* SET_FEATURES - XFER MODE must be preceded/succeeded
@@ -2920,30 +2866,27 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
* controller (i.e. the reason for ->set_piomode(),
* ->set_dmamode(), and ->post_set_mode() hooks).
*/
- if ((tf->command == ATA_CMD_SET_FEATURES)
- && (tf->feature == SETFEATURES_XFER))
+ if (tf->command == ATA_CMD_SET_FEATURES &&
+ tf->feature == SETFEATURES_XFER)
goto invalid_fld;
/*
- * Set flags so that all registers will be written,
- * and pass on write indication (used for PIO/DMA
- * setup.)
- */
- tf->flags |= (ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE);
-
- if (scmd->sc_data_direction == DMA_TO_DEVICE)
- tf->flags |= ATA_TFLAG_WRITE;
-
- /*
- * Set transfer length.
+ * Filter TPM commands by default. These provide an
+ * essentially uncontrolled encrypted "back door" between
+ * applications and the disk. Set libata.allow_tpm=1 if you
+ * have a real reason for wanting to use them. This ensures
+ * that installed software cannot easily mess stuff up without
+ * user intent. DVR type users will probably ship with this enabled
+ * for movie content management.
*
- * TODO: find out if we need to do more here to
- * cover scatter/gather case.
+ * Note that for ATA8 we can issue a DCS change and DCS freeze lock
+ * for this and should do in future but that it is not sufficient as
+ * DCS is an optional feature set. Thus we also do the software filter
+ * so that we comply with the TC consortium stated goal that the user
+ * can turn off TC features of their system.
*/
- ata_qc_set_pc_nbytes(qc);
-
- /* request result TF and be quiet about device error */
- qc->flags |= ATA_QCFLAG_RESULT_TF | ATA_QCFLAG_QUIET;
+ if (tf->command >= 0x5C && tf->command <= 0x5F && !libata_allow_tpm)
+ goto invalid_fld;
return 0;
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 89a1e0018e7..be8e2628f82 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -164,6 +164,7 @@ extern void ata_eh_about_to_do(struct ata_link *link, struct ata_device *dev,
extern void ata_eh_done(struct ata_link *link, struct ata_device *dev,
unsigned int action);
extern void ata_eh_autopsy(struct ata_port *ap);
+const char *ata_get_cmd_descript(u8 command);
extern void ata_eh_report(struct ata_port *ap);
extern int ata_eh_reset(struct ata_link *link, int classify,
ata_prereset_fn_t prereset, ata_reset_fn_t softreset,
diff --git a/drivers/ata/pata_atiixp.c b/drivers/ata/pata_atiixp.c
index 45915566e4e..aa4b3f6ae77 100644
--- a/drivers/ata/pata_atiixp.c
+++ b/drivers/ata/pata_atiixp.c
@@ -246,6 +246,7 @@ static const struct pci_device_id atiixp[] = {
{ PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP400_IDE), },
{ PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP600_IDE), },
{ PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP700_IDE), },
+ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_SB900_IDE), },
{ },
};
diff --git a/drivers/ata/pata_cs5535.c b/drivers/ata/pata_cs5535.c
index d33aa28239a..403f56165ce 100644
--- a/drivers/ata/pata_cs5535.c
+++ b/drivers/ata/pata_cs5535.c
@@ -202,7 +202,8 @@ static int cs5535_init_one(struct pci_dev *dev, const struct pci_device_id *id)
}
static const struct pci_device_id cs5535[] = {
- { PCI_VDEVICE(NS, 0x002D), },
+ { PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_CS5535_IDE), },
+ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_CS5535_IDE), },
{ },
};
diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c
index abdd19fe990..d6f69561dc8 100644
--- a/drivers/ata/pata_octeon_cf.c
+++ b/drivers/ata/pata_octeon_cf.c
@@ -213,7 +213,7 @@ static void octeon_cf_set_dmamode(struct ata_port *ap, struct ata_device *dev)
* This is tI, C.F. spec. says 0, but Sony CF card requires
* more, we use 20 nS.
*/
- dma_tim.s.dmack_s = ns_to_tim_reg(tim_mult, 20);;
+ dma_tim.s.dmack_s = ns_to_tim_reg(tim_mult, 20);
dma_tim.s.dmack_h = ns_to_tim_reg(tim_mult, dma_ackh);
dma_tim.s.dmarq = dma_arq;
@@ -841,7 +841,7 @@ static int __devinit octeon_cf_probe(struct platform_device *pdev)
ocd = pdev->dev.platform_data;
cs0 = devm_ioremap_nocache(&pdev->dev, res_cs0->start,
- res_cs0->end - res_cs0->start + 1);
+ resource_size(res_cs0));
if (!cs0)
return -ENOMEM;
diff --git a/drivers/ata/pata_platform.c b/drivers/ata/pata_platform.c
index d8d743af322..3f6ebc6c665 100644
--- a/drivers/ata/pata_platform.c
+++ b/drivers/ata/pata_platform.c
@@ -151,14 +151,14 @@ int __devinit __pata_platform_probe(struct device *dev,
*/
if (mmio) {
ap->ioaddr.cmd_addr = devm_ioremap(dev, io_res->start,
- io_res->end - io_res->start + 1);
+ resource_size(io_res));
ap->ioaddr.ctl_addr = devm_ioremap(dev, ctl_res->start,
- ctl_res->end - ctl_res->start + 1);
+ resource_size(ctl_res));
} else {
ap->ioaddr.cmd_addr = devm_ioport_map(dev, io_res->start,
- io_res->end - io_res->start + 1);
+ resource_size(io_res));
ap->ioaddr.ctl_addr = devm_ioport_map(dev, ctl_res->start,
- ctl_res->end - ctl_res->start + 1);
+ resource_size(ctl_res));
}
if (!ap->ioaddr.cmd_addr || !ap->ioaddr.ctl_addr) {
dev_err(dev, "failed to map IO/CTL base\n");
diff --git a/drivers/ata/pata_rb532_cf.c b/drivers/ata/pata_rb532_cf.c
index 8e3cdef8a25..45f1e10f917 100644
--- a/drivers/ata/pata_rb532_cf.c
+++ b/drivers/ata/pata_rb532_cf.c
@@ -151,7 +151,7 @@ static __devinit int rb532_pata_driver_probe(struct platform_device *pdev)
info->irq = irq;
info->iobase = devm_ioremap_nocache(&pdev->dev, res->start,
- res->end - res->start + 1);
+ resource_size(res));
if (!info->iobase)
return -ENOMEM;
diff --git a/drivers/ata/pata_rdc.c b/drivers/ata/pata_rdc.c
new file mode 100644
index 00000000000..c843a1e07c4
--- /dev/null
+++ b/drivers/ata/pata_rdc.c
@@ -0,0 +1,400 @@
+/*
+ * pata_rdc - Driver for later RDC PATA controllers
+ *
+ * This is actually a driver for hardware meeting
+ * INCITS 370-2004 (1510D): ATA Host Adapter Standards
+ *
+ * Based on ata_piix.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <scsi/scsi_host.h>
+#include <linux/libata.h>
+#include <linux/dmi.h>
+
+#define DRV_NAME "pata_rdc"
+#define DRV_VERSION "0.01"
+
+struct rdc_host_priv {
+ u32 saved_iocfg;
+};
+
+/**
+ * rdc_pata_cable_detect - Probe host controller cable detect info
+ * @ap: Port for which cable detect info is desired
+ *
+ * Read 80c cable indicator from ATA PCI device's PCI config
+ * register. This register is normally set by firmware (BIOS).
+ *
+ * LOCKING:
+ * None (inherited from caller).
+ */
+
+static int rdc_pata_cable_detect(struct ata_port *ap)
+{
+ struct rdc_host_priv *hpriv = ap->host->private_data;
+ u8 mask;
+
+ /* check BIOS cable detect results */
+ mask = 0x30 << (2 * ap->port_no);
+ if ((hpriv->saved_iocfg & mask) == 0)
+ return ATA_CBL_PATA40;
+ return ATA_CBL_PATA80;
+}
+
+/**
+ * rdc_pata_prereset - prereset for PATA host controller
+ * @link: Target link
+ * @deadline: deadline jiffies for the operation
+ *
+ * LOCKING:
+ * None (inherited from caller).
+ */
+static int rdc_pata_prereset(struct ata_link *link, unsigned long deadline)
+{
+ struct ata_port *ap = link->ap;
+ struct pci_dev *pdev = to_pci_dev(ap->host->dev);
+
+ static const struct pci_bits rdc_enable_bits[] = {
+ { 0x41U, 1U, 0x80UL, 0x80UL }, /* port 0 */
+ { 0x43U, 1U, 0x80UL, 0x80UL }, /* port 1 */
+ };
+
+ if (!pci_test_config_bits(pdev, &rdc_enable_bits[ap->port_no]))
+ return -ENOENT;
+ return ata_sff_prereset(link, deadline);
+}
+
+/**
+ * rdc_set_piomode - Initialize host controller PATA PIO timings
+ * @ap: Port whose timings we are configuring
+ * @adev: um
+ *
+ * Set PIO mode for device, in host controller PCI config space.
+ *
+ * LOCKING:
+ * None (inherited from caller).
+ */
+
+static void rdc_set_piomode(struct ata_port *ap, struct ata_device *adev)
+{
+ unsigned int pio = adev->pio_mode - XFER_PIO_0;
+ struct pci_dev *dev = to_pci_dev(ap->host->dev);
+ unsigned int is_slave = (adev->devno != 0);
+ unsigned int master_port= ap->port_no ? 0x42 : 0x40;
+ unsigned int slave_port = 0x44;
+ u16 master_data;
+ u8 slave_data;
+ u8 udma_enable;
+ int control = 0;
+
+ static const /* ISP RTC */
+ u8 timings[][2] = { { 0, 0 },
+ { 0, 0 },
+ { 1, 0 },
+ { 2, 1 },
+ { 2, 3 }, };
+
+ if (pio >= 2)
+ control |= 1; /* TIME1 enable */
+ if (ata_pio_need_iordy(adev))
+ control |= 2; /* IE enable */
+
+ if (adev->class == ATA_DEV_ATA)
+ control |= 4; /* PPE enable */
+
+ /* PIO configuration clears DTE unconditionally. It will be
+ * programmed in set_dmamode which is guaranteed to be called
+ * after set_piomode if any DMA mode is available.
+ */
+ pci_read_config_word(dev, master_port, &master_data);
+ if (is_slave) {
+ /* clear TIME1|IE1|PPE1|DTE1 */
+ master_data &= 0xff0f;
+ /* Enable SITRE (separate slave timing register) */
+ master_data |= 0x4000;
+ /* enable PPE1, IE1 and TIME1 as needed */
+ master_data |= (control << 4);
+ pci_read_config_byte(dev, slave_port, &slave_data);
+ slave_data &= (ap->port_no ? 0x0f : 0xf0);
+ /* Load the timing nibble for this slave */
+ slave_data |= ((timings[pio][0] << 2) | timings[pio][1])
+ << (ap->port_no ? 4 : 0);
+ } else {
+ /* clear ISP|RCT|TIME0|IE0|PPE0|DTE0 */
+ master_data &= 0xccf0;
+ /* Enable PPE, IE and TIME as appropriate */
+ master_data |= control;
+ /* load ISP and RCT */
+ master_data |=
+ (timings[pio][0] << 12) |
+ (timings[pio][1] << 8);
+ }
+ pci_write_config_word(dev, master_port, master_data);
+ if (is_slave)
+ pci_write_config_byte(dev, slave_port, slave_data);
+
+ /* Ensure the UDMA bit is off - it will be turned back on if
+ UDMA is selected */
+
+ pci_read_config_byte(dev, 0x48, &udma_enable);
+ udma_enable &= ~(1 << (2 * ap->port_no + adev->devno));
+ pci_write_config_byte(dev, 0x48, udma_enable);
+}
+
+/**
+ * rdc_set_dmamode - Initialize host controller PATA PIO timings
+ * @ap: Port whose timings we are configuring
+ * @adev: Drive in question
+ *
+ * Set UDMA mode for device, in host controller PCI config space.
+ *
+ * LOCKING:
+ * None (inherited from caller).
+ */
+
+static void rdc_set_dmamode(struct ata_port *ap, struct ata_device *adev)
+{
+ struct pci_dev *dev = to_pci_dev(ap->host->dev);
+ u8 master_port = ap->port_no ? 0x42 : 0x40;
+ u16 master_data;
+ u8 speed = adev->dma_mode;
+ int devid = adev->devno + 2 * ap->port_no;
+ u8 udma_enable = 0;
+
+ static const /* ISP RTC */
+ u8 timings[][2] = { { 0, 0 },
+ { 0, 0 },
+ { 1, 0 },
+ { 2, 1 },
+ { 2, 3 }, };
+
+ pci_read_config_word(dev, master_port, &master_data);
+ pci_read_config_byte(dev, 0x48, &udma_enable);
+
+ if (speed >= XFER_UDMA_0) {
+ unsigned int udma = adev->dma_mode - XFER_UDMA_0;
+ u16 udma_timing;
+ u16 ideconf;
+ int u_clock, u_speed;
+
+ /*
+ * UDMA is handled by a combination of clock switching and
+ * selection of dividers
+ *
+ * Handy rule: Odd modes are UDMATIMx 01, even are 02
+ * except UDMA0 which is 00
+ */
+ u_speed = min(2 - (udma & 1), udma);
+ if (udma == 5)
+ u_clock = 0x1000; /* 100Mhz */
+ else if (udma > 2)
+ u_clock = 1; /* 66Mhz */
+ else
+ u_clock = 0; /* 33Mhz */
+
+ udma_enable |= (1 << devid);
+
+ /* Load the CT/RP selection */
+ pci_read_config_word(dev, 0x4A, &udma_timing);
+ udma_timing &= ~(3 << (4 * devid));
+ udma_timing |= u_speed << (4 * devid);
+ pci_write_config_word(dev, 0x4A, udma_timing);
+
+ /* Select a 33/66/100Mhz clock */
+ pci_read_config_word(dev, 0x54, &ideconf);
+ ideconf &= ~(0x1001 << devid);
+ ideconf |= u_clock << devid;
+ pci_write_config_word(dev, 0x54, ideconf);
+ } else {
+ /*
+ * MWDMA is driven by the PIO timings. We must also enable
+ * IORDY unconditionally along with TIME1. PPE has already
+ * been set when the PIO timing was set.
+ */
+ unsigned int mwdma = adev->dma_mode - XFER_MW_DMA_0;
+ unsigned int control;
+ u8 slave_data;
+ const unsigned int needed_pio[3] = {
+ XFER_PIO_0, XFER_PIO_3, XFER_PIO_4
+ };
+ int pio = needed_pio[mwdma] - XFER_PIO_0;
+
+ control = 3; /* IORDY|TIME1 */
+
+ /* If the drive MWDMA is faster than it can do PIO then
+ we must force PIO into PIO0 */
+
+ if (adev->pio_mode < needed_pio[mwdma])
+ /* Enable DMA timing only */
+ control |= 8; /* PIO cycles in PIO0 */
+
+ if (adev->devno) { /* Slave */
+ master_data &= 0xFF4F; /* Mask out IORDY|TIME1|DMAONLY */
+ master_data |= control << 4;
+ pci_read_config_byte(dev, 0x44, &slave_data);
+ slave_data &= (ap->port_no ? 0x0f : 0xf0);
+ /* Load the matching timing */
+ slave_data |= ((timings[pio][0] << 2) | timings[pio][1]) << (ap->port_no ? 4 : 0);
+ pci_write_config_byte(dev, 0x44, slave_data);
+ } else { /* Master */
+ master_data &= 0xCCF4; /* Mask out IORDY|TIME1|DMAONLY
+ and master timing bits */
+ master_data |= control;
+ master_data |=
+ (timings[pio][0] << 12) |
+ (timings[pio][1] << 8);
+ }
+
+ udma_enable &= ~(1 << devid);
+ pci_write_config_word(dev, master_port, master_data);
+ }
+ pci_write_config_byte(dev, 0x48, udma_enable);
+}
+
+static struct ata_port_operations rdc_pata_ops = {
+ .inherits = &ata_bmdma32_port_ops,
+ .cable_detect = rdc_pata_cable_detect,
+ .set_piomode = rdc_set_piomode,
+ .set_dmamode = rdc_set_dmamode,
+ .prereset = rdc_pata_prereset,
+};
+
+static struct ata_port_info rdc_port_info = {
+
+ .flags = ATA_FLAG_SLAVE_POSS,
+ .pio_mask = ATA_PIO4,
+ .mwdma_mask = ATA_MWDMA2,
+ .udma_mask = ATA_UDMA5,
+ .port_ops = &rdc_pata_ops,
+};
+
+static struct scsi_host_template rdc_sht = {
+ ATA_BMDMA_SHT(DRV_NAME),
+};
+
+/**
+ * rdc_init_one - Register PIIX ATA PCI device with kernel services
+ * @pdev: PCI device to register
+ * @ent: Entry in rdc_pci_tbl matching with @pdev
+ *
+ * Called from kernel PCI layer. We probe for combined mode (sigh),
+ * and then hand over control to libata, for it to do the rest.
+ *
+ * LOCKING:
+ * Inherited from PCI layer (may sleep).
+ *
+ * RETURNS:
+ * Zero on success, or -ERRNO value.
+ */
+
+static int __devinit rdc_init_one(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ static int printed_version;
+ struct device *dev = &pdev->dev;
+ struct ata_port_info port_info[2];
+ const struct ata_port_info *ppi[] = { &port_info[0], &port_info[1] };
+ unsigned long port_flags;
+ struct ata_host *host;
+ struct rdc_host_priv *hpriv;
+ int rc;
+
+ if (!printed_version++)
+ dev_printk(KERN_DEBUG, &pdev->dev,
+ "version " DRV_VERSION "\n");
+
+ port_info[0] = rdc_port_info;
+ port_info[1] = rdc_port_info;
+
+ port_flags = port_info[0].flags;
+
+ /* enable device and prepare host */
+ rc = pcim_enable_device(pdev);
+ if (rc)
+ return rc;
+
+ hpriv = devm_kzalloc(dev, sizeof(*hpriv), GFP_KERNEL);
+ if (!hpriv)
+ return -ENOMEM;
+
+ /* Save IOCFG, this will be used for cable detection, quirk
+ * detection and restoration on detach.
+ */
+ pci_read_config_dword(pdev, 0x54, &hpriv->saved_iocfg);
+
+ rc = ata_pci_sff_prepare_host(pdev, ppi, &host);
+ if (rc)
+ return rc;
+ host->private_data = hpriv;
+
+ pci_intx(pdev, 1);
+
+ host->flags |= ATA_HOST_PARALLEL_SCAN;
+
+ pci_set_master(pdev);
+ return ata_pci_sff_activate_host(host, ata_sff_interrupt, &rdc_sht);
+}
+
+static void rdc_remove_one(struct pci_dev *pdev)
+{
+ struct ata_host *host = dev_get_drvdata(&pdev->dev);
+ struct rdc_host_priv *hpriv = host->private_data;
+
+ pci_write_config_dword(pdev, 0x54, hpriv->saved_iocfg);
+
+ ata_pci_remove_one(pdev);
+}
+
+static const struct pci_device_id rdc_pci_tbl[] = {
+ { PCI_DEVICE(0x17F3, 0x1011), },
+ { PCI_DEVICE(0x17F3, 0x1012), },
+ { } /* terminate list */
+};
+
+static struct pci_driver rdc_pci_driver = {
+ .name = DRV_NAME,
+ .id_table = rdc_pci_tbl,
+ .probe = rdc_init_one,
+ .remove = rdc_remove_one,
+};
+
+
+static int __init rdc_init(void)
+{
+ return pci_register_driver(&rdc_pci_driver);
+}
+
+static void __exit rdc_exit(void)
+{
+ pci_unregister_driver(&rdc_pci_driver);
+}
+
+module_init(rdc_init);
+module_exit(rdc_exit);
+
+MODULE_AUTHOR("Alan Cox (based on ata_piix)");
+MODULE_DESCRIPTION("SCSI low-level driver for RDC PATA controllers");
+MODULE_LICENSE("GPL");
+MODULE_DEVICE_TABLE(pci, rdc_pci_tbl);
+MODULE_VERSION(DRV_VERSION);
diff --git a/drivers/ata/pata_rz1000.c b/drivers/ata/pata_rz1000.c
index 0c574c065c6..a5e4dfe60b4 100644
--- a/drivers/ata/pata_rz1000.c
+++ b/drivers/ata/pata_rz1000.c
@@ -85,7 +85,6 @@ static int rz1000_fifo_disable(struct pci_dev *pdev)
static int rz1000_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
{
- static int printed_version;
static const struct ata_port_info info = {
.flags = ATA_FLAG_SLAVE_POSS,
.pio_mask = ATA_PIO4,
@@ -93,8 +92,7 @@ static int rz1000_init_one (struct pci_dev *pdev, const struct pci_device_id *en
};
const struct ata_port_info *ppi[] = { &info, NULL };
- if (!printed_version++)
- printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+ printk_once(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
if (rz1000_fifo_disable(pdev) == 0)
return ata_pci_sff_init_one(pdev, ppi, &rz1000_sht, NULL);
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index 94eaa432c40..d344db42a00 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -1257,6 +1257,7 @@ static struct scsi_host_template sata_fsl_sht = {
static struct ata_port_operations sata_fsl_ops = {
.inherits = &sata_pmp_port_ops,
+ .qc_defer = ata_std_qc_defer,
.qc_prep = sata_fsl_qc_prep,
.qc_issue = sata_fsl_qc_issue,
.qc_fill_rtf = sata_fsl_qc_fill_rtf,
diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 8d890cc5a7e..4406902b429 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -405,7 +405,7 @@ static irqreturn_t inic_interrupt(int irq, void *dev_instance)
struct ata_host *host = dev_instance;
struct inic_host_priv *hpriv = host->private_data;
u16 host_irq_stat;
- int i, handled = 0;;
+ int i, handled = 0;
host_irq_stat = readw(hpriv->mmio_base + HOST_IRQ_STAT);
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index c19417e0220..17f9ff9067a 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -4013,7 +4013,7 @@ static int mv_platform_probe(struct platform_device *pdev)
host->iomap = NULL;
hpriv->base = devm_ioremap(&pdev->dev, res->start,
- res->end - res->start + 1);
+ resource_size(res));
hpriv->base -= SATAHC0_REG_BASE;
/*
diff --git a/drivers/ata/sata_sil.c b/drivers/ata/sata_sil.c
index 35bd5cc7f28..3cb69d5fb81 100644
--- a/drivers/ata/sata_sil.c
+++ b/drivers/ata/sata_sil.c
@@ -565,6 +565,19 @@ static void sil_freeze(struct ata_port *ap)
tmp |= SIL_MASK_IDE0_INT << ap->port_no;
writel(tmp, mmio_base + SIL_SYSCFG);
readl(mmio_base + SIL_SYSCFG); /* flush */
+
+ /* Ensure DMA_ENABLE is off.
+ *
+ * This is because the controller will not give us access to the
+ * taskfile registers while a DMA is in progress
+ */
+ iowrite8(ioread8(ap->ioaddr.bmdma_addr) & ~SIL_DMA_ENABLE,
+ ap->ioaddr.bmdma_addr);
+
+ /* According to ata_bmdma_stop, an HDMA transition requires
+ * on PIO cycle. But we can't read a taskfile register.
+ */
+ ioread8(ap->ioaddr.bmdma_addr);
}
static void sil_thaw(struct ata_port *ap)
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index 77aa8d7ecec..e6946fc527d 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -846,6 +846,17 @@ static void sil24_qc_prep(struct ata_queued_cmd *qc)
if (!ata_is_atapi(qc->tf.protocol)) {
prb = &cb->ata.prb;
sge = cb->ata.sge;
+ if (ata_is_data(qc->tf.protocol)) {
+ u16 prot = 0;
+ ctrl = PRB_CTRL_PROTOCOL;
+ if (ata_is_ncq(qc->tf.protocol))
+ prot |= PRB_PROT_NCQ;
+ if (qc->tf.flags & ATA_TFLAG_WRITE)
+ prot |= PRB_PROT_WRITE;
+ else
+ prot |= PRB_PROT_READ;
+ prb->prot = cpu_to_le16(prot);
+ }
} else {
prb = &cb->atapi.prb;
sge = cb->atapi.sge;
diff --git a/drivers/ata/sata_sis.c b/drivers/ata/sata_sis.c
index 8f983322861..f8a91bfd66a 100644
--- a/drivers/ata/sata_sis.c
+++ b/drivers/ata/sata_sis.c
@@ -109,8 +109,9 @@ MODULE_LICENSE("GPL");
MODULE_DEVICE_TABLE(pci, sis_pci_tbl);
MODULE_VERSION(DRV_VERSION);
-static unsigned int get_scr_cfg_addr(struct ata_port *ap, unsigned int sc_reg)
+static unsigned int get_scr_cfg_addr(struct ata_link *link, unsigned int sc_reg)
{
+ struct ata_port *ap = link->ap;
struct pci_dev *pdev = to_pci_dev(ap->host->dev);
unsigned int addr = SIS_SCR_BASE + (4 * sc_reg);
u8 pmr;
@@ -131,6 +132,9 @@ static unsigned int get_scr_cfg_addr(struct ata_port *ap, unsigned int sc_reg)
break;
}
}
+ if (link->pmp)
+ addr += 0x10;
+
return addr;
}
@@ -138,24 +142,12 @@ static u32 sis_scr_cfg_read(struct ata_link *link,
unsigned int sc_reg, u32 *val)
{
struct pci_dev *pdev = to_pci_dev(link->ap->host->dev);
- unsigned int cfg_addr = get_scr_cfg_addr(link->ap, sc_reg);
- u32 val2 = 0;
- u8 pmr;
+ unsigned int cfg_addr = get_scr_cfg_addr(link, sc_reg);
if (sc_reg == SCR_ERROR) /* doesn't exist in PCI cfg space */
return -EINVAL;
- pci_read_config_byte(pdev, SIS_PMR, &pmr);
-
pci_read_config_dword(pdev, cfg_addr, val);
-
- if ((pdev->device == 0x0182) || (pdev->device == 0x0183) ||
- (pdev->device == 0x1182) || (pmr & SIS_PMR_COMBINED))
- pci_read_config_dword(pdev, cfg_addr+0x10, &val2);
-
- *val |= val2;
- *val &= 0xfffffffb; /* avoid problems with powerdowned ports */
-
return 0;
}
@@ -163,28 +155,16 @@ static int sis_scr_cfg_write(struct ata_link *link,
unsigned int sc_reg, u32 val)
{
struct pci_dev *pdev = to_pci_dev(link->ap->host->dev);
- unsigned int cfg_addr = get_scr_cfg_addr(link->ap, sc_reg);
- u8 pmr;
-
- if (sc_reg == SCR_ERROR) /* doesn't exist in PCI cfg space */
- return -EINVAL;
-
- pci_read_config_byte(pdev, SIS_PMR, &pmr);
+ unsigned int cfg_addr = get_scr_cfg_addr(link, sc_reg);
pci_write_config_dword(pdev, cfg_addr, val);
-
- if ((pdev->device == 0x0182) || (pdev->device == 0x0183) ||
- (pdev->device == 0x1182) || (pmr & SIS_PMR_COMBINED))
- pci_write_config_dword(pdev, cfg_addr+0x10, val);
-
return 0;
}
static int sis_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val)
{
struct ata_port *ap = link->ap;
- struct pci_dev *pdev = to_pci_dev(ap->host->dev);
- u8 pmr;
+ void __iomem *base = ap->ioaddr.scr_addr + link->pmp * 0x10;
if (sc_reg > SCR_CONTROL)
return -EINVAL;
@@ -192,39 +172,23 @@ static int sis_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val)
if (ap->flags & SIS_FLAG_CFGSCR)
return sis_scr_cfg_read(link, sc_reg, val);
- pci_read_config_byte(pdev, SIS_PMR, &pmr);
-
- *val = ioread32(ap->ioaddr.scr_addr + (sc_reg * 4));
-
- if ((pdev->device == 0x0182) || (pdev->device == 0x0183) ||
- (pdev->device == 0x1182) || (pmr & SIS_PMR_COMBINED))
- *val |= ioread32(ap->ioaddr.scr_addr + (sc_reg * 4) + 0x10);
-
- *val &= 0xfffffffb;
-
+ *val = ioread32(base + sc_reg * 4);
return 0;
}
static int sis_scr_write(struct ata_link *link, unsigned int sc_reg, u32 val)
{
struct ata_port *ap = link->ap;
- struct pci_dev *pdev = to_pci_dev(ap->host->dev);
- u8 pmr;
+ void __iomem *base = ap->ioaddr.scr_addr + link->pmp * 0x10;
if (sc_reg > SCR_CONTROL)
return -EINVAL;
- pci_read_config_byte(pdev, SIS_PMR, &pmr);
-
if (ap->flags & SIS_FLAG_CFGSCR)
return sis_scr_cfg_write(link, sc_reg, val);
- else {
- iowrite32(val, ap->ioaddr.scr_addr + (sc_reg * 4));
- if ((pdev->device == 0x0182) || (pdev->device == 0x0183) ||
- (pdev->device == 0x1182) || (pmr & SIS_PMR_COMBINED))
- iowrite32(val, ap->ioaddr.scr_addr + (sc_reg * 4)+0x10);
- return 0;
- }
+
+ iowrite32(val, base + (sc_reg * 4));
+ return 0;
}
static int sis_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
@@ -236,7 +200,7 @@ static int sis_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
u32 genctl, val;
u8 pmr;
u8 port2_start = 0x20;
- int rc;
+ int i, rc;
if (!printed_version++)
dev_printk(KERN_INFO, &pdev->dev, "version " DRV_VERSION "\n");
@@ -319,6 +283,17 @@ static int sis_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
if (rc)
return rc;
+ for (i = 0; i < 2; i++) {
+ struct ata_port *ap = host->ports[i];
+
+ if (ap->flags & ATA_FLAG_SATA &&
+ ap->flags & ATA_FLAG_SLAVE_POSS) {
+ rc = ata_slave_link_init(ap);
+ if (rc)
+ return rc;
+ }
+ }
+
if (!(pi.flags & SIS_FLAG_CFGSCR)) {
void __iomem *mmio;
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 5d7a02f63e1..50eecfe1d72 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -24,6 +24,7 @@
#include <linux/sysrq.h>
#include <linux/kbd_kern.h>
#include <linux/proc_fs.h>
+#include <linux/nmi.h>
#include <linux/quotaops.h>
#include <linux/perf_counter.h>
#include <linux/kernel.h>
@@ -222,12 +223,20 @@ static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus);
static void sysrq_handle_showallcpus(int key, struct tty_struct *tty)
{
- struct pt_regs *regs = get_irq_regs();
- if (regs) {
- printk(KERN_INFO "CPU%d:\n", smp_processor_id());
- show_regs(regs);
+ /*
+ * Fall back to the workqueue based printing if the
+ * backtrace printing did not succeed or the
+ * architecture has no support for it:
+ */
+ if (!trigger_all_cpu_backtrace()) {
+ struct pt_regs *regs = get_irq_regs();
+
+ if (regs) {
+ printk(KERN_INFO "CPU%d:\n", smp_processor_id());
+ show_regs(regs);
+ }
+ schedule_work(&sysrq_showallcpus);
}
- schedule_work(&sysrq_showallcpus);
}
static struct sysrq_key_op sysrq_showallcpus_op = {
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 24c84ae8152..938100f14b1 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -568,35 +568,76 @@ const struct dmi_device * dmi_find_device(int type, const char *name,
EXPORT_SYMBOL(dmi_find_device);
/**
- * dmi_get_year - Return year of a DMI date
- * @field: data index (like dmi_get_system_info)
+ * dmi_get_date - parse a DMI date
+ * @field: data index (see enum dmi_field)
+ * @yearp: optional out parameter for the year
+ * @monthp: optional out parameter for the month
+ * @dayp: optional out parameter for the day
*
- * Returns -1 when the field doesn't exist. 0 when it is broken.
+ * The date field is assumed to be in the form resembling
+ * [mm[/dd]]/yy[yy] and the result is stored in the out
+ * parameters any or all of which can be omitted.
+ *
+ * If the field doesn't exist, all out parameters are set to zero
+ * and false is returned. Otherwise, true is returned with any
+ * invalid part of date set to zero.
+ *
+ * On return, year, month and day are guaranteed to be in the
+ * range of [0,9999], [0,12] and [0,31] respectively.
*/
-int dmi_get_year(int field)
+bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp)
{
- int year;
- const char *s = dmi_get_system_info(field);
+ int year = 0, month = 0, day = 0;
+ bool exists;
+ const char *s, *y;
+ char *e;
- if (!s)
- return -1;
- if (*s == '\0')
- return 0;
- s = strrchr(s, '/');
- if (!s)
- return 0;
+ s = dmi_get_system_info(field);
+ exists = s;
+ if (!exists)
+ goto out;
- s += 1;
- year = simple_strtoul(s, NULL, 0);
- if (year && year < 100) { /* 2-digit year */
+ /*
+ * Determine year first. We assume the date string resembles
+ * mm/dd/yy[yy] but the original code extracted only the year
+ * from the end. Keep the behavior in the spirit of no
+ * surprises.
+ */
+ y = strrchr(s, '/');
+ if (!y)
+ goto out;
+
+ y++;
+ year = simple_strtoul(y, &e, 10);
+ if (y != e && year < 100) { /* 2-digit year */
year += 1900;
if (year < 1996) /* no dates < spec 1.0 */
year += 100;
}
+ if (year > 9999) /* year should fit in %04d */
+ year = 0;
+
+ /* parse the mm and dd */
+ month = simple_strtoul(s, &e, 10);
+ if (s == e || *e != '/' || !month || month > 12) {
+ month = 0;
+ goto out;
+ }
- return year;
+ s = e + 1;
+ day = simple_strtoul(s, &e, 10);
+ if (s == y || s == e || *e != '/' || day > 31)
+ day = 0;
+out:
+ if (yearp)
+ *yearp = year;
+ if (monthp)
+ *monthp = month;
+ if (dayp)
+ *dayp = day;
+ return exists;
}
-EXPORT_SYMBOL(dmi_get_year);
+EXPORT_SYMBOL(dmi_get_date);
/**
* dmi_walk - Walk the DMI table and get called back for every record
diff --git a/drivers/ide/atiixp.c b/drivers/ide/atiixp.c
index 923cbfe259d..6396c3ad325 100644
--- a/drivers/ide/atiixp.c
+++ b/drivers/ide/atiixp.c
@@ -177,6 +177,7 @@ static const struct pci_device_id atiixp_pci_tbl[] = {
{ PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP400_IDE), 0 },
{ PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP600_IDE), 1 },
{ PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP700_IDE), 0 },
+ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_SB900_IDE), 0 },
{ 0, },
};
MODULE_DEVICE_TABLE(pci, atiixp_pci_tbl);
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 242257b1944..a7aae24f288 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -21,7 +21,6 @@
#include <linux/sched.h>
#include <linux/oprofile.h>
-#include <linux/vmalloc.h>
#include <linux/errno.h>
#include "event_buffer.h"
@@ -407,6 +406,21 @@ int oprofile_add_data(struct op_entry *entry, unsigned long val)
return op_cpu_buffer_add_data(entry, val);
}
+int oprofile_add_data64(struct op_entry *entry, u64 val)
+{
+ if (!entry->event)
+ return 0;
+ if (op_cpu_buffer_get_size(entry) < 2)
+ /*
+ * the function returns 0 to indicate a too small
+ * buffer, even if there is some space left
+ */
+ return 0;
+ if (!op_cpu_buffer_add_data(entry, (u32)val))
+ return 0;
+ return op_cpu_buffer_add_data(entry, (u32)(val >> 32));
+}
+
int oprofile_write_commit(struct op_entry *entry)
{
if (!entry->event)
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index 3cffce90f82..dc8a0428260 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -12,6 +12,8 @@
#include <linux/init.h>
#include <linux/oprofile.h>
#include <linux/moduleparam.h>
+#include <linux/workqueue.h>
+#include <linux/time.h>
#include <asm/mutex.h>
#include "oprof.h"
@@ -87,6 +89,69 @@ out:
return err;
}
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
+static void switch_worker(struct work_struct *work);
+static DECLARE_DELAYED_WORK(switch_work, switch_worker);
+
+static void start_switch_worker(void)
+{
+ if (oprofile_ops.switch_events)
+ schedule_delayed_work(&switch_work, oprofile_time_slice);
+}
+
+static void stop_switch_worker(void)
+{
+ cancel_delayed_work_sync(&switch_work);
+}
+
+static void switch_worker(struct work_struct *work)
+{
+ if (oprofile_ops.switch_events())
+ return;
+
+ atomic_inc(&oprofile_stats.multiplex_counter);
+ start_switch_worker();
+}
+
+/* User inputs in ms, converts to jiffies */
+int oprofile_set_timeout(unsigned long val_msec)
+{
+ int err = 0;
+ unsigned long time_slice;
+
+ mutex_lock(&start_mutex);
+
+ if (oprofile_started) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (!oprofile_ops.switch_events) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ time_slice = msecs_to_jiffies(val_msec);
+ if (time_slice == MAX_JIFFY_OFFSET) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ oprofile_time_slice = time_slice;
+
+out:
+ mutex_unlock(&start_mutex);
+ return err;
+
+}
+
+#else
+
+static inline void start_switch_worker(void) { }
+static inline void stop_switch_worker(void) { }
+
+#endif
/* Actually start profiling (echo 1>/dev/oprofile/enable) */
int oprofile_start(void)
@@ -108,6 +173,8 @@ int oprofile_start(void)
if ((err = oprofile_ops.start()))
goto out;
+ start_switch_worker();
+
oprofile_started = 1;
out:
mutex_unlock(&start_mutex);
@@ -123,6 +190,9 @@ void oprofile_stop(void)
goto out;
oprofile_ops.stop();
oprofile_started = 0;
+
+ stop_switch_worker();
+
/* wake up the daemon to read what remains */
wake_up_buffer_waiter();
out:
@@ -155,7 +225,6 @@ post_sync:
mutex_unlock(&start_mutex);
}
-
int oprofile_set_backtrace(unsigned long val)
{
int err = 0;
diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h
index c288d3c24b5..cb92f5c98c1 100644
--- a/drivers/oprofile/oprof.h
+++ b/drivers/oprofile/oprof.h
@@ -24,6 +24,8 @@ struct oprofile_operations;
extern unsigned long oprofile_buffer_size;
extern unsigned long oprofile_cpu_buffer_size;
extern unsigned long oprofile_buffer_watershed;
+extern unsigned long oprofile_time_slice;
+
extern struct oprofile_operations oprofile_ops;
extern unsigned long oprofile_started;
extern unsigned long oprofile_backtrace_depth;
@@ -35,5 +37,6 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root);
void oprofile_timer_init(struct oprofile_operations *ops);
int oprofile_set_backtrace(unsigned long depth);
+int oprofile_set_timeout(unsigned long time);
#endif /* OPROF_H */
diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c
index 5d36ffc30dd..bbd7516e086 100644
--- a/drivers/oprofile/oprofile_files.c
+++ b/drivers/oprofile/oprofile_files.c
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/oprofile.h>
+#include <linux/jiffies.h>
#include "event_buffer.h"
#include "oprofile_stats.h"
@@ -17,10 +18,51 @@
#define BUFFER_SIZE_DEFAULT 131072
#define CPU_BUFFER_SIZE_DEFAULT 8192
#define BUFFER_WATERSHED_DEFAULT 32768 /* FIXME: tune */
+#define TIME_SLICE_DEFAULT 1
unsigned long oprofile_buffer_size;
unsigned long oprofile_cpu_buffer_size;
unsigned long oprofile_buffer_watershed;
+unsigned long oprofile_time_slice;
+
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
+static ssize_t timeout_read(struct file *file, char __user *buf,
+ size_t count, loff_t *offset)
+{
+ return oprofilefs_ulong_to_user(jiffies_to_msecs(oprofile_time_slice),
+ buf, count, offset);
+}
+
+
+static ssize_t timeout_write(struct file *file, char const __user *buf,
+ size_t count, loff_t *offset)
+{
+ unsigned long val;
+ int retval;
+
+ if (*offset)
+ return -EINVAL;
+
+ retval = oprofilefs_ulong_from_user(&val, buf, count);
+ if (retval)
+ return retval;
+
+ retval = oprofile_set_timeout(val);
+
+ if (retval)
+ return retval;
+ return count;
+}
+
+
+static const struct file_operations timeout_fops = {
+ .read = timeout_read,
+ .write = timeout_write,
+};
+
+#endif
+
static ssize_t depth_read(struct file *file, char __user *buf, size_t count, loff_t *offset)
{
@@ -129,6 +171,7 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root)
oprofile_buffer_size = BUFFER_SIZE_DEFAULT;
oprofile_cpu_buffer_size = CPU_BUFFER_SIZE_DEFAULT;
oprofile_buffer_watershed = BUFFER_WATERSHED_DEFAULT;
+ oprofile_time_slice = msecs_to_jiffies(TIME_SLICE_DEFAULT);
oprofilefs_create_file(sb, root, "enable", &enable_fops);
oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666);
@@ -139,6 +182,9 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root)
oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops);
oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops);
oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops);
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+ oprofilefs_create_file(sb, root, "time_slice", &timeout_fops);
+#endif
oprofile_create_stats_files(sb, root);
if (oprofile_ops.create_files)
oprofile_ops.create_files(sb, root);
diff --git a/drivers/oprofile/oprofile_stats.c b/drivers/oprofile/oprofile_stats.c
index 3c2270a8300..61689e814d4 100644
--- a/drivers/oprofile/oprofile_stats.c
+++ b/drivers/oprofile/oprofile_stats.c
@@ -34,6 +34,7 @@ void oprofile_reset_stats(void)
atomic_set(&oprofile_stats.sample_lost_no_mapping, 0);
atomic_set(&oprofile_stats.event_lost_overflow, 0);
atomic_set(&oprofile_stats.bt_lost_no_mapping, 0);
+ atomic_set(&oprofile_stats.multiplex_counter, 0);
}
@@ -76,4 +77,8 @@ void oprofile_create_stats_files(struct super_block *sb, struct dentry *root)
&oprofile_stats.event_lost_overflow);
oprofilefs_create_ro_atomic(sb, dir, "bt_lost_no_mapping",
&oprofile_stats.bt_lost_no_mapping);
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+ oprofilefs_create_ro_atomic(sb, dir, "multiplex_counter",
+ &oprofile_stats.multiplex_counter);
+#endif
}
diff --git a/drivers/oprofile/oprofile_stats.h b/drivers/oprofile/oprofile_stats.h
index 3da0d08dc1f..0b54e46c3c1 100644
--- a/drivers/oprofile/oprofile_stats.h
+++ b/drivers/oprofile/oprofile_stats.h
@@ -17,6 +17,7 @@ struct oprofile_stat_struct {
atomic_t sample_lost_no_mapping;
atomic_t bt_lost_no_mapping;
atomic_t event_lost_overflow;
+ atomic_t multiplex_counter;
};
extern struct oprofile_stat_struct oprofile_stats;
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 4f5b8712931..44803644ca0 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -55,15 +55,12 @@ static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
return desc->irq_2_iommu;
}
-static struct irq_2_iommu *irq_2_iommu_alloc_node(unsigned int irq, int node)
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
{
struct irq_desc *desc;
struct irq_2_iommu *irq_iommu;
- /*
- * alloc irq desc if not allocated already.
- */
- desc = irq_to_desc_alloc_node(irq, node);
+ desc = irq_to_desc(irq);
if (!desc) {
printk(KERN_INFO "can not get irq_desc for %d\n", irq);
return NULL;
@@ -72,16 +69,11 @@ static struct irq_2_iommu *irq_2_iommu_alloc_node(unsigned int irq, int node)
irq_iommu = desc->irq_2_iommu;
if (!irq_iommu)
- desc->irq_2_iommu = get_one_free_irq_2_iommu(node);
+ desc->irq_2_iommu = get_one_free_irq_2_iommu(irq_node(irq));
return desc->irq_2_iommu;
}
-static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
-{
- return irq_2_iommu_alloc_node(irq, cpu_to_node(boot_cpu_id));
-}
-
#else /* !CONFIG_SPARSE_IRQ */
static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 06b96562396..85ce23997be 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -992,7 +992,7 @@ DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454NX,
static void __devinit quirk_amd_ide_mode(struct pci_dev *pdev)
{
- /* set sb600/sb700/sb800 sata to ahci mode */
+ /* set SBX00 SATA in IDE mode to AHCI mode */
u8 tmp;
pci_read_config_byte(pdev, PCI_CLASS_DEVICE, &tmp);
@@ -1011,6 +1011,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP600_SATA, quirk
DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP600_SATA, quirk_amd_ide_mode);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP700_SATA, quirk_amd_ide_mode);
DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP700_SATA, quirk_amd_ide_mode);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_SB900_SATA_IDE, quirk_amd_ide_mode);
+DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_SB900_SATA_IDE, quirk_amd_ide_mode);
/*
* Serverworks CSB5 IDE does not fully support native mode
diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6b..a100fa35a48 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
#include <linux/swap.h>
#include <linux/bootmem.h>
#include <linux/fs_struct.h>
+#include <linux/hardirq.h>
#include "internal.h"
int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/locks.c b/fs/locks.c
index 52366e877d7..19ee18a6829 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
* give it the opportunity to lock the file.
*/
if (found)
- cond_resched_bkl();
+ cond_resched();
find_conflict:
for_each_lock(inode, before) {
diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h
index 5406a601185..e694263445f 100644
--- a/include/asm-generic/dma-mapping-common.h
+++ b/include/asm-generic/dma-mapping-common.h
@@ -103,7 +103,6 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
if (ops->sync_single_for_cpu)
ops->sync_single_for_cpu(dev, addr, size, dir);
debug_dma_sync_single_for_cpu(dev, addr, size, dir);
- flush_write_buffers();
}
static inline void dma_sync_single_for_device(struct device *dev,
@@ -116,7 +115,6 @@ static inline void dma_sync_single_for_device(struct device *dev,
if (ops->sync_single_for_device)
ops->sync_single_for_device(dev, addr, size, dir);
debug_dma_sync_single_for_device(dev, addr, size, dir);
- flush_write_buffers();
}
static inline void dma_sync_single_range_for_cpu(struct device *dev,
@@ -132,7 +130,6 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev,
ops->sync_single_range_for_cpu(dev, addr, offset, size, dir);
debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir);
- flush_write_buffers();
} else
dma_sync_single_for_cpu(dev, addr, size, dir);
}
@@ -150,7 +147,6 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
ops->sync_single_range_for_device(dev, addr, offset, size, dir);
debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir);
- flush_write_buffers();
} else
dma_sync_single_for_device(dev, addr, size, dir);
}
@@ -165,7 +161,6 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
if (ops->sync_sg_for_cpu)
ops->sync_sg_for_cpu(dev, sg, nelems, dir);
debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
- flush_write_buffers();
}
static inline void
@@ -179,7 +174,6 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
ops->sync_sg_for_device(dev, sg, nelems, dir);
debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
- flush_write_buffers();
}
#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL)
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 9c75921f0c1..6299a259ed1 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -210,15 +210,25 @@ enum {
ATA_CMD_STANDBY = 0xE2, /* place in standby power mode */
ATA_CMD_IDLE = 0xE3, /* place in idle power mode */
ATA_CMD_EDD = 0x90, /* execute device diagnostic */
+ ATA_CMD_DOWNLOAD_MICRO = 0x92,
+ ATA_CMD_NOP = 0x00,
ATA_CMD_FLUSH = 0xE7,
ATA_CMD_FLUSH_EXT = 0xEA,
ATA_CMD_ID_ATA = 0xEC,
ATA_CMD_ID_ATAPI = 0xA1,
+ ATA_CMD_SERVICE = 0xA2,
ATA_CMD_READ = 0xC8,
ATA_CMD_READ_EXT = 0x25,
+ ATA_CMD_READ_QUEUED = 0x26,
+ ATA_CMD_READ_STREAM_EXT = 0x2B,
+ ATA_CMD_READ_STREAM_DMA_EXT = 0x2A,
ATA_CMD_WRITE = 0xCA,
ATA_CMD_WRITE_EXT = 0x35,
+ ATA_CMD_WRITE_QUEUED = 0x36,
+ ATA_CMD_WRITE_STREAM_EXT = 0x3B,
+ ATA_CMD_WRITE_STREAM_DMA_EXT = 0x3A,
ATA_CMD_WRITE_FUA_EXT = 0x3D,
+ ATA_CMD_WRITE_QUEUED_FUA_EXT = 0x3E,
ATA_CMD_FPDMA_READ = 0x60,
ATA_CMD_FPDMA_WRITE = 0x61,
ATA_CMD_PIO_READ = 0x20,
@@ -235,6 +245,7 @@ enum {
ATA_CMD_PACKET = 0xA0,
ATA_CMD_VERIFY = 0x40,
ATA_CMD_VERIFY_EXT = 0x42,
+ ATA_CMD_WRITE_UNCORR_EXT = 0x45,
ATA_CMD_STANDBYNOW1 = 0xE0,
ATA_CMD_IDLEIMMEDIATE = 0xE1,
ATA_CMD_SLEEP = 0xE6,
@@ -243,15 +254,34 @@ enum {
ATA_CMD_READ_NATIVE_MAX_EXT = 0x27,
ATA_CMD_SET_MAX = 0xF9,
ATA_CMD_SET_MAX_EXT = 0x37,
- ATA_CMD_READ_LOG_EXT = 0x2f,
+ ATA_CMD_READ_LOG_EXT = 0x2F,
+ ATA_CMD_WRITE_LOG_EXT = 0x3F,
+ ATA_CMD_READ_LOG_DMA_EXT = 0x47,
+ ATA_CMD_WRITE_LOG_DMA_EXT = 0x57,
+ ATA_CMD_TRUSTED_RCV = 0x5C,
+ ATA_CMD_TRUSTED_RCV_DMA = 0x5D,
+ ATA_CMD_TRUSTED_SND = 0x5E,
+ ATA_CMD_TRUSTED_SND_DMA = 0x5F,
ATA_CMD_PMP_READ = 0xE4,
ATA_CMD_PMP_WRITE = 0xE8,
ATA_CMD_CONF_OVERLAY = 0xB1,
+ ATA_CMD_SEC_SET_PASS = 0xF1,
+ ATA_CMD_SEC_UNLOCK = 0xF2,
+ ATA_CMD_SEC_ERASE_PREP = 0xF3,
+ ATA_CMD_SEC_ERASE_UNIT = 0xF4,
ATA_CMD_SEC_FREEZE_LOCK = 0xF5,
+ ATA_CMD_SEC_DISABLE_PASS = 0xF6,
+ ATA_CMD_CONFIG_STREAM = 0x51,
ATA_CMD_SMART = 0xB0,
ATA_CMD_MEDIA_LOCK = 0xDE,
ATA_CMD_MEDIA_UNLOCK = 0xDF,
ATA_CMD_DSM = 0x06,
+ ATA_CMD_CHK_MED_CRD_TYP = 0xD1,
+ ATA_CMD_CFA_REQ_EXT_ERR = 0x03,
+ ATA_CMD_CFA_WRITE_NE = 0x38,
+ ATA_CMD_CFA_TRANS_SECT = 0x87,
+ ATA_CMD_CFA_ERASE = 0xC0,
+ ATA_CMD_CFA_WRITE_MULT_NE = 0xCD,
/* marked obsolete in the ATA/ATAPI-7 spec */
ATA_CMD_RESTORE = 0x10,
@@ -306,6 +336,7 @@ enum {
/* SETFEATURE Sector counts for SATA features */
SATA_AN = 0x05, /* Asynchronous Notification */
SATA_DIPM = 0x03, /* Device Initiated Power Management */
+ SATA_FPDMA_AA = 0x02, /* DMA Setup FIS Auto-Activate */
/* feature values for SET_MAX */
ATA_SET_MAX_ADDR = 0x00,
@@ -525,6 +556,9 @@ static inline int ata_is_data(u8 prot)
#define ata_id_has_atapi_AN(id) \
( (((id)[76] != 0x0000) && ((id)[76] != 0xffff)) && \
((id)[78] & (1 << 5)) )
+#define ata_id_has_fpdma_aa(id) \
+ ( (((id)[76] != 0x0000) && ((id)[76] != 0xffff)) && \
+ ((id)[78] & (1 << 2)) )
#define ata_id_iordy_disable(id) ((id)[ATA_ID_CAPABILITY] & (1 << 10))
#define ata_id_has_iordy(id) ((id)[ATA_ID_CAPABILITY] & (1 << 11))
#define ata_id_u32(id,n) \
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4d668e05d45..47536197ffd 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -48,6 +48,15 @@ struct notifier_block;
#ifdef CONFIG_SMP
/* Need to know about CPUs going up/down? */
+#if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
+#define cpu_notifier(fn, pri) { \
+ static struct notifier_block fn##_nb __cpuinitdata = \
+ { .notifier_call = fn, .priority = pri }; \
+ register_cpu_notifier(&fn##_nb); \
+}
+#else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
+#define cpu_notifier(fn, pri) do { (void)(fn); } while (0)
+#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
#ifdef CONFIG_HOTPLUG_CPU
extern int register_cpu_notifier(struct notifier_block *nb);
extern void unregister_cpu_notifier(struct notifier_block *nb);
@@ -74,6 +83,8 @@ extern void cpu_maps_update_done(void);
#else /* CONFIG_SMP */
+#define cpu_notifier(fn, pri) do { (void)(fn); } while (0)
+
static inline int register_cpu_notifier(struct notifier_block *nb)
{
return 0;
@@ -99,11 +110,7 @@ extern struct sysdev_class cpu_sysdev_class;
extern void get_online_cpus(void);
extern void put_online_cpus(void);
-#define hotcpu_notifier(fn, pri) { \
- static struct notifier_block fn##_nb __cpuinitdata = \
- { .notifier_call = fn, .priority = pri }; \
- register_cpu_notifier(&fn##_nb); \
-}
+#define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri)
#define register_hotcpu_notifier(nb) register_cpu_notifier(nb)
#define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb)
int cpu_down(unsigned int cpu);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 07dfd460d28..c0f6c3cd788 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -98,11 +98,6 @@ static inline int is_device_dma_capable(struct device *dev)
return dev->dma_mask != NULL && *dev->dma_mask != DMA_MASK_NONE;
}
-static inline int is_buffer_dma_capable(u64 mask, dma_addr_t addr, size_t size)
-{
- return addr + size <= mask;
-}
-
#ifdef CONFIG_HAS_DMA
#include <asm/dma-mapping.h>
#else
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index bb5489c82c9..a8a3e1ac281 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -43,7 +43,7 @@ extern const char * dmi_get_system_info(int field);
extern const struct dmi_device * dmi_find_device(int type, const char *name,
const struct dmi_device *from);
extern void dmi_scan_machine(void);
-extern int dmi_get_year(int field);
+extern bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp);
extern int dmi_name_in_vendors(const char *str);
extern int dmi_name_in_serial(const char *str);
extern int dmi_available;
@@ -58,7 +58,16 @@ static inline const char * dmi_get_system_info(int field) { return NULL; }
static inline const struct dmi_device * dmi_find_device(int type, const char *name,
const struct dmi_device *from) { return NULL; }
static inline void dmi_scan_machine(void) { return; }
-static inline int dmi_get_year(int year) { return 0; }
+static inline bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp)
+{
+ if (yearp)
+ *yearp = 0;
+ if (monthp)
+ *monthp = 0;
+ if (dayp)
+ *dayp = 0;
+ return false;
+}
static inline int dmi_name_in_vendors(const char *s) { return 0; }
static inline int dmi_name_in_serial(const char *s) { return 0; }
#define dmi_available 0
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index a81170de7f6..23f7179bf74 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -93,16 +93,22 @@ void tracing_generic_entry_update(struct trace_entry *entry,
unsigned long flags,
int pc);
struct ring_buffer_event *
-trace_current_buffer_lock_reserve(int type, unsigned long len,
+trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
+ int type, unsigned long len,
unsigned long flags, int pc);
-void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
unsigned long flags, int pc);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
unsigned long flags, int pc);
-void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
+void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event);
void tracing_record_cmdline(struct task_struct *tsk);
+struct event_filter;
+
struct ftrace_event_call {
struct list_head list;
char *name;
@@ -110,16 +116,18 @@ struct ftrace_event_call {
struct dentry *dir;
struct trace_event *event;
int enabled;
- int (*regfunc)(void);
- void (*unregfunc)(void);
+ int (*regfunc)(void *);
+ void (*unregfunc)(void *);
int id;
int (*raw_init)(void);
- int (*show_format)(struct trace_seq *s);
- int (*define_fields)(void);
+ int (*show_format)(struct ftrace_event_call *call,
+ struct trace_seq *s);
+ int (*define_fields)(struct ftrace_event_call *);
struct list_head fields;
int filter_active;
- void *filter;
+ struct event_filter *filter;
void *mod;
+ void *data;
atomic_t profile_count;
int (*profile_enable)(struct ftrace_event_call *);
@@ -129,15 +137,25 @@ struct ftrace_event_call {
#define MAX_FILTER_PRED 32
#define MAX_FILTER_STR_VAL 128
-extern int init_preds(struct ftrace_event_call *call);
extern void destroy_preds(struct ftrace_event_call *call);
extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
-extern int filter_current_check_discard(struct ftrace_event_call *call,
+extern int filter_current_check_discard(struct ring_buffer *buffer,
+ struct ftrace_event_call *call,
void *rec,
struct ring_buffer_event *event);
-extern int trace_define_field(struct ftrace_event_call *call, char *type,
- char *name, int offset, int size, int is_signed);
+enum {
+ FILTER_OTHER = 0,
+ FILTER_STATIC_STRING,
+ FILTER_DYN_STRING,
+ FILTER_PTR_STRING,
+};
+
+extern int trace_define_field(struct ftrace_event_call *call,
+ const char *type, const char *name,
+ int offset, int size, int is_signed,
+ int filter_type);
+extern int trace_define_common_fields(struct ftrace_event_call *call);
#define is_signed_type(type) (((type)(-1)) < 0)
@@ -162,11 +180,4 @@ do { \
__trace_printk(ip, fmt, ##args); \
} while (0)
-#define __common_field(type, item, is_signed) \
- ret = trace_define_field(event_call, #type, "common_" #item, \
- offsetof(typeof(field.ent), item), \
- sizeof(field.ent.item), is_signed); \
- if (ret) \
- return ret;
-
#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 8246c697863..6d527ee82b2 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -64,6 +64,12 @@
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET (1UL << NMI_SHIFT)
+#ifndef PREEMPT_ACTIVE
+#define PREEMPT_ACTIVE_BITS 1
+#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
+#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
+#endif
+
#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
#error PREEMPT_ACTIVE is too low!
#endif
@@ -132,7 +138,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
}
#endif
-#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU)
+#if defined(CONFIG_NO_HZ)
extern void rcu_irq_enter(void);
extern void rcu_irq_exit(void);
extern void rcu_nmi_enter(void);
@@ -142,7 +148,7 @@ extern void rcu_nmi_exit(void);
# define rcu_irq_exit() do { } while (0)
# define rcu_nmi_enter() do { } while (0)
# define rcu_nmi_exit() do { } while (0)
-#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */
+#endif /* #if defined(CONFIG_NO_HZ) */
/*
* It is safe to do non-atomic ops on ->hardirq_context,
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 7fc01b13be4..9e7f2e8fc66 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -94,6 +94,16 @@ extern struct group_info init_groups;
# define CAP_INIT_BSET CAP_INIT_EFF_SET
#endif
+#ifdef CONFIG_TREE_PREEMPT_RCU
+#define INIT_TASK_RCU_PREEMPT(tsk) \
+ .rcu_read_lock_nesting = 0, \
+ .rcu_read_unlock_special = 0, \
+ .rcu_blocked_node = NULL, \
+ .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),
+#else
+#define INIT_TASK_RCU_PREEMPT(tsk)
+#endif
+
extern struct cred init_cred;
#ifdef CONFIG_PERF_COUNTERS
@@ -173,6 +183,7 @@ extern struct cred init_cred;
INIT_LOCKDEP \
INIT_FTRACE_GRAPH \
INIT_TRACE_RECURSION \
+ INIT_TASK_RCU_PREEMPT(tsk) \
}
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 35e7df1e9f3..1ac57e522a1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -50,6 +50,9 @@
* IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is
* registered first in an shared interrupt is considered for
* performance reasons)
+ * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished.
+ * Used by threaded interrupts which need to keep the
+ * irq line disabled until the threaded handler has been run.
*/
#define IRQF_DISABLED 0x00000020
#define IRQF_SAMPLE_RANDOM 0x00000040
@@ -59,6 +62,7 @@
#define IRQF_PERCPU 0x00000400
#define IRQF_NOBALANCING 0x00000800
#define IRQF_IRQPOLL 0x00001000
+#define IRQF_ONESHOT 0x00002000
/*
* Bits used by threaded handlers:
diff --git a/include/linux/irq.h b/include/linux/irq.h
index cb2e77a3f7f..ae9653dbcd7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -69,6 +69,8 @@ typedef void (*irq_flow_handler_t)(unsigned int irq,
#define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */
#define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/
#define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */
+#define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */
+#define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */
#ifdef CONFIG_IRQ_PER_CPU
# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
@@ -100,6 +102,9 @@ struct msi_desc;
* @set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ
* @set_wake: enable/disable power-management wake-on of an IRQ
*
+ * @bus_lock: function to lock access to slow bus (i2c) chips
+ * @bus_sync_unlock: function to sync and unlock slow bus (i2c) chips
+ *
* @release: release function solely used by UML
* @typename: obsoleted by name, kept as migration helper
*/
@@ -123,6 +128,9 @@ struct irq_chip {
int (*set_type)(unsigned int irq, unsigned int flow_type);
int (*set_wake)(unsigned int irq, unsigned int on);
+ void (*bus_lock)(unsigned int irq);
+ void (*bus_sync_unlock)(unsigned int irq);
+
/* Currently used only by UML, might disappear one day.*/
#ifdef CONFIG_IRQ_RELEASE_METHOD
void (*release)(unsigned int irq, void *dev_id);
@@ -220,13 +228,6 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
/*
- * Migration helpers for obsolete names, they will go away:
- */
-#define hw_interrupt_type irq_chip
-#define no_irq_type no_irq_chip
-typedef struct irq_desc irq_desc_t;
-
-/*
* Pick up the arch-dependent methods:
*/
#include <asm/hw_irq.h>
@@ -289,6 +290,7 @@ extern void handle_edge_irq(unsigned int irq, struct irq_desc *desc);
extern void handle_simple_irq(unsigned int irq, struct irq_desc *desc);
extern void handle_percpu_irq(unsigned int irq, struct irq_desc *desc);
extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc);
+extern void handle_nested_irq(unsigned int irq);
/*
* Monolithic do_IRQ implementation.
@@ -379,6 +381,8 @@ set_irq_chained_handler(unsigned int irq,
__set_irq_handler(irq, handle, 1, NULL);
}
+extern void set_irq_nested_thread(unsigned int irq, int nest);
+
extern void set_irq_noprobe(unsigned int irq);
extern void set_irq_probe(unsigned int irq);
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index ec87b212ff7..7bf89bc8cbc 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -41,6 +41,12 @@ extern struct irq_desc *irq_to_desc(unsigned int irq);
; \
else
+#ifdef CONFIG_SMP
+#define irq_node(irq) (irq_to_desc(irq)->node)
+#else
+#define irq_node(irq) 0
+#endif
+
#endif /* CONFIG_GENERIC_HARDIRQS */
#define for_each_irq_nr(irq) \
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d6320a3e8de..2b5b1e0899a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -125,7 +125,7 @@ extern int _cond_resched(void);
#endif
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
- void __might_sleep(char *file, int line);
+ void __might_sleep(char *file, int line, int preempt_offset);
/**
* might_sleep - annotation for functions that can sleep
*
@@ -137,8 +137,9 @@ extern int _cond_resched(void);
* supposed to.
*/
# define might_sleep() \
- do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
+ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
#else
+ static inline void __might_sleep(char *file, int line, int preempt_offset) { }
# define might_sleep() do { might_resched(); } while (0)
#endif
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e5b6e33c657..76319bf03e3 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -143,7 +143,6 @@ enum {
ATA_DFLAG_PIO = (1 << 12), /* device limited to PIO mode */
ATA_DFLAG_NCQ_OFF = (1 << 13), /* device limited to non-NCQ mode */
- ATA_DFLAG_SPUNDOWN = (1 << 14), /* XXX: for spindown_compat */
ATA_DFLAG_SLEEPING = (1 << 15), /* device is sleeping */
ATA_DFLAG_DUBIOUS_XFER = (1 << 16), /* data transfer not verified */
ATA_DFLAG_NO_UNLOAD = (1 << 17), /* device doesn't support unload */
@@ -190,6 +189,7 @@ enum {
ATA_FLAG_NO_POWEROFF_SPINDOWN = (1 << 11), /* don't spindown before poweroff */
ATA_FLAG_NO_HIBERNATE_SPINDOWN = (1 << 12), /* don't spindown before hibernation */
ATA_FLAG_DEBUGMSG = (1 << 13),
+ ATA_FLAG_FPDMA_AA = (1 << 14), /* driver supports Auto-Activate */
ATA_FLAG_IGN_SIMPLEX = (1 << 15), /* ignore SIMPLEX */
ATA_FLAG_NO_IORDY = (1 << 16), /* controller lacks iordy */
ATA_FLAG_ACPI_SATA = (1 << 17), /* need native SATA ACPI layout */
@@ -386,6 +386,7 @@ enum {
ATA_HORKAGE_FIRMWARE_WARN = (1 << 12), /* firmware update warning */
ATA_HORKAGE_1_5_GBPS = (1 << 13), /* force 1.5 Gbps */
ATA_HORKAGE_NOSETXFER = (1 << 14), /* skip SETXFER, SATA only */
+ ATA_HORKAGE_BROKEN_FPDMA_AA = (1 << 15), /* skip AA */
/* DMA mask for user DMA control: User visible values; DO NOT
renumber */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b25d1b53df0..9ccf0e286b2 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -149,6 +149,12 @@ struct lock_list {
struct lock_class *class;
struct stack_trace trace;
int distance;
+
+ /*
+ * The parent field is used to implement breadth-first search, and the
+ * bit 0 is reused to indicate if the lock has been accessed in BFS.
+ */
+ struct lock_list *parent;
};
/*
@@ -208,10 +214,12 @@ struct held_lock {
* interrupt context:
*/
unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
- unsigned int trylock:1;
+ unsigned int trylock:1; /* 16 bits */
+
unsigned int read:2; /* see lock_acquire() comment */
unsigned int check:2; /* see lock_acquire() comment */
unsigned int hardirqs_off:1;
+ unsigned int references:11; /* 32 bits */
};
/*
@@ -291,6 +299,10 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
extern void lock_release(struct lockdep_map *lock, int nested,
unsigned long ip);
+#define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map)
+
+extern int lock_is_held(struct lockdep_map *lock);
+
extern void lock_set_class(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, unsigned int subclass,
unsigned long ip);
@@ -309,6 +321,8 @@ extern void lockdep_trace_alloc(gfp_t mask);
#define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0)
+#define lockdep_assert_held(l) WARN_ON(debug_locks && !lockdep_is_held(l))
+
#else /* !LOCKDEP */
static inline void lockdep_off(void)
@@ -353,6 +367,8 @@ struct lock_class_key { };
#define lockdep_depth(tsk) (0)
+#define lockdep_assert_held(l) do { } while (0)
+
#endif /* !LOCKDEP */
#ifdef CONFIG_LOCK_STAT
diff --git a/include/linux/module.h b/include/linux/module.h
index 098bdb7bfac..f8f92d015ef 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -17,10 +17,12 @@
#include <linux/moduleparam.h>
#include <linux/marker.h>
#include <linux/tracepoint.h>
-#include <asm/local.h>
+#include <asm/local.h>
#include <asm/module.h>
+#include <trace/events/module.h>
+
/* Not Yet Implemented */
#define MODULE_SUPPORTED_DEVICE(name)
@@ -462,7 +464,10 @@ static inline local_t *__module_ref_addr(struct module *mod, int cpu)
static inline void __module_get(struct module *module)
{
if (module) {
- local_inc(__module_ref_addr(module, get_cpu()));
+ unsigned int cpu = get_cpu();
+ local_inc(__module_ref_addr(module, cpu));
+ trace_module_get(module, _THIS_IP_,
+ local_read(__module_ref_addr(module, cpu)));
put_cpu();
}
}
@@ -473,8 +478,11 @@ static inline int try_module_get(struct module *module)
if (module) {
unsigned int cpu = get_cpu();
- if (likely(module_is_live(module)))
+ if (likely(module_is_live(module))) {
local_inc(__module_ref_addr(module, cpu));
+ trace_module_get(module, _THIS_IP_,
+ local_read(__module_ref_addr(module, cpu)));
+ }
else
ret = 0;
put_cpu();
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 29af2d5df09..b752e807add 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -28,8 +28,23 @@ static inline void acpi_nmi_disable(void) { }
static inline void acpi_nmi_enable(void) { }
#endif
-#ifndef trigger_all_cpu_backtrace
-#define trigger_all_cpu_backtrace() do { } while (0)
+/*
+ * Create trigger_all_cpu_backtrace() out of the arch-provided
+ * base function. Return whether such support was available,
+ * to allow calling code to fall back to some other mechanism:
+ */
+#ifdef arch_trigger_all_cpu_backtrace
+static inline bool trigger_all_cpu_backtrace(void)
+{
+ arch_trigger_all_cpu_backtrace();
+
+ return true;
+}
+#else
+static inline bool trigger_all_cpu_backtrace(void)
+{
+ return false;
+}
#endif
#endif
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index 1d9518bc4c5..5171639ecf0 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -67,6 +67,9 @@ struct oprofile_operations {
/* Initiate a stack backtrace. Optional. */
void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
+
+ /* Multiplex between different events. Optional. */
+ int (*switch_events)(void);
/* CPU identification string. */
char * cpu_type;
};
@@ -171,7 +174,6 @@ struct op_sample;
struct op_entry {
struct ring_buffer_event *event;
struct op_sample *sample;
- unsigned long irq_flags;
unsigned long size;
unsigned long *data;
};
@@ -180,6 +182,7 @@ void oprofile_write_reserve(struct op_entry *entry,
struct pt_regs * const regs,
unsigned long pc, int code, int size);
int oprofile_add_data(struct op_entry *entry, unsigned long val);
+int oprofile_add_data64(struct op_entry *entry, u64 val);
int oprofile_write_commit(struct op_entry *entry);
#endif /* OPROFILE_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index aec3252afcf..ed5d7501e18 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -132,7 +132,7 @@ static inline int page_cache_get_speculative(struct page *page)
{
VM_BUG_ON(in_interrupt());
-#if !defined(CONFIG_SMP) && defined(CONFIG_CLASSIC_RCU)
+#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
# ifdef CONFIG_PREEMPT
VM_BUG_ON(!in_atomic());
# endif
@@ -170,7 +170,7 @@ static inline int page_cache_add_speculative(struct page *page, int count)
{
VM_BUG_ON(in_interrupt());
-#if !defined(CONFIG_SMP) && defined(CONFIG_CLASSIC_RCU)
+#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
# ifdef CONFIG_PREEMPT
VM_BUG_ON(!in_atomic());
# endif
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 73b46b6b904..c8fdcadce43 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -376,6 +376,9 @@
#define PCI_DEVICE_ID_ATI_IXP600_IDE 0x438c
#define PCI_DEVICE_ID_ATI_IXP700_SATA 0x4390
#define PCI_DEVICE_ID_ATI_IXP700_IDE 0x439c
+/* AMD SB Chipset */
+#define PCI_DEVICE_ID_AMD_SB900_IDE 0x780c
+#define PCI_DEVICE_ID_AMD_SB900_SATA_IDE 0x7800
#define PCI_VENDOR_ID_VLSI 0x1004
#define PCI_DEVICE_ID_VLSI_82C592 0x0005
@@ -537,6 +540,7 @@
#define PCI_DEVICE_ID_AMD_8131_BRIDGE 0x7450
#define PCI_DEVICE_ID_AMD_8131_APIC 0x7451
#define PCI_DEVICE_ID_AMD_8132_BRIDGE 0x7458
+#define PCI_DEVICE_ID_AMD_CS5535_IDE 0x208F
#define PCI_DEVICE_ID_AMD_CS5536_ISA 0x2090
#define PCI_DEVICE_ID_AMD_CS5536_FLASH 0x2091
#define PCI_DEVICE_ID_AMD_CS5536_AUDIO 0x2093
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index b53f7006cc4..972f90d7a32 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -216,6 +216,7 @@ struct perf_counter_attr {
#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2)
#define PERF_COUNTER_IOC_RESET _IO ('$', 3)
#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64)
+#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5)
enum perf_counter_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
@@ -415,6 +416,9 @@ enum perf_callchain_context {
PERF_CONTEXT_MAX = (__u64)-4095,
};
+#define PERF_FLAG_FD_NO_GROUP (1U << 0)
+#define PERF_FLAG_FD_OUTPUT (1U << 1)
+
#ifdef __KERNEL__
/*
* Kernel-internal data types and definitions:
@@ -536,6 +540,7 @@ struct perf_counter {
struct list_head sibling_list;
int nr_siblings;
struct perf_counter *group_leader;
+ struct perf_counter *output;
const struct pmu *pmu;
enum perf_counter_active_state state;
@@ -761,6 +766,8 @@ extern int sysctl_perf_counter_mlock;
extern int sysctl_perf_counter_sample_rate;
extern void perf_counter_init(void);
+extern void perf_tpcounter_event(int event_id, u64 addr, u64 count,
+ void *record, int entry_size);
#ifndef perf_misc_flags
#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
deleted file mode 100644
index bfd92e1e5d2..00000000000
--- a/include/linux/rcuclassic.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion (classic version)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2001
- *
- * Author: Dipankar Sarma <dipankar@in.ibm.com>
- *
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
- * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU
- *
- */
-
-#ifndef __LINUX_RCUCLASSIC_H
-#define __LINUX_RCUCLASSIC_H
-
-#include <linux/cache.h>
-#include <linux/spinlock.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/seqlock.h>
-
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rcp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
- long cur; /* Current batch number. */
- long completed; /* Number of the last completed batch */
- long pending; /* Number of the last pending batch */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
- unsigned long gp_start; /* Time at which GP started in jiffies. */
- unsigned long jiffies_stall;
- /* Time at which to check for CPU stalls. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
- int signaled;
-
- spinlock_t lock ____cacheline_internodealigned_in_smp;
- DECLARE_BITMAP(cpumask, NR_CPUS); /* CPUs that need to switch for */
- /* current batch to proceed. */
-} ____cacheline_internodealigned_in_smp;
-
-/* Is batch a before batch b ? */
-static inline int rcu_batch_before(long a, long b)
-{
- return (a - b) < 0;
-}
-
-/* Is batch a after batch b ? */
-static inline int rcu_batch_after(long a, long b)
-{
- return (a - b) > 0;
-}
-
-/* Per-CPU data for Read-Copy UPdate. */
-struct rcu_data {
- /* 1) quiescent state handling : */
- long quiescbatch; /* Batch # for grace period */
- int passed_quiesc; /* User-mode/idle loop etc. */
- int qs_pending; /* core waits for quiesc state */
-
- /* 2) batch handling */
- /*
- * if nxtlist is not NULL, then:
- * batch:
- * The batch # for the last entry of nxtlist
- * [*nxttail[1], NULL = *nxttail[2]):
- * Entries that batch # <= batch
- * [*nxttail[0], *nxttail[1]):
- * Entries that batch # <= batch - 1
- * [nxtlist, *nxttail[0]):
- * Entries that batch # <= batch - 2
- * The grace period for these entries has completed, and
- * the other grace-period-completed entries may be moved
- * here temporarily in rcu_process_callbacks().
- */
- long batch;
- struct rcu_head *nxtlist;
- struct rcu_head **nxttail[3];
- long qlen; /* # of queued callbacks */
- struct rcu_head *donelist;
- struct rcu_head **donetail;
- long blimit; /* Upper limit on a processed batch */
- int cpu;
- struct rcu_head barrier;
-};
-
-/*
- * Increment the quiescent state counter.
- * The counter is a bit degenerated: We do not need to know
- * how many quiescent states passed, just if there was at least
- * one since the start of the grace period. Thus just a flag.
- */
-extern void rcu_qsctr_inc(int cpu);
-extern void rcu_bh_qsctr_inc(int cpu);
-
-extern int rcu_pending(int cpu);
-extern int rcu_needs_cpu(int cpu);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-extern struct lockdep_map rcu_lock_map;
-# define rcu_read_acquire() \
- lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
-# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_)
-#else
-# define rcu_read_acquire() do { } while (0)
-# define rcu_read_release() do { } while (0)
-#endif
-
-#define __rcu_read_lock() \
- do { \
- preempt_disable(); \
- __acquire(RCU); \
- rcu_read_acquire(); \
- } while (0)
-#define __rcu_read_unlock() \
- do { \
- rcu_read_release(); \
- __release(RCU); \
- preempt_enable(); \
- } while (0)
-#define __rcu_read_lock_bh() \
- do { \
- local_bh_disable(); \
- __acquire(RCU_BH); \
- rcu_read_acquire(); \
- } while (0)
-#define __rcu_read_unlock_bh() \
- do { \
- rcu_read_release(); \
- __release(RCU_BH); \
- local_bh_enable(); \
- } while (0)
-
-#define __synchronize_sched() synchronize_rcu()
-
-#define call_rcu_sched(head, func) call_rcu(head, func)
-
-extern void __rcu_init(void);
-#define rcu_init_sched() do { } while (0)
-extern void rcu_check_callbacks(int cpu, int user);
-extern void rcu_restart_cpu(int cpu);
-
-extern long rcu_batches_completed(void);
-extern long rcu_batches_completed_bh(void);
-
-#define rcu_enter_nohz() do { } while (0)
-#define rcu_exit_nohz() do { } while (0)
-
-/* A context switch is a grace period for rcuclassic. */
-static inline int rcu_blocking_is_gp(void)
-{
- return num_online_cpus() == 1;
-}
-
-#endif /* __LINUX_RCUCLASSIC_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 15fbb3ca634..95e0615f4d7 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -51,18 +51,26 @@ struct rcu_head {
void (*func)(struct rcu_head *head);
};
-/* Internal to kernel, but needed by rcupreempt.h. */
+/* Exported common interfaces */
+extern void synchronize_rcu(void);
+extern void synchronize_rcu_bh(void);
+extern void rcu_barrier(void);
+extern void rcu_barrier_bh(void);
+extern void rcu_barrier_sched(void);
+extern void synchronize_sched_expedited(void);
+extern int sched_expedited_torture_stats(char *page);
+
+/* Internal to kernel */
+extern void rcu_init(void);
+extern void rcu_scheduler_starting(void);
+extern int rcu_needs_cpu(int cpu);
extern int rcu_scheduler_active;
-#if defined(CONFIG_CLASSIC_RCU)
-#include <linux/rcuclassic.h>
-#elif defined(CONFIG_TREE_RCU)
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
#include <linux/rcutree.h>
-#elif defined(CONFIG_PREEMPT_RCU)
-#include <linux/rcupreempt.h>
#else
#error "Unknown RCU implementation specified to kernel configuration"
-#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */
+#endif
#define RCU_HEAD_INIT { .next = NULL, .func = NULL }
#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
@@ -70,6 +78,16 @@ extern int rcu_scheduler_active;
(ptr)->next = NULL; (ptr)->func = NULL; \
} while (0)
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern struct lockdep_map rcu_lock_map;
+# define rcu_read_acquire() \
+ lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
+# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_)
+#else
+# define rcu_read_acquire() do { } while (0)
+# define rcu_read_release() do { } while (0)
+#endif
+
/**
* rcu_read_lock - mark the beginning of an RCU read-side critical section.
*
@@ -99,7 +117,12 @@ extern int rcu_scheduler_active;
*
* It is illegal to block while in an RCU read-side critical section.
*/
-#define rcu_read_lock() __rcu_read_lock()
+static inline void rcu_read_lock(void)
+{
+ __rcu_read_lock();
+ __acquire(RCU);
+ rcu_read_acquire();
+}
/**
* rcu_read_unlock - marks the end of an RCU read-side critical section.
@@ -116,7 +139,12 @@ extern int rcu_scheduler_active;
* used as well. RCU does not care how the writers keep out of each
* others' way, as long as they do so.
*/
-#define rcu_read_unlock() __rcu_read_unlock()
+static inline void rcu_read_unlock(void)
+{
+ rcu_read_release();
+ __release(RCU);
+ __rcu_read_unlock();
+}
/**
* rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
@@ -129,14 +157,24 @@ extern int rcu_scheduler_active;
* can use just rcu_read_lock().
*
*/
-#define rcu_read_lock_bh() __rcu_read_lock_bh()
+static inline void rcu_read_lock_bh(void)
+{
+ __rcu_read_lock_bh();
+ __acquire(RCU_BH);
+ rcu_read_acquire();
+}
/*
* rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
*
* See rcu_read_lock_bh() for more information.
*/
-#define rcu_read_unlock_bh() __rcu_read_unlock_bh()
+static inline void rcu_read_unlock_bh(void)
+{
+ rcu_read_release();
+ __release(RCU_BH);
+ __rcu_read_unlock_bh();
+}
/**
* rcu_read_lock_sched - mark the beginning of a RCU-classic critical section
@@ -147,17 +185,34 @@ extern int rcu_scheduler_active;
* - call_rcu_sched() and rcu_barrier_sched()
* on the write-side to insure proper synchronization.
*/
-#define rcu_read_lock_sched() preempt_disable()
-#define rcu_read_lock_sched_notrace() preempt_disable_notrace()
+static inline void rcu_read_lock_sched(void)
+{
+ preempt_disable();
+ __acquire(RCU_SCHED);
+ rcu_read_acquire();
+}
+static inline notrace void rcu_read_lock_sched_notrace(void)
+{
+ preempt_disable_notrace();
+ __acquire(RCU_SCHED);
+}
/*
* rcu_read_unlock_sched - marks the end of a RCU-classic critical section
*
* See rcu_read_lock_sched for more information.
*/
-#define rcu_read_unlock_sched() preempt_enable()
-#define rcu_read_unlock_sched_notrace() preempt_enable_notrace()
-
+static inline void rcu_read_unlock_sched(void)
+{
+ rcu_read_release();
+ __release(RCU_SCHED);
+ preempt_enable();
+}
+static inline notrace void rcu_read_unlock_sched_notrace(void)
+{
+ __release(RCU_SCHED);
+ preempt_enable_notrace();
+}
/**
@@ -259,15 +314,4 @@ extern void call_rcu(struct rcu_head *head,
extern void call_rcu_bh(struct rcu_head *head,
void (*func)(struct rcu_head *head));
-/* Exported common interfaces */
-extern void synchronize_rcu(void);
-extern void rcu_barrier(void);
-extern void rcu_barrier_bh(void);
-extern void rcu_barrier_sched(void);
-
-/* Internal to kernel */
-extern void rcu_init(void);
-extern void rcu_scheduler_starting(void);
-extern int rcu_needs_cpu(int cpu);
-
#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
deleted file mode 100644
index fce522782ff..00000000000
--- a/include/linux/rcupreempt.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion (RT implementation)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2006
- *
- * Author: Paul McKenney <paulmck@us.ibm.com>
- *
- * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
- * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU
- *
- */
-
-#ifndef __LINUX_RCUPREEMPT_H
-#define __LINUX_RCUPREEMPT_H
-
-#include <linux/cache.h>
-#include <linux/spinlock.h>
-#include <linux/threads.h>
-#include <linux/smp.h>
-#include <linux/cpumask.h>
-#include <linux/seqlock.h>
-
-extern void rcu_qsctr_inc(int cpu);
-static inline void rcu_bh_qsctr_inc(int cpu) { }
-
-/*
- * Someone might want to pass call_rcu_bh as a function pointer.
- * So this needs to just be a rename and not a macro function.
- * (no parentheses)
- */
-#define call_rcu_bh call_rcu
-
-/**
- * call_rcu_sched - Queue RCU callback for invocation after sched grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full
- * synchronize_sched()-style grace period elapses, in other words after
- * all currently executing preempt-disabled sections of code (including
- * hardirq handlers, NMI handlers, and local_irq_save() blocks) have
- * completed.
- */
-extern void call_rcu_sched(struct rcu_head *head,
- void (*func)(struct rcu_head *head));
-
-extern void __rcu_read_lock(void) __acquires(RCU);
-extern void __rcu_read_unlock(void) __releases(RCU);
-extern int rcu_pending(int cpu);
-extern int rcu_needs_cpu(int cpu);
-
-#define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); }
-#define __rcu_read_unlock_bh() { local_bh_enable(); rcu_read_unlock(); }
-
-extern void __synchronize_sched(void);
-
-extern void __rcu_init(void);
-extern void rcu_init_sched(void);
-extern void rcu_check_callbacks(int cpu, int user);
-extern void rcu_restart_cpu(int cpu);
-extern long rcu_batches_completed(void);
-
-/*
- * Return the number of RCU batches processed thus far. Useful for debug
- * and statistic. The _bh variant is identifcal to straight RCU
- */
-static inline long rcu_batches_completed_bh(void)
-{
- return rcu_batches_completed();
-}
-
-#ifdef CONFIG_RCU_TRACE
-struct rcupreempt_trace;
-extern long *rcupreempt_flipctr(int cpu);
-extern long rcupreempt_data_completed(void);
-extern int rcupreempt_flip_flag(int cpu);
-extern int rcupreempt_mb_flag(int cpu);
-extern char *rcupreempt_try_flip_state_name(void);
-extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
-#endif
-
-struct softirq_action;
-
-#ifdef CONFIG_NO_HZ
-extern void rcu_enter_nohz(void);
-extern void rcu_exit_nohz(void);
-#else
-# define rcu_enter_nohz() do { } while (0)
-# define rcu_exit_nohz() do { } while (0)
-#endif
-
-/*
- * A context switch is a grace period for rcupreempt synchronize_rcu()
- * only during early boot, before the scheduler has been initialized.
- * So, how the heck do we get a context switch? Well, if the caller
- * invokes synchronize_rcu(), they are willing to accept a context
- * switch, so we simply pretend that one happened.
- *
- * After boot, there might be a blocked or preempted task in an RCU
- * read-side critical section, so we cannot then take the fastpath.
- */
-static inline int rcu_blocking_is_gp(void)
-{
- return num_online_cpus() == 1 && !rcu_scheduler_active;
-}
-
-#endif /* __LINUX_RCUPREEMPT_H */
diff --git a/include/linux/rcupreempt_trace.h b/include/linux/rcupreempt_trace.h
deleted file mode 100644
index b99ae073192..00000000000
--- a/include/linux/rcupreempt_trace.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion (RT implementation)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2006
- *
- * Author: Paul McKenney <paulmck@us.ibm.com>
- *
- * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
- * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
- *
- * For detailed explanation of the Preemptible Read-Copy Update mechanism see -
- * http://lwn.net/Articles/253651/
- */
-
-#ifndef __LINUX_RCUPREEMPT_TRACE_H
-#define __LINUX_RCUPREEMPT_TRACE_H
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-
-#include <asm/atomic.h>
-
-/*
- * PREEMPT_RCU data structures.
- */
-
-struct rcupreempt_trace {
- long next_length;
- long next_add;
- long wait_length;
- long wait_add;
- long done_length;
- long done_add;
- long done_remove;
- atomic_t done_invoked;
- long rcu_check_callbacks;
- atomic_t rcu_try_flip_1;
- atomic_t rcu_try_flip_e1;
- long rcu_try_flip_i1;
- long rcu_try_flip_ie1;
- long rcu_try_flip_g1;
- long rcu_try_flip_a1;
- long rcu_try_flip_ae1;
- long rcu_try_flip_a2;
- long rcu_try_flip_z1;
- long rcu_try_flip_ze1;
- long rcu_try_flip_z2;
- long rcu_try_flip_m1;
- long rcu_try_flip_me1;
- long rcu_try_flip_m2;
-};
-
-#ifdef CONFIG_RCU_TRACE
-#define RCU_TRACE(fn, arg) fn(arg);
-#else
-#define RCU_TRACE(fn, arg)
-#endif
-
-extern void rcupreempt_trace_move2done(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_invoke(struct rcupreempt_trace *trace);
-extern void rcupreempt_trace_next_add(struct rcupreempt_trace *trace);
-
-#endif /* __LINUX_RCUPREEMPT_TRACE_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 5a5153806c4..a8930771782 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,264 +30,57 @@
#ifndef __LINUX_RCUTREE_H
#define __LINUX_RCUTREE_H
-#include <linux/cache.h>
-#include <linux/spinlock.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/seqlock.h>
+extern void rcu_sched_qs(int cpu);
+extern void rcu_bh_qs(int cpu);
-/*
- * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
- * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
- * bug somewhere.
- */
-#define MAX_RCU_LVLS 3
-#define RCU_FANOUT (CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
-#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT
-# define NUM_RCU_LVLS 1
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 (NR_CPUS)
-# define NUM_RCU_LVL_2 0
-# define NUM_RCU_LVL_3 0
-#elif NR_CPUS <= RCU_FANOUT_SQ
-# define NUM_RCU_LVLS 2
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
-# define NUM_RCU_LVL_2 (NR_CPUS)
-# define NUM_RCU_LVL_3 0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
-# define NUM_RCU_LVLS 3
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
-# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
-# define NUM_RCU_LVL_3 NR_CPUS
-#else
-# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
-
-#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
-#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
-
-/*
- * Dynticks per-CPU state.
- */
-struct rcu_dynticks {
- int dynticks_nesting; /* Track nesting level, sort of. */
- int dynticks; /* Even value for dynticks-idle, else odd. */
- int dynticks_nmi; /* Even value for either dynticks-idle or */
- /* not in nmi handler, else odd. So this */
- /* remains even for nmi from irq handler. */
-};
-
-/*
- * Definition for node within the RCU grace-period-detection hierarchy.
- */
-struct rcu_node {
- spinlock_t lock;
- unsigned long qsmask; /* CPUs or groups that need to switch in */
- /* order for current grace period to proceed.*/
- unsigned long qsmaskinit;
- /* Per-GP initialization for qsmask. */
- unsigned long grpmask; /* Mask to apply to parent qsmask. */
- int grplo; /* lowest-numbered CPU or group here. */
- int grphi; /* highest-numbered CPU or group here. */
- u8 grpnum; /* CPU/group number for next level up. */
- u8 level; /* root is at level 0. */
- struct rcu_node *parent;
-} ____cacheline_internodealigned_in_smp;
-
-/* Index values for nxttail array in struct rcu_data. */
-#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
-#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
-#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
-#define RCU_NEXT_TAIL 3
-#define RCU_NEXT_SIZE 4
-
-/* Per-CPU data for read-copy update. */
-struct rcu_data {
- /* 1) quiescent-state and grace-period handling : */
- long completed; /* Track rsp->completed gp number */
- /* in order to detect GP end. */
- long gpnum; /* Highest gp number that this CPU */
- /* is aware of having started. */
- long passed_quiesc_completed;
- /* Value of completed at time of qs. */
- bool passed_quiesc; /* User-mode/idle loop etc. */
- bool qs_pending; /* Core waits for quiesc state. */
- bool beenonline; /* CPU online at least once. */
- struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
- unsigned long grpmask; /* Mask to apply to leaf qsmask. */
-
- /* 2) batch handling */
- /*
- * If nxtlist is not NULL, it is partitioned as follows.
- * Any of the partitions might be empty, in which case the
- * pointer to that partition will be equal to the pointer for
- * the following partition. When the list is empty, all of
- * the nxttail elements point to nxtlist, which is NULL.
- *
- * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
- * Entries that might have arrived after current GP ended
- * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
- * Entries known to have arrived before current GP ended
- * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
- * Entries that batch # <= ->completed - 1: waiting for current GP
- * [nxtlist, *nxttail[RCU_DONE_TAIL]):
- * Entries that batch # <= ->completed
- * The grace period for these entries has completed, and
- * the other grace-period-completed entries may be moved
- * here temporarily in rcu_process_callbacks().
- */
- struct rcu_head *nxtlist;
- struct rcu_head **nxttail[RCU_NEXT_SIZE];
- long qlen; /* # of queued callbacks */
- long blimit; /* Upper limit on a processed batch */
-
-#ifdef CONFIG_NO_HZ
- /* 3) dynticks interface. */
- struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
- int dynticks_snap; /* Per-GP tracking for dynticks. */
- int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
-#endif /* #ifdef CONFIG_NO_HZ */
-
- /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
-#ifdef CONFIG_NO_HZ
- unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
-#endif /* #ifdef CONFIG_NO_HZ */
- unsigned long offline_fqs; /* Kicked due to being offline. */
- unsigned long resched_ipi; /* Sent a resched IPI. */
-
- /* 5) __rcu_pending() statistics. */
- long n_rcu_pending; /* rcu_pending() calls since boot. */
- long n_rp_qs_pending;
- long n_rp_cb_ready;
- long n_rp_cpu_needs_gp;
- long n_rp_gp_completed;
- long n_rp_gp_started;
- long n_rp_need_fqs;
- long n_rp_need_nothing;
-
- int cpu;
-};
-
-/* Values for signaled field in struct rcu_state. */
-#define RCU_GP_INIT 0 /* Grace period being initialized. */
-#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
-#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
-#ifdef CONFIG_NO_HZ
-#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
-#else /* #ifdef CONFIG_NO_HZ */
-#define RCU_SIGNAL_INIT RCU_FORCE_QS
-#endif /* #else #ifdef CONFIG_NO_HZ */
-
-#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
-#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
- /* to take at least one */
- /* scheduling clock irq */
- /* before ratting on them. */
-
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
-/*
- * RCU global state, including node hierarchy. This hierarchy is
- * represented in "heap" form in a dense array. The root (first level)
- * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
- * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
- * and the third level in ->node[m+1] and following (->node[m+1] referenced
- * by ->level[2]). The number of levels is determined by the number of
- * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
- * consisting of a single rcu_node.
- */
-struct rcu_state {
- struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
- struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
- u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
- u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
- struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
-
- /* The following fields are guarded by the root rcu_node's lock. */
-
- u8 signaled ____cacheline_internodealigned_in_smp;
- /* Force QS state. */
- long gpnum; /* Current gp number. */
- long completed; /* # of last completed gp. */
- spinlock_t onofflock; /* exclude on/offline and */
- /* starting new GP. */
- spinlock_t fqslock; /* Only one task forcing */
- /* quiescent states. */
- unsigned long jiffies_force_qs; /* Time at which to invoke */
- /* force_quiescent_state(). */
- unsigned long n_force_qs; /* Number of calls to */
- /* force_quiescent_state(). */
- unsigned long n_force_qs_lh; /* ~Number of calls leaving */
- /* due to lock unavailable. */
- unsigned long n_force_qs_ngp; /* Number of calls leaving */
- /* due to no GP active. */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
- unsigned long gp_start; /* Time at which GP started, */
- /* but in jiffies. */
- unsigned long jiffies_stall; /* Time at which to check */
- /* for CPU stalls. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-#ifdef CONFIG_NO_HZ
- long dynticks_completed; /* Value of completed @ snap. */
-#endif /* #ifdef CONFIG_NO_HZ */
-};
+extern int rcu_needs_cpu(int cpu);
-extern void rcu_qsctr_inc(int cpu);
-extern void rcu_bh_qsctr_inc(int cpu);
+#ifdef CONFIG_TREE_PREEMPT_RCU
-extern int rcu_pending(int cpu);
-extern int rcu_needs_cpu(int cpu);
+extern void __rcu_read_lock(void);
+extern void __rcu_read_unlock(void);
+extern void exit_rcu(void);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-extern struct lockdep_map rcu_lock_map;
-# define rcu_read_acquire() \
- lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
-# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_)
-#else
-# define rcu_read_acquire() do { } while (0)
-# define rcu_read_release() do { } while (0)
-#endif
+#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
static inline void __rcu_read_lock(void)
{
preempt_disable();
- __acquire(RCU);
- rcu_read_acquire();
}
+
static inline void __rcu_read_unlock(void)
{
- rcu_read_release();
- __release(RCU);
preempt_enable();
}
+
+static inline void exit_rcu(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+
static inline void __rcu_read_lock_bh(void)
{
local_bh_disable();
- __acquire(RCU_BH);
- rcu_read_acquire();
}
static inline void __rcu_read_unlock_bh(void)
{
- rcu_read_release();
- __release(RCU_BH);
local_bh_enable();
}
#define __synchronize_sched() synchronize_rcu()
-#define call_rcu_sched(head, func) call_rcu(head, func)
+extern void call_rcu_sched(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu));
-static inline void rcu_init_sched(void)
+static inline void synchronize_rcu_expedited(void)
{
+ synchronize_sched_expedited();
+}
+
+static inline void synchronize_rcu_bh_expedited(void)
+{
+ synchronize_sched_expedited();
}
extern void __rcu_init(void);
@@ -296,6 +89,11 @@ extern void rcu_restart_cpu(int cpu);
extern long rcu_batches_completed(void);
extern long rcu_batches_completed_bh(void);
+extern long rcu_batches_completed_sched(void);
+
+static inline void rcu_init_sched(void)
+{
+}
#ifdef CONFIG_NO_HZ
void rcu_enter_nohz(void);
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 29f8599e6be..5fcc31ed577 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -75,20 +75,6 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
}
/*
- * ring_buffer_event_discard can discard any event in the ring buffer.
- * it is up to the caller to protect against a reader from
- * consuming it or a writer from wrapping and replacing it.
- *
- * No external protection is needed if this is called before
- * the event is commited. But in that case it would be better to
- * use ring_buffer_discard_commit.
- *
- * Note, if an event that has not been committed is discarded
- * with ring_buffer_event_discard, it must still be committed.
- */
-void ring_buffer_event_discard(struct ring_buffer_event *event);
-
-/*
* ring_buffer_discard_commit will remove an event that has not
* ben committed yet. If this is used, then ring_buffer_unlock_commit
* must not be called on the discarded event. This function
@@ -154,8 +140,17 @@ unsigned long ring_buffer_size(struct ring_buffer *buffer);
void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
void ring_buffer_reset(struct ring_buffer *buffer);
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
struct ring_buffer *buffer_b, int cpu);
+#else
+static inline int
+ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu)
+{
+ return -ENODEV;
+}
+#endif
int ring_buffer_empty(struct ring_buffer *buffer);
int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
@@ -170,7 +165,6 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu);
-unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu);
u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9304027673b..f3d74bd04d1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -38,6 +38,8 @@
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
+/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
+#define SCHED_RESET_ON_FORK 0x40000000
#ifdef __KERNEL__
@@ -796,18 +798,19 @@ enum cpu_idle_type {
#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
#ifdef CONFIG_SMP
-#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
-#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */
-#define SD_BALANCE_EXEC 4 /* Balance on exec */
-#define SD_BALANCE_FORK 8 /* Balance on fork, clone */
-#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */
-#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */
-#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */
-#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */
-#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
-#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
-#define SD_SERIALIZE 1024 /* Only a single load balancing instance */
-#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */
+#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
+#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
+#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
+#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
+#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */
+#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
+#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */
+#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
+#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
+#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
+#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
+#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */
+#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
enum powersavings_balance_level {
POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
@@ -827,7 +830,7 @@ static inline int sd_balance_for_mc_power(void)
if (sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE;
- return 0;
+ return SD_PREFER_SIBLING;
}
static inline int sd_balance_for_package_power(void)
@@ -835,7 +838,7 @@ static inline int sd_balance_for_package_power(void)
if (sched_mc_power_savings | sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE;
- return 0;
+ return SD_PREFER_SIBLING;
}
/*
@@ -857,15 +860,9 @@ struct sched_group {
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
- * single CPU. This is read only (except for setup, hotplug CPU).
- * Note : Never change cpu_power without recompute its reciprocal
+ * single CPU.
*/
- unsigned int __cpu_power;
- /*
- * reciprocal value of cpu_power to avoid expensive divides
- * (see include/linux/reciprocal_div.h)
- */
- u32 reciprocal_cpu_power;
+ unsigned int cpu_power;
/*
* The CPUs this group covers.
@@ -918,6 +915,7 @@ struct sched_domain {
unsigned int newidle_idx;
unsigned int wake_idx;
unsigned int forkexec_idx;
+ unsigned int smt_gain;
int flags; /* See SD_* */
enum sched_domain_level level;
@@ -1045,7 +1043,6 @@ struct sched_class {
struct rq *busiest, struct sched_domain *sd,
enum cpu_idle_type idle);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
- int (*needs_post_schedule) (struct rq *this_rq);
void (*post_schedule) (struct rq *this_rq);
void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
@@ -1110,6 +1107,8 @@ struct sched_entity {
u64 wait_max;
u64 wait_count;
u64 wait_sum;
+ u64 iowait_count;
+ u64 iowait_sum;
u64 sleep_start;
u64 sleep_max;
@@ -1163,6 +1162,8 @@ struct sched_rt_entity {
#endif
};
+struct rcu_node;
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
@@ -1206,10 +1207,12 @@ struct task_struct {
unsigned int policy;
cpumask_t cpus_allowed;
-#ifdef CONFIG_PREEMPT_RCU
+#ifdef CONFIG_TREE_PREEMPT_RCU
int rcu_read_lock_nesting;
- int rcu_flipctr_idx;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
+ char rcu_read_unlock_special;
+ struct rcu_node *rcu_blocked_node;
+ struct list_head rcu_node_entry;
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
@@ -1230,11 +1233,19 @@ struct task_struct {
unsigned did_exec:1;
unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */
+ unsigned in_iowait:1;
+
+
+ /* Revert to default priority/policy when forking */
+ unsigned sched_reset_on_fork:1;
+
pid_t pid;
pid_t tgid;
+#ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary;
+#endif
/*
* pointers to (original) parent process, youngest child, younger sibling,
@@ -1725,6 +1736,28 @@ extern cputime_t task_gtime(struct task_struct *p);
#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
#define used_math() tsk_used_math(current)
+#ifdef CONFIG_TREE_PREEMPT_RCU
+
+#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#define RCU_READ_UNLOCK_GOT_QS (1 << 2) /* CPU has responded to RCU core. */
+
+static inline void rcu_copy_process(struct task_struct *p)
+{
+ p->rcu_read_lock_nesting = 0;
+ p->rcu_read_unlock_special = 0;
+ p->rcu_blocked_node = NULL;
+ INIT_LIST_HEAD(&p->rcu_node_entry);
+}
+
+#else
+
+static inline void rcu_copy_process(struct task_struct *p)
+{
+}
+
+#endif
+
#ifdef CONFIG_SMP
extern int set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask);
@@ -1814,11 +1847,12 @@ extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_shares_ratelimit;
extern unsigned int sysctl_sched_shares_thresh;
-#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_child_runs_first;
+#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_sched_time_avg;
extern unsigned int sysctl_timer_migration;
int sched_nr_latency_handler(struct ctl_table *table, int write,
@@ -2282,23 +2316,31 @@ static inline int need_resched(void)
* cond_resched_softirq() will enable bhs before scheduling.
*/
extern int _cond_resched(void);
-#ifdef CONFIG_PREEMPT_BKL
-static inline int cond_resched(void)
-{
- return 0;
-}
+
+#define cond_resched() ({ \
+ __might_sleep(__FILE__, __LINE__, 0); \
+ _cond_resched(); \
+})
+
+extern int __cond_resched_lock(spinlock_t *lock);
+
+#ifdef CONFIG_PREEMPT
+#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET
#else
-static inline int cond_resched(void)
-{
- return _cond_resched();
-}
+#define PREEMPT_LOCK_OFFSET 0
#endif
-extern int cond_resched_lock(spinlock_t * lock);
-extern int cond_resched_softirq(void);
-static inline int cond_resched_bkl(void)
-{
- return _cond_resched();
-}
+
+#define cond_resched_lock(lock) ({ \
+ __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
+ __cond_resched_lock(lock); \
+})
+
+extern int __cond_resched_softirq(void);
+
+#define cond_resched_softirq() ({ \
+ __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \
+ __cond_resched_softirq(); \
+})
/*
* Does a critical section need to be broken due to another
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 4be57ab0347..f0ca7a7a175 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -143,15 +143,6 @@ static inline void smp_mb__after_lock(void) { smp_mb(); }
*/
#define spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock)
-/*
- * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
- */
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-# include <linux/spinlock_api_smp.h>
-#else
-# include <linux/spinlock_api_up.h>
-#endif
-
#ifdef CONFIG_DEBUG_SPINLOCK
extern void _raw_spin_lock(spinlock_t *lock);
#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
@@ -268,50 +259,16 @@ static inline void smp_mb__after_lock(void) { smp_mb(); }
#define spin_lock_irq(lock) _spin_lock_irq(lock)
#define spin_lock_bh(lock) _spin_lock_bh(lock)
-
#define read_lock_irq(lock) _read_lock_irq(lock)
#define read_lock_bh(lock) _read_lock_bh(lock)
-
#define write_lock_irq(lock) _write_lock_irq(lock)
#define write_lock_bh(lock) _write_lock_bh(lock)
-
-/*
- * We inline the unlock functions in the nondebug case:
- */
-#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \
- !defined(CONFIG_SMP)
-# define spin_unlock(lock) _spin_unlock(lock)
-# define read_unlock(lock) _read_unlock(lock)
-# define write_unlock(lock) _write_unlock(lock)
-# define spin_unlock_irq(lock) _spin_unlock_irq(lock)
-# define read_unlock_irq(lock) _read_unlock_irq(lock)
-# define write_unlock_irq(lock) _write_unlock_irq(lock)
-#else
-# define spin_unlock(lock) \
- do {__raw_spin_unlock(&(lock)->raw_lock); __release(lock); } while (0)
-# define read_unlock(lock) \
- do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0)
-# define write_unlock(lock) \
- do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0)
-# define spin_unlock_irq(lock) \
-do { \
- __raw_spin_unlock(&(lock)->raw_lock); \
- __release(lock); \
- local_irq_enable(); \
-} while (0)
-# define read_unlock_irq(lock) \
-do { \
- __raw_read_unlock(&(lock)->raw_lock); \
- __release(lock); \
- local_irq_enable(); \
-} while (0)
-# define write_unlock_irq(lock) \
-do { \
- __raw_write_unlock(&(lock)->raw_lock); \
- __release(lock); \
- local_irq_enable(); \
-} while (0)
-#endif
+#define spin_unlock(lock) _spin_unlock(lock)
+#define read_unlock(lock) _read_unlock(lock)
+#define write_unlock(lock) _write_unlock(lock)
+#define spin_unlock_irq(lock) _spin_unlock_irq(lock)
+#define read_unlock_irq(lock) _read_unlock_irq(lock)
+#define write_unlock_irq(lock) _write_unlock_irq(lock)
#define spin_unlock_irqrestore(lock, flags) \
do { \
@@ -380,4 +337,13 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
*/
#define spin_can_lock(lock) (!spin_is_locked(lock))
+/*
+ * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
+ */
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+# include <linux/spinlock_api_smp.h>
+#else
+# include <linux/spinlock_api_up.h>
+#endif
+
#endif /* __LINUX_SPINLOCK_H */
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index d79845d034b..7a7e18fc241 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -60,4 +60,398 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
__releases(lock);
+/*
+ * We inline the unlock functions in the nondebug case:
+ */
+#if !defined(CONFIG_DEBUG_SPINLOCK) && !defined(CONFIG_PREEMPT)
+#define __always_inline__spin_unlock
+#define __always_inline__read_unlock
+#define __always_inline__write_unlock
+#define __always_inline__spin_unlock_irq
+#define __always_inline__read_unlock_irq
+#define __always_inline__write_unlock_irq
+#endif
+
+#ifndef CONFIG_DEBUG_SPINLOCK
+#ifndef CONFIG_GENERIC_LOCKBREAK
+
+#ifdef __always_inline__spin_lock
+#define _spin_lock(lock) __spin_lock(lock)
+#endif
+
+#ifdef __always_inline__read_lock
+#define _read_lock(lock) __read_lock(lock)
+#endif
+
+#ifdef __always_inline__write_lock
+#define _write_lock(lock) __write_lock(lock)
+#endif
+
+#ifdef __always_inline__spin_lock_bh
+#define _spin_lock_bh(lock) __spin_lock_bh(lock)
+#endif
+
+#ifdef __always_inline__read_lock_bh
+#define _read_lock_bh(lock) __read_lock_bh(lock)
+#endif
+
+#ifdef __always_inline__write_lock_bh
+#define _write_lock_bh(lock) __write_lock_bh(lock)
+#endif
+
+#ifdef __always_inline__spin_lock_irq
+#define _spin_lock_irq(lock) __spin_lock_irq(lock)
+#endif
+
+#ifdef __always_inline__read_lock_irq
+#define _read_lock_irq(lock) __read_lock_irq(lock)
+#endif
+
+#ifdef __always_inline__write_lock_irq
+#define _write_lock_irq(lock) __write_lock_irq(lock)
+#endif
+
+#ifdef __always_inline__spin_lock_irqsave
+#define _spin_lock_irqsave(lock) __spin_lock_irqsave(lock)
+#endif
+
+#ifdef __always_inline__read_lock_irqsave
+#define _read_lock_irqsave(lock) __read_lock_irqsave(lock)
+#endif
+
+#ifdef __always_inline__write_lock_irqsave
+#define _write_lock_irqsave(lock) __write_lock_irqsave(lock)
+#endif
+
+#endif /* !CONFIG_GENERIC_LOCKBREAK */
+
+#ifdef __always_inline__spin_trylock
+#define _spin_trylock(lock) __spin_trylock(lock)
+#endif
+
+#ifdef __always_inline__read_trylock
+#define _read_trylock(lock) __read_trylock(lock)
+#endif
+
+#ifdef __always_inline__write_trylock
+#define _write_trylock(lock) __write_trylock(lock)
+#endif
+
+#ifdef __always_inline__spin_trylock_bh
+#define _spin_trylock_bh(lock) __spin_trylock_bh(lock)
+#endif
+
+#ifdef __always_inline__spin_unlock
+#define _spin_unlock(lock) __spin_unlock(lock)
+#endif
+
+#ifdef __always_inline__read_unlock
+#define _read_unlock(lock) __read_unlock(lock)
+#endif
+
+#ifdef __always_inline__write_unlock
+#define _write_unlock(lock) __write_unlock(lock)
+#endif
+
+#ifdef __always_inline__spin_unlock_bh
+#define _spin_unlock_bh(lock) __spin_unlock_bh(lock)
+#endif
+
+#ifdef __always_inline__read_unlock_bh
+#define _read_unlock_bh(lock) __read_unlock_bh(lock)
+#endif
+
+#ifdef __always_inline__write_unlock_bh
+#define _write_unlock_bh(lock) __write_unlock_bh(lock)
+#endif
+
+#ifdef __always_inline__spin_unlock_irq
+#define _spin_unlock_irq(lock) __spin_unlock_irq(lock)
+#endif
+
+#ifdef __always_inline__read_unlock_irq
+#define _read_unlock_irq(lock) __read_unlock_irq(lock)
+#endif
+
+#ifdef __always_inline__write_unlock_irq
+#define _write_unlock_irq(lock) __write_unlock_irq(lock)
+#endif
+
+#ifdef __always_inline__spin_unlock_irqrestore
+#define _spin_unlock_irqrestore(lock, flags) __spin_unlock_irqrestore(lock, flags)
+#endif
+
+#ifdef __always_inline__read_unlock_irqrestore
+#define _read_unlock_irqrestore(lock, flags) __read_unlock_irqrestore(lock, flags)
+#endif
+
+#ifdef __always_inline__write_unlock_irqrestore
+#define _write_unlock_irqrestore(lock, flags) __write_unlock_irqrestore(lock, flags)
+#endif
+
+#endif /* CONFIG_DEBUG_SPINLOCK */
+
+static inline int __spin_trylock(spinlock_t *lock)
+{
+ preempt_disable();
+ if (_raw_spin_trylock(lock)) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ return 1;
+ }
+ preempt_enable();
+ return 0;
+}
+
+static inline int __read_trylock(rwlock_t *lock)
+{
+ preempt_disable();
+ if (_raw_read_trylock(lock)) {
+ rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
+ return 1;
+ }
+ preempt_enable();
+ return 0;
+}
+
+static inline int __write_trylock(rwlock_t *lock)
+{
+ preempt_disable();
+ if (_raw_write_trylock(lock)) {
+ rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ return 1;
+ }
+ preempt_enable();
+ return 0;
+}
+
+/*
+ * If lockdep is enabled then we use the non-preemption spin-ops
+ * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
+ * not re-enabled during lock-acquire (which the preempt-spin-ops do):
+ */
+#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
+
+static inline void __read_lock(rwlock_t *lock)
+{
+ preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+}
+
+static inline unsigned long __spin_lock_irqsave(spinlock_t *lock)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ /*
+ * On lockdep we dont want the hand-coded irq-enable of
+ * _raw_spin_lock_flags() code, because lockdep assumes
+ * that interrupts are not re-enabled during lock-acquire:
+ */
+#ifdef CONFIG_LOCKDEP
+ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+#else
+ _raw_spin_lock_flags(lock, &flags);
+#endif
+ return flags;
+}
+
+static inline void __spin_lock_irq(spinlock_t *lock)
+{
+ local_irq_disable();
+ preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+
+static inline void __spin_lock_bh(spinlock_t *lock)
+{
+ local_bh_disable();
+ preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+
+static inline unsigned long __read_lock_irqsave(rwlock_t *lock)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
+ _raw_read_lock_flags, &flags);
+ return flags;
+}
+
+static inline void __read_lock_irq(rwlock_t *lock)
+{
+ local_irq_disable();
+ preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+}
+
+static inline void __read_lock_bh(rwlock_t *lock)
+{
+ local_bh_disable();
+ preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+}
+
+static inline unsigned long __write_lock_irqsave(rwlock_t *lock)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
+ _raw_write_lock_flags, &flags);
+ return flags;
+}
+
+static inline void __write_lock_irq(rwlock_t *lock)
+{
+ local_irq_disable();
+ preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+}
+
+static inline void __write_lock_bh(rwlock_t *lock)
+{
+ local_bh_disable();
+ preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+}
+
+static inline void __spin_lock(spinlock_t *lock)
+{
+ preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+
+static inline void __write_lock(rwlock_t *lock)
+{
+ preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+}
+
+#endif /* CONFIG_PREEMPT */
+
+static inline void __spin_unlock(spinlock_t *lock)
+{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_spin_unlock(lock);
+ preempt_enable();
+}
+
+static inline void __write_unlock(rwlock_t *lock)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_write_unlock(lock);
+ preempt_enable();
+}
+
+static inline void __read_unlock(rwlock_t *lock)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_read_unlock(lock);
+ preempt_enable();
+}
+
+static inline void __spin_unlock_irqrestore(spinlock_t *lock,
+ unsigned long flags)
+{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_spin_unlock(lock);
+ local_irq_restore(flags);
+ preempt_enable();
+}
+
+static inline void __spin_unlock_irq(spinlock_t *lock)
+{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_spin_unlock(lock);
+ local_irq_enable();
+ preempt_enable();
+}
+
+static inline void __spin_unlock_bh(spinlock_t *lock)
+{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_spin_unlock(lock);
+ preempt_enable_no_resched();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
+
+static inline void __read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_read_unlock(lock);
+ local_irq_restore(flags);
+ preempt_enable();
+}
+
+static inline void __read_unlock_irq(rwlock_t *lock)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_read_unlock(lock);
+ local_irq_enable();
+ preempt_enable();
+}
+
+static inline void __read_unlock_bh(rwlock_t *lock)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_read_unlock(lock);
+ preempt_enable_no_resched();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
+
+static inline void __write_unlock_irqrestore(rwlock_t *lock,
+ unsigned long flags)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_write_unlock(lock);
+ local_irq_restore(flags);
+ preempt_enable();
+}
+
+static inline void __write_unlock_irq(rwlock_t *lock)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_write_unlock(lock);
+ local_irq_enable();
+ preempt_enable();
+}
+
+static inline void __write_unlock_bh(rwlock_t *lock)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_write_unlock(lock);
+ preempt_enable_no_resched();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
+
+static inline int __spin_trylock_bh(spinlock_t *lock)
+{
+ local_bh_disable();
+ preempt_disable();
+ if (_raw_spin_trylock(lock)) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ return 1;
+ }
+ preempt_enable_no_resched();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+ return 0;
+}
+
#endif /* __LINUX_SPINLOCK_API_SMP_H */
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index cb1a6631b8f..73b1f1cec42 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -14,7 +14,6 @@ struct scatterlist;
*/
#define IO_TLB_SEGSIZE 128
-
/*
* log of the size of each IO TLB slab. The number of slabs is command line
* controllable.
@@ -24,16 +23,6 @@ struct scatterlist;
extern void
swiotlb_init(void);
-extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs);
-extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
-
-extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev,
- phys_addr_t address);
-extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev,
- dma_addr_t address);
-
-extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size);
-
extern void
*swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 80de7003d8c..a8e37821cc6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -64,6 +64,7 @@ struct perf_counter_attr;
#include <linux/sem.h>
#include <asm/siginfo.h>
#include <asm/signal.h>
+#include <linux/unistd.h>
#include <linux/quota.h>
#include <linux/key.h>
#include <trace/syscall.h>
@@ -97,6 +98,53 @@ struct perf_counter_attr;
#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
+#ifdef CONFIG_EVENT_PROFILE
+#define TRACE_SYS_ENTER_PROFILE(sname) \
+static int prof_sysenter_enable_##sname(struct ftrace_event_call *event_call) \
+{ \
+ int ret = 0; \
+ if (!atomic_inc_return(&event_enter_##sname.profile_count)) \
+ ret = reg_prof_syscall_enter("sys"#sname); \
+ return ret; \
+} \
+ \
+static void prof_sysenter_disable_##sname(struct ftrace_event_call *event_call)\
+{ \
+ if (atomic_add_negative(-1, &event_enter_##sname.profile_count)) \
+ unreg_prof_syscall_enter("sys"#sname); \
+}
+
+#define TRACE_SYS_EXIT_PROFILE(sname) \
+static int prof_sysexit_enable_##sname(struct ftrace_event_call *event_call) \
+{ \
+ int ret = 0; \
+ if (!atomic_inc_return(&event_exit_##sname.profile_count)) \
+ ret = reg_prof_syscall_exit("sys"#sname); \
+ return ret; \
+} \
+ \
+static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \
+{ \
+ if (atomic_add_negative(-1, &event_exit_##sname.profile_count)) \
+ unreg_prof_syscall_exit("sys"#sname); \
+}
+
+#define TRACE_SYS_ENTER_PROFILE_INIT(sname) \
+ .profile_count = ATOMIC_INIT(-1), \
+ .profile_enable = prof_sysenter_enable_##sname, \
+ .profile_disable = prof_sysenter_disable_##sname,
+
+#define TRACE_SYS_EXIT_PROFILE_INIT(sname) \
+ .profile_count = ATOMIC_INIT(-1), \
+ .profile_enable = prof_sysexit_enable_##sname, \
+ .profile_disable = prof_sysexit_disable_##sname,
+#else
+#define TRACE_SYS_ENTER_PROFILE(sname)
+#define TRACE_SYS_ENTER_PROFILE_INIT(sname)
+#define TRACE_SYS_EXIT_PROFILE(sname)
+#define TRACE_SYS_EXIT_PROFILE_INIT(sname)
+#endif
+
#ifdef CONFIG_FTRACE_SYSCALLS
#define __SC_STR_ADECL1(t, a) #a
#define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__)
@@ -112,7 +160,81 @@ struct perf_counter_attr;
#define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__)
#define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__)
+#define SYSCALL_TRACE_ENTER_EVENT(sname) \
+ static struct ftrace_event_call event_enter_##sname; \
+ struct trace_event enter_syscall_print_##sname = { \
+ .trace = print_syscall_enter, \
+ }; \
+ static int init_enter_##sname(void) \
+ { \
+ int num, id; \
+ num = syscall_name_to_nr("sys"#sname); \
+ if (num < 0) \
+ return -ENOSYS; \
+ id = register_ftrace_event(&enter_syscall_print_##sname);\
+ if (!id) \
+ return -ENODEV; \
+ event_enter_##sname.id = id; \
+ set_syscall_enter_id(num, id); \
+ INIT_LIST_HEAD(&event_enter_##sname.fields); \
+ return 0; \
+ } \
+ TRACE_SYS_ENTER_PROFILE(sname); \
+ static struct ftrace_event_call __used \
+ __attribute__((__aligned__(4))) \
+ __attribute__((section("_ftrace_events"))) \
+ event_enter_##sname = { \
+ .name = "sys_enter"#sname, \
+ .system = "syscalls", \
+ .event = &event_syscall_enter, \
+ .raw_init = init_enter_##sname, \
+ .show_format = syscall_enter_format, \
+ .define_fields = syscall_enter_define_fields, \
+ .regfunc = reg_event_syscall_enter, \
+ .unregfunc = unreg_event_syscall_enter, \
+ .data = "sys"#sname, \
+ TRACE_SYS_ENTER_PROFILE_INIT(sname) \
+ }
+
+#define SYSCALL_TRACE_EXIT_EVENT(sname) \
+ static struct ftrace_event_call event_exit_##sname; \
+ struct trace_event exit_syscall_print_##sname = { \
+ .trace = print_syscall_exit, \
+ }; \
+ static int init_exit_##sname(void) \
+ { \
+ int num, id; \
+ num = syscall_name_to_nr("sys"#sname); \
+ if (num < 0) \
+ return -ENOSYS; \
+ id = register_ftrace_event(&exit_syscall_print_##sname);\
+ if (!id) \
+ return -ENODEV; \
+ event_exit_##sname.id = id; \
+ set_syscall_exit_id(num, id); \
+ INIT_LIST_HEAD(&event_exit_##sname.fields); \
+ return 0; \
+ } \
+ TRACE_SYS_EXIT_PROFILE(sname); \
+ static struct ftrace_event_call __used \
+ __attribute__((__aligned__(4))) \
+ __attribute__((section("_ftrace_events"))) \
+ event_exit_##sname = { \
+ .name = "sys_exit"#sname, \
+ .system = "syscalls", \
+ .event = &event_syscall_exit, \
+ .raw_init = init_exit_##sname, \
+ .show_format = syscall_exit_format, \
+ .define_fields = syscall_exit_define_fields, \
+ .regfunc = reg_event_syscall_exit, \
+ .unregfunc = unreg_event_syscall_exit, \
+ .data = "sys"#sname, \
+ TRACE_SYS_EXIT_PROFILE_INIT(sname) \
+ }
+
#define SYSCALL_METADATA(sname, nb) \
+ SYSCALL_TRACE_ENTER_EVENT(sname); \
+ SYSCALL_TRACE_EXIT_EVENT(sname); \
static const struct syscall_metadata __used \
__attribute__((__aligned__(4))) \
__attribute__((section("__syscalls_metadata"))) \
@@ -121,18 +243,23 @@ struct perf_counter_attr;
.nb_args = nb, \
.types = types_##sname, \
.args = args_##sname, \
- }
+ .enter_event = &event_enter_##sname, \
+ .exit_event = &event_exit_##sname, \
+ };
#define SYSCALL_DEFINE0(sname) \
+ SYSCALL_TRACE_ENTER_EVENT(_##sname); \
+ SYSCALL_TRACE_EXIT_EVENT(_##sname); \
static const struct syscall_metadata __used \
__attribute__((__aligned__(4))) \
__attribute__((section("__syscalls_metadata"))) \
__syscall_meta_##sname = { \
.name = "sys_"#sname, \
.nb_args = 0, \
+ .enter_event = &event_enter__##sname, \
+ .exit_event = &event_exit__##sname, \
}; \
asmlinkage long sys_##sname(void)
-
#else
#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void)
#endif
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 7402c1a27c4..85e8cf7d393 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -85,20 +85,29 @@ int arch_update_cpu_topology(void);
#define ARCH_HAS_SCHED_WAKE_IDLE
/* Common values for SMT siblings */
#ifndef SD_SIBLING_INIT
-#define SD_SIBLING_INIT (struct sched_domain) { \
- .min_interval = 1, \
- .max_interval = 2, \
- .busy_factor = 64, \
- .imbalance_pct = 110, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_NEWIDLE \
- | SD_BALANCE_FORK \
- | SD_BALANCE_EXEC \
- | SD_WAKE_AFFINE \
- | SD_WAKE_BALANCE \
- | SD_SHARE_CPUPOWER, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
+#define SD_SIBLING_INIT (struct sched_domain) { \
+ .min_interval = 1, \
+ .max_interval = 2, \
+ .busy_factor = 64, \
+ .imbalance_pct = 110, \
+ \
+ .flags = 1*SD_LOAD_BALANCE \
+ | 1*SD_BALANCE_NEWIDLE \
+ | 1*SD_BALANCE_EXEC \
+ | 1*SD_BALANCE_FORK \
+ | 0*SD_WAKE_IDLE \
+ | 1*SD_WAKE_AFFINE \
+ | 1*SD_WAKE_BALANCE \
+ | 1*SD_SHARE_CPUPOWER \
+ | 0*SD_POWERSAVINGS_BALANCE \
+ | 0*SD_SHARE_PKG_RESOURCES \
+ | 0*SD_SERIALIZE \
+ | 0*SD_WAKE_IDLE_FAR \
+ | 0*SD_PREFER_SIBLING \
+ , \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .smt_gain = 1178, /* 15% */ \
}
#endif
#endif /* CONFIG_SCHED_SMT */
@@ -106,69 +115,94 @@ int arch_update_cpu_topology(void);
#ifdef CONFIG_SCHED_MC
/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
#ifndef SD_MC_INIT
-#define SD_MC_INIT (struct sched_domain) { \
- .min_interval = 1, \
- .max_interval = 4, \
- .busy_factor = 64, \
- .imbalance_pct = 125, \
- .cache_nice_tries = 1, \
- .busy_idx = 2, \
- .wake_idx = 1, \
- .forkexec_idx = 1, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_FORK \
- | SD_BALANCE_EXEC \
- | SD_WAKE_AFFINE \
- | SD_WAKE_BALANCE \
- | SD_SHARE_PKG_RESOURCES\
- | sd_balance_for_mc_power()\
- | sd_power_saving_flags(),\
- .last_balance = jiffies, \
- .balance_interval = 1, \
+#define SD_MC_INIT (struct sched_domain) { \
+ .min_interval = 1, \
+ .max_interval = 4, \
+ .busy_factor = 64, \
+ .imbalance_pct = 125, \
+ .cache_nice_tries = 1, \
+ .busy_idx = 2, \
+ .wake_idx = 1, \
+ .forkexec_idx = 1, \
+ \
+ .flags = 1*SD_LOAD_BALANCE \
+ | 1*SD_BALANCE_NEWIDLE \
+ | 1*SD_BALANCE_EXEC \
+ | 1*SD_BALANCE_FORK \
+ | 1*SD_WAKE_IDLE \
+ | 1*SD_WAKE_AFFINE \
+ | 1*SD_WAKE_BALANCE \
+ | 0*SD_SHARE_CPUPOWER \
+ | 1*SD_SHARE_PKG_RESOURCES \
+ | 0*SD_SERIALIZE \
+ | 0*SD_WAKE_IDLE_FAR \
+ | sd_balance_for_mc_power() \
+ | sd_power_saving_flags() \
+ , \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
}
#endif
#endif /* CONFIG_SCHED_MC */
/* Common values for CPUs */
#ifndef SD_CPU_INIT
-#define SD_CPU_INIT (struct sched_domain) { \
- .min_interval = 1, \
- .max_interval = 4, \
- .busy_factor = 64, \
- .imbalance_pct = 125, \
- .cache_nice_tries = 1, \
- .busy_idx = 2, \
- .idle_idx = 1, \
- .newidle_idx = 2, \
- .wake_idx = 1, \
- .forkexec_idx = 1, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_EXEC \
- | SD_BALANCE_FORK \
- | SD_WAKE_AFFINE \
- | SD_WAKE_BALANCE \
- | sd_balance_for_package_power()\
- | sd_power_saving_flags(),\
- .last_balance = jiffies, \
- .balance_interval = 1, \
+#define SD_CPU_INIT (struct sched_domain) { \
+ .min_interval = 1, \
+ .max_interval = 4, \
+ .busy_factor = 64, \
+ .imbalance_pct = 125, \
+ .cache_nice_tries = 1, \
+ .busy_idx = 2, \
+ .idle_idx = 1, \
+ .newidle_idx = 2, \
+ .wake_idx = 1, \
+ .forkexec_idx = 1, \
+ \
+ .flags = 1*SD_LOAD_BALANCE \
+ | 1*SD_BALANCE_NEWIDLE \
+ | 1*SD_BALANCE_EXEC \
+ | 1*SD_BALANCE_FORK \
+ | 1*SD_WAKE_IDLE \
+ | 0*SD_WAKE_AFFINE \
+ | 1*SD_WAKE_BALANCE \
+ | 0*SD_SHARE_CPUPOWER \
+ | 0*SD_SHARE_PKG_RESOURCES \
+ | 0*SD_SERIALIZE \
+ | 0*SD_WAKE_IDLE_FAR \
+ | sd_balance_for_package_power() \
+ | sd_power_saving_flags() \
+ , \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
}
#endif
/* sched_domains SD_ALLNODES_INIT for NUMA machines */
-#define SD_ALLNODES_INIT (struct sched_domain) { \
- .min_interval = 64, \
- .max_interval = 64*num_online_cpus(), \
- .busy_factor = 128, \
- .imbalance_pct = 133, \
- .cache_nice_tries = 1, \
- .busy_idx = 3, \
- .idle_idx = 3, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_NEWIDLE \
- | SD_WAKE_AFFINE \
- | SD_SERIALIZE, \
- .last_balance = jiffies, \
- .balance_interval = 64, \
+#define SD_ALLNODES_INIT (struct sched_domain) { \
+ .min_interval = 64, \
+ .max_interval = 64*num_online_cpus(), \
+ .busy_factor = 128, \
+ .imbalance_pct = 133, \
+ .cache_nice_tries = 1, \
+ .busy_idx = 3, \
+ .idle_idx = 3, \
+ .flags = 1*SD_LOAD_BALANCE \
+ | 1*SD_BALANCE_NEWIDLE \
+ | 0*SD_BALANCE_EXEC \
+ | 0*SD_BALANCE_FORK \
+ | 0*SD_WAKE_IDLE \
+ | 1*SD_WAKE_AFFINE \
+ | 0*SD_WAKE_BALANCE \
+ | 0*SD_SHARE_CPUPOWER \
+ | 0*SD_POWERSAVINGS_BALANCE \
+ | 0*SD_SHARE_PKG_RESOURCES \
+ | 1*SD_SERIALIZE \
+ | 1*SD_WAKE_IDLE_FAR \
+ | 0*SD_PREFER_SIBLING \
+ , \
+ .last_balance = jiffies, \
+ .balance_interval = 64, \
}
#ifdef CONFIG_NUMA
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index b9dc4ca0246..63a3f7a8058 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -23,6 +23,8 @@ struct tracepoint;
struct tracepoint {
const char *name; /* Tracepoint name */
int state; /* State. */
+ void (*regfunc)(void);
+ void (*unregfunc)(void);
void **funcs;
} __attribute__((aligned(32))); /*
* Aligned on 32 bytes because it is
@@ -78,12 +80,16 @@ struct tracepoint {
return tracepoint_probe_unregister(#name, (void *)probe);\
}
-#define DEFINE_TRACE(name) \
+
+#define DEFINE_TRACE_FN(name, reg, unreg) \
static const char __tpstrtab_##name[] \
__attribute__((section("__tracepoints_strings"))) = #name; \
struct tracepoint __tracepoint_##name \
__attribute__((section("__tracepoints"), aligned(32))) = \
- { __tpstrtab_##name, 0, NULL }
+ { __tpstrtab_##name, 0, reg, unreg, NULL }
+
+#define DEFINE_TRACE(name) \
+ DEFINE_TRACE_FN(name, NULL, NULL);
#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \
EXPORT_SYMBOL_GPL(__tracepoint_##name)
@@ -108,6 +114,7 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
return -ENOSYS; \
}
+#define DEFINE_TRACE_FN(name, reg, unreg)
#define DEFINE_TRACE(name)
#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
#define EXPORT_TRACEPOINT_SYMBOL(name)
@@ -158,6 +165,15 @@ static inline void tracepoint_synchronize_unregister(void)
#define PARAMS(args...) args
+#endif /* _LINUX_TRACEPOINT_H */
+
+/*
+ * Note: we keep the TRACE_EVENT outside the include file ifdef protection.
+ * This is due to the way trace events work. If a file includes two
+ * trace event headers under one "CREATE_TRACE_POINTS" the first include
+ * will override the TRACE_EVENT and break the second include.
+ */
+
#ifndef TRACE_EVENT
/*
* For use with the TRACE_EVENT macro:
@@ -259,10 +275,15 @@ static inline void tracepoint_synchronize_unregister(void)
* can also by used by generic instrumentation like SystemTap), and
* it is also used to expose a structured trace record in
* /sys/kernel/debug/tracing/events/.
+ *
+ * A set of (un)registration functions can be passed to the variant
+ * TRACE_EVENT_FN to perform any (un)registration work.
*/
#define TRACE_EVENT(name, proto, args, struct, assign, print) \
DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
-#endif
+#define TRACE_EVENT_FN(name, proto, args, struct, \
+ assign, print, reg, unreg) \
+ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
-#endif
+#endif /* ifdef TRACE_EVENT (see note above) */
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index f7a7ae1e8f9..2a4b3bf7403 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -26,6 +26,11 @@
#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
DEFINE_TRACE(name)
+#undef TRACE_EVENT_FN
+#define TRACE_EVENT_FN(name, proto, args, tstruct, \
+ assign, print, reg, unreg) \
+ DEFINE_TRACE_FN(name, reg, unreg)
+
#undef DECLARE_TRACE
#define DECLARE_TRACE(name, proto, args) \
DEFINE_TRACE(name)
@@ -56,6 +61,8 @@
#include <trace/ftrace.h>
#endif
+#undef TRACE_EVENT
+#undef TRACE_EVENT_FN
#undef TRACE_HEADER_MULTI_READ
/* Only undef what we defined in this file */
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
new file mode 100644
index 00000000000..84160fb1847
--- /dev/null
+++ b/include/trace/events/module.h
@@ -0,0 +1,126 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM module
+
+#if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MODULE_H
+
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_MODULES
+
+struct module;
+
+#define show_module_flags(flags) __print_flags(flags, "", \
+ { (1UL << TAINT_PROPRIETARY_MODULE), "P" }, \
+ { (1UL << TAINT_FORCED_MODULE), "F" }, \
+ { (1UL << TAINT_CRAP), "C" })
+
+TRACE_EVENT(module_load,
+
+ TP_PROTO(struct module *mod),
+
+ TP_ARGS(mod),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, taints )
+ __string( name, mod->name )
+ ),
+
+ TP_fast_assign(
+ __entry->taints = mod->taints;
+ __assign_str(name, mod->name);
+ ),
+
+ TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints))
+);
+
+TRACE_EVENT(module_free,
+
+ TP_PROTO(struct module *mod),
+
+ TP_ARGS(mod),
+
+ TP_STRUCT__entry(
+ __string( name, mod->name )
+ ),
+
+ TP_fast_assign(
+ __assign_str(name, mod->name);
+ ),
+
+ TP_printk("%s", __get_str(name))
+);
+
+TRACE_EVENT(module_get,
+
+ TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+
+ TP_ARGS(mod, ip, refcnt),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, ip )
+ __field( int, refcnt )
+ __string( name, mod->name )
+ ),
+
+ TP_fast_assign(
+ __entry->ip = ip;
+ __entry->refcnt = refcnt;
+ __assign_str(name, mod->name);
+ ),
+
+ TP_printk("%s call_site=%pf refcnt=%d",
+ __get_str(name), (void *)__entry->ip, __entry->refcnt)
+);
+
+TRACE_EVENT(module_put,
+
+ TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+
+ TP_ARGS(mod, ip, refcnt),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, ip )
+ __field( int, refcnt )
+ __string( name, mod->name )
+ ),
+
+ TP_fast_assign(
+ __entry->ip = ip;
+ __entry->refcnt = refcnt;
+ __assign_str(name, mod->name);
+ ),
+
+ TP_printk("%s call_site=%pf refcnt=%d",
+ __get_str(name), (void *)__entry->ip, __entry->refcnt)
+);
+
+TRACE_EVENT(module_request,
+
+ TP_PROTO(char *name, bool wait, unsigned long ip),
+
+ TP_ARGS(name, wait, ip),
+
+ TP_STRUCT__entry(
+ __field( bool, wait )
+ __field( unsigned long, ip )
+ __string( name, name )
+ ),
+
+ TP_fast_assign(
+ __entry->wait = wait;
+ __entry->ip = ip;
+ __assign_str(name, name);
+ ),
+
+ TP_printk("%s wait=%d call_site=%pf",
+ __get_str(name), (int)__entry->wait, (void *)__entry->ip)
+);
+
+#endif /* CONFIG_MODULES */
+
+#endif /* _TRACE_MODULE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 8949bb7eb08..b48f1ad7c94 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -94,6 +94,7 @@ TRACE_EVENT(sched_wakeup,
__field( pid_t, pid )
__field( int, prio )
__field( int, success )
+ __field( int, cpu )
),
TP_fast_assign(
@@ -101,11 +102,12 @@ TRACE_EVENT(sched_wakeup,
__entry->pid = p->pid;
__entry->prio = p->prio;
__entry->success = success;
+ __entry->cpu = task_cpu(p);
),
- TP_printk("task %s:%d [%d] success=%d",
+ TP_printk("task %s:%d [%d] success=%d [%03d]",
__entry->comm, __entry->pid, __entry->prio,
- __entry->success)
+ __entry->success, __entry->cpu)
);
/*
@@ -125,6 +127,7 @@ TRACE_EVENT(sched_wakeup_new,
__field( pid_t, pid )
__field( int, prio )
__field( int, success )
+ __field( int, cpu )
),
TP_fast_assign(
@@ -132,11 +135,12 @@ TRACE_EVENT(sched_wakeup_new,
__entry->pid = p->pid;
__entry->prio = p->prio;
__entry->success = success;
+ __entry->cpu = task_cpu(p);
),
- TP_printk("task %s:%d [%d] success=%d",
+ TP_printk("task %s:%d [%d] success=%d [%03d]",
__entry->comm, __entry->pid, __entry->prio,
- __entry->success)
+ __entry->success, __entry->cpu)
);
/*
@@ -340,6 +344,101 @@ TRACE_EVENT(sched_signal_send,
__entry->sig, __entry->comm, __entry->pid)
);
+/*
+ * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
+ * adding sched_stat support to SCHED_FIFO/RR would be welcome.
+ */
+
+/*
+ * Tracepoint for accounting wait time (time the task is runnable
+ * but not actually running due to scheduler contention).
+ */
+TRACE_EVENT(sched_stat_wait,
+
+ TP_PROTO(struct task_struct *tsk, u64 delay),
+
+ TP_ARGS(tsk, delay),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( u64, delay )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->delay = delay;
+ )
+ TP_perf_assign(
+ __perf_count(delay);
+ ),
+
+ TP_printk("task: %s:%d wait: %Lu [ns]",
+ __entry->comm, __entry->pid,
+ (unsigned long long)__entry->delay)
+);
+
+/*
+ * Tracepoint for accounting sleep time (time the task is not runnable,
+ * including iowait, see below).
+ */
+TRACE_EVENT(sched_stat_sleep,
+
+ TP_PROTO(struct task_struct *tsk, u64 delay),
+
+ TP_ARGS(tsk, delay),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( u64, delay )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->delay = delay;
+ )
+ TP_perf_assign(
+ __perf_count(delay);
+ ),
+
+ TP_printk("task: %s:%d sleep: %Lu [ns]",
+ __entry->comm, __entry->pid,
+ (unsigned long long)__entry->delay)
+);
+
+/*
+ * Tracepoint for accounting iowait time (time the task is not runnable
+ * due to waiting on IO to complete).
+ */
+TRACE_EVENT(sched_stat_iowait,
+
+ TP_PROTO(struct task_struct *tsk, u64 delay),
+
+ TP_ARGS(tsk, delay),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( u64, delay )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->delay = delay;
+ )
+ TP_perf_assign(
+ __perf_count(delay);
+ ),
+
+ TP_printk("task: %s:%d iowait: %Lu [ns]",
+ __entry->comm, __entry->pid,
+ (unsigned long long)__entry->delay)
+);
+
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h
new file mode 100644
index 00000000000..397dff2dbd5
--- /dev/null
+++ b/include/trace/events/syscalls.h
@@ -0,0 +1,70 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM syscalls
+
+#if !defined(_TRACE_EVENTS_SYSCALLS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EVENTS_SYSCALLS_H
+
+#include <linux/tracepoint.h>
+
+#include <asm/ptrace.h>
+#include <asm/syscall.h>
+
+
+#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
+
+extern void syscall_regfunc(void);
+extern void syscall_unregfunc(void);
+
+TRACE_EVENT_FN(sys_enter,
+
+ TP_PROTO(struct pt_regs *regs, long id),
+
+ TP_ARGS(regs, id),
+
+ TP_STRUCT__entry(
+ __field( long, id )
+ __array( unsigned long, args, 6 )
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ syscall_get_arguments(current, regs, 0, 6, __entry->args);
+ ),
+
+ TP_printk("NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)",
+ __entry->id,
+ __entry->args[0], __entry->args[1], __entry->args[2],
+ __entry->args[3], __entry->args[4], __entry->args[5]),
+
+ syscall_regfunc, syscall_unregfunc
+);
+
+TRACE_EVENT_FN(sys_exit,
+
+ TP_PROTO(struct pt_regs *regs, long ret),
+
+ TP_ARGS(regs, ret),
+
+ TP_STRUCT__entry(
+ __field( long, id )
+ __field( long, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->id = syscall_get_nr(current, regs);
+ __entry->ret = ret;
+ ),
+
+ TP_printk("NR %ld = %ld",
+ __entry->id, __entry->ret),
+
+ syscall_regfunc, syscall_unregfunc
+);
+
+#endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */
+
+#endif /* _TRACE_EVENTS_SYSCALLS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index f64fbaae781..308bafd9332 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -21,11 +21,14 @@
#undef __field
#define __field(type, item) type item;
+#undef __field_ext
+#define __field_ext(type, item, filter_type) type item;
+
#undef __array
#define __array(type, item, len) type item[len];
#undef __dynamic_array
-#define __dynamic_array(type, item, len) unsigned short __data_loc_##item;
+#define __dynamic_array(type, item, len) u32 __data_loc_##item;
#undef __string
#define __string(item, src) __dynamic_array(char, item, -1)
@@ -42,6 +45,16 @@
}; \
static struct ftrace_event_call event_##name
+#undef __cpparg
+#define __cpparg(arg...) arg
+
+/* Callbacks are meaningless to ftrace. */
+#undef TRACE_EVENT_FN
+#define TRACE_EVENT_FN(name, proto, args, tstruct, \
+ assign, print, reg, unreg) \
+ TRACE_EVENT(name, __cpparg(proto), __cpparg(args), \
+ __cpparg(tstruct), __cpparg(assign), __cpparg(print)) \
+
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
@@ -51,23 +64,27 @@
* Include the following:
*
* struct ftrace_data_offsets_<call> {
- * int <item1>;
- * int <item2>;
+ * u32 <item1>;
+ * u32 <item2>;
* [...]
* };
*
- * The __dynamic_array() macro will create each int <item>, this is
+ * The __dynamic_array() macro will create each u32 <item>, this is
* to keep the offset of each array from the beginning of the event.
+ * The size of an array is also encoded, in the higher 16 bits of <item>.
*/
#undef __field
-#define __field(type, item);
+#define __field(type, item)
+
+#undef __field_ext
+#define __field_ext(type, item, filter_type)
#undef __array
#define __array(type, item, len)
#undef __dynamic_array
-#define __dynamic_array(type, item, len) int item;
+#define __dynamic_array(type, item, len) u32 item;
#undef __string
#define __string(item, src) __dynamic_array(char, item, -1)
@@ -109,6 +126,9 @@
if (!ret) \
return 0;
+#undef __field_ext
+#define __field_ext(type, item, filter_type) __field(type, item)
+
#undef __array
#define __array(type, item, len) \
ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
@@ -120,7 +140,7 @@
#undef __dynamic_array
#define __dynamic_array(type, item, len) \
- ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t" \
+ ret = trace_seq_printf(s, "\tfield:__data_loc " #type "[] " #item ";\t"\
"offset:%u;\tsize:%u;\n", \
(unsigned int)offsetof(typeof(field), \
__data_loc_##item), \
@@ -150,7 +170,8 @@
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
static int \
-ftrace_format_##call(struct trace_seq *s) \
+ftrace_format_##call(struct ftrace_event_call *unused, \
+ struct trace_seq *s) \
{ \
struct ftrace_raw_##call field __attribute__((unused)); \
int ret = 0; \
@@ -210,7 +231,7 @@ ftrace_format_##call(struct trace_seq *s) \
#undef __get_dynamic_array
#define __get_dynamic_array(field) \
- ((void *)__entry + __entry->__data_loc_##field)
+ ((void *)__entry + (__entry->__data_loc_##field & 0xffff))
#undef __get_str
#define __get_str(field) (char *)__get_dynamic_array(field)
@@ -263,28 +284,33 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
-#undef __field
-#define __field(type, item) \
+#undef __field_ext
+#define __field_ext(type, item, filter_type) \
ret = trace_define_field(event_call, #type, #item, \
offsetof(typeof(field), item), \
- sizeof(field.item), is_signed_type(type)); \
+ sizeof(field.item), \
+ is_signed_type(type), filter_type); \
if (ret) \
return ret;
+#undef __field
+#define __field(type, item) __field_ext(type, item, FILTER_OTHER)
+
#undef __array
#define __array(type, item, len) \
BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
ret = trace_define_field(event_call, #type "[" #len "]", #item, \
offsetof(typeof(field), item), \
- sizeof(field.item), 0); \
+ sizeof(field.item), 0, FILTER_OTHER); \
if (ret) \
return ret;
#undef __dynamic_array
#define __dynamic_array(type, item, len) \
- ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\
- offsetof(typeof(field), __data_loc_##item), \
- sizeof(field.__data_loc_##item), 0);
+ ret = trace_define_field(event_call, "__data_loc " #type "[]", #item, \
+ offsetof(typeof(field), __data_loc_##item), \
+ sizeof(field.__data_loc_##item), 0, \
+ FILTER_OTHER);
#undef __string
#define __string(item, src) __dynamic_array(char, item, -1)
@@ -292,17 +318,14 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
int \
-ftrace_define_fields_##call(void) \
+ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
{ \
struct ftrace_raw_##call field; \
- struct ftrace_event_call *event_call = &event_##call; \
int ret; \
\
- __common_field(int, type, 1); \
- __common_field(unsigned char, flags, 0); \
- __common_field(unsigned char, preempt_count, 0); \
- __common_field(int, pid, 1); \
- __common_field(int, tgid, 1); \
+ ret = trace_define_common_fields(event_call); \
+ if (ret) \
+ return ret; \
\
tstruct; \
\
@@ -321,6 +344,9 @@ ftrace_define_fields_##call(void) \
#undef __field
#define __field(type, item)
+#undef __field_ext
+#define __field_ext(type, item, filter_type)
+
#undef __array
#define __array(type, item, len)
@@ -328,6 +354,7 @@ ftrace_define_fields_##call(void) \
#define __dynamic_array(type, item, len) \
__data_offsets->item = __data_size + \
offsetof(typeof(*entry), __data); \
+ __data_offsets->item |= (len * sizeof(type)) << 16; \
__data_size += (len) * sizeof(type);
#undef __string
@@ -433,13 +460,15 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
* {
* struct ring_buffer_event *event;
* struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ * struct ring_buffer *buffer;
* unsigned long irq_flags;
* int pc;
*
* local_save_flags(irq_flags);
* pc = preempt_count();
*
- * event = trace_current_buffer_lock_reserve(event_<call>.id,
+ * event = trace_current_buffer_lock_reserve(&buffer,
+ * event_<call>.id,
* sizeof(struct ftrace_raw_<call>),
* irq_flags, pc);
* if (!event)
@@ -449,7 +478,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
* <assign>; <-- Here we assign the entries by the __field and
* __array macros.
*
- * trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ * trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
* }
*
* static int ftrace_raw_reg_event_<call>(void)
@@ -541,6 +570,7 @@ static void ftrace_raw_event_##call(proto) \
struct ftrace_event_call *event_call = &event_##call; \
struct ring_buffer_event *event; \
struct ftrace_raw_##call *entry; \
+ struct ring_buffer *buffer; \
unsigned long irq_flags; \
int __data_size; \
int pc; \
@@ -550,7 +580,8 @@ static void ftrace_raw_event_##call(proto) \
\
__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
\
- event = trace_current_buffer_lock_reserve(event_##call.id, \
+ event = trace_current_buffer_lock_reserve(&buffer, \
+ event_##call.id, \
sizeof(*entry) + __data_size, \
irq_flags, pc); \
if (!event) \
@@ -562,11 +593,12 @@ static void ftrace_raw_event_##call(proto) \
\
{ assign; } \
\
- if (!filter_current_check_discard(event_call, entry, event)) \
- trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
+ if (!filter_current_check_discard(buffer, event_call, entry, event)) \
+ trace_nowake_buffer_unlock_commit(buffer, \
+ event, irq_flags, pc); \
} \
\
-static int ftrace_raw_reg_event_##call(void) \
+static int ftrace_raw_reg_event_##call(void *ptr) \
{ \
int ret; \
\
@@ -577,7 +609,7 @@ static int ftrace_raw_reg_event_##call(void) \
return ret; \
} \
\
-static void ftrace_raw_unreg_event_##call(void) \
+static void ftrace_raw_unreg_event_##call(void *ptr) \
{ \
unregister_trace_##call(ftrace_raw_event_##call); \
} \
@@ -595,7 +627,6 @@ static int ftrace_raw_init_event_##call(void) \
return -ENODEV; \
event_##call.id = id; \
INIT_LIST_HEAD(&event_##call.fields); \
- init_preds(&event_##call); \
return 0; \
} \
\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 8cfe515cbc4..5dc283ba5ae 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -1,8 +1,13 @@
#ifndef _TRACE_SYSCALL_H
#define _TRACE_SYSCALL_H
+#include <linux/tracepoint.h>
+#include <linux/unistd.h>
+#include <linux/ftrace_event.h>
+
#include <asm/ptrace.h>
+
/*
* A syscall entry in the ftrace syscalls array.
*
@@ -10,26 +15,49 @@
* @nb_args: number of parameters it takes
* @types: list of types as strings
* @args: list of args as strings (args[i] matches types[i])
+ * @enter_id: associated ftrace enter event id
+ * @exit_id: associated ftrace exit event id
+ * @enter_event: associated syscall_enter trace event
+ * @exit_event: associated syscall_exit trace event
*/
struct syscall_metadata {
const char *name;
int nb_args;
const char **types;
const char **args;
+ int enter_id;
+ int exit_id;
+
+ struct ftrace_event_call *enter_event;
+ struct ftrace_event_call *exit_event;
};
#ifdef CONFIG_FTRACE_SYSCALLS
-extern void arch_init_ftrace_syscalls(void);
extern struct syscall_metadata *syscall_nr_to_meta(int nr);
-extern void start_ftrace_syscalls(void);
-extern void stop_ftrace_syscalls(void);
-extern void ftrace_syscall_enter(struct pt_regs *regs);
-extern void ftrace_syscall_exit(struct pt_regs *regs);
-#else
-static inline void start_ftrace_syscalls(void) { }
-static inline void stop_ftrace_syscalls(void) { }
-static inline void ftrace_syscall_enter(struct pt_regs *regs) { }
-static inline void ftrace_syscall_exit(struct pt_regs *regs) { }
+extern int syscall_name_to_nr(char *name);
+void set_syscall_enter_id(int num, int id);
+void set_syscall_exit_id(int num, int id);
+extern struct trace_event event_syscall_enter;
+extern struct trace_event event_syscall_exit;
+extern int reg_event_syscall_enter(void *ptr);
+extern void unreg_event_syscall_enter(void *ptr);
+extern int reg_event_syscall_exit(void *ptr);
+extern void unreg_event_syscall_exit(void *ptr);
+extern int syscall_enter_format(struct ftrace_event_call *call,
+ struct trace_seq *s);
+extern int syscall_exit_format(struct ftrace_event_call *call,
+ struct trace_seq *s);
+extern int syscall_enter_define_fields(struct ftrace_event_call *call);
+extern int syscall_exit_define_fields(struct ftrace_event_call *call);
+enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags);
+enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags);
+#endif
+#ifdef CONFIG_EVENT_PROFILE
+int reg_prof_syscall_enter(char *name);
+void unreg_prof_syscall_enter(char *name);
+int reg_prof_syscall_exit(char *name);
+void unreg_prof_syscall_exit(char *name);
+
#endif
#endif /* _TRACE_SYSCALL_H */
diff --git a/init/Kconfig b/init/Kconfig
index 3f7e60995c8..8e8b76d8a27 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -316,38 +316,28 @@ choice
prompt "RCU Implementation"
default TREE_RCU
-config CLASSIC_RCU
- bool "Classic RCU"
- help
- This option selects the classic RCU implementation that is
- designed for best read-side performance on non-realtime
- systems.
-
- Select this option if you are unsure.
-
config TREE_RCU
bool "Tree-based hierarchical RCU"
help
This option selects the RCU implementation that is
designed for very large SMP system with hundreds or
- thousands of CPUs.
+ thousands of CPUs. It also scales down nicely to
+ smaller systems.
-config PREEMPT_RCU
- bool "Preemptible RCU"
+config TREE_PREEMPT_RCU
+ bool "Preemptable tree-based hierarchical RCU"
depends on PREEMPT
help
- This option reduces the latency of the kernel by making certain
- RCU sections preemptible. Normally RCU code is non-preemptible, if
- this option is selected then read-only RCU sections become
- preemptible. This helps latency, but may expose bugs due to
- now-naive assumptions about each RCU read-side critical section
- remaining on a given CPU through its execution.
+ This option selects the RCU implementation that is
+ designed for very large SMP systems with hundreds or
+ thousands of CPUs, but for which real-time response
+ is also required.
endchoice
config RCU_TRACE
bool "Enable tracing for RCU"
- depends on TREE_RCU || PREEMPT_RCU
+ depends on TREE_RCU || TREE_PREEMPT_RCU
help
This option provides tracing in RCU which presents stats
in debugfs for debugging RCU implementation.
@@ -359,7 +349,7 @@ config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
range 2 64 if 64BIT
range 2 32 if !64BIT
- depends on TREE_RCU
+ depends on TREE_RCU || TREE_PREEMPT_RCU
default 64 if 64BIT
default 32 if !64BIT
help
@@ -374,7 +364,7 @@ config RCU_FANOUT
config RCU_FANOUT_EXACT
bool "Disable tree-based hierarchical RCU auto-balancing"
- depends on TREE_RCU
+ depends on TREE_RCU || TREE_PREEMPT_RCU
default n
help
This option forces use of the exact RCU_FANOUT value specified,
@@ -387,18 +377,12 @@ config RCU_FANOUT_EXACT
Say N if unsure.
config TREE_RCU_TRACE
- def_bool RCU_TRACE && TREE_RCU
- select DEBUG_FS
- help
- This option provides tracing for the TREE_RCU implementation,
- permitting Makefile to trivially select kernel/rcutree_trace.c.
-
-config PREEMPT_RCU_TRACE
- def_bool RCU_TRACE && PREEMPT_RCU
+ def_bool RCU_TRACE && ( TREE_RCU || TREE_PREEMPT_RCU )
select DEBUG_FS
help
- This option provides tracing for the PREEMPT_RCU implementation,
- permitting Makefile to trivially select kernel/rcupreempt_trace.c.
+ This option provides tracing for the TREE_RCU and
+ TREE_PREEMPT_RCU implementations, permitting Makefile to
+ trivially select kernel/rcutree_trace.c.
endmenu # "RCU Subsystem"
diff --git a/init/main.c b/init/main.c
index 11f4f145be3..b34fd8e5ede 100644
--- a/init/main.c
+++ b/init/main.c
@@ -451,6 +451,7 @@ static noinline void __init_refok rest_init(void)
{
int pid;
+ rcu_scheduler_starting();
kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
@@ -462,7 +463,6 @@ static noinline void __init_refok rest_init(void)
* at least once to get things moving:
*/
init_idle_bootup_task(current);
- rcu_scheduler_starting();
preempt_enable_no_resched();
schedule();
preempt_disable();
@@ -631,7 +631,6 @@ asmlinkage void __init start_kernel(void)
softirq_init();
timekeeping_init();
time_init();
- sched_clock_init();
profile_init();
if (!irqs_disabled())
printk(KERN_CRIT "start_kernel(): bug: interrupts were "
@@ -682,6 +681,7 @@ asmlinkage void __init start_kernel(void)
numa_policy_init();
if (late_time_init)
late_time_init();
+ sched_clock_init();
calibrate_delay();
pidmap_init();
anon_vma_init();
diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a691f1c..b833bd5cc12 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -80,11 +80,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
obj-$(CONFIG_TREE_RCU) += rcutree.o
-obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
+obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
-obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/exit.c b/kernel/exit.c
index c98ff7a8025..ae5d8660ddf 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1014,6 +1014,7 @@ NORET_TYPE void do_exit(long code)
validate_creds_for_do_exit(tsk);
preempt_disable();
+ exit_rcu();
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
schedule();
diff --git a/kernel/fork.c b/kernel/fork.c
index aab8579c609..bfee931ee3f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1007,10 +1007,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
copy_flags(clone_flags, p);
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
-#ifdef CONFIG_PREEMPT_RCU
- p->rcu_read_lock_nesting = 0;
- p->rcu_flipctr_idx = 0;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
+ rcu_copy_process(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
diff --git a/kernel/futex.c b/kernel/futex.c
index e18cfbdc719..248dd119a86 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -115,6 +115,9 @@ struct futex_q {
/* rt_waiter storage for requeue_pi: */
struct rt_mutex_waiter *rt_waiter;
+ /* The expected requeue pi target futex key: */
+ union futex_key *requeue_pi_key;
+
/* Bitset for the optional bitmasked wakeup */
u32 bitset;
};
@@ -1089,6 +1092,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
if (!top_waiter)
return 0;
+ /* Ensure we requeue to the expected futex. */
+ if (!match_futex(top_waiter->requeue_pi_key, key2))
+ return -EINVAL;
+
/*
* Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
* the contended case or if set_waiters is 1. The pi_state is returned
@@ -1276,6 +1283,12 @@ retry_private:
continue;
}
+ /* Ensure we requeue to the expected futex for requeue_pi. */
+ if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
+ ret = -EINVAL;
+ break;
+ }
+
/*
* Requeue nr_requeue waiters and possibly one more in the case
* of requeue_pi if we couldn't acquire the lock atomically.
@@ -1751,6 +1764,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
q.pi_state = NULL;
q.bitset = bitset;
q.rt_waiter = NULL;
+ q.requeue_pi_key = NULL;
if (abs_time) {
to = &timeout;
@@ -1858,6 +1872,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
q.pi_state = NULL;
q.rt_waiter = NULL;
+ q.requeue_pi_key = NULL;
retry:
q.key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -2118,11 +2133,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* We call schedule in futex_wait_queue_me() when we enqueue and return there
* via the following:
* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
- * 2) wakeup on uaddr2 after a requeue and subsequent unlock
- * 3) signal (before or after requeue)
- * 4) timeout (before or after requeue)
+ * 2) wakeup on uaddr2 after a requeue
+ * 3) signal
+ * 4) timeout
*
- * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
+ * If 3, cleanup and return -ERESTARTNOINTR.
*
* If 2, we may then block on trying to take the rt_mutex and return via:
* 5) successful lock
@@ -2130,7 +2145,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* 7) timeout
* 8) other lock acquisition failure
*
- * If 6, we setup a restart_block with futex_lock_pi() as the function.
+ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
*
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
@@ -2169,15 +2184,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
debug_rt_mutex_init_waiter(&rt_waiter);
rt_waiter.task = NULL;
- q.pi_state = NULL;
- q.bitset = bitset;
- q.rt_waiter = &rt_waiter;
-
key2 = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
+ q.pi_state = NULL;
+ q.bitset = bitset;
+ q.rt_waiter = &rt_waiter;
+ q.requeue_pi_key = &key2;
+
/* Prepare to wait on uaddr. */
ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
if (ret)
@@ -2248,14 +2264,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
rt_mutex_unlock(pi_mutex);
} else if (ret == -EINTR) {
/*
- * We've already been requeued, but we have no way to
- * restart by calling futex_lock_pi() directly. We
- * could restart the syscall, but that will look at
- * the user space value and return right away. So we
- * drop back with EWOULDBLOCK to tell user space that
- * "val" has been changed. That's the same what the
- * restart of the syscall would do in
- * futex_wait_setup().
+ * We've already been requeued, but cannot restart by calling
+ * futex_lock_pi() directly. We could restart this syscall, but
+ * it would detect that the user space "val" changed and return
+ * -EWOULDBLOCK. Save the overhead of the restart and return
+ * -EWOULDBLOCK directly.
*/
ret = -EWOULDBLOCK;
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 13c68e71b72..c1660194d11 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -222,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data)
}
EXPORT_SYMBOL(set_irq_chip_data);
+/**
+ * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
+ *
+ * @irq: Interrupt number
+ * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
+ *
+ * The IRQ_NESTED_THREAD flag indicates that on
+ * request_threaded_irq() no separate interrupt thread should be
+ * created for the irq as the handler are called nested in the
+ * context of a demultiplexing interrupt handler thread.
+ */
+void set_irq_nested_thread(unsigned int irq, int nest)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc)
+ return;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ if (nest)
+ desc->status |= IRQ_NESTED_THREAD;
+ else
+ desc->status &= ~IRQ_NESTED_THREAD;
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+EXPORT_SYMBOL_GPL(set_irq_nested_thread);
+
/*
* default enable function
*/
@@ -299,6 +327,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
}
}
+/*
+ * handle_nested_irq - Handle a nested irq from a irq thread
+ * @irq: the interrupt number
+ *
+ * Handle interrupts which are nested into a threaded interrupt
+ * handler. The handler function is called inside the calling
+ * threads context.
+ */
+void handle_nested_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ irqreturn_t action_ret;
+
+ might_sleep();
+
+ spin_lock_irq(&desc->lock);
+
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ action = desc->action;
+ if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ goto out_unlock;
+
+ desc->status |= IRQ_INPROGRESS;
+ spin_unlock_irq(&desc->lock);
+
+ action_ret = action->thread_fn(action->irq, action->dev_id);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+
+ spin_lock_irq(&desc->lock);
+ desc->status &= ~IRQ_INPROGRESS;
+
+out_unlock:
+ spin_unlock_irq(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_nested_irq);
+
/**
* handle_simple_irq - Simple and software-decoded IRQs.
* @irq: the interrupt number
@@ -382,7 +449,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
spin_lock(&desc->lock);
desc->status &= ~IRQ_INPROGRESS;
- if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+
+ if (unlikely(desc->status & IRQ_ONESHOT))
+ desc->status |= IRQ_MASKED;
+ else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
desc->chip->unmask(irq);
out_unlock:
spin_unlock(&desc->lock);
@@ -572,6 +642,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
desc->chip = &dummy_irq_chip;
}
+ chip_bus_lock(irq, desc);
spin_lock_irqsave(&desc->lock, flags);
/* Uninstall? */
@@ -591,6 +662,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
desc->chip->startup(irq);
}
spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(irq, desc);
}
EXPORT_SYMBOL_GPL(__set_irq_handler);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 065205bdd92..a81cf80554d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -161,7 +161,7 @@ int __init early_irq_init(void)
desc = irq_desc_legacy;
legacy_count = ARRAY_SIZE(irq_desc_legacy);
- node = first_online_node;
+ node = first_online_node;
/* allocate irq_desc_ptrs array based on nr_irqs */
irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
@@ -172,6 +172,9 @@ int __init early_irq_init(void)
for (i = 0; i < legacy_count; i++) {
desc[i].irq = i;
+#ifdef CONFIG_SMP
+ desc[i].node = node;
+#endif
desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
alloc_desc_masks(&desc[i], node, true);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e70ed5592eb..1b5d742c6a7 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -44,6 +44,19 @@ extern int irq_select_affinity_usr(unsigned int irq);
extern void irq_set_thread_affinity(struct irq_desc *desc);
+/* Inline functions for support of irq chips on slow busses */
+static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
+{
+ if (unlikely(desc->chip->bus_lock))
+ desc->chip->bus_lock(irq);
+}
+
+static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
+{
+ if (unlikely(desc->chip->bus_sync_unlock))
+ desc->chip->bus_sync_unlock(irq);
+}
+
/*
* Debugging printout:
*/
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0ec9ed83173..bde4c667d24 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -230,9 +230,11 @@ void disable_irq_nosync(unsigned int irq)
if (!desc)
return;
+ chip_bus_lock(irq, desc);
spin_lock_irqsave(&desc->lock, flags);
__disable_irq(desc, irq, false);
spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(irq, desc);
}
EXPORT_SYMBOL(disable_irq_nosync);
@@ -294,7 +296,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
* matches the last disable, processing of interrupts on this
* IRQ line is re-enabled.
*
- * This function may be called from IRQ context.
+ * This function may be called from IRQ context only when
+ * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
*/
void enable_irq(unsigned int irq)
{
@@ -304,9 +307,11 @@ void enable_irq(unsigned int irq)
if (!desc)
return;
+ chip_bus_lock(irq, desc);
spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, false);
spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(irq, desc);
}
EXPORT_SYMBOL(enable_irq);
@@ -436,6 +441,26 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
return ret;
}
+/*
+ * Default primary interrupt handler for threaded interrupts. Is
+ * assigned as primary handler when request_threaded_irq is called
+ * with handler == NULL. Useful for oneshot interrupts.
+ */
+static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
+{
+ return IRQ_WAKE_THREAD;
+}
+
+/*
+ * Primary handler for nested threaded interrupts. Should never be
+ * called.
+ */
+static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
+{
+ WARN(1, "Primary handler called for nested irq %d\n", irq);
+ return IRQ_NONE;
+}
+
static int irq_wait_for_interrupt(struct irqaction *action)
{
while (!kthread_should_stop()) {
@@ -451,6 +476,23 @@ static int irq_wait_for_interrupt(struct irqaction *action)
return -1;
}
+/*
+ * Oneshot interrupts keep the irq line masked until the threaded
+ * handler finished. unmask if the interrupt has not been disabled and
+ * is marked MASKED.
+ */
+static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
+{
+ chip_bus_lock(irq, desc);
+ spin_lock_irq(&desc->lock);
+ if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
+ desc->status &= ~IRQ_MASKED;
+ desc->chip->unmask(irq);
+ }
+ spin_unlock_irq(&desc->lock);
+ chip_bus_sync_unlock(irq, desc);
+}
+
#ifdef CONFIG_SMP
/*
* Check whether we need to change the affinity of the interrupt thread.
@@ -492,7 +534,7 @@ static int irq_thread(void *data)
struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
struct irqaction *action = data;
struct irq_desc *desc = irq_to_desc(action->irq);
- int wake;
+ int wake, oneshot = desc->status & IRQ_ONESHOT;
sched_setscheduler(current, SCHED_FIFO, &param);
current->irqaction = action;
@@ -518,6 +560,9 @@ static int irq_thread(void *data)
spin_unlock_irq(&desc->lock);
action->thread_fn(action->irq, action->dev_id);
+
+ if (oneshot)
+ irq_finalize_oneshot(action->irq, desc);
}
wake = atomic_dec_and_test(&desc->threads_active);
@@ -565,7 +610,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
struct irqaction *old, **old_ptr;
const char *old_name = NULL;
unsigned long flags;
- int shared = 0;
+ int nested, shared = 0;
int ret;
if (!desc)
@@ -590,10 +635,32 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
rand_initialize_irq(irq);
}
+ /* Oneshot interrupts are not allowed with shared */
+ if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
+ return -EINVAL;
+
+ /*
+ * Check whether the interrupt nests into another interrupt
+ * thread.
+ */
+ nested = desc->status & IRQ_NESTED_THREAD;
+ if (nested) {
+ if (!new->thread_fn)
+ return -EINVAL;
+ /*
+ * Replace the primary handler which was provided from
+ * the driver for non nested interrupt handling by the
+ * dummy function which warns when called.
+ */
+ new->handler = irq_nested_primary_handler;
+ }
+
/*
- * Threaded handler ?
+ * Create a handler thread when a thread function is supplied
+ * and the interrupt does not nest into another interrupt
+ * thread.
*/
- if (new->thread_fn) {
+ if (new->thread_fn && !nested) {
struct task_struct *t;
t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -662,9 +729,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
desc->status |= IRQ_PER_CPU;
#endif
- desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
+ desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
+ if (new->flags & IRQF_ONESHOT)
+ desc->status |= IRQ_ONESHOT;
+
if (!(desc->status & IRQ_NOAUTOEN)) {
desc->depth = 0;
desc->status &= ~IRQ_DISABLED;
@@ -875,7 +945,14 @@ EXPORT_SYMBOL_GPL(remove_irq);
*/
void free_irq(unsigned int irq, void *dev_id)
{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc)
+ return;
+
+ chip_bus_lock(irq, desc);
kfree(__free_irq(irq, dev_id));
+ chip_bus_sync_unlock(irq, desc);
}
EXPORT_SYMBOL(free_irq);
@@ -884,6 +961,8 @@ EXPORT_SYMBOL(free_irq);
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs.
* Primary handler for threaded interrupts
+ * If NULL and thread_fn != NULL the default
+ * primary handler is installed
* @thread_fn: Function called from the irq handler thread
* If NULL, no irq thread is created
* @irqflags: Interrupt type flags
@@ -963,8 +1042,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
if (desc->status & IRQ_NOREQUEST)
return -EINVAL;
- if (!handler)
- return -EINVAL;
+
+ if (!handler) {
+ if (!thread_fn)
+ return -EINVAL;
+ handler = irq_default_primary_handler;
+ }
action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
if (!action)
@@ -976,7 +1059,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
action->name = devname;
action->dev_id = dev_id;
+ chip_bus_lock(irq, desc);
retval = __setup_irq(irq, desc, action);
+ chip_bus_sync_unlock(irq, desc);
+
if (retval)
kfree(action);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 638d8bedec1..a0bb09e7986 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -15,10 +15,10 @@
/**
* suspend_device_irqs - disable all currently enabled interrupt lines
*
- * During system-wide suspend or hibernation device interrupts need to be
- * disabled at the chip level and this function is provided for this purpose.
- * It disables all interrupt lines that are enabled at the moment and sets the
- * IRQ_SUSPENDED flag for them.
+ * During system-wide suspend or hibernation device drivers need to be prevented
+ * from receiving interrupts and this function is provided for this purpose.
+ * It marks all interrupt lines in use, except for the timer ones, as disabled
+ * and sets the IRQ_SUSPENDED flag for each of them.
*/
void suspend_device_irqs(void)
{
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 89c7117acf2..090c3763f3a 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
- if (!desc->chip || !desc->chip->retrigger ||
- !desc->chip->retrigger(irq)) {
+ if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
#ifdef CONFIG_HARDIRQS_SW_RESEND
/* Set it pending and activate the softirq: */
set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 4d568294de3..114e704760f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str)
__setup("irqfixup", irqfixup_setup);
module_param(irqfixup, int, 0644);
-MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
static int __init irqpoll_setup(char *str)
{
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 4e8cae2e914..9fcb53a11f8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -37,6 +37,8 @@
#include <linux/suspend.h>
#include <asm/uaccess.h>
+#include <trace/events/module.h>
+
extern int max_threads;
static struct workqueue_struct *khelper_wq;
@@ -112,6 +114,8 @@ int __request_module(bool wait, const char *fmt, ...)
return -ENOMEM;
}
+ trace_module_request(module_name, wait, _RET_IP_);
+
ret = call_usermodehelper(modprobe_path, argv, envp,
wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
atomic_dec(&kmod_concurrent);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0540948e29a..ef177d653b2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
struct kprobe_insn_page {
- struct hlist_node hlist;
+ struct list_head list;
kprobe_opcode_t *insns; /* Page of instruction slots */
char slot_used[INSNS_PER_PAGE];
int nused;
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
};
static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
-static struct hlist_head kprobe_insn_pages;
+static LIST_HEAD(kprobe_insn_pages);
static int kprobe_garbage_slots;
static int collect_garbage_slots(void);
@@ -152,10 +152,9 @@ loop_end:
static kprobe_opcode_t __kprobes *__get_insn_slot(void)
{
struct kprobe_insn_page *kip;
- struct hlist_node *pos;
retry:
- hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
+ list_for_each_entry(kip, &kprobe_insn_pages, list) {
if (kip->nused < INSNS_PER_PAGE) {
int i;
for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
kfree(kip);
return NULL;
}
- INIT_HLIST_NODE(&kip->hlist);
- hlist_add_head(&kip->hlist, &kprobe_insn_pages);
+ INIT_LIST_HEAD(&kip->list);
+ list_add(&kip->list, &kprobe_insn_pages);
memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
kip->slot_used[0] = SLOT_USED;
kip->nused = 1;
@@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
* so as not to have to set it up again the
* next time somebody inserts a probe.
*/
- hlist_del(&kip->hlist);
- if (hlist_empty(&kprobe_insn_pages)) {
- INIT_HLIST_NODE(&kip->hlist);
- hlist_add_head(&kip->hlist,
- &kprobe_insn_pages);
- } else {
+ if (!list_is_singular(&kprobe_insn_pages)) {
+ list_del(&kip->list);
module_free(NULL, kip->insns);
kfree(kip);
}
@@ -235,14 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
static int __kprobes collect_garbage_slots(void)
{
- struct kprobe_insn_page *kip;
- struct hlist_node *pos, *next;
+ struct kprobe_insn_page *kip, *next;
/* Ensure no-one is preepmted on the garbages */
if (check_safety())
return -EAGAIN;
- hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
+ list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
int i;
if (kip->ngarbage == 0)
continue;
@@ -260,19 +254,17 @@ static int __kprobes collect_garbage_slots(void)
void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
{
struct kprobe_insn_page *kip;
- struct hlist_node *pos;
mutex_lock(&kprobe_insn_mutex);
- hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
+ list_for_each_entry(kip, &kprobe_insn_pages, list) {
if (kip->insns <= slot &&
slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
int i = (slot - kip->insns) / MAX_INSN_SIZE;
if (dirty) {
kip->slot_used[i] = SLOT_DIRTY;
kip->ngarbage++;
- } else {
+ } else
collect_one_slot(kip, i);
- }
break;
}
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index eb8751aa041..5fe709982ca 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,8 +16,6 @@
#include <linux/mutex.h>
#include <trace/events/sched.h>
-#define KTHREAD_NICE_LEVEL (-5)
-
static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
* The kernel thread should not inherit these properties.
*/
sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
- set_user_nice(create.result, KTHREAD_NICE_LEVEL);
set_cpus_allowed_ptr(create.result, cpu_all_mask);
}
return create.result;
@@ -221,7 +218,6 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
- set_user_nice(tsk, KTHREAD_NICE_LEVEL);
set_cpus_allowed_ptr(tsk, cpu_all_mask);
set_mems_allowed(node_possible_map);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8bbeef996c7..f74d2d7aa60 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
#include <linux/hash.h>
#include <linux/ftrace.h>
#include <linux/stringify.h>
+#include <linux/bitops.h>
#include <asm/sections.h>
@@ -366,11 +367,21 @@ static int save_trace(struct stack_trace *trace)
save_stack_trace(trace);
+ /*
+ * Some daft arches put -1 at the end to indicate its a full trace.
+ *
+ * <rant> this is buggy anyway, since it takes a whole extra entry so a
+ * complete trace that maxes out the entries provided will be reported
+ * as incomplete, friggin useless </rant>
+ */
+ if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
+ trace->nr_entries--;
+
trace->max_entries = trace->nr_entries;
nr_stack_trace_entries += trace->nr_entries;
- if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
+ if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -388,20 +399,6 @@ unsigned int nr_hardirq_chains;
unsigned int nr_softirq_chains;
unsigned int nr_process_chains;
unsigned int max_lockdep_depth;
-unsigned int max_recursion_depth;
-
-static unsigned int lockdep_dependency_gen_id;
-
-static bool lockdep_dependency_visit(struct lock_class *source,
- unsigned int depth)
-{
- if (!depth)
- lockdep_dependency_gen_id++;
- if (source->dep_gen_id == lockdep_dependency_gen_id)
- return true;
- source->dep_gen_id = lockdep_dependency_gen_id;
- return false;
-}
#ifdef CONFIG_DEBUG_LOCKDEP
/*
@@ -431,11 +428,8 @@ atomic_t redundant_softirqs_on;
atomic_t redundant_softirqs_off;
atomic_t nr_unused_locks;
atomic_t nr_cyclic_checks;
-atomic_t nr_cyclic_check_recursions;
atomic_t nr_find_usage_forwards_checks;
-atomic_t nr_find_usage_forwards_recursions;
atomic_t nr_find_usage_backwards_checks;
-atomic_t nr_find_usage_backwards_recursions;
#endif
/*
@@ -551,58 +545,6 @@ static void lockdep_print_held_locks(struct task_struct *curr)
}
}
-static void print_lock_class_header(struct lock_class *class, int depth)
-{
- int bit;
-
- printk("%*s->", depth, "");
- print_lock_name(class);
- printk(" ops: %lu", class->ops);
- printk(" {\n");
-
- for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
- if (class->usage_mask & (1 << bit)) {
- int len = depth;
-
- len += printk("%*s %s", depth, "", usage_str[bit]);
- len += printk(" at:\n");
- print_stack_trace(class->usage_traces + bit, len);
- }
- }
- printk("%*s }\n", depth, "");
-
- printk("%*s ... key at: ",depth,"");
- print_ip_sym((unsigned long)class->key);
-}
-
-/*
- * printk all lock dependencies starting at <entry>:
- */
-static void __used
-print_lock_dependencies(struct lock_class *class, int depth)
-{
- struct lock_list *entry;
-
- if (lockdep_dependency_visit(class, depth))
- return;
-
- if (DEBUG_LOCKS_WARN_ON(depth >= 20))
- return;
-
- print_lock_class_header(class, depth);
-
- list_for_each_entry(entry, &class->locks_after, entry) {
- if (DEBUG_LOCKS_WARN_ON(!entry->class))
- return;
-
- print_lock_dependencies(entry->class, depth + 1);
-
- printk("%*s ... acquired at:\n",depth,"");
- print_stack_trace(&entry->trace, 2);
- printk("\n");
- }
-}
-
static void print_kernel_version(void)
{
printk("%s %.*s\n", init_utsname()->release,
@@ -898,22 +840,203 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
}
/*
+ * For good efficiency of modular, we use power of 2
+ */
+#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
+#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
+
+/*
+ * The circular_queue and helpers is used to implement the
+ * breadth-first search(BFS)algorithem, by which we can build
+ * the shortest path from the next lock to be acquired to the
+ * previous held lock if there is a circular between them.
+ */
+struct circular_queue {
+ unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
+ unsigned int front, rear;
+};
+
+static struct circular_queue lock_cq;
+
+unsigned int max_bfs_queue_depth;
+
+static unsigned int lockdep_dependency_gen_id;
+
+static inline void __cq_init(struct circular_queue *cq)
+{
+ cq->front = cq->rear = 0;
+ lockdep_dependency_gen_id++;
+}
+
+static inline int __cq_empty(struct circular_queue *cq)
+{
+ return (cq->front == cq->rear);
+}
+
+static inline int __cq_full(struct circular_queue *cq)
+{
+ return ((cq->rear + 1) & CQ_MASK) == cq->front;
+}
+
+static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+{
+ if (__cq_full(cq))
+ return -1;
+
+ cq->element[cq->rear] = elem;
+ cq->rear = (cq->rear + 1) & CQ_MASK;
+ return 0;
+}
+
+static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+{
+ if (__cq_empty(cq))
+ return -1;
+
+ *elem = cq->element[cq->front];
+ cq->front = (cq->front + 1) & CQ_MASK;
+ return 0;
+}
+
+static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
+{
+ return (cq->rear - cq->front) & CQ_MASK;
+}
+
+static inline void mark_lock_accessed(struct lock_list *lock,
+ struct lock_list *parent)
+{
+ unsigned long nr;
+
+ nr = lock - list_entries;
+ WARN_ON(nr >= nr_list_entries);
+ lock->parent = parent;
+ lock->class->dep_gen_id = lockdep_dependency_gen_id;
+}
+
+static inline unsigned long lock_accessed(struct lock_list *lock)
+{
+ unsigned long nr;
+
+ nr = lock - list_entries;
+ WARN_ON(nr >= nr_list_entries);
+ return lock->class->dep_gen_id == lockdep_dependency_gen_id;
+}
+
+static inline struct lock_list *get_lock_parent(struct lock_list *child)
+{
+ return child->parent;
+}
+
+static inline int get_lock_depth(struct lock_list *child)
+{
+ int depth = 0;
+ struct lock_list *parent;
+
+ while ((parent = get_lock_parent(child))) {
+ child = parent;
+ depth++;
+ }
+ return depth;
+}
+
+static int __bfs(struct lock_list *source_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry,
+ int forward)
+{
+ struct lock_list *entry;
+ struct list_head *head;
+ struct circular_queue *cq = &lock_cq;
+ int ret = 1;
+
+ if (match(source_entry, data)) {
+ *target_entry = source_entry;
+ ret = 0;
+ goto exit;
+ }
+
+ if (forward)
+ head = &source_entry->class->locks_after;
+ else
+ head = &source_entry->class->locks_before;
+
+ if (list_empty(head))
+ goto exit;
+
+ __cq_init(cq);
+ __cq_enqueue(cq, (unsigned long)source_entry);
+
+ while (!__cq_empty(cq)) {
+ struct lock_list *lock;
+
+ __cq_dequeue(cq, (unsigned long *)&lock);
+
+ if (!lock->class) {
+ ret = -2;
+ goto exit;
+ }
+
+ if (forward)
+ head = &lock->class->locks_after;
+ else
+ head = &lock->class->locks_before;
+
+ list_for_each_entry(entry, head, entry) {
+ if (!lock_accessed(entry)) {
+ unsigned int cq_depth;
+ mark_lock_accessed(entry, lock);
+ if (match(entry, data)) {
+ *target_entry = entry;
+ ret = 0;
+ goto exit;
+ }
+
+ if (__cq_enqueue(cq, (unsigned long)entry)) {
+ ret = -1;
+ goto exit;
+ }
+ cq_depth = __cq_get_elem_count(cq);
+ if (max_bfs_queue_depth < cq_depth)
+ max_bfs_queue_depth = cq_depth;
+ }
+ }
+ }
+exit:
+ return ret;
+}
+
+static inline int __bfs_forwards(struct lock_list *src_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
+{
+ return __bfs(src_entry, data, match, target_entry, 1);
+
+}
+
+static inline int __bfs_backwards(struct lock_list *src_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
+{
+ return __bfs(src_entry, data, match, target_entry, 0);
+
+}
+
+/*
* Recursive, forwards-direction lock-dependency checking, used for
* both noncyclic checking and for hardirq-unsafe/softirq-unsafe
* checking.
- *
- * (to keep the stackframe of the recursive functions small we
- * use these global variables, and we also mark various helper
- * functions as noinline.)
*/
-static struct held_lock *check_source, *check_target;
/*
* Print a dependency chain entry (this is only done when a deadlock
* has been detected):
*/
static noinline int
-print_circular_bug_entry(struct lock_list *target, unsigned int depth)
+print_circular_bug_entry(struct lock_list *target, int depth)
{
if (debug_locks_silent)
return 0;
@@ -930,11 +1053,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
* header first:
*/
static noinline int
-print_circular_bug_header(struct lock_list *entry, unsigned int depth)
+print_circular_bug_header(struct lock_list *entry, unsigned int depth,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
- if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ if (debug_locks_silent)
return 0;
printk("\n=======================================================\n");
@@ -943,9 +1068,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
printk( "-------------------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
- print_lock(check_source);
+ print_lock(check_src);
printk("\nbut task is already holding lock:\n");
- print_lock(check_target);
+ print_lock(check_tgt);
printk("\nwhich lock already depends on the new lock.\n\n");
printk("\nthe existing dependency chain (in reverse order) is:\n");
@@ -954,19 +1079,36 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
return 0;
}
-static noinline int print_circular_bug_tail(void)
+static inline int class_equal(struct lock_list *entry, void *data)
+{
+ return entry->class == data;
+}
+
+static noinline int print_circular_bug(struct lock_list *this,
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
- struct lock_list this;
+ struct lock_list *parent;
+ int depth;
- if (debug_locks_silent)
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- this.class = hlock_class(check_source);
- if (!save_trace(&this.trace))
+ if (!save_trace(&this->trace))
return 0;
- print_circular_bug_entry(&this, 0);
+ depth = get_lock_depth(target);
+
+ print_circular_bug_header(target, depth, check_src, check_tgt);
+
+ parent = get_lock_parent(target);
+
+ while (parent) {
+ print_circular_bug_entry(parent, --depth);
+ parent = get_lock_parent(parent);
+ }
printk("\nother info that might help us debug this:\n\n");
lockdep_print_held_locks(curr);
@@ -977,73 +1119,69 @@ static noinline int print_circular_bug_tail(void)
return 0;
}
-#define RECURSION_LIMIT 40
-
-static int noinline print_infinite_recursion_bug(void)
+static noinline int print_bfs_bug(int ret)
{
if (!debug_locks_off_graph_unlock())
return 0;
- WARN_ON(1);
+ WARN(1, "lockdep bfs error:%d\n", ret);
return 0;
}
-unsigned long __lockdep_count_forward_deps(struct lock_class *class,
- unsigned int depth)
+static int noop_count(struct lock_list *entry, void *data)
{
- struct lock_list *entry;
- unsigned long ret = 1;
+ (*(unsigned long *)data)++;
+ return 0;
+}
- if (lockdep_dependency_visit(class, depth))
- return 0;
+unsigned long __lockdep_count_forward_deps(struct lock_list *this)
+{
+ unsigned long count = 0;
+ struct lock_list *uninitialized_var(target_entry);
- /*
- * Recurse this class's dependency list:
- */
- list_for_each_entry(entry, &class->locks_after, entry)
- ret += __lockdep_count_forward_deps(entry->class, depth + 1);
+ __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
- return ret;
+ return count;
}
-
unsigned long lockdep_count_forward_deps(struct lock_class *class)
{
unsigned long ret, flags;
+ struct lock_list this;
+
+ this.parent = NULL;
+ this.class = class;
local_irq_save(flags);
__raw_spin_lock(&lockdep_lock);
- ret = __lockdep_count_forward_deps(class, 0);
+ ret = __lockdep_count_forward_deps(&this);
__raw_spin_unlock(&lockdep_lock);
local_irq_restore(flags);
return ret;
}
-unsigned long __lockdep_count_backward_deps(struct lock_class *class,
- unsigned int depth)
+unsigned long __lockdep_count_backward_deps(struct lock_list *this)
{
- struct lock_list *entry;
- unsigned long ret = 1;
+ unsigned long count = 0;
+ struct lock_list *uninitialized_var(target_entry);
- if (lockdep_dependency_visit(class, depth))
- return 0;
- /*
- * Recurse this class's dependency list:
- */
- list_for_each_entry(entry, &class->locks_before, entry)
- ret += __lockdep_count_backward_deps(entry->class, depth + 1);
+ __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
- return ret;
+ return count;
}
unsigned long lockdep_count_backward_deps(struct lock_class *class)
{
unsigned long ret, flags;
+ struct lock_list this;
+
+ this.parent = NULL;
+ this.class = class;
local_irq_save(flags);
__raw_spin_lock(&lockdep_lock);
- ret = __lockdep_count_backward_deps(class, 0);
+ ret = __lockdep_count_backward_deps(&this);
__raw_spin_unlock(&lockdep_lock);
local_irq_restore(flags);
@@ -1055,29 +1193,16 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
* lead to <target>. Print an error and return 0 if it does.
*/
static noinline int
-check_noncircular(struct lock_class *source, unsigned int depth)
+check_noncircular(struct lock_list *root, struct lock_class *target,
+ struct lock_list **target_entry)
{
- struct lock_list *entry;
+ int result;
- if (lockdep_dependency_visit(source, depth))
- return 1;
+ debug_atomic_inc(&nr_cyclic_checks);
- debug_atomic_inc(&nr_cyclic_check_recursions);
- if (depth > max_recursion_depth)
- max_recursion_depth = depth;
- if (depth >= RECURSION_LIMIT)
- return print_infinite_recursion_bug();
- /*
- * Check this lock's dependency list:
- */
- list_for_each_entry(entry, &source->locks_after, entry) {
- if (entry->class == hlock_class(check_target))
- return print_circular_bug_header(entry, depth+1);
- debug_atomic_inc(&nr_cyclic_checks);
- if (!check_noncircular(entry->class, depth+1))
- return print_circular_bug_entry(entry, depth+1);
- }
- return 1;
+ result = __bfs_forwards(root, target, class_equal, target_entry);
+
+ return result;
}
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
@@ -1086,103 +1211,121 @@ check_noncircular(struct lock_class *source, unsigned int depth)
* proving that two subgraphs can be connected by a new dependency
* without creating any illegal irq-safe -> irq-unsafe lock dependency.
*/
-static enum lock_usage_bit find_usage_bit;
-static struct lock_class *forwards_match, *backwards_match;
+
+static inline int usage_match(struct lock_list *entry, void *bit)
+{
+ return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+}
+
+
/*
* Find a node in the forwards-direction dependency sub-graph starting
- * at <source> that matches <find_usage_bit>.
+ * at @root->class that matches @bit.
*
- * Return 2 if such a node exists in the subgraph, and put that node
- * into <forwards_match>.
+ * Return 0 if such a node exists in the subgraph, and put that node
+ * into *@target_entry.
*
- * Return 1 otherwise and keep <forwards_match> unchanged.
- * Return 0 on error.
+ * Return 1 otherwise and keep *@target_entry unchanged.
+ * Return <0 on error.
*/
-static noinline int
-find_usage_forwards(struct lock_class *source, unsigned int depth)
+static int
+find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+ struct lock_list **target_entry)
{
- struct lock_list *entry;
- int ret;
-
- if (lockdep_dependency_visit(source, depth))
- return 1;
-
- if (depth > max_recursion_depth)
- max_recursion_depth = depth;
- if (depth >= RECURSION_LIMIT)
- return print_infinite_recursion_bug();
+ int result;
debug_atomic_inc(&nr_find_usage_forwards_checks);
- if (source->usage_mask & (1 << find_usage_bit)) {
- forwards_match = source;
- return 2;
- }
- /*
- * Check this lock's dependency list:
- */
- list_for_each_entry(entry, &source->locks_after, entry) {
- debug_atomic_inc(&nr_find_usage_forwards_recursions);
- ret = find_usage_forwards(entry->class, depth+1);
- if (ret == 2 || ret == 0)
- return ret;
- }
- return 1;
+ result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+
+ return result;
}
/*
* Find a node in the backwards-direction dependency sub-graph starting
- * at <source> that matches <find_usage_bit>.
+ * at @root->class that matches @bit.
*
- * Return 2 if such a node exists in the subgraph, and put that node
- * into <backwards_match>.
+ * Return 0 if such a node exists in the subgraph, and put that node
+ * into *@target_entry.
*
- * Return 1 otherwise and keep <backwards_match> unchanged.
- * Return 0 on error.
+ * Return 1 otherwise and keep *@target_entry unchanged.
+ * Return <0 on error.
*/
-static noinline int
-find_usage_backwards(struct lock_class *source, unsigned int depth)
+static int
+find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+ struct lock_list **target_entry)
{
- struct lock_list *entry;
- int ret;
+ int result;
- if (lockdep_dependency_visit(source, depth))