From 07d209bd092d023976fdb881ba6d4b30fe18aebe Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 4 Sep 2014 14:06:55 +0200
Subject: udf: Avoid infinite loop when processing indirect ICBs

commit c03aa9f6e1f938618e6db2e23afef0574efeeb65 upstream.

We did not implement any bound on number of indirect ICBs we follow when
loading inode. Thus corrupted medium could cause kernel to go into an
infinite loop, possibly causing a stack overflow.

Fix the possible stack overflow by removing recursion from
__udf_read_inode() and limit number of indirect ICBs we follow to avoid
infinite loops.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Chuck Ebbert <cebbert.lkml@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/udf/inode.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index b6d15d349810..aa023283cc8a 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1270,13 +1270,22 @@ update_time:
 	return 0;
 }
 
+/*
+ * Maximum length of linked list formed by ICB hierarchy. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_ICB_NESTING 1024
+
 static void __udf_read_inode(struct inode *inode)
 {
 	struct buffer_head *bh = NULL;
 	struct fileEntry *fe;
 	uint16_t ident;
 	struct udf_inode_info *iinfo = UDF_I(inode);
+	unsigned int indirections = 0;
 
+reread:
 	/*
 	 * Set defaults, but the inode is still incomplete!
 	 * Note: get_new_inode() sets the following on a new inode:
@@ -1313,28 +1322,26 @@ static void __udf_read_inode(struct inode *inode)
 		ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
 					&ident);
 		if (ident == TAG_IDENT_IE && ibh) {
-			struct buffer_head *nbh = NULL;
 			struct kernel_lb_addr loc;
 			struct indirectEntry *ie;
 
 			ie = (struct indirectEntry *)ibh->b_data;
 			loc = lelb_to_cpu(ie->indirectICB.extLocation);
 
-			if (ie->indirectICB.extLength &&
-				(nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
-							&ident))) {
-				if (ident == TAG_IDENT_FE ||
-					ident == TAG_IDENT_EFE) {
-					memcpy(&iinfo->i_location,
-						&loc,
-						sizeof(struct kernel_lb_addr));
-					brelse(bh);
-					brelse(ibh);
-					brelse(nbh);
-					__udf_read_inode(inode);
+			if (ie->indirectICB.extLength) {
+				brelse(bh);
+				brelse(ibh);
+				memcpy(&iinfo->i_location, &loc,
+				       sizeof(struct kernel_lb_addr));
+				if (++indirections > UDF_MAX_ICB_NESTING) {
+					udf_err(inode->i_sb,
+						"too many ICBs in ICB hierarchy"
+						" (max %d supported)\n",
+						UDF_MAX_ICB_NESTING);
+					make_bad_inode(inode);
 					return;
 				}
-				brelse(nbh);
+				goto reread;
 			}
 		}
 		brelse(ibh);
-- 
cgit v1.2.3


From bee870fc1af7c5109a0f167af3bfe7002a02e7f3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 2 Oct 2014 16:17:02 -0700
Subject: perf: fix perf bug in fork()

commit 6c72e3501d0d62fc064d3680e5234f3463ec5a86 upstream.

Oleg noticed that a cleanup by Sylvain actually uncovered a bug; by
calling perf_event_free_task() when failing sched_fork() we will not yet
have done the memset() on ->perf_event_ctxp[] and will therefore try and
'free' the inherited contexts, which are still in use by the parent
process.  This is bad..

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Sylvain 'ythier' Hitier <sylvain.hitier@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/events/core.c | 4 +++-
 kernel/fork.c        | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6bf387a60399..0b4733447151 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7482,8 +7482,10 @@ int perf_event_init_task(struct task_struct *child)
 
 	for_each_task_context_nr(ctxn) {
 		ret = perf_event_init_context(child, ctxn);
-		if (ret)
+		if (ret) {
+			perf_event_free_task(child);
 			return ret;
+		}
 	}
 
 	return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 2c76e11ba939..514dbc40f98f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1324,7 +1324,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_policy;
 	retval = audit_alloc(p);
 	if (retval)
-		goto bad_fork_cleanup_policy;
+		goto bad_fork_cleanup_perf;
 	/* copy all the process information */
 	retval = copy_semundo(clone_flags, p);
 	if (retval)
@@ -1522,8 +1522,9 @@ bad_fork_cleanup_semundo:
 	exit_sem(p);
 bad_fork_cleanup_audit:
 	audit_free(p);
-bad_fork_cleanup_policy:
+bad_fork_cleanup_perf:
 	perf_event_free_task(p);
+bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
-- 
cgit v1.2.3


From 0000372a96216b393bacfa50fb0253c40f8cf3d1 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Fri, 3 Oct 2014 16:19:24 -0700
Subject: init/Kconfig: Fix HAVE_FUTEX_CMPXCHG to not break up the EXPERT menu

commit 62b4d2041117f35ab2409c9f5c4b8d3dc8e59d0f upstream.

commit 03b8c7b623c80af264c4c8d6111e5c6289933666 ("futex: Allow
architectures to skip futex_atomic_cmpxchg_inatomic() test") added the
HAVE_FUTEX_CMPXCHG symbol right below FUTEX.  This placed it right in
the middle of the options for the EXPERT menu.  However,
HAVE_FUTEX_CMPXCHG does not depend on EXPERT or FUTEX, so Kconfig stops
placing items in the EXPERT menu, and displays the remaining several
EXPERT items (starting with EPOLL) directly in the General Setup menu.

Since both users of HAVE_FUTEX_CMPXCHG only select it "if FUTEX", make
HAVE_FUTEX_CMPXCHG itself depend on FUTEX.  With this change, the
subsequent items display as part of the EXPERT menu again; the EMBEDDED
menu now appears as the next top-level item in the General Setup menu,
which makes General Setup much shorter and more usable.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 init/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/init/Kconfig b/init/Kconfig
index 5d6febaea56d..8fa4f758821a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1367,6 +1367,7 @@ config FUTEX
 
 config HAVE_FUTEX_CMPXCHG
 	bool
+	depends on FUTEX
 	help
 	  Architectures should select this if futex_atomic_cmpxchg_inatomic()
 	  is implemented and always working. This removes a couple of runtime
-- 
cgit v1.2.3


From 78a3db11cb0e9521572c7d0effbc63f2bd5dac12 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 2 Oct 2014 16:51:18 -0400
Subject: ring-buffer: Fix infinite spin in reading buffer

commit 24607f114fd14f2f37e3e0cb3d47bce96e81e848 upstream.

Commit 651e22f2701b "ring-buffer: Always reset iterator to reader page"
fixed one bug but in the process caused another one. The reset is to
update the header page, but that fix also changed the way the cached
reads were updated. The cache reads are used to test if an iterator
needs to be updated or not.

A ring buffer iterator, when created, disables writes to the ring buffer
but does not stop other readers or consuming reads from happening.
Although all readers are synchronized via a lock, they are only
synchronized when in the ring buffer functions. Those functions may
be called by any number of readers. The iterator continues down when
its not interrupted by a consuming reader. If a consuming read
occurs, the iterator starts from the beginning of the buffer.

The way the iterator sees that a consuming read has happened since
its last read is by checking the reader "cache". The cache holds the
last counts of the read and the reader page itself.

Commit 651e22f2701b changed what was saved by the cache_read when
the rb_iter_reset() occurred, making the iterator never match the cache.
Then if the iterator calls rb_iter_reset(), it will go into an
infinite loop by checking if the cache doesn't match, doing the reset
and retrying, just to see that the cache still doesn't match! Which
should never happen as the reset is suppose to set the cache to the
current value and there's locks that keep a consuming reader from
having access to the data.

Fixes: 651e22f2701b "ring-buffer: Always reset iterator to reader page"
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0bc181b0524c..3d9fee3a80b3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3371,7 +3371,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
 	iter->head = cpu_buffer->reader_page->read;
 
 	iter->cache_reader_page = iter->head_page;
-	iter->cache_read = iter->head;
+	iter->cache_read = cpu_buffer->read;
 
 	if (iter->head)
 		iter->read_stamp = cpu_buffer->read_stamp;
-- 
cgit v1.2.3


From 183c062c51c7e7663752b94ac399c830e4cb3c44 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hp.com>
Date: Wed, 6 Aug 2014 16:05:36 -0700
Subject: mm, thp: move invariant bug check out of loop in
 __split_huge_page_map

commit f8303c2582b889351e261ff18c4d8eb197a77db2 upstream.

In __split_huge_page_map(), the check for page_mapcount(page) is
invariant within the for loop.  Because of the fact that the macro is
implemented using atomic_read(), the redundant check cannot be optimized
away by the compiler leading to unnecessary read to the page structure.

This patch moves the invariant bug check out of the loop so that it will
be done only once.  On a 3.16-rc1 based kernel, the execution time of a
microbenchmark that broke up 1000 transparent huge pages using munmap()
had an execution time of 38,245us and 38,548us with and without the
patch respectively.  The performance gain is about 1%.

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Scott J Norton <scott.norton@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/huge_memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index eb00e81601a5..b9a096a36edc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1733,6 +1733,8 @@ static int __split_huge_page_map(struct page *page,
 	if (pmd) {
 		pgtable = pgtable_trans_huge_withdraw(mm);
 		pmd_populate(mm, &_pmd, pgtable);
+		if (pmd_write(*pmd))
+			BUG_ON(page_mapcount(page) != 1);
 
 		haddr = address;
 		for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1742,8 +1744,6 @@ static int __split_huge_page_map(struct page *page,
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 			if (!pmd_write(*pmd))
 				entry = pte_wrprotect(entry);
-			else
-				BUG_ON(page_mapcount(page) != 1);
 			if (!pmd_young(*pmd))
 				entry = pte_mkold(entry);
 			if (pmd_numa(*pmd))
-- 
cgit v1.2.3


From f35407acce23bab3727190a94468362dc8f030a1 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 2 Oct 2014 19:47:42 +0100
Subject: mm: numa: Do not mark PTEs pte_numa when splitting huge pages

commit abc40bd2eeb77eb7c2effcaf63154aad929a1d5f upstream.

This patch reverts 1ba6e0b50b ("mm: numa: split_huge_page: transfer the
NUMA type from the pmd to the pte"). If a huge page is being split due
a protection change and the tail will be in a PROT_NONE vma then NUMA
hinting PTEs are temporarily created in the protected VMA.

 VM_RW|VM_PROTNONE
|-----------------|
      ^
      split here

In the specific case above, it should get fixed up by change_pte_range()
but there is a window of opportunity for weirdness to happen. Similarly,
if a huge page is shrunk and split during a protection update but before
pmd_numa is cleared then a pte_numa can be left behind.

Instead of adding complexity trying to deal with the case, this patch
will not mark PTEs NUMA when splitting a huge page. NUMA hinting faults
will not be triggered which is marginal in comparison to the complexity
in dealing with the corner cases during THP split.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/huge_memory.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b9a096a36edc..d21c9ef0943c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1740,14 +1740,17 @@ static int __split_huge_page_map(struct page *page,
 		for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
 			pte_t *pte, entry;
 			BUG_ON(PageCompound(page+i));
+			/*
+			 * Note that pmd_numa is not transferred deliberately
+			 * to avoid any possibility that pte_numa leaks to
+			 * a PROT_NONE VMA by accident.
+			 */
 			entry = mk_pte(page + i, vma->vm_page_prot);
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 			if (!pmd_write(*pmd))
 				entry = pte_wrprotect(entry);
 			if (!pmd_young(*pmd))
 				entry = pte_mkold(entry);
-			if (pmd_numa(*pmd))
-				entry = pte_mknuma(entry);
 			pte = pte_offset_map(&_pmd, haddr);
 			BUG_ON(!pte_none(*pte));
 			set_pte_at(mm, haddr, pte, entry);
-- 
cgit v1.2.3


From f5d34b7cae6c6ddddb1797ebc0d0918954544108 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Sat, 20 Sep 2014 16:16:35 -0300
Subject: media: vb2: fix VBI/poll regression

commit 58d75f4b1ce26324b4d809b18f94819843a98731 upstream.

The recent conversion of saa7134 to vb2 unconvered a poll() bug that
broke the teletext applications alevt and mtt. These applications
expect that calling poll() without having called VIDIOC_STREAMON will
cause poll() to return POLLERR. That did not happen in vb2.

This patch fixes that behavior. It also fixes what should happen when
poll() is called when STREAMON is called but no buffers have been
queued. In that case poll() will also return POLLERR, but only for
capture queues since output queues will always return POLLOUT
anyway in that situation.

This brings the vb2 behavior in line with the old videobuf behavior.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/v4l2-core/videobuf2-core.c | 15 +++++++++++++--
 include/media/videobuf2-core.h           |  4 ++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c
index e3bdc3be91e1..5e47ba479e53 100644
--- a/drivers/media/v4l2-core/videobuf2-core.c
+++ b/drivers/media/v4l2-core/videobuf2-core.c
@@ -666,6 +666,7 @@ static int __reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req)
 	 * to the userspace.
 	 */
 	req->count = allocated_buffers;
+	q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type);
 
 	return 0;
 }
@@ -714,6 +715,7 @@ static int __create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create
 		memset(q->plane_sizes, 0, sizeof(q->plane_sizes));
 		memset(q->alloc_ctx, 0, sizeof(q->alloc_ctx));
 		q->memory = create->memory;
+		q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type);
 	}
 
 	num_buffers = min(create->count, VIDEO_MAX_FRAME - q->num_buffers);
@@ -1355,6 +1357,7 @@ int vb2_qbuf(struct vb2_queue *q, struct v4l2_buffer *b)
 	 * dequeued in dqbuf.
 	 */
 	list_add_tail(&vb->queued_entry, &q->queued_list);
+	q->waiting_for_buffers = false;
 	vb->state = VB2_BUF_STATE_QUEUED;
 
 	/*
@@ -1724,6 +1727,7 @@ int vb2_streamoff(struct vb2_queue *q, enum v4l2_buf_type type)
 	 * and videobuf, effectively returning control over them to userspace.
 	 */
 	__vb2_queue_cancel(q);
+	q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type);
 
 	dprintk(3, "Streamoff successful\n");
 	return 0;
@@ -2009,9 +2013,16 @@ unsigned int vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait)
 	}
 
 	/*
-	 * There is nothing to wait for if no buffers have already been queued.
+	 * There is nothing to wait for if the queue isn't streaming.
 	 */
-	if (list_empty(&q->queued_list))
+	if (!vb2_is_streaming(q))
+		return res | POLLERR;
+	/*
+	 * For compatibility with vb1: if QBUF hasn't been called yet, then
+	 * return POLLERR as well. This only affects capture queues, output
+	 * queues will always initialize waiting_for_buffers to false.
+	 */
+	if (q->waiting_for_buffers)
 		return res | POLLERR;
 
 	if (list_empty(&q->done_list))
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index d88a098d1aff..2cc4e0df9c5d 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -318,6 +318,9 @@ struct v4l2_fh;
  * @done_wq:	waitqueue for processes waiting for buffers ready to be dequeued
  * @alloc_ctx:	memory type/allocator-specific contexts for each plane
  * @streaming:	current streaming state
+ * @waiting_for_buffers: used in poll() to check if vb2 is still waiting for
+ *		buffers. Only set for capture queues if qbuf has not yet been
+ *		called since poll() needs to return POLLERR in that situation.
  * @fileio:	file io emulator internal data, used only if emulator is active
  */
 struct vb2_queue {
@@ -350,6 +353,7 @@ struct vb2_queue {
 	unsigned int			plane_sizes[VIDEO_MAX_PLANES];
 
 	unsigned int			streaming:1;
+	unsigned int			waiting_for_buffers:1;
 
 	struct vb2_fileio_data		*fileio;
 };
-- 
cgit v1.2.3


From 06905ff8a6f07cb59b20311694f5d1454654808f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 2 Oct 2014 13:45:00 +1000
Subject: md/raid5: disable 'DISCARD' by default due to safety concerns.

commit 8e0e99ba64c7ba46133a7c8a3e3f7de01f23bd93 upstream.

It has come to my attention (thanks Martin) that 'discard_zeroes_data'
is only a hint.  Some devices in some cases don't do what it
says on the label.

The use of DISCARD in RAID5 depends on reads from discarded regions
being predictably zero.  If a write to a previously discarded region
performs a read-modify-write cycle it assumes that the parity block
was consistent with the data blocks.  If all were zero, this would
be the case.  If some are and some aren't this would not be the case.
This could lead to data corruption after a device failure when
data needs to be reconstructed from the parity.

As we cannot trust 'discard_zeroes_data', ignore it by default
and so disallow DISCARD on all raid4/5/6 arrays.

As many devices are trustworthy, and as there are benefits to using
DISCARD, add a module parameter to over-ride this caution and cause
DISCARD to work if discard_zeroes_data is set.

If a site want to enable DISCARD on some arrays but not on others they
should select DISCARD support at the filesystem level, and set the
raid456 module parameter.
    raid456.devices_handle_discard_safely=Y

As this is a data-safety issue, I believe this patch is suitable for
-stable.
DISCARD support for RAID456 was added in 3.7

Cc: Shaohua Li <shli@kernel.org>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Heinz Mauelshagen <heinzm@redhat.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Fixes: 620125f2bf8ff0c4969b79653b54d7bcc9d40637
Signed-off-by: NeilBrown <neilb@suse.de>
[bwh: Backported to 3.10: adjust context]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/md/raid5.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 774f81423d78..2332b5ced0dd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -60,6 +60,10 @@
 #include "raid0.h"
 #include "bitmap.h"
 
+static bool devices_handle_discard_safely = false;
+module_param(devices_handle_discard_safely, bool, 0644);
+MODULE_PARM_DESC(devices_handle_discard_safely,
+		 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
 /*
  * Stripe cache
  */
@@ -5611,7 +5615,7 @@ static int run(struct mddev *mddev)
 		mddev->queue->limits.discard_granularity = stripe;
 		/*
 		 * unaligned part of discard request will be ignored, so can't
-		 * guarantee discard_zerors_data
+		 * guarantee discard_zeroes_data
 		 */
 		mddev->queue->limits.discard_zeroes_data = 0;
 
@@ -5636,6 +5640,18 @@ static int run(struct mddev *mddev)
 			    !bdev_get_queue(rdev->bdev)->
 						limits.discard_zeroes_data)
 				discard_supported = false;
+			/* Unfortunately, discard_zeroes_data is not currently
+			 * a guarantee - just a hint.  So we only allow DISCARD
+			 * if the sysadmin has confirmed that only safe devices
+			 * are in use by setting a module parameter.
+			 */
+			if (!devices_handle_discard_safely) {
+				if (discard_supported) {
+					pr_info("md/raid456: discard support disabled due to uncertainty.\n");
+					pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
+				}
+				discard_supported = false;
+			}
 		}
 
 		if (discard_supported &&
-- 
cgit v1.2.3


From 00790d4526bd88e711999b9af04a0e896cfbf5a8 Mon Sep 17 00:00:00 2001
From: Andrew Hunter <ahh@google.com>
Date: Thu, 4 Sep 2014 14:17:16 -0700
Subject: jiffies: Fix timeval conversion to jiffies

commit d78c9300c51d6ceed9f6d078d4e9366f259de28c upstream.

timeval_to_jiffies tried to round a timeval up to an integral number
of jiffies, but the logic for doing so was incorrect: intervals
corresponding to exactly N jiffies would become N+1. This manifested
itself particularly repeatedly stopping/starting an itimer:

setitimer(ITIMER_PROF, &val, NULL);
setitimer(ITIMER_PROF, NULL, &val);

would add a full tick to val, _even if it was exactly representable in
terms of jiffies_ (say, the result of a previous rounding.)  Doing
this repeatedly would cause unbounded growth in val.  So fix the math.

Here's what was wrong with the conversion: we essentially computed
(eliding seconds)

jiffies = usec  * (NSEC_PER_USEC/TICK_NSEC)

by using scaling arithmetic, which took the best approximation of
NSEC_PER_USEC/TICK_NSEC with denominator of 2^USEC_JIFFIE_SC =
x/(2^USEC_JIFFIE_SC), and computed:

jiffies = (usec * x) >> USEC_JIFFIE_SC

and rounded this calculation up in the intermediate form (since we
can't necessarily exactly represent TICK_NSEC in usec.) But the
scaling arithmetic is a (very slight) *over*approximation of the true
value; that is, instead of dividing by (1 usec/ 1 jiffie), we
effectively divided by (1 usec/1 jiffie)-epsilon (rounding
down). This would normally be fine, but we want to round timeouts up,
and we did so by adding 2^USEC_JIFFIE_SC - 1 before the shift; this
would be fine if our division was exact, but dividing this by the
slightly smaller factor was equivalent to adding just _over_ 1 to the
final result (instead of just _under_ 1, as desired.)

In particular, with HZ=1000, we consistently computed that 10000 usec
was 11 jiffies; the same was true for any exact multiple of
TICK_NSEC.

We could possibly still round in the intermediate form, adding
something less than 2^USEC_JIFFIE_SC - 1, but easier still is to
convert usec->nsec, round in nanoseconds, and then convert using
time*spec*_to_jiffies.  This adds one constant multiplication, and is
not observably slower in microbenchmarks on recent x86 hardware.

Tested: the following program:

int main() {
  struct itimerval zero = {{0, 0}, {0, 0}};
  /* Initially set to 10 ms. */
  struct itimerval initial = zero;
  initial.it_interval.tv_usec = 10000;
  setitimer(ITIMER_PROF, &initial, NULL);
  /* Save and restore several times. */
  for (size_t i = 0; i < 10; ++i) {
    struct itimerval prev;
    setitimer(ITIMER_PROF, &zero, &prev);
    /* on old kernels, this goes up by TICK_USEC every iteration */
    printf("previous value: %ld %ld %ld %ld\n",
           prev.it_interval.tv_sec, prev.it_interval.tv_usec,
           prev.it_value.tv_sec, prev.it_value.tv_usec);
    setitimer(ITIMER_PROF, &prev, NULL);
  }
    return 0;
}


Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Paul Turner <pjt@google.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reviewed-by: Paul Turner <pjt@google.com>
Reported-by: Aaron Jacobs <jacobsa@google.com>
Signed-off-by: Andrew Hunter <ahh@google.com>
[jstultz: Tweaked to apply to 3.17-rc]
Signed-off-by: John Stultz <john.stultz@linaro.org>
[bwh: Backported to 3.16: adjust filename]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/jiffies.h | 12 -----------
 kernel/time.c           | 54 +++++++++++++++++++++++++++----------------------
 2 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 7b5d4a8ab199..c039fe1315eb 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -254,23 +254,11 @@ extern unsigned long preset_lpj;
 #define SEC_JIFFIE_SC (32 - SHIFT_HZ)
 #endif
 #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29)
-#define USEC_JIFFIE_SC (SEC_JIFFIE_SC + 19)
 #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\
                                 TICK_NSEC -1) / (u64)TICK_NSEC))
 
 #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\
                                         TICK_NSEC -1) / (u64)TICK_NSEC))
-#define USEC_CONVERSION  \
-                    ((unsigned long)((((u64)NSEC_PER_USEC << USEC_JIFFIE_SC) +\
-                                        TICK_NSEC -1) / (u64)TICK_NSEC))
-/*
- * USEC_ROUND is used in the timeval to jiffie conversion.  See there
- * for more details.  It is the scaled resolution rounding value.  Note
- * that it is a 64-bit value.  Since, when it is applied, we are already
- * in jiffies (albit scaled), it is nothing but the bits we will shift
- * off.
- */
-#define USEC_ROUND (u64)(((u64)1 << USEC_JIFFIE_SC) - 1)
 /*
  * The maximum jiffie value is (MAX_INT >> 1).  Here we translate that
  * into seconds.  The 64-bit case will overflow if we are not careful,
diff --git a/kernel/time.c b/kernel/time.c
index d3617dbd3dca..d21398e6da87 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -496,17 +496,20 @@ EXPORT_SYMBOL(usecs_to_jiffies);
  * that a remainder subtract here would not do the right thing as the
  * resolution values don't fall on second boundries.  I.e. the line:
  * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
+ * Note that due to the small error in the multiplier here, this
+ * rounding is incorrect for sufficiently large values of tv_nsec, but
+ * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
+ * OK.
  *
  * Rather, we just shift the bits off the right.
  *
  * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
  * value to a scaled second value.
  */
-unsigned long
-timespec_to_jiffies(const struct timespec *value)
+static unsigned long
+__timespec_to_jiffies(unsigned long sec, long nsec)
 {
-	unsigned long sec = value->tv_sec;
-	long nsec = value->tv_nsec + TICK_NSEC - 1;
+	nsec = nsec + TICK_NSEC - 1;
 
 	if (sec >= MAX_SEC_IN_JIFFIES){
 		sec = MAX_SEC_IN_JIFFIES;
@@ -517,6 +520,13 @@ timespec_to_jiffies(const struct timespec *value)
 		 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
 
 }
+
+unsigned long
+timespec_to_jiffies(const struct timespec *value)
+{
+	return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
+}
+
 EXPORT_SYMBOL(timespec_to_jiffies);
 
 void
@@ -533,31 +543,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
 }
 EXPORT_SYMBOL(jiffies_to_timespec);
 
-/* Same for "timeval"
+/*
+ * We could use a similar algorithm to timespec_to_jiffies (with a
+ * different multiplier for usec instead of nsec). But this has a
+ * problem with rounding: we can't exactly add TICK_NSEC - 1 to the
+ * usec value, since it's not necessarily integral.
  *
- * Well, almost.  The problem here is that the real system resolution is
- * in nanoseconds and the value being converted is in micro seconds.
- * Also for some machines (those that use HZ = 1024, in-particular),
- * there is a LARGE error in the tick size in microseconds.
-
- * The solution we use is to do the rounding AFTER we convert the
- * microsecond part.  Thus the USEC_ROUND, the bits to be shifted off.
- * Instruction wise, this should cost only an additional add with carry
- * instruction above the way it was done above.
+ * We could instead round in the intermediate scaled representation
+ * (i.e. in units of 1/2^(large scale) jiffies) but that's also
+ * perilous: the scaling introduces a small positive error, which
+ * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1
+ * units to the intermediate before shifting) leads to accidental
+ * overflow and overestimates.
+ *
+ * At the cost of one additional multiplication by a constant, just
+ * use the timespec implementation.
  */
 unsigned long
 timeval_to_jiffies(const struct timeval *value)
 {
-	unsigned long sec = value->tv_sec;
-	long usec = value->tv_usec;
-
-	if (sec >= MAX_SEC_IN_JIFFIES){
-		sec = MAX_SEC_IN_JIFFIES;
-		usec = 0;
-	}
-	return (((u64)sec * SEC_CONVERSION) +
-		(((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
-		 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+	return __timespec_to_jiffies(value->tv_sec,
+				     value->tv_usec * NSEC_PER_USEC);
 }
 EXPORT_SYMBOL(timeval_to_jiffies);
 
-- 
cgit v1.2.3


From 6353c97aa7c7dd6b0c3fe717eeacb39e3873259e Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 9 Jul 2014 21:18:32 +0200
Subject: drbd: fix regression 'out of mem, failed to invoke fence-peer helper'

commit bbc1c5e8ad6dfebf9d13b8a4ccdf66c92913eac9 upstream.

Since linux kernel 3.13, kthread_run() internally uses
wait_for_completion_killable().  We sometimes may use kthread_run()
while we still have a signal pending, which we used to kick our threads
out of potentially blocking network functions, causing kthread_run() to
mistake that as a new fatal signal and fail.

Fix: flush_signals() before kthread_run().

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/block/drbd/drbd_nl.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 9e3f441e7e84..9c37f3d896a2 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -514,6 +514,12 @@ void conn_try_outdate_peer_async(struct drbd_tconn *tconn)
 	struct task_struct *opa;
 
 	kref_get(&tconn->kref);
+	/* We may just have force_sig()'ed this thread
+	 * to get it out of some blocking network function.
+	 * Clear signals; otherwise kthread_run(), which internally uses
+	 * wait_on_completion_killable(), will mistake our pending signal
+	 * for a new fatal signal and fail. */
+	flush_signals(current);
 	opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h");
 	if (IS_ERR(opa)) {
 		conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n");
-- 
cgit v1.2.3


From 7dd311128022551d7876b26b7193157883494cd3 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 30 Jul 2014 14:55:26 +0200
Subject: nl80211: clear skb cb before passing to netlink

commit bd8c78e78d5011d8111bc2533ee73b13a3bd6c42 upstream.

In testmode and vendor command reply/event SKBs we use the
skb cb data to store nl80211 parameters between allocation
and sending. This causes the code for CONFIG_NETLINK_MMAP
to get confused, because it takes ownership of the skb cb
data when the SKB is handed off to netlink, and it doesn't
explicitly clear it.

Clear the skb cb explicitly when we're done and before it
gets passed to netlink to avoid this issue.

Reported-by: Assaf Azulay <assaf.azulay@intel.com>
Reported-by: David Spinadel <david.spinadel@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/wireless/nl80211.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 448c034184e2..62aebed7c6e2 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6568,6 +6568,9 @@ int cfg80211_testmode_reply(struct sk_buff *skb)
 	void *hdr = ((void **)skb->cb)[1];
 	struct nlattr *data = ((void **)skb->cb)[2];
 
+	/* clear CB data for netlink core to own from now on */
+	memset(skb->cb, 0, sizeof(skb->cb));
+
 	if (WARN_ON(!rdev->testmode_info)) {
 		kfree_skb(skb);
 		return -EINVAL;
@@ -6594,6 +6597,9 @@ void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
 	void *hdr = ((void **)skb->cb)[1];
 	struct nlattr *data = ((void **)skb->cb)[2];
 
+	/* clear CB data for netlink core to own from now on */
+	memset(skb->cb, 0, sizeof(skb->cb));
+
 	nla_nest_end(skb, data);
 	genlmsg_end(skb, hdr);
 	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), skb, 0,
-- 
cgit v1.2.3


From 35c239149f6e5794da2285f30bdeb3b4dd4df3b6 Mon Sep 17 00:00:00 2001
From: Andreas Schwab <schwab@linux-m68k.org>
Date: Sat, 7 Sep 2013 18:35:08 +0200
Subject: cpufreq: Fix wrong time unit conversion

commit a857c0b9e24e39fe5be82451b65377795f9538d8 upstream.

The time spent by a CPU under a given frequency is stored in jiffies unit
in the cpu var cpufreq_stats_table->time_in_state[i], i being the index of
the frequency.

This is what is displayed in the following file on the right column:

     cat /sys/devices/system/cpu/cpuX/cpufreq/stats/time_in_state
     2301000 19835820
     2300000 3172
     [...]

Now cpufreq converts this jiffies unit delta to clock_t before returning it
to the user as in the above file. And that conversion is achieved using the API
cputime64_to_clock_t().

Although it accidentally works on traditional tick based cputime accounting, where
cputime_t maps directly to jiffies, it doesn't work with other types of cputime
accounting such as CONFIG_VIRT_CPU_ACCOUNTING_* where cputime_t can map to nsecs
or any granularity preffered by the architecture.

For example we get a buggy zero delta on full dyntick configurations:

     cat /sys/devices/system/cpu/cpuX/cpufreq/stats/time_in_state
     2301000 0
     2300000 0
     [...]

Fix this with using the proper jiffies_64_t to clock_t conversion.

Reported-and-tested-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Andreas Schwab <schwab@linux-m68k.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/cpufreq/cpufreq_stats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index bfd6273fd873..7fb600239059 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -81,7 +81,7 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	for (i = 0; i < stat->state_num; i++) {
 		len += sprintf(buf + len, "%u %llu\n", stat->freq_table[i],
 			(unsigned long long)
-			cputime64_to_clock_t(stat->time_in_state[i]));
+			jiffies_64_to_clock_t(stat->time_in_state[i]));
 	}
 	return len;
 }
-- 
cgit v1.2.3


From bed5396573366682b2e07d79a08aefde1c5a8f52 Mon Sep 17 00:00:00 2001
From: Stratos Karafotis <stratosk@semaphore.gr>
Date: Wed, 5 Jun 2013 19:01:25 +0300
Subject: cpufreq: ondemand: Change the calculation of target frequency

commit dfa5bb622555d9da0df21b50f46ebdeef390041b upstream.

The ondemand governor calculates load in terms of frequency and
increases it only if load_freq is greater than up_threshold
multiplied by the current or average frequency.  This appears to
produce oscillations of frequency between min and max because,
for example, a relatively small load can easily saturate minimum
frequency and lead the CPU to the max.  Then, it will decrease
back to the min due to small load_freq.

Change the calculation method of load and target frequency on the
basis of the following two observations:

 - Load computation should not depend on the current or average
   measured frequency.  For example, absolute load of 80% at 100MHz
   is not necessarily equivalent to 8% at 1000MHz in the next
   sampling interval.

 - It should be possible to increase the target frequency to any
   value present in the frequency table proportional to the absolute
   load, rather than to the max only, so that:

   Target frequency = C * load

   where we take C = policy->cpuinfo.max_freq / 100.

Tested on Intel i7-3770 CPU @ 3.40GHz and on Quad core 1500MHz Krait.
Phoronix benchmark of Linux Kernel Compilation 3.1 test shows an
increase ~1.5% in performance. cpufreq_stats (time_in_state) shows
that middle frequencies are used more, with this patch.  Highest
and lowest frequencies were used less by ~9%.

[rjw: We have run multiple other tests on kernels with this
 change applied and in the vast majority of cases it turns out
 that the resulting performance improvement also leads to reduced
 consumption of energy.  The change is additionally justified by
 the overall simplification of the code in question.]

Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/cpufreq/cpufreq_governor.c | 10 +---------
 drivers/cpufreq/cpufreq_governor.h |  1 -
 drivers/cpufreq/cpufreq_ondemand.c | 39 +++++++-------------------------------
 3 files changed, 8 insertions(+), 42 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 28a0b32c73b3..27b0e2a295ea 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -97,7 +97,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 
 	policy = cdbs->cur_policy;
 
-	/* Get Absolute Load (in terms of freq for ondemand gov) */
+	/* Get Absolute Load */
 	for_each_cpu(j, policy->cpus) {
 		struct cpu_dbs_common_info *j_cdbs;
 		u64 cur_wall_time, cur_idle_time;
@@ -148,14 +148,6 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 
 		load = 100 * (wall_time - idle_time) / wall_time;
 
-		if (dbs_data->cdata->governor == GOV_ONDEMAND) {
-			int freq_avg = __cpufreq_driver_getavg(policy, j);
-			if (freq_avg <= 0)
-				freq_avg = policy->cur;
-
-			load *= freq_avg;
-		}
-
 		if (load > max_load)
 			max_load = load;
 	}
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 0d9e6befe1d5..4a9058aeb57e 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -169,7 +169,6 @@ struct od_dbs_tuners {
 	unsigned int sampling_rate;
 	unsigned int sampling_down_factor;
 	unsigned int up_threshold;
-	unsigned int adj_up_threshold;
 	unsigned int powersave_bias;
 	unsigned int io_is_busy;
 };
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index c087347d6688..25438bbf96bb 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -29,11 +29,9 @@
 #include "cpufreq_governor.h"
 
 /* On-demand governor macros */
-#define DEF_FREQUENCY_DOWN_DIFFERENTIAL		(10)
 #define DEF_FREQUENCY_UP_THRESHOLD		(80)
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
 #define MAX_SAMPLING_DOWN_FACTOR		(100000)
-#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL	(3)
 #define MICRO_FREQUENCY_UP_THRESHOLD		(95)
 #define MICRO_FREQUENCY_MIN_SAMPLE_RATE		(10000)
 #define MIN_FREQUENCY_UP_THRESHOLD		(11)
@@ -161,14 +159,10 @@ static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq)
 
 /*
  * Every sampling_rate, we check, if current idle time is less than 20%
- * (default), then we try to increase frequency. Every sampling_rate, we look
- * for the lowest frequency which can sustain the load while keeping idle time
- * over 30%. If such a frequency exist, we try to decrease to this frequency.
- *
- * Any frequency increase takes it to the maximum frequency. Frequency reduction
- * happens at minimum steps of 5% (default) of current frequency
+ * (default), then we try to increase frequency. Else, we adjust the frequency
+ * proportional to load.
  */
-static void od_check_cpu(int cpu, unsigned int load_freq)
+static void od_check_cpu(int cpu, unsigned int load)
 {
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
 	struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy;
@@ -178,29 +172,17 @@ static void od_check_cpu(int cpu, unsigned int load_freq)
 	dbs_info->freq_lo = 0;
 
 	/* Check for frequency increase */
-	if (load_freq > od_tuners->up_threshold * policy->cur) {
+	if (load > od_tuners->up_threshold) {
 		/* If switching to max speed, apply sampling_down_factor */
 		if (policy->cur < policy->max)
 			dbs_info->rate_mult =
 				od_tuners->sampling_down_factor;
 		dbs_freq_increase(policy, policy->max);
 		return;
-	}
-
-	/* Check for frequency decrease */
-	/* if we cannot reduce the frequency anymore, break out early */
-	if (policy->cur == policy->min)
-		return;
-
-	/*
-	 * The optimal frequency is the frequency that is the lowest that can
-	 * support the current CPU usage without triggering the up policy. To be
-	 * safe, we focus 10 points under the threshold.
-	 */
-	if (load_freq < od_tuners->adj_up_threshold
-			* policy->cur) {
+	} else {
+		/* Calculate the next frequency proportional to load */
 		unsigned int freq_next;
-		freq_next = load_freq / od_tuners->adj_up_threshold;
+		freq_next = load * policy->cpuinfo.max_freq / 100;
 
 		/* No longer fully busy, reset rate_mult */
 		dbs_info->rate_mult = 1;
@@ -374,9 +356,6 @@ static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
 			input < MIN_FREQUENCY_UP_THRESHOLD) {
 		return -EINVAL;
 	}
-	/* Calculate the new adj_up_threshold */
-	od_tuners->adj_up_threshold += input;
-	od_tuners->adj_up_threshold -= od_tuners->up_threshold;
 
 	od_tuners->up_threshold = input;
 	return count;
@@ -525,8 +504,6 @@ static int od_init(struct dbs_data *dbs_data)
 	if (idle_time != -1ULL) {
 		/* Idle micro accounting is supported. Use finer thresholds */
 		tuners->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
-		tuners->adj_up_threshold = MICRO_FREQUENCY_UP_THRESHOLD -
-			MICRO_FREQUENCY_DOWN_DIFFERENTIAL;
 		/*
 		 * In nohz/micro accounting case we set the minimum frequency
 		 * not depending on HZ, but fixed (very low). The deferred
@@ -535,8 +512,6 @@ static int od_init(struct dbs_data *dbs_data)
 		dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
 	} else {
 		tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
-		tuners->adj_up_threshold = DEF_FREQUENCY_UP_THRESHOLD -
-			DEF_FREQUENCY_DOWN_DIFFERENTIAL;
 
 		/* For correct statistics, we need 10 ticks for each measure */
 		dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
-- 
cgit v1.2.3


From f41c15f2c9a00489735036846ec7e474e52b14a6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 9 Oct 2014 12:18:54 -0700
Subject: Linux 3.10.57

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 03bd927522f7..9df630a513b7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 3
 PATCHLEVEL = 10
-SUBLEVEL = 56
+SUBLEVEL = 57
 EXTRAVERSION =
 NAME = TOSSUG Baby Fish
 
-- 
cgit v1.2.3