From dccb31be7ef8984b8fa636b65f74b662db6b3cb3 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 18 Aug 2017 14:40:53 +0200
Subject: ipv6: accept 64k - 1 packet length in ip6_find_1stfragopt()

[ Upstream commit 3de33e1ba0506723ab25734e098cf280ecc34756 ]

A packet length of exactly IPV6_MAXPLEN is allowed, we should
refuse parsing options only if the size is 64KiB or more.

While at it, remove one extra variable and one assignment which
were also introduced by the commit that introduced the size
check. Checking the sum 'offset + len' and only later adding
'len' to 'offset' doesn't provide any advantage over directly
summing to 'offset' and checking it.

Fixes: 6399f1fae4ec ("ipv6: avoid overflow of offset in ip6_find_1stfragopt")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/output_core.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index abb2c307fbe8..a338bbc33cf3 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -86,7 +86,6 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 
 	while (offset <= packet_len) {
 		struct ipv6_opt_hdr *exthdr;
-		unsigned int len;
 
 		switch (**nexthdr) {
 
@@ -112,10 +111,9 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 
 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 						 offset);
-		len = ipv6_optlen(exthdr);
-		if (len + offset >= IPV6_MAXPLEN)
+		offset += ipv6_optlen(exthdr);
+		if (offset > IPV6_MAXPLEN)
 			return -EINVAL;
-		offset += len;
 		*nexthdr = &exthdr->nexthdr;
 	}
 
-- 
cgit v1.2.3


From 7f8f23fc8026a7a4f29f49c18a2ebbb529ee3916 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Mon, 21 Aug 2017 09:47:10 -0700
Subject: ipv6: add rcu grace period before freeing fib6_node

[ Upstream commit c5cff8561d2d0006e972bd114afd51f082fee77c ]

We currently keep rt->rt6i_node pointing to the fib6_node for the route.
And some functions make use of this pointer to dereference the fib6_node
from rt structure, e.g. rt6_check(). However, as there is neither
refcount nor rcu taken when dereferencing rt->rt6i_node, it could
potentially cause crashes as rt->rt6i_node could be set to NULL by other
CPUs when doing a route deletion.
This patch introduces an rcu grace period before freeing fib6_node and
makes sure the functions that dereference it takes rcu_read_lock().

Note: there is no "Fixes" tag because this bug was there in a very
early stage.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/ip6_fib.h | 30 +++++++++++++++++++++++++++++-
 net/ipv6/ip6_fib.c    | 20 ++++++++++++++++----
 net/ipv6/route.c      | 14 +++++++++++---
 3 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a74e2aa40ef4..c17180118897 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -68,6 +68,7 @@ struct fib6_node {
 	__u16			fn_flags;
 	int			fn_sernum;
 	struct rt6_info		*rr_ptr;
+	struct rcu_head		rcu;
 };
 
 #ifndef CONFIG_IPV6_SUBTREES
@@ -165,13 +166,40 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
 	rt0->rt6i_flags |= RTF_EXPIRES;
 }
 
+/* Function to safely get fn->sernum for passed in rt
+ * and store result in passed in cookie.
+ * Return true if we can get cookie safely
+ * Return false if not
+ */
+static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
+				       u32 *cookie)
+{
+	struct fib6_node *fn;
+	bool status = false;
+
+	rcu_read_lock();
+	fn = rcu_dereference(rt->rt6i_node);
+
+	if (fn) {
+		*cookie = fn->fn_sernum;
+		status = true;
+	}
+
+	rcu_read_unlock();
+	return status;
+}
+
 static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 {
+	u32 cookie = 0;
+
 	if (rt->rt6i_flags & RTF_PCPU ||
 	    (unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from))
 		rt = (struct rt6_info *)(rt->dst.from);
 
-	return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+	rt6_get_cookie_safe(rt, &cookie);
+
+	return cookie;
 }
 
 static inline void ip6_rt_put(struct rt6_info *rt)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ff389591a340..ed832b3d9b70 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -148,11 +148,23 @@ static struct fib6_node *node_alloc(void)
 	return fn;
 }
 
-static void node_free(struct fib6_node *fn)
+static void node_free_immediate(struct fib6_node *fn)
+{
+	kmem_cache_free(fib6_node_kmem, fn);
+}
+
+static void node_free_rcu(struct rcu_head *head)
 {
+	struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
+
 	kmem_cache_free(fib6_node_kmem, fn);
 }
 
+static void node_free(struct fib6_node *fn)
+{
+	call_rcu(&fn->rcu, node_free_rcu);
+}
+
 static void rt6_rcu_free(struct rt6_info *rt)
 {
 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
@@ -589,9 +601,9 @@ insert_above:
 
 		if (!in || !ln) {
 			if (in)
-				node_free(in);
+				node_free_immediate(in);
 			if (ln)
-				node_free(ln);
+				node_free_immediate(ln);
 			return ERR_PTR(-ENOMEM);
 		}
 
@@ -1020,7 +1032,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 				   root, and then (in failure) stale node
 				   in main tree.
 				 */
-				node_free(sfn);
+				node_free_immediate(sfn);
 				err = PTR_ERR(sn);
 				goto failure;
 			}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5764a84465f8..632987ffc07d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1267,7 +1267,9 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt)
 
 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
 {
-	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
+	u32 rt_cookie;
+
+	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
 		return NULL;
 
 	if (rt6_check_expired(rt))
@@ -1335,8 +1337,14 @@ static void ip6_link_failure(struct sk_buff *skb)
 		if (rt->rt6i_flags & RTF_CACHE) {
 			dst_hold(&rt->dst);
 			ip6_del_rt(rt);
-		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
-			rt->rt6i_node->fn_sernum = -1;
+		} else {
+			struct fib6_node *fn;
+
+			rcu_read_lock();
+			fn = rcu_dereference(rt->rt6i_node);
+			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
+				fn->fn_sernum = -1;
+			rcu_read_unlock();
 		}
 	}
 }
-- 
cgit v1.2.3


From 43c792a8488087668f7e1052201e2eeb32150141 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 25 Aug 2017 15:03:10 -0700
Subject: ipv6: fix sparse warning on rt6i_node

[ Upstream commit 4e587ea71bf924f7dac621f1351653bd41e446cb ]

Commit c5cff8561d2d adds rcu grace period before freeing fib6_node. This
generates a new sparse warning on rt->rt6i_node related code:
  net/ipv6/route.c:1394:30: error: incompatible types in comparison
  expression (different address spaces)
  ./include/net/ip6_fib.h:187:14: error: incompatible types in comparison
  expression (different address spaces)

This commit adds "__rcu" tag for rt6i_node and makes sure corresponding
rcu API is used for it.
After this fix, sparse no longer generates the above warning.

Fixes: c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/ip6_fib.h |  2 +-
 net/ipv6/addrconf.c   |  2 +-
 net/ipv6/ip6_fib.c    | 11 +++++++----
 net/ipv6/route.c      |  3 ++-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c17180118897..a6bcb18ac4c3 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -103,7 +103,7 @@ struct rt6_info {
 	 * the same cache line.
 	 */
 	struct fib6_table		*rt6i_table;
-	struct fib6_node		*rt6i_node;
+	struct fib6_node __rcu		*rt6i_node;
 
 	struct in6_addr			rt6i_gateway;
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b2cabda72320..cc101b1be903 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5443,7 +5443,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		 * our DAD process, so we don't need
 		 * to do it again
 		 */
-		if (!(ifp->rt->rt6i_node))
+		if (!rcu_access_pointer(ifp->rt->rt6i_node))
 			ip6_ins_rt(ifp->rt);
 		if (ifp->idev->cnf.forwarding)
 			addrconf_join_anycast(ifp);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ed832b3d9b70..af7442211ffb 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -874,7 +874,7 @@ add:
 
 		rt->dst.rt6_next = iter;
 		*ins = rt;
-		rt->rt6i_node = fn;
+		rcu_assign_pointer(rt->rt6i_node, fn);
 		atomic_inc(&rt->rt6i_ref);
 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
 		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
@@ -899,7 +899,7 @@ add:
 			return err;
 
 		*ins = rt;
-		rt->rt6i_node = fn;
+		rcu_assign_pointer(rt->rt6i_node, fn);
 		rt->dst.rt6_next = iter->dst.rt6_next;
 		atomic_inc(&rt->rt6i_ref);
 		inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
@@ -1459,8 +1459,9 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 
 int fib6_del(struct rt6_info *rt, struct nl_info *info)
 {
+	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
+				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
 	struct net *net = info->nl_net;
-	struct fib6_node *fn = rt->rt6i_node;
 	struct rt6_info **rtp;
 
 #if RT6_DEBUG >= 2
@@ -1649,7 +1650,9 @@ static int fib6_clean_node(struct fib6_walker *w)
 			if (res) {
 #if RT6_DEBUG >= 2
 				pr_debug("%s: del failed: rt=%p@%p err=%d\n",
-					 __func__, rt, rt->rt6i_node, res);
+					 __func__, rt,
+					 rcu_access_pointer(rt->rt6i_node),
+					 res);
 #endif
 				continue;
 			}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 632987ffc07d..9c2dd3f77cdb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1361,7 +1361,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
 {
 	return !(rt->rt6i_flags & RTF_CACHE) &&
-		(rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
+		(rt->rt6i_flags & RTF_PCPU ||
+		 rcu_access_pointer(rt->rt6i_node));
 }
 
 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
-- 
cgit v1.2.3


From 4b4a194a10e2a2dd7bf3f90016b56ac495a1d37e Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 22 Aug 2017 15:36:08 +0200
Subject: macsec: add genl family module alias

[ Upstream commit 78362998f58c7c271e2719dcd0aaced435c801f9 ]

This helps tools such as wpa_supplicant can start even if the macsec
module isn't loaded yet.

Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/macsec.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index a5d66e205bb2..2caac0c37059 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -3510,6 +3510,7 @@ module_init(macsec_init);
 module_exit(macsec_exit);
 
 MODULE_ALIAS_RTNL_LINK("macsec");
+MODULE_ALIAS_GENL_FAMILY("macsec");
 
 MODULE_DESCRIPTION("MACsec IEEE 802.1AE");
 MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 1e39e5c6a2ea1f488ad13d351d6c55a5ef530666 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 22 Aug 2017 09:39:28 -0700
Subject: udp: on peeking bad csum, drop packets even if not at head

[ Upstream commit fd6055a806edc4019be1b9fb7d25262599bca5b1 ]

When peeking, if a bad csum is discovered, the skb is unlinked from
the queue with __sk_queue_drop_skb and the peek operation restarted.

__sk_queue_drop_skb only drops packets that match the queue head.

This fails if the skb was found after the head, using SO_PEEK_OFF
socket option. This causes an infinite loop.

We MUST drop this problematic skb, and we can simply check if skb was
already removed by another thread, by looking at skb->next :

This pointer is set to NULL by the  __skb_unlink() operation, that might
have happened only under the spinlock protection.

Many thanks to syzkaller team (and particularly Dmitry Vyukov who
provided us nice C reproducers exhibiting the lockup) and Willem de
Bruijn who provided first version for this patch and a test program.

Fixes: 627d2d6b5500 ("udp: enable MSG_PEEK at non-zero offset")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/datagram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/datagram.c b/net/core/datagram.c
index 58dfa23d12ca..4fa4011feec1 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -351,7 +351,7 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
 	if (flags & MSG_PEEK) {
 		err = -ENOENT;
 		spin_lock_bh(&sk->sk_receive_queue.lock);
-		if (skb == skb_peek(&sk->sk_receive_queue)) {
+		if (skb->next) {
 			__skb_unlink(skb, &sk->sk_receive_queue);
 			atomic_dec(&skb->users);
 			err = 0;
-- 
cgit v1.2.3


From 4d8ee1935bcd666360311dfdadeee235d682d69a Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 22 Aug 2017 15:24:47 -0700
Subject: fsl/man: Inherit parent device and of_node

[ Upstream commit a1a50c8e4c241a505b7270e1a3c6e50d94e794b1 ]

Junote Cai reported that he was not able to get a DSA setup involving the
Freescale DPAA/FMAN driver to work and narrowed it down to
of_find_net_device_by_node(). This function requires the network device's
device reference to be correctly set which is the case here, though we have
lost any device_node association there.

The problem is that dpaa_eth_add_device() allocates a "dpaa-ethernet" platform
device, and later on dpaa_eth_probe() is called but SET_NETDEV_DEV() won't be
propagating &pdev->dev.of_node properly. Fix this by inherenting both the parent
device and the of_node when dpaa_eth_add_device() creates the platform device.

Fixes: 3933961682a3 ("fsl/fman: Add FMan MAC driver")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/freescale/fman/mac.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c
index 736db9d9b0ad..81021f87e4f3 100644
--- a/drivers/net/ethernet/freescale/fman/mac.c
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -622,6 +622,9 @@ static struct platform_device *dpaa_eth_add_device(int fman_id,
 		goto no_mem;
 	}
 
+	pdev->dev.of_node = node;
+	pdev->dev.parent = priv->dev;
+
 	ret = platform_device_add_data(pdev, &data, sizeof(data));
 	if (ret)
 		goto err;
-- 
cgit v1.2.3


From 08d56d8a99bb82e134ba7704e4cfdabbcc16fc4f Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 23 Aug 2017 13:27:13 +0200
Subject: sctp: Avoid out-of-bounds reads from address storage

[ Upstream commit ee6c88bb754e3d363e568da78086adfedb692447 ]

inet_diag_msg_sctp{,l}addr_fill() and sctp_get_sctp_info() copy
sizeof(sockaddr_storage) bytes to fill in sockaddr structs used
to export diagnostic information to userspace.

However, the memory allocated to store sockaddr information is
smaller than that and depends on the address family, so we leak
up to 100 uninitialized bytes to userspace. Just use the size of
the source structs instead, in all the three cases this is what
userspace expects. Zero out the remaining memory.

Unused bytes (i.e. when IPv4 addresses are used) in source
structs sctp_sockaddr_entry and sctp_transport are already
cleared by sctp_add_bind_addr() and sctp_transport_new(),
respectively.

Noticed while testing KASAN-enabled kernel with 'ss':

[ 2326.885243] BUG: KASAN: slab-out-of-bounds in inet_sctp_diag_fill+0x42c/0x6c0 [sctp_diag] at addr ffff881be8779800
[ 2326.896800] Read of size 128 by task ss/9527
[ 2326.901564] CPU: 0 PID: 9527 Comm: ss Not tainted 4.11.0-22.el7a.x86_64 #1
[ 2326.909236] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.4.3 01/17/2017
[ 2326.917585] Call Trace:
[ 2326.920312]  dump_stack+0x63/0x8d
[ 2326.924014]  kasan_object_err+0x21/0x70
[ 2326.928295]  kasan_report+0x288/0x540
[ 2326.932380]  ? inet_sctp_diag_fill+0x42c/0x6c0 [sctp_diag]
[ 2326.938500]  ? skb_put+0x8b/0xd0
[ 2326.942098]  ? memset+0x31/0x40
[ 2326.945599]  check_memory_region+0x13c/0x1a0
[ 2326.950362]  memcpy+0x23/0x50
[ 2326.953669]  inet_sctp_diag_fill+0x42c/0x6c0 [sctp_diag]
[ 2326.959596]  ? inet_diag_msg_sctpasoc_fill+0x460/0x460 [sctp_diag]
[ 2326.966495]  ? __lock_sock+0x102/0x150
[ 2326.970671]  ? sock_def_wakeup+0x60/0x60
[ 2326.975048]  ? remove_wait_queue+0xc0/0xc0
[ 2326.979619]  sctp_diag_dump+0x44a/0x760 [sctp_diag]
[ 2326.985063]  ? sctp_ep_dump+0x280/0x280 [sctp_diag]
[ 2326.990504]  ? memset+0x31/0x40
[ 2326.994007]  ? mutex_lock+0x12/0x40
[ 2326.997900]  __inet_diag_dump+0x57/0xb0 [inet_diag]
[ 2327.003340]  ? __sys_sendmsg+0x150/0x150
[ 2327.007715]  inet_diag_dump+0x4d/0x80 [inet_diag]
[ 2327.012979]  netlink_dump+0x1e6/0x490
[ 2327.017064]  __netlink_dump_start+0x28e/0x2c0
[ 2327.021924]  inet_diag_handler_cmd+0x189/0x1a0 [inet_diag]
[ 2327.028045]  ? inet_diag_rcv_msg_compat+0x1b0/0x1b0 [inet_diag]
[ 2327.034651]  ? inet_diag_dump_compat+0x190/0x190 [inet_diag]
[ 2327.040965]  ? __netlink_lookup+0x1b9/0x260
[ 2327.045631]  sock_diag_rcv_msg+0x18b/0x1e0
[ 2327.050199]  netlink_rcv_skb+0x14b/0x180
[ 2327.054574]  ? sock_diag_bind+0x60/0x60
[ 2327.058850]  sock_diag_rcv+0x28/0x40
[ 2327.062837]  netlink_unicast+0x2e7/0x3b0
[ 2327.067212]  ? netlink_attachskb+0x330/0x330
[ 2327.071975]  ? kasan_check_write+0x14/0x20
[ 2327.076544]  netlink_sendmsg+0x5be/0x730
[ 2327.080918]  ? netlink_unicast+0x3b0/0x3b0
[ 2327.085486]  ? kasan_check_write+0x14/0x20
[ 2327.090057]  ? selinux_socket_sendmsg+0x24/0x30
[ 2327.095109]  ? netlink_unicast+0x3b0/0x3b0
[ 2327.099678]  sock_sendmsg+0x74/0x80
[ 2327.103567]  ___sys_sendmsg+0x520/0x530
[ 2327.107844]  ? __get_locked_pte+0x178/0x200
[ 2327.112510]  ? copy_msghdr_from_user+0x270/0x270
[ 2327.117660]  ? vm_insert_page+0x360/0x360
[ 2327.122133]  ? vm_insert_pfn_prot+0xb4/0x150
[ 2327.126895]  ? vm_insert_pfn+0x32/0x40
[ 2327.131077]  ? vvar_fault+0x71/0xd0
[ 2327.134968]  ? special_mapping_fault+0x69/0x110
[ 2327.140022]  ? __do_fault+0x42/0x120
[ 2327.144008]  ? __handle_mm_fault+0x1062/0x17a0
[ 2327.148965]  ? __fget_light+0xa7/0xc0
[ 2327.153049]  __sys_sendmsg+0xcb/0x150
[ 2327.157133]  ? __sys_sendmsg+0xcb/0x150
[ 2327.161409]  ? SyS_shutdown+0x140/0x140
[ 2327.165688]  ? exit_to_usermode_loop+0xd0/0xd0
[ 2327.170646]  ? __do_page_fault+0x55d/0x620
[ 2327.175216]  ? __sys_sendmsg+0x150/0x150
[ 2327.179591]  SyS_sendmsg+0x12/0x20
[ 2327.183384]  do_syscall_64+0xe3/0x230
[ 2327.187471]  entry_SYSCALL64_slow_path+0x25/0x25
[ 2327.192622] RIP: 0033:0x7f41d18fa3b0
[ 2327.196608] RSP: 002b:00007ffc3b731218 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
[ 2327.205055] RAX: ffffffffffffffda RBX: 00007ffc3b731380 RCX: 00007f41d18fa3b0
[ 2327.213017] RDX: 0000000000000000 RSI: 00007ffc3b731340 RDI: 0000000000000003
[ 2327.220978] RBP: 0000000000000002 R08: 0000000000000004 R09: 0000000000000040
[ 2327.228939] R10: 00007ffc3b730f30 R11: 0000000000000246 R12: 0000000000000003
[ 2327.236901] R13: 00007ffc3b731340 R14: 00007ffc3b7313d0 R15: 0000000000000084
[ 2327.244865] Object at ffff881be87797e0, in cache kmalloc-64 size: 64
[ 2327.251953] Allocated:
[ 2327.254581] PID = 9484
[ 2327.257215]  save_stack_trace+0x1b/0x20
[ 2327.261485]  save_stack+0x46/0xd0
[ 2327.265179]  kasan_kmalloc+0xad/0xe0
[ 2327.269165]  kmem_cache_alloc_trace+0xe6/0x1d0
[ 2327.274138]  sctp_add_bind_addr+0x58/0x180 [sctp]
[ 2327.279400]  sctp_do_bind+0x208/0x310 [sctp]
[ 2327.284176]  sctp_bind+0x61/0xa0 [sctp]
[ 2327.288455]  inet_bind+0x5f/0x3a0
[ 2327.292151]  SYSC_bind+0x1a4/0x1e0
[ 2327.295944]  SyS_bind+0xe/0x10
[ 2327.299349]  do_syscall_64+0xe3/0x230
[ 2327.303433]  return_from_SYSCALL_64+0x0/0x6a
[ 2327.308194] Freed:
[ 2327.310434] PID = 4131
[ 2327.313065]  save_stack_trace+0x1b/0x20
[ 2327.317344]  save_stack+0x46/0xd0
[ 2327.321040]  kasan_slab_free+0x73/0xc0
[ 2327.325220]  kfree+0x96/0x1a0
[ 2327.328530]  dynamic_kobj_release+0x15/0x40
[ 2327.333195]  kobject_release+0x99/0x1e0
[ 2327.337472]  kobject_put+0x38/0x70
[ 2327.341266]  free_notes_attrs+0x66/0x80
[ 2327.345545]  mod_sysfs_teardown+0x1a5/0x270
[ 2327.350211]  free_module+0x20/0x2a0
[ 2327.354099]  SyS_delete_module+0x2cb/0x2f0
[ 2327.358667]  do_syscall_64+0xe3/0x230
[ 2327.362750]  return_from_SYSCALL_64+0x0/0x6a
[ 2327.367510] Memory state around the buggy address:
[ 2327.372855]  ffff881be8779700: fc fc fc fc 00 00 00 00 00 00 00 00 fc fc fc fc
[ 2327.380914]  ffff881be8779780: fb fb fb fb fb fb fb fb fc fc fc fc 00 00 00 00
[ 2327.388972] >ffff881be8779800: 00 00 00 00 fc fc fc fc fb fb fb fb fb fb fb fb
[ 2327.397031]                                ^
[ 2327.401792]  ffff881be8779880: fc fc fc fc fb fb fb fb fb fb fb fb fc fc fc fc
[ 2327.409850]  ffff881be8779900: 00 00 00 00 00 04 fc fc fc fc fc fc 00 00 00 00
[ 2327.417907] ==================================================================

This fixes CVE-2017-7558.

References: https://bugzilla.redhat.com/show_bug.cgi?id=1480266
Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file")
Cc: Xin Long <lucien.xin@gmail.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sctp/sctp_diag.c | 7 +++++--
 net/sctp/socket.c    | 3 +--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index 048954eee984..e8f56b7c5afb 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -70,7 +70,8 @@ static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb,
 
 	info = nla_data(attr);
 	list_for_each_entry_rcu(laddr, address_list, list) {
-		memcpy(info, &laddr->a, addrlen);
+		memcpy(info, &laddr->a, sizeof(laddr->a));
+		memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a));
 		info += addrlen;
 	}
 
@@ -93,7 +94,9 @@ static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb,
 	info = nla_data(attr);
 	list_for_each_entry(from, &asoc->peer.transport_addr_list,
 			    transports) {
-		memcpy(info, &from->ipaddr, addrlen);
+		memcpy(info, &from->ipaddr, sizeof(from->ipaddr));
+		memset(info + sizeof(from->ipaddr), 0,
+		       addrlen - sizeof(from->ipaddr));
 		info += addrlen;
 	}
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9647e314d4fc..3ef725229449 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4373,8 +4373,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
 	info->sctpi_ictrlchunks = asoc->stats.ictrlchunks;
 
 	prim = asoc->peer.primary_path;
-	memcpy(&info->sctpi_p_address, &prim->ipaddr,
-	       sizeof(struct sockaddr_storage));
+	memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr));
 	info->sctpi_p_state = prim->state;
 	info->sctpi_p_cwnd = prim->cwnd;
 	info->sctpi_p_srtt = prim->srtt;
-- 
cgit v1.2.3


From 64dfc67548da52fe7891decf725342a8e87e32d8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 23 Aug 2017 15:59:49 +0200
Subject: qlge: avoid memcpy buffer overflow

[ Upstream commit e58f95831e7468d25eb6e41f234842ecfe6f014f ]

gcc-8.0.0 (snapshot) points out that we copy a variable-length string
into a fixed length field using memcpy() with the destination length,
and that ends up copying whatever follows the string:

    inlined from 'ql_core_dump' at drivers/net/ethernet/qlogic/qlge/qlge_dbg.c:1106:2:
drivers/net/ethernet/qlogic/qlge/qlge_dbg.c:708:2: error: 'memcpy' reading 15 bytes from a region of size 14 [-Werror=stringop-overflow=]
  memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);

Changing it to use strncpy() will instead zero-pad the destination,
which seems to be the right thing to do here.

The bug is probably harmless, but it seems like a good idea to address
it in stable kernels as well, if only for the purpose of building with
gcc-8 without warnings.

Fixes: a61f80261306 ("qlge: Add ethtool register dump function.")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/qlogic/qlge/qlge_dbg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
index 829be21f97b2..be258d90de9e 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
@@ -724,7 +724,7 @@ static void ql_build_coredump_seg_header(
 	seg_hdr->cookie = MPI_COREDUMP_COOKIE;
 	seg_hdr->segNum = seg_number;
 	seg_hdr->segSize = seg_size;
-	memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
+	strncpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
 }
 
 /*
-- 
cgit v1.2.3


From de2ecec26dba848c729e51faaf2b4daf35096330 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Thu, 24 Aug 2017 16:49:16 -0700
Subject: netvsc: fix deadlock betwen link status and removal

[ Upstream commit 9b4e946ce14e20d7addbfb7d9139e604f9fda107 ]

There is a deadlock possible when canceling the link status
delayed work queue. The removal process is run with RTNL held,
and the link status callback is acquring RTNL.

Resolve the issue by using trylock and rescheduling.
If cancel is in process, that block it from happening.

Fixes: 122a5f6410f4 ("staging: hv: use delayed_work for netvsc_send_garp()")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/hyperv/netvsc_drv.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index ff038e507fd6..36a04e182af1 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1084,7 +1084,12 @@ static void netvsc_link_change(struct work_struct *w)
 	bool notify = false, reschedule = false;
 	unsigned long flags, next_reconfig, delay;
 
-	rtnl_lock();
+	/* if changes are happening, comeback later */
+	if (!rtnl_trylock()) {
+		schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
+		return;
+	}
+
 	if (ndev_ctx->start_remove)
 		goto out_unlock;
 
-- 
cgit v1.2.3


From 2b3bd5972a5ce434b3f2211181e72033efe018d9 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 25 Aug 2017 22:48:48 +0200
Subject: cxgb4: Fix stack out-of-bounds read due to wrong size to
 t4_record_mbox()

[ Upstream commit 0f3086868e8889a823a6e0f3d299102aa895d947 ]

Passing commands for logging to t4_record_mbox() with size
MBOX_LEN, when the actual command size is actually smaller,
causes out-of-bounds stack accesses in t4_record_mbox() while
copying command words here:

	for (i = 0; i < size / 8; i++)
		entry->cmd[i] = be64_to_cpu(cmd[i]);

Up to 48 bytes from the stack are then leaked to debugfs.

This happens whenever we send (and log) commands described by
structs fw_sched_cmd (32 bytes leaked), fw_vi_rxmode_cmd (48),
fw_hello_cmd (48), fw_bye_cmd (48), fw_initialize_cmd (48),
fw_reset_cmd (48), fw_pfvf_cmd (32), fw_eq_eth_cmd (16),
fw_eq_ctrl_cmd (32), fw_eq_ofld_cmd (32), fw_acl_mac_cmd(16),
fw_rss_glb_config_cmd(32), fw_rss_vi_config_cmd(32),
fw_devlog_cmd(32), fw_vi_enable_cmd(48), fw_port_cmd(32),
fw_sched_cmd(32), fw_devlog_cmd(32).

The cxgb4vf driver got this right instead.

When we call t4_record_mbox() to log a command reply, a MBOX_LEN
size can be used though, as get_mbox_rpl() will fill cmd_rpl up
completely.

Fixes: 7f080c3f2ff0 ("cxgb4: Add support to enable logging of firmware mailbox commands")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index e8139514d32c..9e073fb6870a 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -317,12 +317,12 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
 
 	if (v != MBOX_OWNER_DRV) {
 		ret = (v == MBOX_OWNER_FW) ? -EBUSY : -ETIMEDOUT;
-		t4_record_mbox(adap, cmd, MBOX_LEN, access, ret);
+		t4_record_mbox(adap, cmd, size, access, ret);
 		return ret;
 	}
 
 	/* Copy in the new mailbox command and send it on its way ... */
-	t4_record_mbox(adap, cmd, MBOX_LEN, access, 0);
+	t4_record_mbox(adap, cmd, size, access, 0);
 	for (i = 0; i < size; i += 8)
 		t4_write_reg64(adap, data_reg + i, be64_to_cpu(*p++));
 
@@ -371,7 +371,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
 	}
 
 	ret = (pcie_fw & PCIE_FW_ERR_F) ? -ENXIO : -ETIMEDOUT;
-	t4_record_mbox(adap, cmd, MBOX_LEN, access, ret);
+	t4_record_mbox(adap, cmd, size, access, ret);
 	dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n",
 		*(const u8 *)cmd, mbox);
 	t4_report_fw_error(adap);
-- 
cgit v1.2.3


From 8c623e5d03692dc478277185a0b907d53aea1b43 Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@suse.com>
Date: Mon, 28 Aug 2017 14:29:41 -0400
Subject: packet: Don't write vnet header beyond end of buffer

[ Upstream commit edbd58be15a957f6a760c4a514cd475217eb97fd ]

... which may happen with certain values of tp_reserve and maclen.

Fixes: 58d19b19cd99 ("packet: vnet_hdr support for tpacket_rcv")
Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/packet/af_packet.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ae7bfd26cd91..35ba4b60d927 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2151,6 +2151,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	struct timespec ts;
 	__u32 ts_status;
 	bool is_drop_n_account = false;
+	bool do_vnet = false;
 
 	/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
 	 * We may add members to them until current aligned size without forcing
@@ -2201,8 +2202,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
 				       (maclen < 16 ? 16 : maclen)) +
 				       po->tp_reserve;
-		if (po->has_vnet_hdr)
+		if (po->has_vnet_hdr) {
 			netoff += sizeof(struct virtio_net_hdr);
+			do_vnet = true;
+		}
 		macoff = netoff - maclen;
 	}
 	if (po->tp_version <= TPACKET_V2) {
@@ -2219,8 +2222,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 					skb_set_owner_r(copy_skb, sk);
 			}
 			snaplen = po->rx_ring.frame_size - macoff;
-			if ((int)snaplen < 0)
+			if ((int)snaplen < 0) {
 				snaplen = 0;
+				do_vnet = false;
+			}
 		}
 	} else if (unlikely(macoff + snaplen >
 			    GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
@@ -2233,6 +2238,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		if (unlikely((int)snaplen < 0)) {
 			snaplen = 0;
 			macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
+			do_vnet = false;
 		}
 	}
 	spin_lock(&sk->sk_receive_queue.lock);
@@ -2258,7 +2264,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	}
 	spin_unlock(&sk->sk_receive_queue.lock);
 
-	if (po->has_vnet_hdr) {
+	if (do_vnet) {
 		if (__packet_rcv_vnet(skb, h.raw + macoff -
 					   sizeof(struct virtio_net_hdr))) {
 			spin_lock(&sk->sk_receive_queue.lock);
-- 
cgit v1.2.3


From af33da0ed95f6a7b652f774fbb07fb52d2c21a97 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 30 Aug 2017 09:29:31 -0700
Subject: kcm: do not attach PF_KCM sockets to avoid deadlock

[ Upstream commit 351050ecd6523374b370341cc29fe61e2201556b ]

syzkaller had no problem to trigger a deadlock, attaching a KCM socket
to another one (or itself). (original syzkaller report was a very
confusing lockdep splat during a sendmsg())

It seems KCM claims to only support TCP, but no enforcement is done,
so we might need to add additional checks.

Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/kcm/kcmsock.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index fecad1098cf8..7eb0e8fe3ca8 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1381,6 +1381,10 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 	if (!csk)
 		return -EINVAL;
 
+	/* We must prevent loops or risk deadlock ! */
+	if (csk->sk_family == PF_KCM)
+		return -EOPNOTSUPP;
+
 	psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
 	if (!psock)
 		return -ENOMEM;
-- 
cgit v1.2.3


From a6e51fda71a205fbd8f7b98da799c46e563c3db1 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 30 Aug 2017 17:49:29 -0700
Subject: Revert "net: phy: Correctly process PHY_HALTED in phy_stop_machine()"

[ Upstream commit ebc8254aeae34226d0bc8fda309fd9790d4dccfe ]

This reverts commit 7ad813f208533cebfcc32d3d7474dc1677d1b09a ("net: phy:
Correctly process PHY_HALTED in phy_stop_machine()") because it is
creating the possibility for a NULL pointer dereference.

David Daney provide the following call trace and diagram of events:

When ndo_stop() is called we call:

 phy_disconnect()
    +---> phy_stop_interrupts() implies: phydev->irq = PHY_POLL;
    +---> phy_stop_machine()
    |      +---> phy_state_machine()
    |              +----> queue_delayed_work(): Work queued.
    +--->phy_detach() implies: phydev->attached_dev = NULL;

Now at a later time the queued work does:

 phy_state_machine()
    +---->netif_carrier_off(phydev->attached_dev): Oh no! It is NULL:

 CPU 12 Unable to handle kernel paging request at virtual address
0000000000000048, epc == ffffffff80de37ec, ra == ffffffff80c7c
Oops[#1]:
CPU: 12 PID: 1502 Comm: kworker/12:1 Not tainted 4.9.43-Cavium-Octeon+ #1
Workqueue: events_power_efficient phy_state_machine
task: 80000004021ed100 task.stack: 8000000409d70000
$ 0   : 0000000000000000 ffffffff84720060 0000000000000048 0000000000000004
$ 4   : 0000000000000000 0000000000000001 0000000000000004 0000000000000000
$ 8   : 0000000000000000 0000000000000000 00000000ffff98f3 0000000000000000
$12   : 8000000409d73fe0 0000000000009c00 ffffffff846547c8 000000000000af3b
$16   : 80000004096bab68 80000004096babd0 0000000000000000 80000004096ba800
$20   : 0000000000000000 0000000000000000 ffffffff81090000 0000000000000008
$24   : 0000000000000061 ffffffff808637b0
$28   : 8000000409d70000 8000000409d73cf0 80000000271bd300 ffffffff80c7804c
Hi    : 000000000000002a
Lo    : 000000000000003f
epc   : ffffffff80de37ec netif_carrier_off+0xc/0x58
ra    : ffffffff80c7804c phy_state_machine+0x48c/0x4f8
Status: 14009ce3        KX SX UX KERNEL EXL IE
Cause : 00800008 (ExcCode 02)
BadVA : 0000000000000048
PrId  : 000d9501 (Cavium Octeon III)
Modules linked in:
Process kworker/12:1 (pid: 1502, threadinfo=8000000409d70000,
task=80000004021ed100, tls=0000000000000000)
Stack : 8000000409a54000 80000004096bab68 80000000271bd300 80000000271c1e00
        0000000000000000 ffffffff808a1708 8000000409a54000 80000000271bd300
        80000000271bd320 8000000409a54030 ffffffff80ff0f00 0000000000000001
        ffffffff81090000 ffffffff808a1ac0 8000000402182080 ffffffff84650000
        8000000402182080 ffffffff84650000 ffffffff80ff0000 8000000409a54000
        ffffffff808a1970 0000000000000000 80000004099e8000 8000000402099240
        0000000000000000 ffffffff808a8598 0000000000000000 8000000408eeeb00
        8000000409a54000 00000000810a1d00 0000000000000000 8000000409d73de8
        8000000409d73de8 0000000000000088 000000000c009c00 8000000409d73e08
        8000000409d73e08 8000000402182080 ffffffff808a84d0 8000000402182080
        ...
Call Trace:
[<ffffffff80de37ec>] netif_carrier_off+0xc/0x58
[<ffffffff80c7804c>] phy_state_machine+0x48c/0x4f8
[<ffffffff808a1708>] process_one_work+0x158/0x368
[<ffffffff808a1ac0>] worker_thread+0x150/0x4c0
[<ffffffff808a8598>] kthread+0xc8/0xe0
[<ffffffff808617f0>] ret_from_kernel_thread+0x14/0x1c

The original motivation for this change originated from Marc Gonzales
indicating that his network driver did not have its adjust_link callback
executing with phydev->link = 0 while he was expecting it.

PHYLIB has never made any such guarantees ever because phy_stop() merely just
tells the workqueue to move into PHY_HALTED state which will happen
asynchronously.

Reported-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reported-by: David Daney <ddaney.cavm@gmail.com>
Fixes: 7ad813f20853 ("net: phy: Correctly process PHY_HALTED in phy_stop_machine()")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/phy/phy.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 775a6e1fdef9..6e12401b5102 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -674,9 +674,6 @@ void phy_stop_machine(struct phy_device *phydev)
 	if (phydev->state > PHY_UP && phydev->state != PHY_HALTED)
 		phydev->state = PHY_UP;
 	mutex_unlock(&phydev->lock);
-
-	/* Now we can run the state machine synchronously */
-	phy_state_machine(&phydev->state_queue.work);
 }
 
 /**
-- 
cgit v1.2.3


From a10c510179b369f7d1e8cf77f43ee2db900c1ac9 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Thu, 18 May 2017 11:22:33 -0700
Subject: tcp: initialize rcv_mss to TCP_MIN_MSS instead of 0

[ Upstream commit 499350a5a6e7512d9ed369ed63a4244b6536f4f8 ]

When tcp_disconnect() is called, inet_csk_delack_init() sets
icsk->icsk_ack.rcv_mss to 0.
This could potentially cause tcp_recvmsg() => tcp_cleanup_rbuf() =>
__tcp_select_window() call path to have division by 0 issue.
So this patch initializes rcv_mss to TCP_MIN_MSS instead of 0.

Reported-by: Andrey Konovalov  <andreyknvl@google.com>
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1a4db27f5833..6b3d27e50317 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2297,6 +2297,10 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tcp_set_ca_state(sk, TCP_CA_Open);
 	tcp_clear_retrans(tp);
 	inet_csk_delack_init(sk);
+	/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
+	 * issue in __tcp_select_window()
+	 */
+	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
 	tcp_init_send_head(sk);
 	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
 	__sk_dst_reset(sk);
-- 
cgit v1.2.3


From 73ee5a73e75f3c0e5d4ca0c5a362424e93413bb0 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 1 Sep 2017 10:52:31 +0200
Subject: mlxsw: spectrum: Forbid linking to devices that have uppers

[ Upstream commit 25cc72a33835ed8a6f53180a822cadab855852ac ]

The mlxsw driver relies on NETDEV_CHANGEUPPER events to configure the
device in case a port is enslaved to a master netdev such as bridge or
bond.

Since the driver ignores events unrelated to its ports and their
uppers, it's possible to engineer situations in which the device's data
path differs from the kernel's.

One example to such a situation is when a port is enslaved to a bond
that is already enslaved to a bridge. When the bond was enslaved the
driver ignored the event - as the bond wasn't one of its uppers - and
therefore a bridge port instance isn't created in the device.

Until such configurations are supported forbid them by checking that the
upper device doesn't have uppers of its own.

Fixes: 0d65fc13042f ("mlxsw: spectrum: Implement LAG port join/leave")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Nogah Frankel <nogahf@mellanox.com>
Tested-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 6 ++++++
 include/linux/netdevice.h                      | 2 ++
 net/core/dev.c                                 | 3 ++-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index f902c4d3de99..1806b1fc6e4c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4172,6 +4172,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev,
 			return -EINVAL;
 		if (!info->linking)
 			break;
+		if (netdev_has_any_upper_dev(upper_dev))
+			return -EINVAL;
 		/* HW limitation forbids to put ports to multiple bridges. */
 		if (netif_is_bridge_master(upper_dev) &&
 		    !mlxsw_sp_master_bridge_check(mlxsw_sp, upper_dev))
@@ -4185,6 +4187,10 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev,
 		if (netif_is_lag_port(dev) && is_vlan_dev(upper_dev) &&
 		    !netif_is_lag_master(vlan_dev_real_dev(upper_dev)))
 			return -EINVAL;
+		if (!info->linking)
+			break;
+		if (netdev_has_any_upper_dev(upper_dev))
+			return -EINVAL;
 		break;
 	case NETDEV_CHANGEUPPER:
 		upper_dev = info->upper_dev;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 780e7171f548..23db1ae37464 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3901,6 +3901,8 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
 	     updev; \
 	     updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)))
 
+bool netdev_has_any_upper_dev(struct net_device *dev);
+
 void *netdev_lower_get_next_private(struct net_device *dev,
 				    struct list_head **iter);
 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 1d0a7369d5a2..ba7b8121a414 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5337,12 +5337,13 @@ EXPORT_SYMBOL(netdev_has_upper_dev);
  * Find out if a device is linked to an upper device and return true in case
  * it is. The caller must hold the RTNL lock.
  */
-static bool netdev_has_any_upper_dev(struct net_device *dev)
+bool netdev_has_any_upper_dev(struct net_device *dev)
 {
 	ASSERT_RTNL();
 
 	return !list_empty(&dev->all_adj_list.upper);
 }
+EXPORT_SYMBOL(netdev_has_any_upper_dev);
 
 /**
  * netdev_master_upper_dev_get - Get master upper device
-- 
cgit v1.2.3


From b5a3ae8b127e692d6ebf4707c4ec6db68c413024 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 1 Sep 2017 12:22:25 +0300
Subject: bridge: switchdev: Clear forward mark when transmitting packet

[ Upstream commit 79e99bdd60b484af9afe0147e85a13e66d5c1cdb ]

Commit 6bc506b4fb06 ("bridge: switchdev: Add forward mark support for
stacked devices") added the 'offload_fwd_mark' bit to the skb in order
to allow drivers to indicate to the bridge driver that they already
forwarded the packet in L2.

In case the bit is set, before transmitting the packet from each port,
the port's mark is compared with the mark stored in the skb's control
block. If both marks are equal, we know the packet arrived from a switch
device that already forwarded the packet and it's not re-transmitted.

However, if the packet is transmitted from the bridge device itself
(e.g., br0), we should clear the 'offload_fwd_mark' bit as the mark
stored in the skb's control block isn't valid.

This scenario can happen in rare cases where a packet was trapped during
L3 forwarding and forwarded by the kernel to a bridge device.

Fixes: 6bc506b4fb06 ("bridge: switchdev: Add forward mark support for stacked devices")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Yotam Gigi <yotamg@mellanox.com>
Tested-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bridge/br_device.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 89a687f3c0a3..5f5e28f210e0 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -53,6 +53,9 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	brstats->tx_bytes += skb->len;
 	u64_stats_update_end(&brstats->syncp);
 
+#ifdef CONFIG_NET_SWITCHDEV
+	skb->offload_fwd_mark = 0;
+#endif
 	BR_INPUT_SKB_CB(skb)->brdev = dev;
 
 	skb_reset_mac_header(skb);
-- 
cgit v1.2.3


From 5a7a40bad254d2571d93059ba4b3963dc448cdb0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 1 Sep 2017 11:26:08 +0200
Subject: Revert "net: use lib/percpu_counter API for fragmentation mem
 accounting"

[ Upstream commit fb452a1aa3fd4034d7999e309c5466ff2d7005aa ]

This reverts commit 6d7b857d541ecd1d9bd997c97242d4ef94b19de2.

There is a bug in fragmentation codes use of the percpu_counter API,
that can cause issues on systems with many CPUs.

The frag_mem_limit() just reads the global counter (fbc->count),
without considering other CPUs can have upto batch size (130K) that
haven't been subtracted yet.  Due to the 3MBytes lower thresh limit,
this become dangerous at >=24 CPUs (3*1024*1024/130000=24).

The correct API usage would be to use __percpu_counter_compare() which
does the right thing, and takes into account the number of (online)
CPUs and batch size, to account for this and call __percpu_counter_sum()
when needed.

We choose to revert the use of the lib/percpu_counter API for frag
memory accounting for several reasons:

1) On systems with CPUs > 24, the heavier fully locked
   __percpu_counter_sum() is always invoked, which will be more
   expensive than the atomic_t that is reverted to.

Given systems with more than 24 CPUs are becoming common this doesn't
seem like a good option.  To mitigate this, the batch size could be
decreased and thresh be increased.

2) The add_frag_mem_limit+sub_frag_mem_limit pairs happen on the RX
   CPU, before SKBs are pushed into sockets on remote CPUs.  Given
   NICs can only hash on L2 part of the IP-header, the NIC-RXq's will
   likely be limited.  Thus, a fair chance that atomic add+dec happen
   on the same CPU.

Revert note that commit 1d6119baf061 ("net: fix percpu memory leaks")
removed init_frag_mem_limit() and instead use inet_frags_init_net().
After this revert, inet_frags_uninit_net() becomes empty.

Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/inet_frag.h  | 36 +++++++++---------------------------
 net/ipv4/inet_fragment.c |  4 +---
 2 files changed, 10 insertions(+), 30 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 909972aa3acd..3bb8dfec7725 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -1,14 +1,9 @@
 #ifndef __NET_FRAG_H__
 #define __NET_FRAG_H__
 
-#include <linux/percpu_counter.h>
-
 struct netns_frags {
-	/* The percpu_counter "mem" need to be cacheline aligned.
-	 *  mem.count must not share cacheline with other writers
-	 */
-	struct percpu_counter   mem ____cacheline_aligned_in_smp;
-
+	/* Keep atomic mem on separate cachelines in structs that include it */
+	atomic_t		mem ____cacheline_aligned_in_smp;
 	/* sysctls */
 	int			timeout;
 	int			high_thresh;
@@ -110,11 +105,11 @@ void inet_frags_fini(struct inet_frags *);
 
 static inline int inet_frags_init_net(struct netns_frags *nf)
 {
-	return percpu_counter_init(&nf->mem, 0, GFP_KERNEL);
+	atomic_set(&nf->mem, 0);
+	return 0;
 }
 static inline void inet_frags_uninit_net(struct netns_frags *nf)
 {
-	percpu_counter_destroy(&nf->mem);
 }
 
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
@@ -140,37 +135,24 @@ static inline bool inet_frag_evicting(struct inet_frag_queue *q)
 
 /* Memory Tracking Functions. */
 
-/* The default percpu_counter batch size is not big enough to scale to
- * fragmentation mem acct sizes.
- * The mem size of a 64K fragment is approx:
- *  (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
- */
-static unsigned int frag_percpu_counter_batch = 130000;
-
 static inline int frag_mem_limit(struct netns_frags *nf)
 {
-	return percpu_counter_read(&nf->mem);
+	return atomic_read(&nf->mem);
 }
 
 static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
 {
-	__percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch);
+	atomic_sub(i, &nf->mem);
 }
 
 static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
 {
-	__percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch);
+	atomic_add(i, &nf->mem);
 }
 
-static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
+static inline int sum_frag_mem_limit(struct netns_frags *nf)
 {
-	unsigned int res;
-
-	local_bh_disable();
-	res = percpu_counter_sum_positive(&nf->mem);
-	local_bh_enable();
-
-	return res;
+	return atomic_read(&nf->mem);
 }
 
 /* RFC 3168 support :
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index b5e9317eaf9e..631c0d0d7cf8 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -234,10 +234,8 @@ evict_again:
 	cond_resched();
 
 	if (read_seqretry(&f->rnd_seqlock, seq) ||
-	    percpu_counter_sum(&nf->mem))
+	    sum_frag_mem_limit(nf))
 		goto evict_again;
-
-	percpu_counter_destroy(&nf->mem);
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
 
-- 
cgit v1.2.3


From 1bcf18718ec63ad5fb025b75a5d2439e1dcf1213 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 1 Sep 2017 11:26:13 +0200
Subject: Revert "net: fix percpu memory leaks"

[ Upstream commit 5a63643e583b6a9789d7a225ae076fb4e603991c ]

This reverts commit 1d6119baf0610f813eb9d9580eb4fd16de5b4ceb.

After reverting commit 6d7b857d541e ("net: use lib/percpu_counter API
for fragmentation mem accounting") then here is no need for this
fix-up patch.  As percpu_counter is no longer used, it cannot
memory leak it any-longer.

Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/inet_frag.h                 |  7 +------
 net/ieee802154/6lowpan/reassembly.c     | 11 +++--------
 net/ipv4/ip_fragment.c                  | 12 +++---------
 net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +++---------
 net/ipv6/reassembly.c                   | 12 +++---------
 5 files changed, 13 insertions(+), 41 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 3bb8dfec7725..634d19203e7d 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -103,15 +103,10 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline int inet_frags_init_net(struct netns_frags *nf)
+static inline void inet_frags_init_net(struct netns_frags *nf)
 {
 	atomic_set(&nf->mem, 0);
-	return 0;
 }
-static inline void inet_frags_uninit_net(struct netns_frags *nf)
-{
-}
-
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 
 void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 30d875dff6b5..f85b08baff16 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -580,19 +580,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 {
 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
 		net_ieee802154_lowpan(net);
-	int res;
 
 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	res = inet_frags_init_net(&ieee802154_lowpan->frags);
-	if (res)
-		return res;
-	res = lowpan_frags_ns_sysctl_register(net);
-	if (res)
-		inet_frags_uninit_net(&ieee802154_lowpan->frags);
-	return res;
+	inet_frags_init_net(&ieee802154_lowpan->frags);
+
+	return lowpan_frags_ns_sysctl_register(net);
 }
 
 static void __net_exit lowpan_frags_exit_net(struct net *net)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index bbe7f72db9c1..453db950dc9f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -835,8 +835,6 @@ static void __init ip4_frags_ctl_register(void)
 
 static int __net_init ipv4_frags_init_net(struct net *net)
 {
-	int res;
-
 	/* Fragment cache limits.
 	 *
 	 * The fragment memory accounting code, (tries to) account for
@@ -862,13 +860,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 
 	net->ipv4.frags.max_dist = 64;
 
-	res = inet_frags_init_net(&net->ipv4.frags);
-	if (res)
-		return res;
-	res = ip4_frags_ns_ctl_register(net);
-	if (res)
-		inet_frags_uninit_net(&net->ipv4.frags);
-	return res;
+	inet_frags_init_net(&net->ipv4.frags);
+
+	return ip4_frags_ns_ctl_register(net);
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 986d4ca38832..b263bf3a19f7 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
 
 static int nf_ct_net_init(struct net *net)
 {
-	int res;
-
 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
-	res = inet_frags_init_net(&net->nf_frag.frags);
-	if (res)
-		return res;
-	res = nf_ct_frag6_sysctl_register(net);
-	if (res)
-		inet_frags_uninit_net(&net->nf_frag.frags);
-	return res;
+	inet_frags_init_net(&net->nf_frag.frags);
+
+	return nf_ct_frag6_sysctl_register(net);
 }
 
 static void nf_ct_net_exit(struct net *net)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 3815e8505ed2..e585c0a2591c 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -709,19 +709,13 @@ static void ip6_frags_sysctl_unregister(void)
 
 static int __net_init ipv6_frags_init_net(struct net *net)
 {
-	int res;
-
 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	res = inet_frags_init_net(&net->ipv6.frags);
-	if (res)
-		return res;
-	res = ip6_frags_ns_sysctl_register(net);
-	if (res)
-		inet_frags_uninit_net(&net->ipv6.frags);
-	return res;
+	inet_frags_init_net(&net->ipv6.frags);
+
+	return ip6_frags_ns_sysctl_register(net);
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
-- 
cgit v1.2.3


From 90406e68e42fa50c41b69a5d607fa979d0ab562b Mon Sep 17 00:00:00 2001
From: Claudiu Manoil <claudiu.manoil@nxp.com>
Date: Mon, 4 Sep 2017 10:45:28 +0300
Subject: gianfar: Fix Tx flow control deactivation

[ Upstream commit 5d621672bc1a1e5090c1ac5432a18c79e0e13e03 ]

The wrong register is checked for the Tx flow control bit,
it should have been maccfg1 not maccfg2.
This went unnoticed for so long probably because the impact is
hardly visible, not to mention the tangled code from adjust_link().
First, link flow control (i.e. handling of Rx/Tx link level pause frames)
is disabled by default (needs to be enabled via 'ethtool -A').
Secondly, maccfg2 always returns 0 for tx_flow_oldval (except for a few
old boards), which results in Tx flow control remaining always on
once activated.

Fixes: 45b679c9a3ccd9e34f28e6ec677b812a860eb8eb ("gianfar: Implement PAUSE frame generation support")
Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/freescale/gianfar.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 3f4e71148808..fd206889a433 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -3690,7 +3690,7 @@ static noinline void gfar_update_link_state(struct gfar_private *priv)
 		u32 tempval1 = gfar_read(&regs->maccfg1);
 		u32 tempval = gfar_read(&regs->maccfg2);
 		u32 ecntrl = gfar_read(&regs->ecntrl);
-		u32 tx_flow_oldval = (tempval & MACCFG1_TX_FLOW);
+		u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW);
 
 		if (phydev->duplex != priv->oldduplex) {
 			if (!(phydev->duplex))
-- 
cgit v1.2.3


From f5755c0e870056dd35c95a0b5c0a038cdb4382ee Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 5 Sep 2017 09:22:05 +0800
Subject: vhost_net: correctly check tx avail during rx busy polling

[ Upstream commit 8b949bef9172ca69d918e93509a4ecb03d0355e0 ]

We check tx avail through vhost_enable_notify() in the past which is
wrong since it only checks whether or not guest has filled more
available buffer since last avail idx synchronization which was just
done by vhost_vq_avail_empty() before. What we really want is checking
pending buffers in the avail ring. Fix this by calling
vhost_vq_avail_empty() instead.

This issue could be noticed by doing netperf TCP_RR benchmark as
client from guest (but not host). With this fix, TCP_RR from guest to
localhost restores from 1375.91 trans per sec to 55235.28 trans per
sec on my laptop (Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz).

Fixes: 030881372460 ("vhost_net: basic polling support")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/vhost/net.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 5dc128a8da83..96a0661011fd 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -537,8 +537,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 
 		preempt_enable();
 
-		if (vhost_enable_notify(&net->dev, vq))
+		if (!vhost_vq_avail_empty(&net->dev, vq))
 			vhost_poll_queue(&vq->poll);
+		else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+			vhost_disable_notify(&net->dev, vq);
+			vhost_poll_queue(&vq->poll);
+		}
+
 		mutex_unlock(&vq->mutex);
 
 		len = peek_head_len(sk);
-- 
cgit v1.2.3


From ca7d8a337bd3e3eda49ab1b4dfa09ac9b335a56b Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 5 Sep 2017 17:26:33 +0800
Subject: ip6_gre: update mtu properly in ip6gre_err

[ Upstream commit 5c25f30c93fdc5bf25e62101aeaae7a4f9b421b3 ]

Now when probessing ICMPV6_PKT_TOOBIG, ip6gre_err only subtracts the
offset of gre header from mtu info. The expected mtu of gre device
should also subtract gre header. Otherwise, the next packets still
can't be sent out.

Jianlin found this issue when using the topo:
  client(ip6gre)<---->(nic1)route(nic2)<----->(ip6gre)server

and reducing nic2's mtu, then both tcp and sctp's performance with
big size data became 0.

This patch is to fix it by also subtracting grehdr (tun->tun_hlen)
from mtu info when updating gre device's mtu in ip6gre_err(). It
also needs to subtract ETH_HLEN if gre dev'type is ARPHRD_ETHER.

Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6_gre.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index d2844ee469cb..f78afe43bdff 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		}
 		break;
 	case ICMPV6_PKT_TOOBIG:
-		mtu = be32_to_cpu(info) - offset;
+		mtu = be32_to_cpu(info) - offset - t->tun_hlen;
+		if (t->dev->type == ARPHRD_ETHER)
+			mtu -= ETH_HLEN;
 		if (mtu < IPV6_MIN_MTU)
 			mtu = IPV6_MIN_MTU;
 		t->dev->mtu = mtu;
-- 
cgit v1.2.3


From c9335db792c04be68e553c6d0537c9df8b20e557 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 8 Sep 2017 10:26:19 +0200
Subject: ipv6: fix memory leak with multiple tables during netns destruction

[ Upstream commit ba1cc08d9488c94cb8d94f545305688b72a2a300 ]

fib6_net_exit only frees the main and local tables. If another table was
created with fib6_alloc_table, we leak it when the netns is destroyed.

Fix this in the same way ip_fib_net_exit cleans up tables, by walking
through the whole hashtable of fib6_table's. We can get rid of the
special cases for local and main, since they're also part of the
hashtable.

Reproducer:
    ip netns add x
    ip -net x -6 rule add from 6003:1::/64 table 100
    ip netns del x

Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: 58f09b78b730 ("[NETNS][IPV6] ip6_fib - make it per network namespace")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6_fib.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index af7442211ffb..291ec5e2d3cb 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -201,6 +201,12 @@ static void rt6_release(struct rt6_info *rt)
 	}
 }
 
+static void fib6_free_table(struct fib6_table *table)
+{
+	inetpeer_invalidate_tree(&table->tb6_peers);
+	kfree(table);
+}
+
 static void fib6_link_table(struct net *net, struct fib6_table *tb)
 {
 	unsigned int h;
@@ -1893,15 +1899,22 @@ out_timer:
 
 static void fib6_net_exit(struct net *net)
 {
+	unsigned int i;
+
 	rt6_ifdown(net, NULL);
 	del_timer_sync(&net->ipv6.ip6_fib_timer);
 
-#ifdef CONFIG_IPV6_MULTIPLE_TABLES
-	inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
-	kfree(net->ipv6.fib6_local_tbl);
-#endif
-	inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
-	kfree(net->ipv6.fib6_main_tbl);
+	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+		struct hlist_head *head = &net->ipv6.fib_table_hash[i];
+		struct hlist_node *tmp;
+		struct fib6_table *tb;
+
+		hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
+			hlist_del(&tb->tb6_hlist);
+			fib6_free_table(tb);
+		}
+	}
+
 	kfree(net->ipv6.fib_table_hash);
 	kfree(net->ipv6.rt6_stats);
 }
-- 
cgit v1.2.3


From bf8ed95d2ca9c99f0237fb3cf56c381b19130610 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 8 Sep 2017 15:48:47 -0700
Subject: ipv6: fix typo in fib6_net_exit()

[ Upstream commit 32a805baf0fb70b6dbedefcd7249ac7f580f9e3b ]

IPv6 FIB should use FIB6_TABLE_HASHSZ, not FIB_TABLE_HASHSZ.

Fixes: ba1cc08d9488 ("ipv6: fix memory leak with multiple tables during netns destruction")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6_fib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 291ec5e2d3cb..5da864997495 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1904,7 +1904,7 @@ static void fib6_net_exit(struct net *net)
 	rt6_ifdown(net, NULL);
 	del_timer_sync(&net->ipv6.ip6_fib_timer);
 
-	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+	for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
 		struct hlist_head *head = &net->ipv6.fib_table_hash[i];
 		struct hlist_node *tmp;
 		struct fib6_table *tb;
-- 
cgit v1.2.3


From 3f60dadbe1781e292b560dd353d4a5a637ed192d Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 8 Sep 2017 11:35:21 -0300
Subject: sctp: fix missing wake ups in some situations

[ Upstream commit 7906b00f5cd1cd484fced7fcda892176e3202c8a ]

Commit fb586f25300f ("sctp: delay calls to sk_data_ready() as much as
possible") minimized the number of wake ups that are triggered in case
the association receives a packet with multiple data chunks on it and/or
when io_events are enabled and then commit 0970f5b36659 ("sctp: signal
sk_data_ready earlier on data chunks reception") moved the wake up to as
soon as possible. It thus relies on the state machine running later to
clean the flag that the event was already generated.

The issue is that there are 2 call paths that calls
sctp_ulpq_tail_event() outside of the state machine, causing the flag to
linger and possibly omitting a needed wake up in the sequence.

One of the call paths is when enabling SCTP_SENDER_DRY_EVENTS via
setsockopt(SCTP_EVENTS), as noticed by Harald Welte. The other is when
partial reliability triggers removal of chunks from the send queue when
the application calls sendmsg().

This commit fixes it by not setting the flag in case the socket is not
owned by the user, as it won't be cleaned later. This works for
user-initiated calls and also for rx path processing.

Fixes: fb586f25300f ("sctp: delay calls to sk_data_ready() as much as possible")
Reported-by: Harald Welte <laforge@gnumonks.org>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sctp/ulpqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 84d0fdaf7de9..d3cfbf2f407d 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
 		sctp_ulpq_clear_pd(ulpq);
 
 	if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) {
-		sp->data_ready_signalled = 1;
+		if (!sock_owned_by_user(sk))
+			sp->data_ready_signalled = 1;
 		sk->sk_data_ready(sk);
 	}
 	return 1;
-- 
cgit v1.2.3


From 60b94125a1fe4988f5392d8537305dad441ef43d Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Thu, 7 Sep 2017 14:08:34 +0800
Subject: ip_tunnel: fix setting ttl and tos value in collect_md mode

[ Upstream commit 0f693f1995cf002432b70f43ce73f79bf8d0b6c9 ]

ttl and tos variables are declared and assigned, but are not used in
iptunnel_xmit() function.

Fixes: cfc7381b3002 ("ip_tunnel: add collect_md mode to IPIP tunnel")
Cc: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ip_tunnel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 5719d6ba0824..bd7f1836bb70 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -609,8 +609,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
 		ip_rt_put(rt);
 		goto tx_dropped;
 	}
-	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
-		      key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
+		      df, !net_eq(tunnel->net, dev_net(dev)));
 	return;
 tx_error:
 	dev->stats.tx_errors++;
-- 
cgit v1.2.3


From 0f90297cba9ba37eb37723423c2df022ce77704a Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 10 Aug 2017 17:35:04 -0700
Subject: f2fs: let fill_super handle roll-forward errors

commit afd2b4da40b3b567ef8d8e6881479345a2312a03 upstream.

If we set CP_ERROR_FLAG in roll-forward error, f2fs is no longer to proceed
any IOs due to f2fs_cp_error(). But, for example, if some stale data is involved
on roll-forward process, we're able to get -ENOENT, getting fs stuck.
If we get any error, let fill_super set SBI_NEED_FSCK and try to recover back
to stable point.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/recovery.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 2fc84a991325..66395f7c0309 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -626,8 +626,6 @@ out:
 	}
 
 	clear_sbi_flag(sbi, SBI_POR_DOING);
-	if (err)
-		set_ckpt_flags(sbi, CP_ERROR_FLAG);
 	mutex_unlock(&sbi->cp_mutex);
 
 	/* let's drop all the directory inodes for clean checkpoint */
-- 
cgit v1.2.3


From cc9618c9fffe6bd362f048928e15effe04e5b6cd Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Sat, 12 Aug 2017 21:33:23 -0700
Subject: f2fs: check hot_data for roll-forward recovery

commit 125c9fb1ccb53eb2ea9380df40f3c743f3fb2fed upstream.

We need to check HOT_DATA to truncate any previous data block when doing
roll-forward recovery.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 66395f7c0309..98c1a63a4614 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -316,7 +316,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
 		return 0;
 
 	/* Get the previous summary */
-	for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
+	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
 		struct curseg_info *curseg = CURSEG_I(sbi, i);
 		if (curseg->segno == segno) {
 			sum = curseg->sum_blk->entries[blkoff];
-- 
cgit v1.2.3


From c7d1ddec251d39415cd488c29e9d60b22d4b61b7 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 1 Aug 2017 07:11:34 -0700
Subject: x86/fsgsbase/64: Fully initialize FS and GS state in
 start_thread_common

commit 767d035d838f4fd6b5a5bbd7a3f6d293b7f65a49 upstream.

execve used to leak FSBASE and GSBASE on AMD CPUs.  Fix it.

The security impact of this bug is small but not quite zero -- it
could weaken ASLR when a privileged task execs a less privileged
program, but only if program changed bitness across the exec, or the
child binary was highly unusual or actively malicious.  A child
program that was compromised after the exec would not have access to
the leaked base.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chang Seok <chang.seok.bae@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/process_64.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3760b3c1ca0..02fa4701cc2e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -216,10 +216,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 		    unsigned long new_sp,
 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 {
+	WARN_ON_ONCE(regs != current_pt_regs());
+
+	if (static_cpu_has(X86_BUG_NULL_SEG)) {
+		/* Loading zero below won't clear the base. */
+		loadsegment(fs, __USER_DS);
+		load_gs_index(__USER_DS);
+	}
+
 	loadsegment(fs, 0);
 	loadsegment(es, _ds);
 	loadsegment(ds, _ds);
 	load_gs_index(0);
+
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
 	regs->cs		= _cs;
-- 
cgit v1.2.3


From 0caec70692a0f19538ed4ebb816df0d5585c8bd0 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 1 Aug 2017 07:11:35 -0700
Subject: x86/fsgsbase/64: Report FSBASE and GSBASE correctly in core dumps

commit 9584d98bed7a7a904d0702ad06bbcc94703cb5b4 upstream.

In ELF_COPY_CORE_REGS, we're copying from the current task, so
accessing thread.fsbase and thread.gsbase makes no sense.  Just read
the values from the CPU registers.

In practice, the old code would have been correct most of the time
simply because thread.fsbase and thread.gsbase usually matched the
CPU registers.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chang Seok <chang.seok.bae@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/elf.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index b31761ecce63..7bcd138c3aa9 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -204,6 +204,7 @@ void set_personality_ia32(bool);
 
 #define ELF_CORE_COPY_REGS(pr_reg, regs)			\
 do {								\
+	unsigned long base;					\
 	unsigned v;						\
 	(pr_reg)[0] = (regs)->r15;				\
 	(pr_reg)[1] = (regs)->r14;				\
@@ -226,8 +227,8 @@ do {								\
 	(pr_reg)[18] = (regs)->flags;				\
 	(pr_reg)[19] = (regs)->sp;				\
 	(pr_reg)[20] = (regs)->ss;				\
-	(pr_reg)[21] = current->thread.fsbase;			\
-	(pr_reg)[22] = current->thread.gsbase;			\
+	rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base;		\
+	rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base;	\
 	asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;	\
 	asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;	\
 	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;	\
-- 
cgit v1.2.3


From 3fddeb80034b2be27179cdc4e23167bc78d304d1 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 1 Aug 2017 07:11:37 -0700
Subject: x86/switch_to/64: Rewrite FS/GS switching yet again to fix AMD CPUs

commit e137a4d8f4dd2e277e355495b6b2cb241a8693c3 upstream.

Switching FS and GS is a mess, and the current code is still subtly
wrong: it assumes that "Loading a nonzero value into FS sets the
index and base", which is false on AMD CPUs if the value being
loaded is 1, 2, or 3.

(The current code came from commit 3e2b68d752c9 ("x86/asm,
sched/x86: Rewrite the FS and GS context switch code"), which made
it better but didn't fully fix it.)

Rewrite it to be much simpler and more obviously correct.  This
should fix it fully on AMD CPUs and shouldn't adversely affect
performance.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chang Seok <chang.seok.bae@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/process_64.c | 227 +++++++++++++++++++++++--------------------
 1 file changed, 122 insertions(+), 105 deletions(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 02fa4701cc2e..0887d2ae3797 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -136,6 +136,123 @@ void release_thread(struct task_struct *dead_task)
 	}
 }
 
+enum which_selector {
+	FS,
+	GS
+};
+
+/*
+ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
+ * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
+ * It's forcibly inlined because it'll generate better code and this function
+ * is hot.
+ */
+static __always_inline void save_base_legacy(struct task_struct *prev_p,
+					     unsigned short selector,
+					     enum which_selector which)
+{
+	if (likely(selector == 0)) {
+		/*
+		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
+		 * be the pre-existing saved base or it could be zero.  On AMD
+		 * (with X86_BUG_NULL_SEG), the segment base could be almost
+		 * anything.
+		 *
+		 * This branch is very hot (it's hit twice on almost every
+		 * context switch between 64-bit programs), and avoiding
+		 * the RDMSR helps a lot, so we just assume that whatever
+		 * value is already saved is correct.  This matches historical
+		 * Linux behavior, so it won't break existing applications.
+		 *
+		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
+		 * report that the base is zero, it needs to actually be zero:
+		 * see the corresponding logic in load_seg_legacy.
+		 */
+	} else {
+		/*
+		 * If the selector is 1, 2, or 3, then the base is zero on
+		 * !X86_BUG_NULL_SEG CPUs and could be anything on
+		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
+		 * has never attempted to preserve the base across context
+		 * switches.
+		 *
+		 * If selector > 3, then it refers to a real segment, and
+		 * saving the base isn't necessary.
+		 */
+		if (which == FS)
+			prev_p->thread.fsbase = 0;
+		else
+			prev_p->thread.gsbase = 0;
+	}
+}
+
+static __always_inline void save_fsgs(struct task_struct *task)
+{
+	savesegment(fs, task->thread.fsindex);
+	savesegment(gs, task->thread.gsindex);
+	save_base_legacy(task, task->thread.fsindex, FS);
+	save_base_legacy(task, task->thread.gsindex, GS);
+}
+
+static __always_inline void loadseg(enum which_selector which,
+				    unsigned short sel)
+{
+	if (which == FS)
+		loadsegment(fs, sel);
+	else
+		load_gs_index(sel);
+}
+
+static __always_inline void load_seg_legacy(unsigned short prev_index,
+					    unsigned long prev_base,
+					    unsigned short next_index,
+					    unsigned long next_base,
+					    enum which_selector which)
+{
+	if (likely(next_index <= 3)) {
+		/*
+		 * The next task is using 64-bit TLS, is not using this
+		 * segment at all, or is having fun with arcane CPU features.
+		 */
+		if (next_base == 0) {
+			/*
+			 * Nasty case: on AMD CPUs, we need to forcibly zero
+			 * the base.
+			 */
+			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+				loadseg(which, __USER_DS);
+				loadseg(which, next_index);
+			} else {
+				/*
+				 * We could try to exhaustively detect cases
+				 * under which we can skip the segment load,
+				 * but there's really only one case that matters
+				 * for performance: if both the previous and
+				 * next states are fully zeroed, we can skip
+				 * the load.
+				 *
+				 * (This assumes that prev_base == 0 has no
+				 * false positives.  This is the case on
+				 * Intel-style CPUs.)
+				 */
+				if (likely(prev_index | next_index | prev_base))
+					loadseg(which, next_index);
+			}
+		} else {
+			if (prev_index != next_index)
+				loadseg(which, next_index);
+			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+			       next_base);
+		}
+	} else {
+		/*
+		 * The next task is using a real segment.  Loading the selector
+		 * is sufficient.
+		 */
+		loadseg(which, next_index);
+	}
+}
+
 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 		unsigned long arg, struct task_struct *p, unsigned long tls)
 {
@@ -273,7 +390,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
-	unsigned prev_fsindex, prev_gsindex;
 	fpu_switch_t fpu_switch;
 
 	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
@@ -283,8 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 *
 	 * (e.g. xen_load_tls())
 	 */
-	savesegment(fs, prev_fsindex);
-	savesegment(gs, prev_gsindex);
+	save_fsgs(prev_p);
 
 	/*
 	 * Load TLS before restoring any segments so that segment loads
@@ -323,108 +438,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (unlikely(next->ds | prev->ds))
 		loadsegment(ds, next->ds);
 
-	/*
-	 * Switch FS and GS.
-	 *
-	 * These are even more complicated than DS and ES: they have
-	 * 64-bit bases are that controlled by arch_prctl.  The bases
-	 * don't necessarily match the selectors, as user code can do
-	 * any number of things to cause them to be inconsistent.
-	 *
-	 * We don't promise to preserve the bases if the selectors are
-	 * nonzero.  We also don't promise to preserve the base if the
-	 * selector is zero and the base doesn't match whatever was
-	 * most recently passed to ARCH_SET_FS/GS.  (If/when the
-	 * FSGSBASE instructions are enabled, we'll need to offer
-	 * stronger guarantees.)
-	 *
-	 * As an invariant,
-	 * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
-	 * impossible.
-	 */
-	if (next->fsindex) {
-		/* Loading a nonzero value into FS sets the index and base. */
-		loadsegment(fs, next->fsindex);
-	} else {
-		if (next->fsbase) {
-			/* Next index is zero but next base is nonzero. */
-			if (prev_fsindex)
-				loadsegment(fs, 0);
-			wrmsrl(MSR_FS_BASE, next->fsbase);
-		} else {
-			/* Next base and index are both zero. */
-			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
-				/*
-				 * We don't know the previous base and can't
-				 * find out without RDMSR.  Forcibly clear it.
-				 */
-				loadsegment(fs, __USER_DS);
-				loadsegment(fs, 0);
-			} else {
-				/*
-				 * If the previous index is zero and ARCH_SET_FS
-				 * didn't change the base, then the base is
-				 * also zero and we don't need to do anything.
-				 */
-				if (prev->fsbase || prev_fsindex)
-					loadsegment(fs, 0);
-			}
-		}
-	}
-	/*
-	 * Save the old state and preserve the invariant.
-	 * NB: if prev_fsindex == 0, then we can't reliably learn the base
-	 * without RDMSR because Intel user code can zero it without telling
-	 * us and AMD user code can program any 32-bit value without telling
-	 * us.
-	 */
-	if (prev_fsindex)
-		prev->fsbase = 0;
-	prev->fsindex = prev_fsindex;
-
-	if (next->gsindex) {
-		/* Loading a nonzero value into GS sets the index and base. */
-		load_gs_index(next->gsindex);
-	} else {
-		if (next->gsbase) {
-			/* Next index is zero but next base is nonzero. */
-			if (prev_gsindex)
-				load_gs_index(0);
-			wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
-		} else {
-			/* Next base and index are both zero. */
-			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
-				/*
-				 * We don't know the previous base and can't
-				 * find out without RDMSR.  Forcibly clear it.
-				 *
-				 * This contains a pointless SWAPGS pair.
-				 * Fixing it would involve an explicit check
-				 * for Xen or a new pvop.
-				 */
-				load_gs_index(__USER_DS);
-				load_gs_index(0);
-			} else {
-				/*
-				 * If the previous index is zero and ARCH_SET_GS
-				 * didn't change the base, then the base is
-				 * also zero and we don't need to do anything.
-				 */
-				if (prev->gsbase || prev_gsindex)
-					load_gs_index(0);
-			}
-		}
-	}
-	/*
-	 * Save the old state and preserve the invariant.
-	 * NB: if prev_gsindex == 0, then we can't reliably learn the base
-	 * without RDMSR because Intel user code can zero it without telling
-	 * us and AMD user code can program any 32-bit value without telling
-	 * us.
-	 */
-	if (prev_gsindex)
-		prev->gsbase = 0;
-	prev->gsindex = prev_gsindex;
+	load_seg_legacy(prev->fsindex, prev->fsbase,
+			next->fsindex, next->fsbase, FS);
+	load_seg_legacy(prev->gsindex, prev->gsbase,
+			next->gsindex, next->gsbase, GS);
 
 	switch_fpu_finish(next_fpu, fpu_switch);
 
-- 
cgit v1.2.3


From 4c1d33c4cf864cd1fa14868440daa300a8494900 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 18 May 2017 16:36:24 -0700
Subject: xfs: Move handling of missing page into one place in
 xfs_find_get_desired_pgoff()

commit a54fba8f5a0dc36161cacdf2aa90f007f702ec1a upstream.

Currently several places in xfs_find_get_desired_pgoff() handle the case
of a missing page. Make them all handled in one place after the loop has
terminated.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_file.c | 38 ++++++++------------------------------
 1 file changed, 8 insertions(+), 30 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index df206cfc21f7..2e04b1cdb0d2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1139,29 +1139,8 @@ xfs_find_get_desired_pgoff(
 		want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
 		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
 					  want);
-		/*
-		 * No page mapped into given range.  If we are searching holes
-		 * and if this is the first time we got into the loop, it means
-		 * that the given offset is landed in a hole, return it.
-		 *
-		 * If we have already stepped through some block buffers to find
-		 * holes but they all contains data.  In this case, the last
-		 * offset is already updated and pointed to the end of the last
-		 * mapped page, if it does not reach the endpoint to search,
-		 * that means there should be a hole between them.
-		 */
-		if (nr_pages == 0) {
-			/* Data search found nothing */
-			if (type == DATA_OFF)
-				break;
-
-			ASSERT(type == HOLE_OFF);
-			if (lastoff == startoff || lastoff < endoff) {
-				found = true;
-				*offset = lastoff;
-			}
+		if (nr_pages == 0)
 			break;
-		}
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page	*page = pvec.pages[i];
@@ -1227,21 +1206,20 @@ xfs_find_get_desired_pgoff(
 
 		/*
 		 * The number of returned pages less than our desired, search
-		 * done.  In this case, nothing was found for searching data,
-		 * but we found a hole behind the last offset.
+		 * done.
 		 */
-		if (nr_pages < want) {
-			if (type == HOLE_OFF) {
-				*offset = lastoff;
-				found = true;
-			}
+		if (nr_pages < want)
 			break;
-		}
 
 		index = pvec.pages[i - 1]->index + 1;
 		pagevec_release(&pvec);
 	} while (index <= end);
 
+	/* No page at lastoff and we are not done - we found a hole. */
+	if (type == HOLE_OFF && lastoff < endoff) {
+		*offset = lastoff;
+		found = true;
+	}
 out:
 	pagevec_release(&pvec);
 	return found;
-- 
cgit v1.2.3


From 85ab1b23d2d865049299f3c197ce550e80228fac Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 8 Jun 2017 08:23:07 -0700
Subject: xfs: fix spurious spin_is_locked() assert failures on non-smp kernels

commit 95989c46d2a156365867b1d795fdefce71bce378 upstream.

The 0-day kernel test robot reports assertion failures on
!CONFIG_SMP kernels due to failed spin_is_locked() checks. As it
turns out, spin_is_locked() is hardcoded to return zero on
!CONFIG_SMP kernels and so this function cannot be relied on to
verify spinlock state in this configuration.

To avoid this problem, replace the associated asserts with lockdep
variants that do the right thing regardless of kernel configuration.
Drop the one assert that checks for an unlocked lock as there is no
suitable lockdep variant for that case. This moves the spinlock
checks from XFS debug code to lockdep, but generally provides the
same level of protection.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_buf.c    | 2 +-
 fs/xfs/xfs_icache.c | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 16269271ebd6..24940dd3baa8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -116,7 +116,7 @@ static inline void
 __xfs_buf_ioacct_dec(
 	struct xfs_buf	*bp)
 {
-	ASSERT(spin_is_locked(&bp->b_lock));
+	lockdep_assert_held(&bp->b_lock);
 
 	if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
 		bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 74304b6ce84b..e279882de427 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -66,7 +66,6 @@ xfs_inode_alloc(
 
 	XFS_STATS_INC(mp, vn_active);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(!xfs_isiflocked(ip));
 	ASSERT(ip->i_ino == 0);
 
@@ -192,7 +191,7 @@ xfs_perag_set_reclaim_tag(
 {
 	struct xfs_mount	*mp = pag->pag_mount;
 
-	ASSERT(spin_is_locked(&pag->pag_ici_lock));
+	lockdep_assert_held(&pag->pag_ici_lock);
 	if (pag->pag_ici_reclaimable++)
 		return;
 
@@ -214,7 +213,7 @@ xfs_perag_clear_reclaim_tag(
 {
 	struct xfs_mount	*mp = pag->pag_mount;
 
-	ASSERT(spin_is_locked(&pag->pag_ici_lock));
+	lockdep_assert_held(&pag->pag_ici_lock);
 	if (--pag->pag_ici_reclaimable)
 		return;
 
-- 
cgit v1.2.3


From 7cb011bbacef6fcf1d26fe8cd8cc8079404b01f8 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 14 Jun 2017 21:21:45 -0700
Subject: xfs: push buffer of flush locked dquot to avoid quotacheck deadlock

commit 7912e7fef2aebe577f0b46d3cba261f2783c5695 upstream.

Reclaim during quotacheck can lead to deadlocks on the dquot flush
lock:

 - Quotacheck populates a local delwri queue with the physical dquot
   buffers.
 - Quotacheck performs the xfs_qm_dqusage_adjust() bulkstat and
   dirties all of the dquots.
 - Reclaim kicks in and attempts to flush a dquot whose buffer is
   already queud on the quotacheck queue. The flush succeeds but
   queueing to the reclaim delwri queue fails as the backing buffer is
   already queued. The flush unlock is now deferred to I/O completion
   of the buffer from the quotacheck queue.
 - The dqadjust bulkstat continues and dirties the recently flushed
   dquot once again.
 - Quotacheck proceeds to the xfs_qm_flush_one() walk which requires
   the flush lock to update the backing buffers with the in-core
   recalculated values. It deadlocks on the redirtied dquot as the
   flush lock was already acquired by reclaim, but the buffer resides
   on the local delwri queue which isn't submitted until the end of
   quotacheck.

This is reproduced by running quotacheck on a filesystem with a
couple million inodes in low memory (512MB-1GB) situations. This is
a regression as of commit 43ff2122e6 ("xfs: on-stack delayed write
buffer lists"), which removed a trylock and buffer I/O submission
from the quotacheck dquot flush sequence.

Quotacheck first resets and collects the physical dquot buffers in a
delwri queue. Then, it traverses the filesystem inodes via bulkstat,
updates the in-core dquots, flushes the corrected dquots to the
backing buffers and finally submits the delwri queue for I/O. Since
the backing buffers are queued across the entire quotacheck
operation, dquot reclaim cannot possibly complete a dquot flush
before quotacheck completes.

Therefore, quotacheck must submit the buffer for I/O in order to
cycle the flush lock and flush the dirty in-core dquot to the
buffer. Add a delwri queue buffer push mechanism to submit an
individual buffer for I/O without losing the delwri queue status and
use it from quotacheck to avoid the deadlock. This restores
quotacheck behavior to as before the regression was introduced.

Reported-by: Martin Svec <martin.svec@zoner.cz>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_buf.c   | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_buf.h   |  1 +
 fs/xfs/xfs_qm.c    | 28 ++++++++++++++++++++++++-
 fs/xfs/xfs_trace.h |  1 +
 4 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 24940dd3baa8..eca7baecc9f0 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2022,6 +2022,66 @@ xfs_buf_delwri_submit(
 	return error;
 }
 
+/*
+ * Push a single buffer on a delwri queue.
+ *
+ * The purpose of this function is to submit a single buffer of a delwri queue
+ * and return with the buffer still on the original queue. The waiting delwri
+ * buffer submission infrastructure guarantees transfer of the delwri queue
+ * buffer reference to a temporary wait list. We reuse this infrastructure to
+ * transfer the buffer back to the original queue.
+ *
+ * Note the buffer transitions from the queued state, to the submitted and wait
+ * listed state and back to the queued state during this call. The buffer
+ * locking and queue management logic between _delwri_pushbuf() and
+ * _delwri_queue() guarantee that the buffer cannot be queued to another list
+ * before returning.
+ */
+int
+xfs_buf_delwri_pushbuf(
+	struct xfs_buf		*bp,
+	struct list_head	*buffer_list)
+{
+	LIST_HEAD		(submit_list);
+	int			error;
+
+	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+
+	trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
+
+	/*
+	 * Isolate the buffer to a new local list so we can submit it for I/O
+	 * independently from the rest of the original list.
+	 */
+	xfs_buf_lock(bp);
+	list_move(&bp->b_list, &submit_list);
+	xfs_buf_unlock(bp);
+
+	/*
+	 * Delwri submission clears the DELWRI_Q buffer flag and returns with
+	 * the buffer on the wait list with an associated reference. Rather than
+	 * bounce the buffer from a local wait list back to the original list
+	 * after I/O completion, reuse the original list as the wait list.
+	 */
+	xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
+
+	/*
+	 * The buffer is now under I/O and wait listed as during typical delwri
+	 * submission. Lock the buffer to wait for I/O completion. Rather than
+	 * remove the buffer from the wait list and release the reference, we
+	 * want to return with the buffer queued to the original list. The
+	 * buffer already sits on the original list with a wait list reference,
+	 * however. If we let the queue inherit that wait list reference, all we
+	 * need to do is reset the DELWRI_Q flag.
+	 */
+	xfs_buf_lock(bp);
+	error = bp->b_error;
+	bp->b_flags |= _XBF_DELWRI_Q;
+	xfs_buf_unlock(bp);
+
+	return error;
+}
+
 int __init
 xfs_buf_init(void)
 {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index ad514a8025dd..f961b19b9cc2 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -333,6 +333,7 @@ extern void xfs_buf_delwri_cancel(struct list_head *);
 extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
 extern int xfs_buf_delwri_submit(struct list_head *);
 extern int xfs_buf_delwri_submit_nowait(struct list_head *);
+extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
 
 /* Buffer Daemon Setup Routines */
 extern int xfs_buf_init(void);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 8b9a9f15f022..8068867a8183 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1247,6 +1247,7 @@ xfs_qm_flush_one(
 	struct xfs_dquot	*dqp,
 	void			*data)
 {
+	struct xfs_mount	*mp = dqp->q_mount;
 	struct list_head	*buffer_list = data;
 	struct xfs_buf		*bp = NULL;
 	int			error = 0;
@@ -1257,7 +1258,32 @@ xfs_qm_flush_one(
 	if (!XFS_DQ_IS_DIRTY(dqp))
 		goto out_unlock;
 
-	xfs_dqflock(dqp);
+	/*
+	 * The only way the dquot is already flush locked by the time quotacheck
+	 * gets here is if reclaim flushed it before the dqadjust walk dirtied
+	 * it for the final time. Quotacheck collects all dquot bufs in the
+	 * local delwri queue before dquots are dirtied, so reclaim can't have
+	 * possibly queued it for I/O. The only way out is to push the buffer to
+	 * cycle the flush lock.
+	 */
+	if (!xfs_dqflock_nowait(dqp)) {
+		/* buf is pinned in-core by delwri list */
+		DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno,
+				      mp->m_quotainfo->qi_dqchunklen);
+		bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL);
+		if (!bp) {
+			error = -EINVAL;
+			goto out_unlock;
+		}
+		xfs_buf_unlock(bp);
+
+		xfs_buf_delwri_pushbuf(bp, buffer_list);
+		xfs_buf_rele(bp);
+
+		error = -EAGAIN;
+		goto out_unlock;
+	}
+
 	error = xfs_qm_dqflush(dqp, &bp);
 	if (error)
 		goto out_unlock;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 828f383df121..2df73f3a73c1 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -366,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
 DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
 DEFINE_BUF_EVENT(xfs_buf_delwri_split);
+DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
 DEFINE_BUF_EVENT(xfs_buf_get_uncached);
 DEFINE_BUF_EVENT(xfs_bdstrat_shut);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
-- 
cgit v1.2.3


From ce83e494d1bbbdd045aae236dcbb412cdd721319 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 14 Jun 2017 21:25:57 -0700
Subject: xfs: try to avoid blowing out the transaction reservation when
 bunmaping a shared extent

commit e1a4e37cc7b665b6804fba812aca2f4d7402c249 upstream.

In a pathological scenario where we are trying to bunmapi a single
extent in which every other block is shared, it's possible that trying
to unmap the entire large extent in a single transaction can generate so
many EFIs that we overflow the transaction reservation.

Therefore, use a heuristic to guess at the number of blocks we can
safely unmap from a reflink file's data fork in an single transaction.
This should prevent problems such as the log head slamming into the tail
and ASSERTs that trigger because we've exceeded the transaction
reservation.

Note that since bunmapi can fail to unmap the entire range, we must also
teach the deferred unmap code to roll into a new transaction whenever we
get low on reservation.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
[hch: random edits, all bugs are my fault]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_bmap.c     | 37 ++++++++++++++++++++++++++++---------
 fs/xfs/libxfs/xfs_bmap.h     |  2 +-
 fs/xfs/libxfs/xfs_refcount.c | 10 +---------
 fs/xfs/libxfs/xfs_refcount.h | 16 ++++++++++++++++
 fs/xfs/xfs_bmap_item.c       | 17 +++++++++++++++--
 fs/xfs/xfs_trans.h           |  2 +-
 fs/xfs/xfs_trans_bmap.c      | 11 +++++++++--
 7 files changed, 71 insertions(+), 24 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 2a8cbd15d5d1..b79719a87638 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5555,6 +5555,7 @@ __xfs_bunmapi(
 	int			whichfork;	/* data or attribute fork */
 	xfs_fsblock_t		sum;
 	xfs_filblks_t		len = *rlen;	/* length to unmap in file */
+	xfs_fileoff_t		max_len;
 
 	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
 
@@ -5576,6 +5577,16 @@ __xfs_bunmapi(
 	ASSERT(len > 0);
 	ASSERT(nexts >= 0);
 
+	/*
+	 * Guesstimate how many blocks we can unmap without running the risk of
+	 * blowing out the transaction with a mix of EFIs and reflink
+	 * adjustments.
+	 */
+	if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
+		max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
+	else
+		max_len = len;
+
 	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
 	    (error = xfs_iread_extents(tp, ip, whichfork)))
 		return error;
@@ -5621,7 +5632,7 @@ __xfs_bunmapi(
 
 	extno = 0;
 	while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
-	       (nexts == 0 || extno < nexts)) {
+	       (nexts == 0 || extno < nexts) && max_len > 0) {
 		/*
 		 * Is the found extent after a hole in which bno lives?
 		 * Just back up to the previous extent, if so.
@@ -5655,6 +5666,15 @@ __xfs_bunmapi(
 		}
 		if (del.br_startoff + del.br_blockcount > bno + 1)
 			del.br_blockcount = bno + 1 - del.br_startoff;
+
+		/* How much can we safely unmap? */
+		if (max_len < del.br_blockcount) {
+			del.br_startoff += del.br_blockcount - max_len;
+			if (!wasdel)
+				del.br_startblock += del.br_blockcount - max_len;
+			del.br_blockcount = max_len;
+		}
+
 		sum = del.br_startblock + del.br_blockcount;
 		if (isrt &&
 		    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
@@ -5835,6 +5855,7 @@ __xfs_bunmapi(
 		if (!isrt && wasdel)
 			xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
 
+		max_len -= del.br_blockcount;
 		bno = del.br_startoff - 1;
 nodelete:
 		/*
@@ -6604,25 +6625,24 @@ xfs_bmap_finish_one(
 	int				whichfork,
 	xfs_fileoff_t			startoff,
 	xfs_fsblock_t			startblock,
-	xfs_filblks_t			blockcount,
+	xfs_filblks_t			*blockcount,
 	xfs_exntst_t			state)
 {
 	struct xfs_bmbt_irec		bmap;
 	int				nimaps = 1;
 	xfs_fsblock_t			firstfsb;
 	int				flags = XFS_BMAPI_REMAP;
-	int				done;
 	int				error = 0;
 
 	bmap.br_startblock = startblock;
 	bmap.br_startoff = startoff;
-	bmap.br_blockcount = blockcount;
+	bmap.br_blockcount = *blockcount;
 	bmap.br_state = state;
 
 	trace_xfs_bmap_deferred(tp->t_mountp,
 			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
 			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
-			ip->i_ino, whichfork, startoff, blockcount, state);
+			ip->i_ino, whichfork, startoff, *blockcount, state);
 
 	if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
 		return -EFSCORRUPTED;
@@ -6641,12 +6661,11 @@ xfs_bmap_finish_one(
 					bmap.br_blockcount, flags, &firstfsb,
 					bmap.br_blockcount, &bmap, &nimaps,
 					dfops);
+		*blockcount = 0;
 		break;
 	case XFS_BMAP_UNMAP:
-		error = xfs_bunmapi(tp, ip, bmap.br_startoff,
-				bmap.br_blockcount, flags, 1, &firstfsb,
-				dfops, &done);
-		ASSERT(done);
+		error = __xfs_bunmapi(tp, ip, startoff, blockcount,
+				XFS_BMAPI_REMAP, 1, &firstfsb, dfops);
 		break;
 	default:
 		ASSERT(0);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index e7d40b39f18f..db53ac7ff6df 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -265,7 +265,7 @@ struct xfs_bmap_intent {
 int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
 		struct xfs_inode *ip, enum xfs_bmap_intent_type type,
 		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
-		xfs_filblks_t blockcount, xfs_exntst_t state);
+		xfs_filblks_t *blockcount, xfs_exntst_t state);
 int	xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
 		struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
 int	xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 82a38d86ebad..e17016163542 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -784,14 +784,6 @@ xfs_refcount_merge_extents(
 }
 
 /*
- * While we're adjusting the refcounts records of an extent, we have
- * to keep an eye on the number of extents we're dirtying -- run too
- * many in a single transaction and we'll exceed the transaction's
- * reservation and crash the fs.  Each record adds 12 bytes to the
- * log (plus any key updates) so we'll conservatively assume 24 bytes
- * per record.  We must also leave space for btree splits on both ends
- * of the range and space for the CUD and a new CUI.
- *
  * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing
  * true incorrectly is a shutdown FS; the penalty for guessing false
  * incorrectly is more transaction rolls than might be necessary.
@@ -822,7 +814,7 @@ xfs_refcount_still_have_space(
 	else if (overhead > cur->bc_tp->t_log_res)
 		return false;
 	return  cur->bc_tp->t_log_res - overhead >
-		cur->bc_private.a.priv.refc.nr_ops * 32;
+		cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 098dc668ab2c..eafb9d1f3b37 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
 extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
 		xfs_agnumber_t agno);
 
+/*
+ * While we're adjusting the refcounts records of an extent, we have
+ * to keep an eye on the number of extents we're dirtying -- run too
+ * many in a single transaction and we'll exceed the transaction's
+ * reservation and crash the fs.  Each record adds 12 bytes to the
+ * log (plus any key updates) so we'll conservatively assume 32 bytes
+ * per record.  We must also leave space for btree splits on both ends
+ * of the range and space for the CUD and a new CUI.
+ */
+#define XFS_REFCOUNT_ITEM_OVERHEAD	32
+
+static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
+{
+	return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
+}
+
 #endif	/* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index c4b90e794e41..5a54dcd7e7b1 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -395,6 +395,7 @@ xfs_bui_recover(
 	struct xfs_map_extent		*bmap;
 	xfs_fsblock_t			startblock_fsb;
 	xfs_fsblock_t			inode_fsb;
+	xfs_filblks_t			count;
 	bool				op_ok;
 	struct xfs_bud_log_item		*budp;
 	enum xfs_bmap_intent_type	type;
@@ -403,6 +404,7 @@ xfs_bui_recover(
 	struct xfs_trans		*tp;
 	struct xfs_inode		*ip = NULL;
 	struct xfs_defer_ops		dfops;
+	struct xfs_bmbt_irec		irec;
 	xfs_fsblock_t			firstfsb;
 
 	ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
@@ -480,13 +482,24 @@ xfs_bui_recover(
 	}
 	xfs_trans_ijoin(tp, ip, 0);
 
+	count = bmap->me_len;
 	error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
 			ip, whichfork, bmap->me_startoff,
-			bmap->me_startblock, bmap->me_len,
-			state);
+			bmap->me_startblock, &count, state);
 	if (error)
 		goto err_dfops;
 
+	if (count > 0) {
+		ASSERT(type == XFS_BMAP_UNMAP);
+		irec.br_startblock = bmap->me_startblock;
+		irec.br_blockcount = count;
+		irec.br_startoff = bmap->me_startoff;
+		irec.br_state = state;
+		error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
+		if (error)
+			goto err_dfops;
+	}
+
 	/* Finish transaction, free inodes. */
 	error = xfs_defer_finish(&tp, &dfops, NULL);
 	if (error)
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 98024cb933ef..c0e72ab57741 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -277,6 +277,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
 		struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
 		enum xfs_bmap_intent_type type, struct xfs_inode *ip,
 		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
-		xfs_filblks_t blockcount, xfs_exntst_t state);
+		xfs_filblks_t *blockcount, xfs_exntst_t state);
 
 #endif	/* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
index 6408e7d7c08c..14543d93cd4b 100644
--- a/fs/xfs/xfs_trans_bmap.c
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
 	int				whichfork,
 	xfs_fileoff_t			startoff,
 	xfs_fsblock_t			startblock,
-	xfs_filblks_t			blockcount,
+	xfs_filblks_t			*blockcount,
 	xfs_exntst_t			state)
 {
 	int				error;
@@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
 	void				**state)
 {
 	struct xfs_bmap_intent		*bmap;
+	xfs_filblks_t			count;
 	int				error;
 
 	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+	count = bmap->bi_bmap.br_blockcount;
 	error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
 			bmap->bi_type,
 			bmap->bi_owner, bmap->bi_whichfork,
 			bmap->bi_bmap.br_startoff,
 			bmap->bi_bmap.br_startblock,
-			bmap->bi_bmap.br_blockcount,
+			&count,
 			bmap->bi_bmap.br_state);
+	if (!error && count > 0) {
+		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
+		bmap->bi_bmap.br_blockcount = count;
+		return -EAGAIN;
+	}
 	kmem_free(bmap);
 	return error;
 }
-- 
cgit v1.2.3


From 6c0ecde201d796363b92de79553b75089760d9a4 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 14 Jun 2017 21:35:35 -0700
Subject: xfs: release bli from transaction properly on fs shutdown

commit 79e641ce29cfae5b8fc55fb77ac62d11d2d849c0 upstream.

If a filesystem shutdown occurs with a buffer log item in the CIL
and a log force occurs, the ->iop_unpin() handler is generally
expected to tear down the bli properly. This entails freeing the bli
memory and releasing the associated hold on the buffer so it can be
released and the filesystem unmounted.

If this sequence occurs while ->bli_refcount is elevated (i.e.,
another transaction is open and attempting to modify the buffer),
however, ->iop_unpin() may not be responsible for releasing the bli.
Instead, the transaction may release the final ->bli_refcount
reference and thus xfs_trans_brelse() is responsible for tearing
down the bli.

While xfs_trans_brelse() does drop the reference count, it only
attempts to release the bli if it is clean (i.e., not in the
CIL/AIL). If the filesystem is shutdown and the bli is sitting dirty
in the CIL as noted above, this ends up skipping the last
opportunity to release the bli. In turn, this leaves the hold on the
buffer and causes an unmount hang. This can be reproduced by running
generic/388 in repetition.

Update xfs_trans_brelse() to handle this shutdown corner case
correctly. If the final bli reference is dropped and the filesystem
is shutdown, remove the bli from the AIL (if necessary) and release
the bli to drop the buffer hold and ensure an unmount does not hang.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_trans_buf.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee29ca132dc..86987d823d76 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 		 xfs_buf_t	*bp)
 {
 	xfs_buf_log_item_t	*bip;
+	int			freed;
 
 	/*
 	 * Default to a normal brelse() call if the tp is NULL.
@@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 	/*
 	 * Drop our reference to the buf log item.
 	 */
-	atomic_dec(&bip->bli_refcount);
+	freed = atomic_dec_and_test(&bip->bli_refcount);
 
 	/*
-	 * If the buf item is not tracking data in the log, then
-	 * we must free it before releasing the buffer back to the
-	 * free pool.  Before releasing the buffer to the free pool,
-	 * clear the transaction pointer in b_fsprivate2 to dissolve
-	 * its relation to this transaction.
+	 * If the buf item is not tracking data in the log, then we must free it
+	 * before releasing the buffer back to the free pool.
+	 *
+	 * If the fs has shutdown and we dropped the last reference, it may fall
+	 * on us to release a (possibly dirty) bli if it never made it to the
+	 * AIL (e.g., the aborted unpin already happened and didn't release it
+	 * due to our reference). Since we're already shutdown and need xa_lock,
+	 * just force remove from the AIL and release the bli here.
 	 */
-	if (!xfs_buf_item_dirty(bip)) {
+	if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
+		xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
+		xfs_buf_item_relse(bp);
+	} else if (!xfs_buf_item_dirty(bip)) {
 /***
 		ASSERT(bp->b_pincount == 0);
 ***/
-- 
cgit v1.2.3


From 8913492d12b1e71bd89bb234408483b7c56700e0 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 14 Jun 2017 21:35:35 -0700
Subject: xfs: remove bli from AIL before release on transaction abort

commit 3d4b4a3e30ae7a949c31e1e10268a3da4723d290 upstream.

When a buffer is modified, logged and committed, it ultimately ends
up sitting on the AIL with a dirty bli waiting for metadata
writeback. If another transaction locks and invalidates the buffer
(freeing an inode chunk, for example) in the meantime, the bli is
flagged as stale, the dirty state is cleared and the bli remains in
the AIL.

If a shutdown occurs before the transaction that has invalidated the
buffer is committed, the transaction is ultimately aborted. The log
items are flagged as such and ->iop_unlock() handles the aborted
items. Because the bli is clean (due to the invalidation),
->iop_unlock() unconditionally releases it. The log item may still
reside in the AIL, however, which means the I/O completion handler
may still run and attempt to access it. This results in assert
failure due to the release of the bli while still present in the AIL
and a subsequent NULL dereference and panic in the buffer I/O
completion handling. This can be reproduced by running generic/388
in repetition.

To avoid this problem, update xfs_buf_item_unlock() to first check
whether the bli is aborted and if so, remove it from the AIL before
it is released. This ensures that the bli is no longer accessed
during the shutdown sequence after it has been freed.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_buf_item.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 0306168af332..f6a8422e9562 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -636,20 +636,23 @@ xfs_buf_item_unlock(
 
 	/*
 	 * Clean buffers, by definition, cannot be in the AIL. However, aborted
-	 * buffers may be dirty and hence in the AIL. Therefore if we are
-	 * aborting a buffer and we've just taken the last refernce away, we
-	 * have to check if it is in the AIL before freeing it. We need to free
-	 * it in this case, because an aborted transaction has already shut the
-	 * filesystem down and this is the last chance we will have to do so.
+	 * buffers may be in the AIL regardless of dirty state. An aborted
+	 * transaction that invalidates a buffer already in the AIL may have
+	 * marked it stale and cleared the dirty state, for example.
+	 *
+	 * Therefore if we are aborting a buffer and we've just taken the last
+	 * reference away, we have to check if it is in the AIL before freeing
+	 * it. We need to free it in this case, because an aborted transaction
+	 * has already shut the filesystem down and this is the last chance we
+	 * will have to do so.
 	 */
 	if (atomic_dec_and_test(&bip->bli_refcount)) {
-		if (clean)
-			xfs_buf_item_relse(bp);
-		else if (aborted) {
+		if (aborted) {
 			ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
 			xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
 			xfs_buf_item_relse(bp);
-		}
+		} else if (clean)
+			xfs_buf_item_relse(bp);
 	}
 
 	if (!(flags & XFS_BLI_HOLD))
-- 
cgit v1.2.3


From 621d0b75a3476bce5f1d4e13bb99deaf57b9289d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 19 Jun 2017 13:19:08 -0700
Subject: xfs: don't allow bmap on rt files

commit 61d819e7bcb7f33da710bf3f5dcb2bcf1e48203c upstream.

bmap returns a dumb LBA address but not the block device that goes with
that LBA.  Swapfiles don't care about this and will blindly assume that
the data volume is the correct blockdev, which is totally bogus for
files on the rt subvolume.  This results in the swap code doing IOs to
arbitrary locations on the data device(!) if the passed in mapping is a
realtime file, so just turn off bmap for rt files.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_aops.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 578981412615..f750d888bd17 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1566,9 +1566,12 @@ xfs_vm_bmap(
 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
 	 * bypasseѕ the file system for actual I/O.  We really can't allow
 	 * that on reflinks inodes, so we have to skip out here.  And yes,
-	 * 0 is the magic code for a bmap error..
+	 * 0 is the magic code for a bmap error.
+	 *
+	 * Since we don't pass back blockdev info, we can't return bmap
+	 * information for rt files either.
 	 */
-	if (xfs_is_reflink_inode(ip)) {
+	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) {
 		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 		return 0;
 	}
-- 
cgit v1.2.3


From 171192c92da616d5848e0e70c6cab4f14351d275 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:06:33 -0700
Subject: xfs: free uncommitted transactions during log recovery

commit 39775431f82f890f4aaa08860a30883d081bffc7 upstream.

Log recovery allocates in-core transaction and member item data
structures on-demand as it processes the on-disk log. Transactions
are allocated on first encounter on-disk and stored in a hash table
structure where they are easily accessible for subsequent lookups.
Transaction items are also allocated on demand and are attached to
the associated transactions.

When a commit record is encountered in the log, the transaction is
committed to the fs and the in-core structures are freed. If a
filesystem crashes or shuts down before all in-core log buffers are
flushed to the log, however, not all transactions may have commit
records in the log. As expected, the modifications in such an
incomplete transaction are not replayed to the fs. The in-core data
structures for the partial transaction are never freed, however,
resulting in a memory leak.

Update xlog_do_recovery_pass() to first correctly initialize the
hash table array so empty lists can be distinguished from populated
lists on function exit. Update xlog_recover_free_trans() to always
remove the transaction from the list prior to freeing the associated
memory. Finally, walk the hash table of transaction lists as the
last step before it goes out of scope and free any transactions that
may remain on the lists. This prevents a memory leak of partial
transactions in the log.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_log_recover.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9b3d7c76915d..e06aa2827ecb 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4152,7 +4152,7 @@ xlog_recover_commit_trans(
 
 	#define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
 
-	hlist_del(&trans->r_list);
+	hlist_del_init(&trans->r_list);
 
 	error = xlog_recover_reorder_trans(log, trans, pass);
 	if (error)
@@ -4354,6 +4354,8 @@ xlog_recover_free_trans(
 	xlog_recover_item_t	*item, *n;
 	int			i;
 
+	hlist_del_init(&trans->r_list);
+
 	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
 		/* Free the regions in the item. */
 		list_del(&item->ri_list);
@@ -5222,12 +5224,16 @@ xlog_do_recovery_pass(
 	int			error2 = 0;
 	int			bblks, split_bblks;
 	int			hblks, split_hblks, wrapped_hblks;
+	int			i;
 	struct hlist_head	rhash[XLOG_RHASH_SIZE];
 	LIST_HEAD		(buffer_list);
 
 	ASSERT(head_blk != tail_blk);
 	rhead_blk = 0;
 
+	for (i = 0; i < XLOG_RHASH_SIZE; i++)
+		INIT_HLIST_HEAD(&rhash[i]);
+
 	/*
 	 * Read the header of the tail block and get the iclog buffer size from
 	 * h_size.  Use this to tell how many sectors make up the log header.
@@ -5464,6 +5470,19 @@ xlog_do_recovery_pass(
 	if (error && first_bad)
 		*first_bad = rhead_blk;
 
+	/*
+	 * Transactions are freed at commit time but transactions without commit
+	 * records on disk are never committed. Free any that may be left in the
+	 * hash table.
+	 */
+	for (i = 0; i < XLOG_RHASH_SIZE; i++) {
+		struct hlist_node	*tmp;
+		struct xlog_recover	*trans;
+
+		hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
+			xlog_recover_free_trans(trans);
+	}
+
 	return error ? error : error2;
 }
 
-- 
cgit v1.2.3


From b46382f02aff8d9ac141714bc6ae4f972836816f Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:06:34 -0700
Subject: xfs: free cowblocks and retry on buffered write ENOSPC

commit cf2cb7845d6e101cb17bd62f8aa08cd514fc8988 upstream.

XFS runs an eofblocks reclaim scan before returning an ENOSPC error to
userspace for buffered writes. This facilitates aggressive speculative
preallocation without causing user visible side effects such as
premature ENOSPC.

Run a cowblocks scan in the same situation to reclaim lingering COW fork
preallocation throughout the filesystem.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_file.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2e04b1cdb0d2..586b398f268d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -729,6 +729,7 @@ write_retry:
 		xfs_rw_iunlock(ip, iolock);
 		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
 		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+		xfs_icache_free_cowblocks(ip->i_mount, &eofb);
 		goto write_retry;
 	}
 
-- 
cgit v1.2.3


From e76496fa85543c48858c537c1a6465068e18db8b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 17 Sep 2017 14:06:35 -0700
Subject: xfs: don't crash on unexpected holes in dir/attr btrees

commit cd87d867920155911d0d2e6485b769d853547750 upstream.

In quite a few places we call xfs_da_read_buf with a mappedbno that we
don't control, then assume that the function passes back either an error
code or a buffer pointer.  Unfortunately, if mappedbno == -2 and bno
maps to a hole, we get a return code of zero and a NULL buffer, which
means that we crash if we actually try to use that buffer pointer.  This
happens immediately when we set the buffer type for transaction context.

Therefore, check that we have no error code and a non-NULL bp before
trying to use bp.  This patch is a follow-up to an incomplete fix in
96a3aefb8ffde231 ("xfs: don't crash if reading a directory results in an
unexpected hole").

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_attr_leaf.c  | 2 +-
 fs/xfs/libxfs/xfs_da_btree.c   | 2 +-
 fs/xfs/libxfs/xfs_dir2_block.c | 2 +-
 fs/xfs/libxfs/xfs_dir2_leaf.c  | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2852521fc8ec..c6c15e5717e4 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -351,7 +351,7 @@ xfs_attr3_leaf_read(
 
 	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
 				XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
-	if (!err && tp)
+	if (!err && tp && *bpp)
 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
 	return err;
 }
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 1bdf2888295b..b305dbfd81c4 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -263,7 +263,7 @@ xfs_da3_node_read(
 
 	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
 					which_fork, &xfs_da3_node_buf_ops);
-	if (!err && tp) {
+	if (!err && tp && *bpp) {
 		struct xfs_da_blkinfo	*info = (*bpp)->b_addr;
 		int			type;
 
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index aa17cb788946..43c902f7a68d 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -139,7 +139,7 @@ xfs_dir3_block_read(
 
 	err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
 				XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
-	if (!err && tp)
+	if (!err && tp && *bpp)
 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
 	return err;
 }
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index b887fb2a2bcf..f2e342e05365 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -268,7 +268,7 @@ xfs_dir3_leaf_read(
 
 	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
 				XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
-	if (!err && tp)
+	if (!err && tp && *bpp)
 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
 	return err;
 }
@@ -285,7 +285,7 @@ xfs_dir3_leafn_read(
 
 	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
 				XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
-	if (!err && tp)
+	if (!err && tp && *bpp)
 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
 	return err;
 }
-- 
cgit v1.2.3


From a6247b0189fab0edbe065ab42e76eddb2a03a631 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 17 Sep 2017 14:06:36 -0700
Subject: xfs: check _btree_check_block value

commit 1e86eabe73b73c82e1110c746ed3ec6d5e1c0a0d upstream.

Check the _btree_check_block return value for the firstrec and lastrec
functions, since we have the ability to signal that the repositioning
did not succeed.

Fixes-coverity-id: 114067
Fixes-coverity-id: 114068
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_btree.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 91c68913d495..e9f26a09a0be 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -714,7 +714,8 @@ xfs_btree_firstrec(
 	 * Get the block pointer for this level.
 	 */
 	block = xfs_btree_get_block(cur, level, &bp);
-	xfs_btree_check_block(cur, block, level, bp);
+	if (xfs_btree_check_block(cur, block, level, bp))
+		return 0;
 	/*
 	 * It's empty, there is no such record.
 	 */
@@ -743,7 +744,8 @@ xfs_btree_lastrec(
 	 * Get the block pointer for this level.
 	 */
 	block = xfs_btree_get_block(cur, level, &bp);
-	xfs_btree_check_block(cur, block, level, bp);
+	if (xfs_btree_check_block(cur, block, level, bp))
+		return 0;
 	/*
 	 * It's empty, there is no such record.
 	 */
-- 
cgit v1.2.3


From c32b1ec8a266476494f04843434538cdb25d9190 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 17 Sep 2017 14:06:37 -0700
Subject: xfs: set firstfsb to NULLFSBLOCK before feeding it to _bmapi_write

commit 4c1a67bd3606540b9b42caff34a1d5cd94b1cf65 upstream.

We must initialize the firstfsb parameter to _bmapi_write so that it
doesn't incorrectly treat stack garbage as a restriction on which AGs
it can search for free space.

Fixes-coverity-id: 1402025
Fixes-coverity-id: 1415167
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_bmap.c | 9 +++++++++
 fs/xfs/xfs_reflink.c     | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b79719a87638..73571fb4dfed 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6639,6 +6639,15 @@ xfs_bmap_finish_one(
 	bmap.br_blockcount = *blockcount;
 	bmap.br_state = state;
 
+	/*
+	 * firstfsb is tied to the transaction lifetime and is used to
+	 * ensure correct AG locking order and schedule work item
+	 * continuations.  XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us
+	 * to only making one bmap call per transaction, so it should
+	 * be safe to have it as a local variable here.
+	 */
+	firstfsb = NULLFSBLOCK;
+
 	trace_xfs_bmap_deferred(tp->t_mountp,
 			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
 			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 29a75ecb2425..350fc64441b1 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -333,7 +333,7 @@ xfs_reflink_convert_cow_extent(
 	struct xfs_defer_ops		*dfops)
 {
 	struct xfs_bmbt_irec		irec = *imap;
-	xfs_fsblock_t			first_block;
+	xfs_fsblock_t			first_block = NULLFSBLOCK;
 	int				nimaps = 1;
 
 	if (imap->br_state == XFS_EXT_NORM)
-- 
cgit v1.2.3


From 01bc132048cf9505ed49152cc82e583b18c5538d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 17 Sep 2017 14:06:38 -0700
Subject: xfs: check _alloc_read_agf buffer pointer before using

commit 10479e2dea83d4c421ad05dfc55d918aa8dfc0cd upstream.

In some circumstances, _alloc_read_agf can return an error code of zero
but also a null AGF buffer pointer.  Check for this and jump out.

Fixes-coverity-id: 1415250
Fixes-coverity-id: 1415320
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_refcount.c | 4 ++++
 fs/xfs/xfs_reflink.c         | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index e17016163542..d71cb63cdea3 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1640,6 +1640,10 @@ xfs_refcount_recover_cow_leftovers(
 	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
 	if (error)
 		goto out_trans;
+	if (!agbp) {
+		error = -ENOMEM;
+		goto out_trans;
+	}
 	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL);
 
 	/* Find all the leftover CoW staging extents. */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 350fc64441b1..0015c19c7455 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -169,6 +169,8 @@ xfs_reflink_find_shared(
 	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
 	if (error)
 		return error;
+	if (!agbp)
+		return -ENOMEM;
 
 	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
 
-- 
cgit v1.2.3


From 81e27c94f9ab86c04ba4ca5f1d2bcf9e61f7b5af Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:06:39 -0700
Subject: xfs: fix quotacheck dquot id overflow infinite loop

commit cfaf2d034360166e569a4929dd83ae9698bed856 upstream.

If a dquot has an id of U32_MAX, the next lookup index increment
overflows the uint32_t back to 0. This starts the lookup sequence
over from the beginning, repeats indefinitely and results in a
livelock.

Update xfs_qm_dquot_walk() to explicitly check for the lookup
overflow and exit the loop.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_qm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 8068867a8183..1fdd3face2d9 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -111,6 +111,9 @@ restart:
 			skipped = 0;
 			break;
 		}
+		/* we're done if id overflows back to zero */
+		if (!next_index)
+			break;
 	}
 
 	if (skipped) {
-- 
cgit v1.2.3


From 229980158f95098ba82e7bec91ce8ada18335bdc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 17 Sep 2017 14:06:40 -0700
Subject: xfs: fix multi-AG deadlock in xfs_bunmapi

commit 5b094d6dac0451ad89b1dc088395c7b399b7e9e8 upstream.

Just like in the allocator we must avoid touching multiple AGs out of
order when freeing blocks, as freeing still locks the AGF and can cause
the same AB-BA deadlocks as in the allocation path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_bmap.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 73571fb4dfed..2ab50caca14c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5556,6 +5556,7 @@ __xfs_bunmapi(
 	xfs_fsblock_t		sum;
 	xfs_filblks_t		len = *rlen;	/* length to unmap in file */
 	xfs_fileoff_t		max_len;
+	xfs_agnumber_t		prev_agno = NULLAGNUMBER, agno;
 
 	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
 
@@ -5658,6 +5659,17 @@ __xfs_bunmapi(
 		ASSERT(ep != NULL);
 		del = got;
 		wasdel = isnullstartblock(del.br_startblock);
+
+		/*
+		 * Make sure we don't touch multiple AGF headers out of order
+		 * in a single transaction, as that could cause AB-BA deadlocks.
+		 */
+		if (!wasdel) {
+			agno = XFS_FSB_TO_AGNO(mp, del.br_startblock);
+			if (prev_agno != NULLAGNUMBER && prev_agno > agno)
+				break;
+			prev_agno = agno;
+		}
 		if (got.br_startoff < start) {
 			del.br_startoff = start;
 			del.br_blockcount -= start - got.br_startoff;
-- 
cgit v1.2.3


From f90756d75d69cb05d82a061c799c54dc46e1db1b Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Sun, 17 Sep 2017 14:06:41 -0700
Subject: xfs: Fix per-inode DAX flag inheritance

commit 56bdf855e676f1f2ed7033f288f57dfd315725ba upstream.

According to the commit that implemented per-inode DAX flag:
commit 58f88ca2df72 ("xfs: introduce per-inode DAX enablement")
the flag is supposed to act as "inherit flag".

Currently this only works in the situations where parent directory
already has a flag in di_flags set, otherwise inheritance does not
work. This is because setting the XFS_DIFLAG2_DAX flag is done in a
wrong branch designated for di_flags, not di_flags2.

Fix this by moving the code to branch designated for setting di_flags2,
which does test for flags in di_flags2.

Fixes: 58f88ca2df72 ("xfs: introduce per-inode DAX enablement")
Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_inode.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7a0b4eeb99e4..98cd905eadca 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -881,7 +881,6 @@ xfs_ialloc(
 	case S_IFREG:
 	case S_IFDIR:
 		if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
-			uint64_t	di_flags2 = 0;
 			uint		di_flags = 0;
 
 			if (S_ISDIR(mode)) {
@@ -918,20 +917,23 @@ xfs_ialloc(
 				di_flags |= XFS_DIFLAG_NODEFRAG;
 			if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
 				di_flags |= XFS_DIFLAG_FILESTREAM;
-			if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
-				di_flags2 |= XFS_DIFLAG2_DAX;
 
 			ip->i_d.di_flags |= di_flags;
-			ip->i_d.di_flags2 |= di_flags2;
 		}
 		if (pip &&
 		    (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
 		    pip->i_d.di_version == 3 &&
 		    ip->i_d.di_version == 3) {
+			uint64_t	di_flags2 = 0;
+
 			if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
-				ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+				di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
 				ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
 			}
+			if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+				di_flags2 |= XFS_DIFLAG2_DAX;
+
+			ip->i_d.di_flags2 |= di_flags2;
 		}
 		/* FALLTHROUGH */
 	case S_IFLNK:
-- 
cgit v1.2.3


From 8edd73a13dc03d4bdcb25d9273908a901f880d09 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Sun, 17 Sep 2017 14:06:42 -0700
Subject: xfs: fix inobt inode allocation search optimization

commit c44245b3d5435f533ca8346ece65918f84c057f9 upstream.

When we try to allocate a free inode by searching the inobt, we try to
find the inode nearest the parent inode by searching chunks both left
and right of the chunk containing the parent. As an optimization, we
cache the leftmost and rightmost records that we previously searched; if
we do another allocation with the same parent inode, we'll pick up the
search where it last left off.

There's a bug in the case where we found a free inode to the left of the
parent's chunk: we need to update the cached left and right records, but
because we already reassigned the right record to point to the left, we
end up assigning the left record to both the cached left and right
records.

This isn't a correctness problem strictly, but it can result in the next
allocation rechecking chunks unnecessarily or allocating inodes further
away from the parent than it needs to. Fix it by swapping the record
pointer after we update the cached left and right records.

Fixes: bd169565993b ("xfs: speed up free inode search")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_ialloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index a2818f6e8598..af6acd5f276c 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1236,13 +1236,13 @@ xfs_dialloc_ag_inobt(
 
 			/* free inodes to the left? */
 			if (useleft && trec.ir_freecount) {
-				rec = trec;
 				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 				cur = tcur;
 
 				pag->pagl_leftrec = trec.ir_startino;
 				pag->pagl_rightrec = rec.ir_startino;
 				pag->pagl_pagino = pagino;
+				rec = trec;
 				goto alloc_inode;
 			}
 
-- 
cgit v1.2.3


From 7fb3e5e373bb45342c6909ea8320010c461b4082 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 17 Sep 2017 14:06:43 -0700
Subject: xfs: clear MS_ACTIVE after finishing log recovery

commit 8204f8ddaafafcae074746fcf2a05a45e6827603 upstream.

Way back when we established inode block-map redo log items, it was
discovered that we needed to prevent the VFS from evicting inodes during
log recovery because any given inode might be have bmap redo items to
replay even if the inode has no link count and is ultimately deleted,
and any eviction of an unlinked inode causes the inode to be truncated
and freed too early.

To make this possible, we set MS_ACTIVE so that inodes would not be torn
down immediately upon release.  Unfortunately, this also results in the
quota inodes not being released at all if a later part of the mount
process should fail, because we never reclaim the inodes.  So, set
MS_ACTIVE right before we do the last part of log recovery and clear it
immediately after we finish the log recovery so that everything
will be torn down properly if we abort the mount.

Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped")
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_log.c   | 11 +++++++++++
 fs/xfs/xfs_mount.c | 10 ----------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b57ab34fbf3c..c235170f3a07 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -749,9 +749,20 @@ xfs_log_mount_finish(
 		return 0;
 	}
 
+	/*
+	 * During the second phase of log recovery, we need iget and
+	 * iput to behave like they do for an active filesystem.
+	 * xfs_fs_drop_inode needs to be able to prevent the deletion
+	 * of inodes before we're done replaying log items on those
+	 * inodes.  Turn it off immediately after recovery finishes
+	 * so that we don't leak the quota inodes if subsequent mount
+	 * activities fail.
+	 */
+	mp->m_super->s_flags |= MS_ACTIVE;
 	error = xlog_recover_finish(mp->m_log);
 	if (!error)
 		xfs_log_work_queue(mp);
+	mp->m_super->s_flags &= ~MS_ACTIVE;
 
 	return error;
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 13796f212f98..ab058c7dc598 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -924,15 +924,6 @@ xfs_mountfs(
 		}
 	}
 
-	/*
-	 * During the second phase of log recovery, we need iget and
-	 * iput to behave like they do for an active filesystem.
-	 * xfs_fs_drop_inode needs to be able to prevent the deletion
-	 * of inodes before we're done replaying log items on those
-	 * inodes.
-	 */
-	mp->m_super->s_flags |= MS_ACTIVE;
-
 	/*
 	 * Finish recovering the file system.  This part needed to be delayed
 	 * until after the root and real-time bitmap inodes were consistently
@@ -1008,7 +999,6 @@ xfs_mountfs(
  out_quota:
 	xfs_qm_unmount_quotas(mp);
  out_rtunmount:
-	mp->m_super->s_flags &= ~MS_ACTIVE;
 	xfs_rtunmount_inodes(mp);
  out_rele_rip:
 	IRELE(rip);
-- 
cgit v1.2.3


From e1a7b7e1f6c294f64602b9cb1c15d44432f48561 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 17 Sep 2017 14:06:44 -0700
Subject: xfs: don't leak quotacheck dquots when cow recovery

commit 77aff8c76425c8f49b50d0b9009915066739e7d2 upstream.

If we fail a mount on account of cow recovery errors, it's possible that
a previous quotacheck left some dquots in memory.  The bailout clause of
xfs_mountfs forgets to purge these, and so we leak them.  Fix that.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_mount.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index ab058c7dc598..d4ce8d277992 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1004,6 +1004,8 @@ xfs_mountfs(
 	IRELE(rip);
 	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
+	/* Clean out dquots that might be in memory after quotacheck. */
+	xfs_qm_unmount(mp);
  out_log_dealloc:
 	mp->m_flags |= XFS_MOUNT_UNMOUNTING;
 	xfs_log_mount_cancel(mp);
-- 
cgit v1.2.3


From ec0d46ef8b7e35b4f7c82bcf12afbe96b711350f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 17 Sep 2017 14:06:45 -0700
Subject: iomap: fix integer truncation issues in the zeroing and dirtying
 helpers

commit e28ae8e428fefe2facd72cea9f29906ecb9c861d upstream.

Fix the min_t calls in the zeroing and dirtying helpers to perform the
comparisms on 64-bit types, which prevents them from incorrectly
being truncated, and larger zeroing operations being stuck in a never
ending loop.

Special thanks to Markus Stockhausen for spotting the bug.

Reported-by: Paul Menzel <pmenzel@molgen.mpg.de>
Tested-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/iomap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 798c291cbc75..a49db8806a3a 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -281,7 +281,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		unsigned long bytes;	/* Bytes to write to page */
 
 		offset = (pos & (PAGE_SIZE - 1));
-		bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
+		bytes = min_t(loff_t, PAGE_SIZE - offset, length);
 
 		rpage = __iomap_read_page(inode, pos);
 		if (IS_ERR(rpage))
@@ -376,7 +376,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
 		unsigned offset, bytes;
 
 		offset = pos & (PAGE_SIZE - 1); /* Within page */
-		bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+		bytes = min_t(loff_t, PAGE_SIZE - offset, count);
 
 		if (IS_DAX(inode))
 			status = iomap_dax_zero(pos, offset, bytes, iomap);
-- 
cgit v1.2.3


From 01d38e380746e5880d744c634f0c459ea6646dd9 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Sun, 17 Sep 2017 14:06:46 -0700
Subject: xfs: write unmount record for ro mounts

commit 757a69ef6cf2bf839bd4088e5609ddddd663b0c4 upstream.

There are dueling comments in the xfs code about intent
for log writes when unmounting a readonly filesystem.

In xfs_mountfs, we see the intent:

/*
 * Now the log is fully replayed, we can transition to full read-only
 * mode for read-only mounts. This will sync all the metadata and clean
 * the log so that the recovery we just performed does not have to be
 * replayed again on the next mount.
 */

and it calls xfs_quiesce_attr(), but by the time we get to
xfs_log_unmount_write(), it returns early for a RDONLY mount:

 * Don't write out unmount record on read-only mounts.

Because of this, sequential ro mounts of a filesystem with
a dirty log will replay the log each time, which seems odd.

Fix this by writing an unmount record even for RO mounts, as long
as norecovery wasn't specified (don't write a clean log record
if a dirty log may still be there!) and the log device is
writable.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_log.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c235170f3a07..4f59cbc0b185 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -812,11 +812,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 	int		 error;
 
 	/*
-	 * Don't write out unmount record on read-only mounts.
+	 * Don't write out unmount record on norecovery mounts or ro devices.
 	 * Or, if we are doing a forced umount (typically because of IO errors).
 	 */
-	if (mp->m_flags & XFS_MOUNT_RDONLY)
+	if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
+	    xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
 		return 0;
+	}
 
 	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
 	ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
-- 
cgit v1.2.3


From 9a3f752290907e7bfa80a333e4965574932f5670 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Sun, 17 Sep 2017 14:06:47 -0700
Subject: xfs: toggle readonly state around xfs_log_mount_finish

commit 6f4a1eefdd0ad4561543270a7fceadabcca075dd upstream.

When we do log recovery on a readonly mount, unlinked inode
processing does not happen due to the readonly checks in
xfs_inactive(), which are trying to prevent any I/O on a
readonly mount.

This is misguided - we do I/O on readonly mounts all the time,
for consistency; for example, log recovery.  So do the same
RDONLY flag twiddling around xfs_log_mount_finish() as we
do around xfs_log_mount(), for the same reason.

This all cries out for a big rework but for now this is a
simple fix to an obvious problem.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_log.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4f59cbc0b185..ebe20f1591f1 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -743,10 +743,14 @@ xfs_log_mount_finish(
 	struct xfs_mount	*mp)
 {
 	int	error = 0;
+	bool	readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
 
 	if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
 		return 0;
+	} else if (readonly) {
+		/* Allow unlinked processing to proceed */
+		mp->m_flags &= ~XFS_MOUNT_RDONLY;
 	}
 
 	/*
@@ -764,6 +768,9 @@ xfs_log_mount_finish(
 		xfs_log_work_queue(mp);
 	mp->m_super->s_flags &= ~MS_ACTIVE;
 
+	if (readonly)
+		mp->m_flags |= XFS_MOUNT_RDONLY;
+
 	return error;
 }
 
-- 
cgit v1.2.3


From 1ba04933408e4b4567f557d363f7bdecfabe9399 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 17 Sep 2017 14:06:48 -0700
Subject: xfs: remove xfs_trans_ail_delete_bulk

commit 27af1bbf524459962d1477a38ac6e0b7f79aaecc upstream.

xfs_iflush_done uses an on-stack variable length array to pass the log
items to be deleted to xfs_trans_ail_delete_bulk.  On-stack VLAs are a
nasty gcc extension that can lead to unbounded stack allocations, but
fortunately we can easily avoid them by simply open coding
xfs_trans_ail_delete_bulk in xfs_iflush_done, which is the only caller
of it except for the single-item xfs_trans_ail_delete.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_inode_item.c | 29 +++++++++++---------
 fs/xfs/xfs_trans_ail.c  | 71 ++++++++++++++++++++++++-------------------------
 fs/xfs/xfs_trans_priv.h | 15 +++--------
 3 files changed, 55 insertions(+), 60 deletions(-)

diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d90e7811ccdd..08cb7d1a4a3a 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -731,22 +731,27 @@ xfs_iflush_done(
 	 * holding the lock before removing the inode from the AIL.
 	 */
 	if (need_ail) {
-		struct xfs_log_item *log_items[need_ail];
-		int i = 0;
+		bool			mlip_changed = false;
+
+		/* this is an opencoded batch version of xfs_trans_ail_delete */
 		spin_lock(&ailp->xa_lock);
 		for (blip = lip; blip; blip = blip->li_bio_list) {
-			iip = INODE_ITEM(blip);
-			if (iip->ili_logged &&
-			    blip->li_lsn == iip->ili_flush_lsn) {
-				log_items[i++] = blip;
-			}
-			ASSERT(i <= need_ail);
+			if (INODE_ITEM(blip)->ili_logged &&
+			    blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
+				mlip_changed |= xfs_ail_delete_one(ailp, blip);
 		}
-		/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
-		xfs_trans_ail_delete_bulk(ailp, log_items, i,
-					  SHUTDOWN_CORRUPT_INCORE);
-	}
 
+		if (mlip_changed) {
+			if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+				xlog_assign_tail_lsn_locked(ailp->xa_mount);
+			if (list_empty(&ailp->xa_ail))
+				wake_up_all(&ailp->xa_empty);
+		}
+		spin_unlock(&ailp->xa_lock);
+
+		if (mlip_changed)
+			xfs_log_space_wake(ailp->xa_mount);
+	}
 
 	/*
 	 * clean up and unlock the flush lock now we are done. We can clear the
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d6c9c3e9e02b..9056c0f34a3c 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -684,8 +684,23 @@ xfs_trans_ail_update_bulk(
 	}
 }
 
-/*
- * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
+bool
+xfs_ail_delete_one(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item 	*lip)
+{
+	struct xfs_log_item	*mlip = xfs_ail_min(ailp);
+
+	trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+	xfs_ail_delete(ailp, lip);
+	lip->li_flags &= ~XFS_LI_IN_AIL;
+	lip->li_lsn = 0;
+
+	return mlip == lip;
+}
+
+/**
+ * Remove a log items from the AIL
  *
  * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
  * removed from the AIL. The caller is already holding the AIL lock, and done
@@ -706,52 +721,36 @@ xfs_trans_ail_update_bulk(
  * before returning.
  */
 void
-xfs_trans_ail_delete_bulk(
+xfs_trans_ail_delete(
 	struct xfs_ail		*ailp,
-	struct xfs_log_item	**log_items,
-	int			nr_items,
+	struct xfs_log_item	*lip,
 	int			shutdown_type) __releases(ailp->xa_lock)
 {
-	xfs_log_item_t		*mlip;
-	int			mlip_changed = 0;
-	int			i;
+	struct xfs_mount	*mp = ailp->xa_mount;
+	bool			mlip_changed;
 
-	mlip = xfs_ail_min(ailp);
-
-	for (i = 0; i < nr_items; i++) {
-		struct xfs_log_item *lip = log_items[i];
-		if (!(lip->li_flags & XFS_LI_IN_AIL)) {
-			struct xfs_mount	*mp = ailp->xa_mount;
-
-			spin_unlock(&ailp->xa_lock);
-			if (!XFS_FORCED_SHUTDOWN(mp)) {
-				xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
-		"%s: attempting to delete a log item that is not in the AIL",
-						__func__);
-				xfs_force_shutdown(mp, shutdown_type);
-			}
-			return;
+	if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+		spin_unlock(&ailp->xa_lock);
+		if (!XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+	"%s: attempting to delete a log item that is not in the AIL",
+					__func__);
+			xfs_force_shutdown(mp, shutdown_type);
 		}
-
-		trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
-		xfs_ail_delete(ailp, lip);
-		lip->li_flags &= ~XFS_LI_IN_AIL;
-		lip->li_lsn = 0;
-		if (mlip == lip)
-			mlip_changed = 1;
+		return;
 	}
 
+	mlip_changed = xfs_ail_delete_one(ailp, lip);
 	if (mlip_changed) {
-		if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
-			xlog_assign_tail_lsn_locked(ailp->xa_mount);
+		if (!XFS_FORCED_SHUTDOWN(mp))
+			xlog_assign_tail_lsn_locked(mp);
 		if (list_empty(&ailp->xa_ail))
 			wake_up_all(&ailp->xa_empty);
-		spin_unlock(&ailp->xa_lock);
+	}
 
+	spin_unlock(&ailp->xa_lock);
+	if (mlip_changed)
 		xfs_log_space_wake(ailp->xa_mount);
-	} else {
-		spin_unlock(&ailp->xa_lock);
-	}
 }
 
 int
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 49931b72da8a..d91706c56c63 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -106,18 +106,9 @@ xfs_trans_ail_update(
 	xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
 }
 
-void	xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
-				struct xfs_log_item **log_items, int nr_items,
-				int shutdown_type)
-				__releases(ailp->xa_lock);
-static inline void
-xfs_trans_ail_delete(
-	struct xfs_ail	*ailp,
-	xfs_log_item_t	*lip,
-	int		shutdown_type) __releases(ailp->xa_lock)
-{
-	xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
-}
+bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
+		int shutdown_type) __releases(ailp->xa_lock);
 
 static inline void
 xfs_trans_ail_remove(
-- 
cgit v1.2.3


From 7942f605c3086abe6c9f61f29e9326c48d5c8095 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Sun, 17 Sep 2017 14:06:49 -0700
Subject: xfs: Add infrastructure needed for error propagation during buffer IO
 failure

commit 0b80ae6ed13169bd3a244e71169f2cc020b0c57a upstream.

With the current code, XFS never re-submit a failed buffer for IO,
because the failed item in the buffer is kept in the flush locked state
forever.

To be able to resubmit an log item for IO, we need a way to mark an item
as failed, if, for any reason the buffer which the item belonged to
failed during writeback.

Add a new log item callback to be used after an IO completion failure
and make the needed clean ups.

Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_buf_item.c | 32 +++++++++++++++++++++++++++++++-
 fs/xfs/xfs_trans.h    |  7 +++++--
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f6a8422e9562..7573a1f0bc9a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -29,6 +29,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
+#include "xfs_inode.h"
 
 
 kmem_zone_t	*xfs_buf_item_zone;
@@ -1054,6 +1055,31 @@ xfs_buf_do_callbacks(
 	}
 }
 
+/*
+ * Invoke the error state callback for each log item affected by the failed I/O.
+ *
+ * If a metadata buffer write fails with a non-permanent error, the buffer is
+ * eventually resubmitted and so the completion callbacks are not run. The error
+ * state may need to be propagated to the log items attached to the buffer,
+ * however, so the next AIL push of the item knows hot to handle it correctly.
+ */
+STATIC void
+xfs_buf_do_callbacks_fail(
+	struct xfs_buf		*bp)
+{
+	struct xfs_log_item	*next;
+	struct xfs_log_item	*lip = bp->b_fspriv;
+	struct xfs_ail		*ailp = lip->li_ailp;
+
+	spin_lock(&ailp->xa_lock);
+	for (; lip; lip = next) {
+		next = lip->li_bio_list;
+		if (lip->li_ops->iop_error)
+			lip->li_ops->iop_error(lip, bp);
+	}
+	spin_unlock(&ailp->xa_lock);
+}
+
 static bool
 xfs_buf_iodone_callback_error(
 	struct xfs_buf		*bp)
@@ -1123,7 +1149,11 @@ xfs_buf_iodone_callback_error(
 	if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
 		goto permanent_error;
 
-	/* still a transient error, higher layers will retry */
+	/*
+	 * Still a transient error, run IO completion failure callbacks and let
+	 * the higher layers retry the buffer.
+	 */
+	xfs_buf_do_callbacks_fail(bp);
 	xfs_buf_ioerror(bp, 0);
 	xfs_buf_relse(bp);
 	return true;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c0e72ab57741..22fddad9fe11 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -65,11 +65,13 @@ typedef struct xfs_log_item {
 } xfs_log_item_t;
 
 #define	XFS_LI_IN_AIL	0x1
-#define XFS_LI_ABORTED	0x2
+#define	XFS_LI_ABORTED	0x2
+#define	XFS_LI_FAILED	0x4
 
 #define XFS_LI_FLAGS \
 	{ XFS_LI_IN_AIL,	"IN_AIL" }, \
-	{ XFS_LI_ABORTED,	"ABORTED" }
+	{ XFS_LI_ABORTED,	"ABORTED" }, \
+	{ XFS_LI_FAILED,	"FAILED" }
 
 struct xfs_item_ops {
 	void (*iop_size)(xfs_log_item_t *, int *, int *);
@@ -80,6 +82,7 @@ struct xfs_item_ops {
 	void (*iop_unlock)(xfs_log_item_t *);
 	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
 	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+	void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
 };
 
 void	xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
-- 
cgit v1.2.3


From 0800356def7f3ede34986adeeb03235176297eb0 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Sun, 17 Sep 2017 14:06:50 -0700
Subject: xfs: Properly retry failed inode items in case of error during buffer
 writeback

commit d3a304b6292168b83b45d624784f973fdc1ca674 upstream.

When a buffer has been failed during writeback, the inode items into it
are kept flush locked, and are never resubmitted due the flush lock, so,
if any buffer fails to be written, the items in AIL are never written to
disk and never unlocked.

This causes unmount operation to hang due these items flush locked in AIL,
but this also causes the items in AIL to never be written back, even when
the IO device comes back to normal.

I've been testing this patch with a DM-thin device, creating a
filesystem larger than the real device.

When writing enough data to fill the DM-thin device, XFS receives ENOSPC
errors from the device, and keep spinning on xfsaild (when 'retry
forever' configuration is set).

At this point, the filesystem can not be unmounted because of the flush locked
items in AIL, but worse, the items in AIL are never retried at all
(once xfs_inode_item_push() will skip the items that are flush locked),
even if the underlying DM-thin device is expanded to the proper size.

This patch fixes both cases, retrying any item that has been failed
previously, using the infra-structure provided by the previous patch.

Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_buf_item.c   | 28 ++++++++++++++++++++++++++++
 fs/xfs/xfs_buf_item.h   |  3 +++
 fs/xfs/xfs_inode_item.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
 fs/xfs/xfs_trans.h      |  1 +
 fs/xfs/xfs_trans_ail.c  |  3 ++-
 fs/xfs/xfs_trans_priv.h | 31 +++++++++++++++++++++++++++++++
 6 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7573a1f0bc9a..573fc72c3f23 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1234,3 +1234,31 @@ xfs_buf_iodone(
 	xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
 	xfs_buf_item_free(BUF_ITEM(lip));
 }
+
+/*
+ * Requeue a failed buffer for writeback
+ *
+ * Return true if the buffer has been re-queued properly, false otherwise
+ */
+bool
+xfs_buf_resubmit_failed_buffers(
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	struct xfs_log_item	*next;
+
+	/*
+	 * Clear XFS_LI_FAILED flag from all items before resubmit
+	 *
+	 * XFS_LI_FAILED set/clear is protected by xa_lock, caller  this
+	 * function already have it acquired
+	 */
+	for (; lip; lip = next) {
+		next = lip->li_bio_list;
+		xfs_clear_li_failed(lip);
+	}
+
+	/* Add this buffer back to the delayed write list */
+	return xfs_buf_delwri_queue(bp, buffer_list);
+}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index f7eba99d19dd..530686e1afb9 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -70,6 +70,9 @@ void	xfs_buf_attach_iodone(struct xfs_buf *,
 			      xfs_log_item_t *);
 void	xfs_buf_iodone_callbacks(struct xfs_buf *);
 void	xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
+bool	xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
+					struct xfs_log_item *,
+					struct list_head *);
 
 extern kmem_zone_t	*xfs_buf_item_zone;
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 08cb7d1a4a3a..94915747042c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -27,6 +27,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
 #include "xfs_log.h"
 
 
@@ -475,6 +476,23 @@ xfs_inode_item_unpin(
 		wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
 }
 
+/*
+ * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
+ * have been failed during writeback
+ *
+ * This informs the AIL that the inode is already flush locked on the next push,
+ * and acquires a hold on the buffer to ensure that it isn't reclaimed before
+ * dirty data makes it to disk.
+ */
+STATIC void
+xfs_inode_item_error(
+	struct xfs_log_item	*lip,
+	struct xfs_buf		*bp)
+{
+	ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
+	xfs_set_li_failed(lip, bp);
+}
+
 STATIC uint
 xfs_inode_item_push(
 	struct xfs_log_item	*lip,
@@ -484,13 +502,28 @@ xfs_inode_item_push(
 {
 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
 	struct xfs_inode	*ip = iip->ili_inode;
-	struct xfs_buf		*bp = NULL;
+	struct xfs_buf		*bp = lip->li_buf;
 	uint			rval = XFS_ITEM_SUCCESS;
 	int			error;
 
 	if (xfs_ipincount(ip) > 0)
 		return XFS_ITEM_PINNED;
 
+	/*
+	 * The buffer containing this item failed to be written back
+	 * previously. Resubmit the buffer for IO.
+	 */
+	if (lip->li_flags & XFS_LI_FAILED) {
+		if (!xfs_buf_trylock(bp))
+			return XFS_ITEM_LOCKED;
+
+		if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
+			rval = XFS_ITEM_FLUSHING;
+
+		xfs_buf_unlock(bp);
+		return rval;
+	}
+
 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
 		return XFS_ITEM_LOCKED;
 
@@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
 	.iop_unlock	= xfs_inode_item_unlock,
 	.iop_committed	= xfs_inode_item_committed,
 	.iop_push	= xfs_inode_item_push,
-	.iop_committing = xfs_inode_item_committing
+	.iop_committing = xfs_inode_item_committing,
+	.iop_error	= xfs_inode_item_error
 };
 
 
@@ -710,7 +744,8 @@ xfs_iflush_done(
 		 * the AIL lock.
 		 */
 		iip = INODE_ITEM(blip);
-		if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+		if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
+		    lip->li_flags & XFS_LI_FAILED)
 			need_ail++;
 
 		blip = next;
@@ -718,7 +753,8 @@ xfs_iflush_done(
 
 	/* make sure we capture the state of the initial inode. */
 	iip = INODE_ITEM(lip);
-	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+	if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
+	    lip->li_flags & XFS_LI_FAILED)
 		need_ail++;
 
 	/*
@@ -739,6 +775,9 @@ xfs_iflush_done(
 			if (INODE_ITEM(blip)->ili_logged &&
 			    blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
 				mlip_changed |= xfs_ail_delete_one(ailp, blip);
+			else {
+				xfs_clear_li_failed(blip);
+			}
 		}
 
 		if (mlip_changed) {
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 22fddad9fe11..0318e92aed66 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -50,6 +50,7 @@ typedef struct xfs_log_item {
 	struct xfs_ail			*li_ailp;	/* ptr to AIL */
 	uint				li_type;	/* item type */
 	uint				li_flags;	/* misc flags */
+	struct xfs_buf			*li_buf;	/* real buffer pointer */
 	struct xfs_log_item		*li_bio_list;	/* buffer item list */
 	void				(*li_cb)(struct xfs_buf *,
 						 struct xfs_log_item *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 9056c0f34a3c..70f5ab017323 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -687,12 +687,13 @@ xfs_trans_ail_update_bulk(
 bool
 xfs_ail_delete_one(
 	struct xfs_ail		*ailp,
-	struct xfs_log_item 	*lip)
+	struct xfs_log_item	*lip)
 {
 	struct xfs_log_item	*mlip = xfs_ail_min(ailp);
 
 	trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
 	xfs_ail_delete(ailp, lip);
+	xfs_clear_li_failed(lip);
 	lip->li_flags &= ~XFS_LI_IN_AIL;
 	lip->li_lsn = 0;
 
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index d91706c56c63..b317a3644c00 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn(
 	*dst = *src;
 }
 #endif
+
+static inline void
+xfs_clear_li_failed(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf	*bp = lip->li_buf;
+
+	ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+	lockdep_assert_held(&lip->li_ailp->xa_lock);
+
+	if (lip->li_flags & XFS_LI_FAILED) {
+		lip->li_flags &= ~XFS_LI_FAILED;
+		lip->li_buf = NULL;
+		xfs_buf_rele(bp);
+	}
+}
+
+static inline void
+xfs_set_li_failed(
+	struct xfs_log_item	*lip,
+	struct xfs_buf		*bp)
+{
+	lockdep_assert_held(&lip->li_ailp->xa_lock);
+
+	if (!(lip->li_flags & XFS_LI_FAILED)) {
+		xfs_buf_hold(bp);
+		lip->li_flags |= XFS_LI_FAILED;
+		lip->li_buf = bp;
+	}
+}
+
 #endif	/* __XFS_TRANS_PRIV_H__ */
-- 
cgit v1.2.3


From 35093926c2f8bd259e50b73685f638095cc59c89 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:06:51 -0700
Subject: xfs: fix recovery failure when log record header wraps log end

commit 284f1c2c9bebf871861184b0e2c40fa921dd380b upstream.

The high-level log recovery algorithm consists of two loops that
walk the physical log and process log records from the tail to the
head. The first loop handles the case where the tail is beyond the
head and processes records up to the end of the physical log. The
subsequent loop processes records from the beginning of the physical
log to the head.

Because log records can wrap around the end of the physical log, the
first loop mentioned above must handle this case appropriately.
Records are processed from in-core buffers, which means that this
algorithm must split the reads of such records into two partial
I/Os: 1.) from the beginning of the record to the end of the log and
2.) from the beginning of the log to the end of the record. This is
further complicated by the fact that the log record header and log
record data are read into independent buffers.

The current handling of each buffer correctly splits the reads when
either the header or data starts before the end of the log and wraps
around the end. The data read does not correctly handle the case
where the prior header read wrapped or ends on the physical log end
boundary. blk_no is incremented to or beyond the log end after the
header read to point to the record data, but the split data read
logic triggers, attempts to read from an invalid log block and
ultimately causes log recovery to fail. This can be reproduced
fairly reliably via xfstests tests generic/047 and generic/388 with
large iclog sizes (256k) and small (10M) logs.

If the record header read has pushed beyond the end of the physical
log, the subsequent data read is actually contiguous. Update the
data read logic to detect the case where blk_no has wrapped, mod it
against the log size to read from the correct address and issue one
contiguous read for the log data buffer. The log record is processed
as normal from the buffer(s), the loop exits after the current
iteration and the subsequent loop picks up with the first new record
after the start of the log.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_log_recover.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e06aa2827ecb..9cef8916314a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -5216,7 +5216,7 @@ xlog_do_recovery_pass(
 	xfs_daddr_t		*first_bad)	/* out: first bad log rec */
 {
 	xlog_rec_header_t	*rhead;
-	xfs_daddr_t		blk_no;
+	xfs_daddr_t		blk_no, rblk_no;
 	xfs_daddr_t		rhead_blk;
 	char			*offset;
 	xfs_buf_t		*hbp, *dbp;
@@ -5369,9 +5369,19 @@ xlog_do_recovery_pass(
 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
 			blk_no += hblks;
 
-			/* Read in data for log record */
-			if (blk_no + bblks <= log->l_logBBsize) {
-				error = xlog_bread(log, blk_no, bblks, dbp,
+			/*
+			 * Read the log record data in multiple reads if it
+			 * wraps around the end of the log. Note that if the
+			 * header already wrapped, blk_no could point past the
+			 * end of the log. The record data is contiguous in
+			 * that case.
+			 */
+			if (blk_no + bblks <= log->l_logBBsize ||
+			    blk_no >= log->l_logBBsize) {
+				/* mod blk_no in case the header wrapped and
+				 * pushed it beyond the end of the log */
+				rblk_no = do_mod(blk_no, log->l_logBBsize);
+				error = xlog_bread(log, rblk_no, bblks, dbp,
 						   &offset);
 				if (error)
 					goto bread_err2;
-- 
cgit v1.2.3


From e34b72a2381e6432b9eab07a3ec285b59a80e45f Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:06:52 -0700
Subject: xfs: always verify the log tail during recovery

commit 5297ac1f6d7cbf45464a49b9558831f271dfc559 upstream.

Log tail verification currently only occurs when torn writes are
detected at the head of the log. This was introduced because a
change in the head block due to torn writes can lead to a change in
the tail block (each log record header references the current tail)
and the tail block should be verified before log recovery proceeds.

Tail corruption is possible outside of torn write scenarios,
however. For example, partial log writes can be detected and cleared
during the initial head/tail block discovery process. If the partial
write coincides with a tail overwrite, the log tail is corrupted and
recovery fails.

To facilitate correct handling of log tail overwites, update log
recovery to always perform tail verification. This is necessary to
detect potential tail overwrite conditions when torn writes may not
have occurred. This changes normal (i.e., no torn writes) recovery
behavior slightly to detect and return CRC related errors near the
tail before actual recovery starts.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_log_recover.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9cef8916314a..1457fa0f8637 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1183,31 +1183,11 @@ xlog_verify_head(
 			ASSERT(0);
 			return 0;
 		}
-
-		/*
-		 * Now verify the tail based on the updated head. This is
-		 * required because the torn writes trimmed from the head could
-		 * have been written over the tail of a previous record. Return
-		 * any errors since recovery cannot proceed if the tail is
-		 * corrupt.
-		 *
-		 * XXX: This leaves a gap in truly robust protection from torn
-		 * writes in the log. If the head is behind the tail, the tail
-		 * pushes forward to create some space and then a crash occurs
-		 * causing the writes into the previous record's tail region to
-		 * tear, log recovery isn't able to recover.
-		 *
-		 * How likely is this to occur? If possible, can we do something
-		 * more intelligent here? Is it safe to push the tail forward if
-		 * we can determine that the tail is within the range of the
-		 * torn write (e.g., the kernel can only overwrite the tail if
-		 * it has actually been pushed forward)? Alternatively, could we
-		 * somehow prevent this condition at runtime?
-		 */
-		error = xlog_verify_tail(log, *head_blk, *tail_blk);
 	}
+	if (error)
+		return error;
 
-	return error;
+	return xlog_verify_tail(log, *head_blk, *tail_blk);
 }
 
 /*
-- 
cgit v1.2.3


From 47db1fc608b89820f712ab7806b0bd4d4ed69c16 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:06:53 -0700
Subject: xfs: fix log recovery corruption error due to tail overwrite

commit 4a4f66eac4681378996a1837ad1ffec3a2e2981f upstream.

If we consider the case where the tail (T) of the log is pinned long
enough for the head (H) to push and block behind the tail, we can
end up blocked in the following state without enough free space (f)
in the log to satisfy a transaction reservation:

	0	phys. log	N
	[-------HffT---H'--T'---]

The last good record in the log (before H) refers to T. The tail
eventually pushes forward (T') leaving more free space in the log
for writes to H. At this point, suppose space frees up in the log
for the maximum of 8 in-core log buffers to start flushing out to
the log. If this pushes the head from H to H', these next writes
overwrite the previous tail T. This is safe because the items logged
from T to T' have been written back and removed from the AIL.

If the next log writes (H -> H') happen to fail and result in
partial records in the log, the filesystem shuts down having
overwritten T with invalid data. Log recovery correctly locates H on
the subsequent mount, but H still refers to the now corrupted tail
T. This results in log corruption errors and recovery failure.

Since the tail overwrite results from otherwise correct runtime
behavior, it is up to log recovery to try and deal with this
situation. Update log recovery tail verification to run a CRC pass
from the first record past the tail to the head. This facilitates
error detection at T and moves the recovery tail to the first good
record past H' (similar to truncating the head on torn write
detection). If corruption is detected beyond the range possibly
affected by the max number of iclogs, the log is legitimately
corrupted and log recovery failure is expected.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_log_recover.c | 108 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 77 insertions(+), 31 deletions(-)

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1457fa0f8637..fdad8c9ecbb3 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1029,61 +1029,106 @@ out_error:
 }
 
 /*
- * Check the log tail for torn writes. This is required when torn writes are
- * detected at the head and the head had to be walked back to a previous record.
- * The tail of the previous record must now be verified to ensure the torn
- * writes didn't corrupt the previous tail.
+ * Calculate distance from head to tail (i.e., unused space in the log).
+ */
+static inline int
+xlog_tail_distance(
+	struct xlog	*log,
+	xfs_daddr_t	head_blk,
+	xfs_daddr_t	tail_blk)
+{
+	if (head_blk < tail_blk)
+		return tail_blk - head_blk;
+
+	return tail_blk + (log->l_logBBsize - head_blk);
+}
+
+/*
+ * Verify the log tail. This is particularly important when torn or incomplete
+ * writes have been detected near the front of the log and the head has been
+ * walked back accordingly.
+ *
+ * We also have to handle the case where the tail was pinned and the head
+ * blocked behind the tail right before a crash. If the tail had been pushed
+ * immediately prior to the crash and the subsequent checkpoint was only
+ * partially written, it's possible it overwrote the last referenced tail in the
+ * log with garbage. This is not a coherency problem because the tail must have
+ * been pushed before it can be overwritten, but appears as log corruption to
+ * recovery because we have no way to know the tail was updated if the
+ * subsequent checkpoint didn't write successfully.
  *
- * Return an error if CRC verification fails as recovery cannot proceed.
+ * Therefore, CRC check the log from tail to head. If a failure occurs and the
+ * offending record is within max iclog bufs from the head, walk the tail
+ * forward and retry until a valid tail is found or corruption is detected out
+ * of the range of a possible overwrite.
  */
 STATIC int
 xlog_verify_tail(
 	struct xlog		*log,
 	xfs_daddr_t		head_blk,
-	xfs_daddr_t		tail_blk)
+	xfs_daddr_t		*tail_blk,
+	int			hsize)
 {
 	struct xlog_rec_header	*thead;
 	struct xfs_buf		*bp;
 	xfs_daddr_t		first_bad;
-	int			count;
 	int			error = 0;
 	bool			wrapped;
-	xfs_daddr_t		tmp_head;
+	xfs_daddr_t		tmp_tail;
+	xfs_daddr_t		orig_tail = *tail_blk;
 
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
 		return -ENOMEM;
 
 	/*
-	 * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
-	 * a temporary head block that points after the last possible
-	 * concurrently written record of the tail.
+	 * Make sure the tail points to a record (returns positive count on
+	 * success).
 	 */
-	count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
-				     XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
-				     &wrapped);
-	if (count < 0) {
-		error = count;
+	error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
+			&tmp_tail, &thead, &wrapped);
+	if (error < 0)
 		goto out;
-	}
+	if (*tail_blk != tmp_tail)
+		*tail_blk = tmp_tail;
 
 	/*
-	 * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
-	 * into the actual log head. tmp_head points to the start of the record
-	 * so update it to the actual head block.
+	 * Run a CRC check from the tail to the head. We can't just check
+	 * MAX_ICLOGS records past the tail because the tail may point to stale
+	 * blocks cleared during the search for the head/tail. These blocks are
+	 * overwritten with zero-length records and thus record count is not a
+	 * reliable indicator of the iclog state before a crash.
 	 */
-	if (count < XLOG_MAX_ICLOGS + 1)
-		tmp_head = head_blk;
-
-	/*
-	 * We now have a tail and temporary head block that covers at least
-	 * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
-	 * records were completely written. Run a CRC verification pass from
-	 * tail to head and return the result.
-	 */
-	error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
+	first_bad = 0;
+	error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
 				      XLOG_RECOVER_CRCPASS, &first_bad);
+	while (error == -EFSBADCRC && first_bad) {
+		int	tail_distance;
+
+		/*
+		 * Is corruption within range of the head? If so, retry from
+		 * the next record. Otherwise return an error.
+		 */
+		tail_distance = xlog_tail_distance(log, head_blk, first_bad);
+		if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
+			break;
 
+		/* skip to the next record; returns positive count on success */
+		error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
+				&tmp_tail, &thead, &wrapped);
+		if (error < 0)
+			goto out;
+
+		*tail_blk = tmp_tail;
+		first_bad = 0;
+		error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+					      XLOG_RECOVER_CRCPASS, &first_bad);
+	}
+
+	if (!error && *tail_blk != orig_tail)
+		xfs_warn(log->l_mp,
+		"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
+			 orig_tail, *tail_blk);
 out:
 	xlog_put_bp(bp);
 	return error;
@@ -1187,7 +1232,8 @@ xlog_verify_head(
 	if (error)
 		return error;
 
-	return xlog_verify_tail(log, *head_blk, *tail_blk);
+	return xlog_verify_tail(log, *head_blk, tail_blk,
+				be32_to_cpu((*rhead)->h_size));
 }
 
 /*
-- 
cgit v1.2.3


From 7549e7c01fb0220e47515ad3ee52f46e2742f178 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:06:54 -0700
Subject: xfs: handle -EFSCORRUPTED during head/tail verification

commit a4c9b34d6a17081005ec459b57b8effc08f4c731 upstream.

Torn write and tail overwrite detection both trigger only on
-EFSBADCRC errors. While this is the most likely failure scenario
for each condition, -EFSCORRUPTED is still possible in certain cases
depending on what ends up on disk when a torn write or partial tail
overwrite occurs. For example, an invalid log record h_len can lead
to an -EFSCORRUPTED error when running the log recovery CRC pass.

Therefore, update log head and tail verification to trigger the
associated head/tail fixups in the event of -EFSCORRUPTED errors
along with -EFSBADCRC. Also, -EFSCORRUPTED can currently be returned
from xlog_do_recovery_pass() before rhead_blk is initialized if the
first record encountered happens to be corrupted. This leads to an
incorrect 'first_bad' return value. Initialize rhead_blk earlier in
the function to address that problem as well.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_log_recover.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index fdad8c9ecbb3..83e90bf1899f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1102,7 +1102,7 @@ xlog_verify_tail(
 	first_bad = 0;
 	error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
 				      XLOG_RECOVER_CRCPASS, &first_bad);
-	while (error == -EFSBADCRC && first_bad) {
+	while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
 		int	tail_distance;
 
 		/*
@@ -1188,7 +1188,7 @@ xlog_verify_head(
 	 */
 	error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
 				      XLOG_RECOVER_CRCPASS, &first_bad);
-	if (error == -EFSBADCRC) {
+	if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
 		/*
 		 * We've hit a potential torn write. Reset the error and warn
 		 * about it.
@@ -5255,7 +5255,7 @@ xlog_do_recovery_pass(
 	LIST_HEAD		(buffer_list);
 
 	ASSERT(head_blk != tail_blk);
-	rhead_blk = 0;
+	blk_no = rhead_blk = tail_blk;
 
 	for (i = 0; i < XLOG_RHASH_SIZE; i++)
 		INIT_HLIST_HEAD(&rhash[i]);
@@ -5333,7 +5333,6 @@ xlog_do_recovery_pass(
 	}
 
 	memset(rhash, 0, sizeof(rhash));
-	blk_no = rhead_blk = tail_blk;
 	if (tail_blk > head_blk) {
 		/*
 		 * Perform recovery around the end of the physical log.
-- 
cgit v1.2.3


From 6b6505d90b77f98b0ce08a8332f03cb62f97c78f Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:06:55 -0700
Subject: xfs: add log recovery tracepoint for head/tail

commit e67d3d4246e5fbb0c7c700426d11241ca9c6f473 upstream.

Torn write detection and tail overwrite detection can shift the log
head and tail respectively in the event of CRC mismatch or
corruption errors. Add a high-level log recovery tracepoint to dump
the final log head/tail and make those values easily attainable in
debug/diagnostic situations.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_log_recover.c |  2 ++
 fs/xfs/xfs_trace.h       | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 83e90bf1899f..edd849b8f14d 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -5596,6 +5596,8 @@ xlog_do_recover(
 	xfs_buf_t	*bp;
 	xfs_sb_t	*sbp;
 
+	trace_xfs_log_recover(log, head_blk, tail_blk);
+
 	/*
 	 * First replay the images in the log.
 	 */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2df73f3a73c1..6221c3818c6e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1991,6 +1991,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
 
+TRACE_EVENT(xfs_log_recover,
+	TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk),
+	TP_ARGS(log, headblk, tailblk),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, headblk)
+		__field(xfs_daddr_t, tailblk)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->headblk = headblk;
+		__entry->tailblk = tailblk;
+	),
+	TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk,
+		  __entry->tailblk)
+)
+
 TRACE_EVENT(xfs_log_recover_record,
 	TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
 	TP_ARGS(log, rhead, pass),
-- 
cgit v1.2.3


From 536932f39e93411c48a165c9c859e806c8989301 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Sun, 17 Sep 2017 14:06:56 -0700
Subject: xfs: stop searching for free slots in an inode chunk when there are
 none

commit 2d32311cf19bfb8c1d2b4601974ddd951f9cfd0b upstream.

In a filesystem without finobt, the Space manager selects an AG to alloc a new
inode, where xfs_dialloc_ag_inobt() will search the AG for the free slot chunk.

When the new inode is in the same AG as its parent, the btree will be searched
starting on the parent's record, and then retried from the top if no slot is
available beyond the parent's record.

To exit this loop though, xfs_dialloc_ag_inobt() relies on the fact that the
btree must have a free slot available, once its callers relied on the
agi->freecount when deciding how/where to allocate this new inode.

In the case when the agi->freecount is corrupted, showing available inodes in an
AG, when in fact there is none, this becomes an infinite loop.

Add a way to stop the loop when a free slot is not found in the btree, making
the function to fall into the whole AG scan which will then, be able to detect
the corruption and shut the filesystem down.

As pointed by Brian, this might impact performance, giving the fact we
don't reset the search distance anymore when we reach the end of the
tree, giving it fewer tries before falling back to the whole AG search, but
it will only affect searches that start within 10 records to the end of the tree.

Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_ialloc.c | 55 +++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index af6acd5f276c..4536ac588fa3 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1123,6 +1123,7 @@ xfs_dialloc_ag_inobt(
 	int			error;
 	int			offset;
 	int			i, j;
+	int			searchdistance = 10;
 
 	pag = xfs_perag_get(mp, agno);
 
@@ -1149,7 +1150,6 @@ xfs_dialloc_ag_inobt(
 	if (pagno == agno) {
 		int		doneleft;	/* done, to the left */
 		int		doneright;	/* done, to the right */
-		int		searchdistance = 10;
 
 		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
 		if (error)
@@ -1210,21 +1210,9 @@ xfs_dialloc_ag_inobt(
 		/*
 		 * Loop until we find an inode chunk with a free inode.
 		 */
-		while (!doneleft || !doneright) {
+		while (--searchdistance > 0 && (!doneleft || !doneright)) {
 			int	useleft;  /* using left inode chunk this time */
 
-			if (!--searchdistance) {
-				/*
-				 * Not in range - save last search
-				 * location and allocate a new inode
-				 */
-				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-				pag->pagl_leftrec = trec.ir_startino;
-				pag->pagl_rightrec = rec.ir_startino;
-				pag->pagl_pagino = pagino;
-				goto newino;
-			}
-
 			/* figure out the closer block if both are valid. */
 			if (!doneleft && !doneright) {
 				useleft = pagino -
@@ -1268,26 +1256,37 @@ xfs_dialloc_ag_inobt(
 				goto error1;
 		}
 
-		/*
-		 * We've reached the end of the btree. because
-		 * we are only searching a small chunk of the
-		 * btree each search, there is obviously free
-		 * inodes closer to the parent inode than we
-		 * are now. restart the search again.
-		 */
-		pag->pagl_pagino = NULLAGINO;
-		pag->pagl_leftrec = NULLAGINO;
-		pag->pagl_rightrec = NULLAGINO;
-		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-		goto restart_pagno;
+		if (searchdistance <= 0) {
+			/*
+			 * Not in range - save last search
+			 * location and allocate a new inode
+			 */
+			xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+			pag->pagl_leftrec = trec.ir_startino;
+			pag->pagl_rightrec = rec.ir_startino;
+			pag->pagl_pagino = pagino;
+
+		} else {
+			/*
+			 * We've reached the end of the btree. because
+			 * we are only searching a small chunk of the
+			 * btree each search, there is obviously free
+			 * inodes closer to the parent inode than we
+			 * are now. restart the search again.
+			 */
+			pag->pagl_pagino = NULLAGINO;
+			pag->pagl_leftrec = NULLAGINO;
+			pag->pagl_rightrec = NULLAGINO;
+			xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+			goto restart_pagno;
+		}
 	}
 
 	/*
 	 * In a different AG from the parent.
 	 * See if the most recently allocated block has any free.
 	 */
-newino:
 	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
 		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
 					 XFS_LOOKUP_EQ, &i);
-- 
cgit v1.2.3


From 63d184d2955bab0584acc10b502e415ce23394b1 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 17 Sep 2017 14:06:57 -0700
Subject: xfs: evict all inodes involved with log redo item

commit 799ea9e9c59949008770aab4e1da87f10e99dbe4 upstream.

When we introduced the bmap redo log items, we set MS_ACTIVE on the
mountpoint and XFS_IRECOVERY on the inode to prevent unlinked inodes
from being truncated prematurely during log recovery.  This also had the
effect of putting linked inodes on the lru instead of evicting them.

Unfortunately, we neglected to find all those unreferenced lru inodes
and evict them after finishing log recovery, which means that we leak
them if anything goes wrong in the rest of xfs_mountfs, because the lru
is only cleaned out on unmount.

Therefore, evict unreferenced inodes in the lru list immediately
after clearing MS_ACTIVE.

Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped")
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Cc: viro@ZenIV.linux.org.uk
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/inode.c         |  1 +
 fs/internal.h      |  1 -
 fs/xfs/xfs_log.c   | 12 ++++++++++++
 include/linux/fs.h |  1 +
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/inode.c b/fs/inode.c
index 88110fd0b282..920aa0b1c6b0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -637,6 +637,7 @@ again:
 
 	dispose_list(&dispose);
 }
+EXPORT_SYMBOL_GPL(evict_inodes);
 
 /**
  * invalidate_inodes	- attempt to free all inodes on a superblock
diff --git a/fs/internal.h b/fs/internal.h
index f4da3341b4a3..8b7143b0211c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -136,7 +136,6 @@ extern bool atime_needs_update_rcu(const struct path *, struct inode *);
 extern void inode_io_list_del(struct inode *inode);
 
 extern long get_nr_dirty_inodes(void);
-extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *, bool);
 
 /*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ebe20f1591f1..fe5f3df8b253 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -761,12 +761,24 @@ xfs_log_mount_finish(
 	 * inodes.  Turn it off immediately after recovery finishes
 	 * so that we don't leak the quota inodes if subsequent mount
 	 * activities fail.
+	 *
+	 * We let all inodes involved in redo item processing end up on
+	 * the LRU instead of being evicted immediately so that if we do
+	 * something to an unlinked inode, the irele won't cause
+	 * premature truncation and freeing of the inode, which results
+	 * in log recovery failure.  We have to evict the unreferenced
+	 * lru inodes after clearing MS_ACTIVE because we don't
+	 * otherwise clean up the lru if there's a subsequent failure in
+	 * xfs_mountfs, which leads to us leaking the inodes if nothing
+	 * else (e.g. quotacheck) references the inodes before the
+	 * mount failure occurs.
 	 */
 	mp->m_super->s_flags |= MS_ACTIVE;
 	error = xlog_recover_finish(mp->m_log);
 	if (!error)
 		xfs_log_work_queue(mp);
 	mp->m_super->s_flags &= ~MS_ACTIVE;
+	evict_inodes(mp->m_super);
 
 	if (readonly)
 		mp->m_flags |= XFS_MOUNT_RDONLY;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd88ded27fc8..d705ae084edd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2760,6 +2760,7 @@ static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
 #endif
 extern void unlock_new_inode(struct inode *);
 extern unsigned int get_next_ino(void);
+extern void evict_inodes(struct super_block *sb);
 
 extern void __iget(struct inode * inode);
 extern void iget_failed(struct inode *);
-- 
cgit v1.2.3


From 81286ade81f73e895fe2edf89f3e8054a595ebe5 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Sun, 17 Sep 2017 14:06:58 -0700
Subject: xfs: check for race with xfs_reclaim_inode() in xfs_ifree_cluster()

commit f2e9ad212def50bcf4c098c6288779dd97fff0f0 upstream.

After xfs_ifree_cluster() finds an inode in the radix tree and verifies
that the inode number is what it expected, xfs_reclaim_inode() can swoop
in and free it. xfs_ifree_cluster() will then happily continue working
on the freed inode. Most importantly, it will mark the inode stale,
which will probably be overwritten when the inode slab object is
reallocated, but if it has already been reallocated then we can end up
with an inode spuriously marked stale.

In 8a17d7ddedb4 ("xfs: mark reclaimed inodes invalid earlier") we added
a second check to xfs_iflush_cluster() to detect this race, but the
similar RCU lookup in xfs_ifree_cluster() needs the same treatment.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_icache.c | 10 +++++-----
 fs/xfs/xfs_inode.c  | 23 ++++++++++++++++++-----
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index e279882de427..86a4911520cc 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1078,11 +1078,11 @@ reclaim:
 	 * Because we use RCU freeing we need to ensure the inode always appears
 	 * to be reclaimed with an invalid inode number when in the free state.
 	 * We do this as early as possible under the ILOCK so that
-	 * xfs_iflush_cluster() can be guaranteed to detect races with us here.
-	 * By doing this, we guarantee that once xfs_iflush_cluster has locked
-	 * XFS_ILOCK that it will see either a valid, flushable inode that will
-	 * serialise correctly, or it will see a clean (and invalid) inode that
-	 * it can skip.
+	 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
+	 * detect races with us here. By doing this, we guarantee that once
+	 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
+	 * it will see either a valid inode that will serialise correctly, or it
+	 * will see an invalid inode that it can skip.
 	 */
 	spin_lock(&ip->i_flags_lock);
 	ip->i_flags = XFS_IRECLAIM;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 98cd905eadca..9e795ab08a53 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2368,11 +2368,24 @@ retry:
 			 * already marked stale. If we can't lock it, back off
 			 * and retry.
 			 */
-			if (ip != free_ip &&
-			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-				rcu_read_unlock();
-				delay(1);
-				goto retry;
+			if (ip != free_ip) {
+				if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+					rcu_read_unlock();
+					delay(1);
+					goto retry;
+				}
+
+				/*
+				 * Check the inode number again in case we're
+				 * racing with freeing in xfs_reclaim_inode().
+				 * See the comments in that function for more
+				 * information as to why the initial check is
+				 * not sufficient.
+				 */
+				if (ip->i_ino != inum + i) {
+					xfs_iunlock(ip, XFS_ILOCK_EXCL);
+					continue;
+				}
 			}
 			rcu_read_unlock();
 
-- 
cgit v1.2.3


From 0f5af7eae8846fd73d01ecbe0d60309560084a74 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:06:59 -0700
Subject: xfs: open-code xfs_buf_item_dirty()

commit a4f6cf6b2b6b60ec2a05a33a32e65caa4149aa2b upstream.

It checks a single flag and has one caller. It probably isn't worth
its own function.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_buf_item.c  | 11 -----------
 fs/xfs/xfs_buf_item.h  |  1 -
 fs/xfs/xfs_trans_buf.c |  2 +-
 3 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 573fc72c3f23..cdae0ad5e0a5 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -945,17 +945,6 @@ xfs_buf_item_log(
 }
 
 
-/*
- * Return 1 if the buffer has been logged or ordered in a transaction (at any
- * point, not just the current transaction) and 0 if not.
- */
-uint
-xfs_buf_item_dirty(
-	xfs_buf_log_item_t	*bip)
-{
-	return (bip->bli_flags & XFS_BLI_DIRTY);
-}
-
 STATIC void
 xfs_buf_item_free(
 	xfs_buf_log_item_t	*bip)
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 530686e1afb9..e0e744aefaa8 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -64,7 +64,6 @@ typedef struct xfs_buf_log_item {
 int	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void	xfs_buf_item_relse(struct xfs_buf *);
 void	xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
-uint	xfs_buf_item_dirty(xfs_buf_log_item_t *);
 void	xfs_buf_attach_iodone(struct xfs_buf *,
 			      void(*)(struct xfs_buf *, xfs_log_item_t *),
 			      xfs_log_item_t *);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 86987d823d76..cac8abbeca3f 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -435,7 +435,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 	if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
 		xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
 		xfs_buf_item_relse(bp);
-	} else if (!xfs_buf_item_dirty(bip)) {
+	} else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
 /***
 		ASSERT(bp->b_pincount == 0);
 ***/
-- 
cgit v1.2.3


From ba986b3c84987bbc5e52d8ab83a851e613ce4001 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:07:00 -0700
Subject: xfs: remove unnecessary dirty bli format check for ordered bufs

commit 6453c65d3576bc3e602abb5add15f112755c08ca upstream.

xfs_buf_item_unlock() historically checked the dirty state of the
buffer by manually checking the buffer log formats for dirty
segments. The introduction of ordered buffers invalidated this check
because ordered buffers have dirty bli's but no dirty (logged)
segments. The check was updated to accommodate ordered buffers by
looking at the bli state first and considering the blf only if the
bli is clean.

This logic is safe but unnecessary. There is no valid case where the
bli is clean yet the blf has dirty segments. The bli is set dirty
whenever the blf is logged (via xfs_trans_log_buf()) and the blf is
cleared in the only place BLI_DIRTY is cleared (xfs_trans_binval()).

Remove the conditional blf dirty checks and replace with an assert
that should catch any discrepencies between bli and blf dirty
states. Refactor the old blf dirty check into a helper function to
be used by the assert.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_buf_item.c | 62 ++++++++++++++++++++++++++-------------------------
 fs/xfs/xfs_buf_item.h |  1 +
 2 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index cdae0ad5e0a5..ff076d11804a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -575,26 +575,18 @@ xfs_buf_item_unlock(
 {
 	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 	struct xfs_buf		*bp = bip->bli_buf;
-	bool			clean;
-	bool			aborted;
-	int			flags;
+	bool			aborted = !!(lip->li_flags & XFS_LI_ABORTED);
+	bool			hold = !!(bip->bli_flags & XFS_BLI_HOLD);
+	bool			dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
+	bool			ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
 
 	/* Clear the buffer's association with this transaction. */
 	bp->b_transp = NULL;
 
 	/*
-	 * If this is a transaction abort, don't return early.  Instead, allow
-	 * the brelse to happen.  Normally it would be done for stale
-	 * (cancelled) buffers at unpin time, but we'll never go through the
-	 * pin/unpin cycle if we abort inside commit.
+	 * The per-transaction state has been copied above so clear it from the
+	 * bli.
 	 */
-	aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
-	/*
-	 * Before possibly freeing the buf item, copy the per-transaction state
-	 * so we can reference it safely later after clearing it from the
-	 * buffer log item.
-	 */
-	flags = bip->bli_flags;
 	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
 
 	/*
@@ -602,7 +594,7 @@ xfs_buf_item_unlock(
 	 * unlock the buffer and free the buf item when the buffer is unpinned
 	 * for the last time.
 	 */
-	if (flags & XFS_BLI_STALE) {
+	if (bip->bli_flags & XFS_BLI_STALE) {
 		trace_xfs_buf_item_unlock_stale(bip);
 		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
 		if (!aborted) {
@@ -620,20 +612,11 @@ xfs_buf_item_unlock(
 	 * regardless of whether it is dirty or not. A dirty abort implies a
 	 * shutdown, anyway.
 	 *
-	 * Ordered buffers are dirty but may have no recorded changes, so ensure
-	 * we only release clean items here.
+	 * The bli dirty state should match whether the blf has logged segments
+	 * except for ordered buffers, where only the bli should be dirty.
 	 */
-	clean = (flags & XFS_BLI_DIRTY) ? false : true;
-	if (clean) {
-		int i;
-		for (i = 0; i < bip->bli_format_count; i++) {
-			if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
-				     bip->bli_formats[i].blf_map_size)) {
-				clean = false;
-				break;
-			}
-		}
-	}
+	ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
+	       (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
 
 	/*
 	 * Clean buffers, by definition, cannot be in the AIL. However, aborted
@@ -652,11 +635,11 @@ xfs_buf_item_unlock(
 			ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
 			xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
 			xfs_buf_item_relse(bp);
-		} else if (clean)
+		} else if (!dirty)
 			xfs_buf_item_relse(bp);
 	}
 
-	if (!(flags & XFS_BLI_HOLD))
+	if (!hold)
 		xfs_buf_relse(bp);
 }
 
@@ -945,6 +928,25 @@ xfs_buf_item_log(
 }
 
 
+/*
+ * Return true if the buffer has any ranges logged/dirtied by a transaction,
+ * false otherwise.
+ */
+bool
+xfs_buf_item_dirty_format(
+	struct xfs_buf_log_item	*bip)
+{
+	int			i;
+
+	for (i = 0; i < bip->bli_format_count; i++) {
+		if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+			     bip->bli_formats[i].blf_map_size))
+			return true;
+	}
+
+	return false;
+}
+
 STATIC void
 xfs_buf_item_free(
 	xfs_buf_log_item_t	*bip)
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index e0e744aefaa8..9690ce62c9a7 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -64,6 +64,7 @@ typedef struct xfs_buf_log_item {
 int	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void	xfs_buf_item_relse(struct xfs_buf *);
 void	xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
+bool	xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
 void	xfs_buf_attach_iodone(struct xfs_buf *,
 			      void(*)(struct xfs_buf *, xfs_log_item_t *),
 			      xfs_log_item_t *);
-- 
cgit v1.2.3


From 93b64516019249fa196cc3cf4c9040270cf4106f Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:07:01 -0700
Subject: xfs: ordered buffer log items are never formatted

commit e9385cc6fb7edf23702de33a2dc82965d92d9392 upstream.

Ordered buffers pass through the logging infrastructure without ever
being written to the log. The way this works is that the ordered
buffer status is transferred to the log vector at commit time via
the ->iop_size() callback. In xlog_cil_insert_format_items(),
ordered log vectors bypass ->iop_format() processing altogether.

Therefore it is unnecessary for xfs_buf_item_format() to handle
ordered buffers. Remove the unnecessary logic and assert that an
ordered buffer never reaches this point.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_buf_item.c | 12 ++----------
 fs/xfs/xfs_trace.h    |  1 -
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index ff076d11804a..ef2c1375f092 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -323,6 +323,8 @@ xfs_buf_item_format(
 	ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
 	       (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
 	        && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
+	ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
+	       (bip->bli_flags & XFS_BLI_STALE));
 
 
 	/*
@@ -347,16 +349,6 @@ xfs_buf_item_format(
 		bip->bli_flags &= ~XFS_BLI_INODE_BUF;
 	}
 
-	if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
-							XFS_BLI_ORDERED) {
-		/*
-		 * The buffer has been logged just to order it.  It is not being
-		 * included in the transaction commit, so don't format it.
-		 */
-		trace_xfs_buf_item_format_ordered(bip);
-		return;
-	}
-
 	for (i = 0; i < bip->bli_format_count; i++) {
 		xfs_buf_item_format_segment(bip, lv, &vecp, offset,
 					    &bip->bli_formats[i]);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6221c3818c6e..bdf69e1c7410 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -520,7 +520,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
-- 
cgit v1.2.3


From 19a87a9407654b6e46fff9f325cac0a11dec75f7 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:07:02 -0700
Subject: xfs: refactor buffer logging into buffer dirtying helper

commit 9684010d38eccda733b61106765e9357cf436f65 upstream.

xfs_trans_log_buf() is responsible for logging the dirty segments of
a buffer along with setting all of the necessary state on the
transaction, buffer, bli, etc., to ensure that the associated items
are marked as dirty and prepared for I/O. We have a couple use cases
that need to to dirty a buffer in a transaction without actually
logging dirty ranges of the buffer.  One existing use case is
ordered buffers, which are currently logged with arbitrary ranges to
accomplish this even though the content of ordered buffers is never
written to the log. Another pending use case is to relog an already
dirty buffer across rolled transactions within the deferred
operations infrastructure. This is required to prevent a held
(XFS_BLI_HOLD) buffer from pinning the tail of the log.

Refactor xfs_trans_log_buf() into a new function that contains all
of the logic responsible to dirty the transaction, lidp, buffer and
bli. This new function can be used in the future for the use cases
outlined above. This patch does not introduce functional changes.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_trans.h     |  4 +++-
 fs/xfs/xfs_trans_buf.c | 46 ++++++++++++++++++++++++++++++----------------
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 0318e92aed66..40555bcaa277 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -222,7 +222,9 @@ void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
 void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
-void		xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
+void		xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
+				  uint);
+void		xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
 void		xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
 
 void		xfs_extent_free_init_defer_op(void);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index cac8abbeca3f..8c99813e5377 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -493,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
 }
 
 /*
- * This is called to mark bytes first through last inclusive of the given
- * buffer as needing to be logged when the transaction is committed.
- * The buffer must already be associated with the given transaction.
- *
- * First and last are numbers relative to the beginning of this buffer,
- * so the first byte in the buffer is numbered 0 regardless of the
- * value of b_blkno.
+ * Mark a buffer dirty in the transaction.
  */
 void
-xfs_trans_log_buf(xfs_trans_t	*tp,
-		  xfs_buf_t	*bp,
-		  uint		first,
-		  uint		last)
+xfs_trans_dirty_buf(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
 {
-	xfs_buf_log_item_t	*bip = bp->b_fspriv;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
 
 	ASSERT(bp->b_transp == tp);
 	ASSERT(bip != NULL);
-	ASSERT(first <= last && last < BBTOB(bp->b_length));
 	ASSERT(bp->b_iodone == NULL ||
 	       bp->b_iodone == xfs_buf_iodone_callbacks);
 
@@ -531,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 	bp->b_iodone = xfs_buf_iodone_callbacks;
 	bip->bli_item.li_cb = xfs_buf_iodone;
 
-	trace_xfs_trans_log_buf(bip);
-
 	/*
 	 * If we invalidated the buffer within this transaction, then
 	 * cancel the invalidation now that we're dirtying the buffer
@@ -545,15 +535,39 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 		bp->b_flags &= ~XBF_STALE;
 		bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
 	}
+	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
 
 	tp->t_flags |= XFS_TRANS_DIRTY;
 	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
+
+/*
+ * This is called to mark bytes first through last inclusive of the given
+ * buffer as needing to be logged when the transaction is committed.
+ * The buffer must already be associated with the given transaction.
+ *
+ * First and last are numbers relative to the beginning of this buffer,
+ * so the first byte in the buffer is numbered 0 regardless of the
+ * value of b_blkno.
+ */
+void
+xfs_trans_log_buf(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	uint			first,
+	uint			last)
+{
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	ASSERT(first <= last && last < BBTOB(bp->b_length));
+
+	xfs_trans_dirty_buf(tp, bp);
 
 	/*
 	 * If we have an ordered buffer we are not logging any dirty range but
 	 * it still needs to be marked dirty and that it has been logged.
 	 */
-	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+	trace_xfs_trans_log_buf(bip);
 	if (!(bip->bli_flags & XFS_BLI_ORDERED))
 		xfs_buf_item_log(bip, first, last);
 }
-- 
cgit v1.2.3


From fe211e1744db41fb23b0a85f7cda87de8fab5ea2 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:07:03 -0700
Subject: xfs: don't log dirty ranges for ordered buffers

commit 8dc518dfa7dbd079581269e51074b3c55a65a880 upstream.

Ordered buffers are attached to transactions and pushed through the
logging infrastructure just like normal buffers with the exception
that they are not actually written to the log. Therefore, we don't
need to log dirty ranges of ordered buffers. xfs_trans_log_buf() is
called on ordered buffers to set up all of the dirty state on the
transaction, buffer and log item and prepare the buffer for I/O.

Now that xfs_trans_dirty_buf() is available, call it from
xfs_trans_ordered_buf() so the latter is now mutually exclusive with
xfs_trans_log_buf(). This reflects the implementation of ordered
buffers and helps eliminate confusion over the need to log ranges of
ordered buffers just to set up internal log state.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_btree.c  |  6 ++----
 fs/xfs/libxfs/xfs_ialloc.c |  2 --
 fs/xfs/xfs_trans_buf.c     | 26 ++++++++++++++------------
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index e9f26a09a0be..69c3f428b582 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4447,12 +4447,10 @@ xfs_btree_block_change_owner(
 	 * though, so everything is consistent in memory.
 	 */
 	if (bp) {
-		if (cur->bc_tp) {
+		if (cur->bc_tp)
 			xfs_trans_ordered_buf(cur->bc_tp, bp);
-			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
-		} else {
+		else
 			xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
-		}
 	} else {
 		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
 		ASSERT(level == cur->bc_nlevels - 1);
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 4536ac588fa3..42fef0731e2a 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -368,8 +368,6 @@ xfs_ialloc_inode_init(
 				 * transaction and pin the log appropriately.
 				 */
 				xfs_trans_ordered_buf(tp, fbuf);
-				xfs_trans_log_buf(tp, fbuf, 0,
-						  BBTOB(fbuf->b_length) - 1);
 			}
 		} else {
 			fbuf->b_flags |= XBF_DONE;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8c99813e5377..3089e8015369 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -560,16 +560,12 @@ xfs_trans_log_buf(
 	struct xfs_buf_log_item	*bip = bp->b_fspriv;
 
 	ASSERT(first <= last && last < BBTOB(bp->b_length));
+	ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
 
 	xfs_trans_dirty_buf(tp, bp);
 
-	/*
-	 * If we have an ordered buffer we are not logging any dirty range but
-	 * it still needs to be marked dirty and that it has been logged.
-	 */
 	trace_xfs_trans_log_buf(bip);
-	if (!(bip->bli_flags & XFS_BLI_ORDERED))
-		xfs_buf_item_log(bip, first, last);
+	xfs_buf_item_log(bip, first, last);
 }
 
 
@@ -722,12 +718,11 @@ xfs_trans_inode_alloc_buf(
 }
 
 /*
- * Mark the buffer as ordered for this transaction. This means
- * that the contents of the buffer are not recorded in the transaction
- * but it is tracked in the AIL as though it was. This allows us
- * to record logical changes in transactions rather than the physical
- * changes we make to the buffer without changing writeback ordering
- * constraints of metadata buffers.
+ * Mark the buffer as ordered for this transaction. This means that the contents
+ * of the buffer are not recorded in the transaction but it is tracked in the
+ * AIL as though it was. This allows us to record logical changes in
+ * transactions rather than the physical changes we make to the buffer without
+ * changing writeback ordering constraints of metadata buffers.
  */
 void
 xfs_trans_ordered_buf(
@@ -739,9 +734,16 @@ xfs_trans_ordered_buf(
 	ASSERT(bp->b_transp == tp);
 	ASSERT(bip != NULL);
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT(!xfs_buf_item_dirty_format(bip));
 
 	bip->bli_flags |= XFS_BLI_ORDERED;
 	trace_xfs_buf_item_ordered(bip);
+
+	/*
+	 * We don't log a dirty range of an ordered buffer but it still needs
+	 * to be marked dirty and that it has been logged.
+	 */
+	xfs_trans_dirty_buf(tp, bp);
 }
 
 /*
-- 
cgit v1.2.3


From f9e583edf1a71b7b40d5c5c492319a07ebe82d71 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:07:04 -0700
Subject: xfs: skip bmbt block ino validation during owner change

commit 99c794c639a65cc7b74f30a674048fd100fe9ac8 upstream.

Extent swap uses xfs_btree_visit_blocks() to fix up bmbt block
owners on v5 (!rmapbt) filesystems. The bmbt scan uses
xfs_btree_lookup_get_block() to read bmbt blocks which verifies the
current owner of the block against the parent inode of the bmbt.
This works during extent swap because the bmbt owners are updated to
the opposite inode number before the inode extent forks are swapped.

The modified bmbt blocks are marked as ordered buffers which allows
everything to commit in a single transaction. If the transaction
commits to the log and the system crashes such that recovery of the
extent swap is required, log recovery restarts the bmbt scan to fix
up any bmbt blocks that may have not been written back before the
crash. The log recovery bmbt scan occurs after the inode forks have
been swapped, however. This causes the bmbt block owner verification
to fail, leads to log recovery failure and requires xfs_repair to
zap the log to recover.

Define a new invalid inode owner flag to inform the btree block
lookup mechanism that the current inode may be invalid with respect
to the current owner of the bmbt block. Set this flag on the cursor
used for change owner scans to allow this operation to work at
runtime and during log recovery.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Fixes: bb3be7e7c ("xfs: check for bogus values in btree block headers")
Cc: stable@vger.kernel.org
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_bmap_btree.c | 1 +
 fs/xfs/libxfs/xfs_btree.c      | 1 +
 fs/xfs/libxfs/xfs_btree.h      | 3 ++-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 5c3918678bb6..9968a746c649 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -888,6 +888,7 @@ xfs_bmbt_change_owner(
 	cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
 	if (!cur)
 		return -ENOMEM;
+	cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
 
 	error = xfs_btree_change_owner(cur, new_owner, buffer_list);
 	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 69c3f428b582..1df747fadc3a 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1774,6 +1774,7 @@ xfs_btree_lookup_get_block(
 
 	/* Check the inode owner since the verifiers don't. */
 	if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
+	    !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
 	    (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
 	    be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
 			cur->bc_private.b.ip->i_ino)
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3b0fc1afada5..33c7be2357b9 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -268,7 +268,8 @@ typedef struct xfs_btree_cur
 			short		forksize;	/* fork's inode space */
 			char		whichfork;	/* data or attr fork */
 			char		flags;		/* flags */
-#define	XFS_BTCUR_BPRV_WASDEL	1			/* was delayed */
+#define	XFS_BTCUR_BPRV_WASDEL		(1<<0)		/* was delayed */
+#define	XFS_BTCUR_BPRV_INVALID_OWNER	(1<<1)		/* for ext swap */
 		} b;
 	}		bc_private;	/* per-btree type data */
 } xfs_btree_cur_t;
-- 
cgit v1.2.3


From a51e3e2cf3cbb306faa16784fd4f1791ee304816 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:07:05 -0700
Subject: xfs: move bmbt owner change to last step of extent swap

commit 6fb10d6d22094bc4062f92b9ccbcee2f54033d04 upstream.

The extent swap operation currently resets bmbt block owners before
the inode forks are swapped. The bmbt buffers are marked as ordered
so they do not have to be physically logged in the transaction.

This use of ordered buffers is not safe as bmbt buffers may have
been previously physically logged. The bmbt owner change algorithm
needs to be updated to physically log buffers that are already dirty
when/if they are encountered. This means that an extent swap will
eventually require multiple rolling transactions to handle large
btrees. In addition, all inode related changes must be logged before
the bmbt owner change scan begins and can roll the transaction for
the first time to preserve fs consistency via log recovery.

In preparation for such fixes to the bmbt owner change algorithm,
refactor the bmbt scan out of the extent fork swap code to the last
operation before the transaction is committed. Update
xfs_swap_extent_forks() to only set the inode log flags when an
owner change scan is necessary. Update xfs_swap_extents() to trigger
the owner change based on the inode log flags. Note that since the
owner change now occurs after the extent fork swap, the inode btrees
must be fixed up with the inode number of the current inode (similar
to log recovery).

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_bmap_util.c | 44 ++++++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 87b495e2f15a..15cd36f29fc4 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1825,29 +1825,18 @@ xfs_swap_extent_forks(
 	}
 
 	/*
-	 * Before we've swapped the forks, lets set the owners of the forks
-	 * appropriately. We have to do this as we are demand paging the btree
-	 * buffers, and so the validation done on read will expect the owner
-	 * field to be correctly set. Once we change the owners, we can swap the
-	 * inode forks.
+	 * Btree format (v3) inodes have the inode number stamped in the bmbt
+	 * block headers. We can't start changing the bmbt blocks until the
+	 * inode owner change is logged so recovery does the right thing in the
+	 * event of a crash. Set the owner change log flags now and leave the
+	 * bmbt scan as the last step.
 	 */
 	if (ip->i_d.di_version == 3 &&
-	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
 		(*target_log_flags) |= XFS_ILOG_DOWNER;
-		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
-					      tip->i_ino, NULL);
-		if (error)
-			return error;
-	}
-
 	if (tip->i_d.di_version == 3 &&
-	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
 		(*src_log_flags) |= XFS_ILOG_DOWNER;
-		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
-					      ip->i_ino, NULL);
-		if (error)
-			return error;
-	}
 
 	/*
 	 * Swap the data forks of the inodes
@@ -2076,6 +2065,25 @@ xfs_swap_extents(
 	xfs_trans_log_inode(tp, ip,  src_log_flags);
 	xfs_trans_log_inode(tp, tip, target_log_flags);
 
+	/*
+	 * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
+	 * have inode number owner values in the bmbt blocks that still refer to
+	 * the old inode. Scan each bmbt to fix up the owner values with the
+	 * inode number of the current inode.
+	 */
+	if (src_log_flags & XFS_ILOG_DOWNER) {
+		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+					      ip->i_ino, NULL);
+		if (error)
+			goto out_trans_cancel;
+	}
+	if (target_log_flags & XFS_ILOG_DOWNER) {
+		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+					      tip->i_ino, NULL);
+		if (error)
+			goto out_trans_cancel;
+	}
+
 	/*
 	 * If this is a synchronous mount, make sure that the
 	 * transaction goes to disk before returning to the user.
-- 
cgit v1.2.3


From e2bb92633615ad801c4ab56fdb3eba3c701b2a3c Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:07:06 -0700
Subject: xfs: disallow marking previously dirty buffers as ordered

commit a5814bceea48ee1c57c4db2bd54b0c0246daf54a upstream.

Ordered buffers are used in situations where the buffer is not
physically logged but must pass through the transaction/logging
pipeline for a particular transaction. As a result, ordered buffers
are not unpinned and written back until the transaction commits to
the log. Ordered buffers have a strict requirement that the target
buffer must not be currently dirty and resident in the log pipeline
at the time it is marked ordered. If a dirty+ordered buffer is
committed, the buffer is reinserted to the AIL but not physically
relogged at the LSN of the associated checkpoint. The buffer log
item is assigned the LSN of the latest checkpoint and the AIL
effectively releases the previously logged buffer content from the
active log before the buffer has been written back. If the tail
pushes forward and a filesystem crash occurs while in this state, an
inconsistent filesystem could result.

It is currently the caller responsibility to ensure an ordered
buffer is not already dirty from a previous modification. This is
unclear and error prone when not used in situations where it is
guaranteed a buffer has not been previously modified (such as new
metadata allocations).

To facilitate general purpose use of ordered buffers, update
xfs_trans_ordered_buf() to conditionally order the buffer based on
state of the log item and return the status of the result. If the
bli is dirty, do not order the buffer and return false. The caller
must either physically log the buffer (having acquired the
appropriate log reservation) or push it from the AIL to clean it
before it can be marked ordered in the current transaction.

Note that ordered buffers are currently only used in two situations:
1.) inode chunk allocation where previously logged buffers are not
possible and 2.) extent swap which will be updated to handle ordered
buffer failures in a separate patch.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_trans.h     | 2 +-
 fs/xfs/xfs_trans_buf.c | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 40555bcaa277..5669cf00bae0 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -217,7 +217,7 @@ void		xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
-void		xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
+bool		xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 3089e8015369..3ba7a96a8abd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -724,7 +724,7 @@ xfs_trans_inode_alloc_buf(
  * transactions rather than the physical changes we make to the buffer without
  * changing writeback ordering constraints of metadata buffers.
  */
-void
+bool
 xfs_trans_ordered_buf(
 	struct xfs_trans	*tp,
 	struct xfs_buf		*bp)
@@ -734,7 +734,9 @@ xfs_trans_ordered_buf(
 	ASSERT(bp->b_transp == tp);
 	ASSERT(bip != NULL);
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
-	ASSERT(!xfs_buf_item_dirty_format(bip));
+
+	if (xfs_buf_item_dirty_format(bip))
+		return false;
 
 	bip->bli_flags |= XFS_BLI_ORDERED;
 	trace_xfs_buf_item_ordered(bip);
@@ -744,6 +746,7 @@ xfs_trans_ordered_buf(
 	 * to be marked dirty and that it has been logged.
 	 */
 	xfs_trans_dirty_buf(tp, bp);
+	return true;
 }
 
 /*
-- 
cgit v1.2.3


From a46cf59265cf5282be0a488abc913e94db924e87 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Sun, 17 Sep 2017 14:07:07 -0700
Subject: xfs: relog dirty buffers during swapext bmbt owner change

commit 2dd3d709fc4338681a3aa61658122fa8faa5a437 upstream.

The owner change bmbt scan that occurs during extent swap operations
does not handle ordered buffer failures. Buffers that cannot be
marked ordered must be physically logged so previously dirty ranges
of the buffer can be relogged in the transaction.

Since the bmbt scan may need to process and potentially log a large
number of blocks, we can't expect to complete this operation in a
single transaction. Update extent swap to use a permanent
transaction with enough log reservation to physically log a buffer.
Update the bmbt scan to physically log any buffers that cannot be
ordered and to terminate the scan with -EAGAIN. On -EAGAIN, the
caller rolls the transaction and restarts the scan. Finally, update
the bmbt scan helper function to skip bmbt blocks that already match
the expected owner so they are not reprocessed after scan restarts.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
[darrick: fix the xfs_trans_roll call]
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_btree.c | 26 ++++++++++++++-------
 fs/xfs/xfs_bmap_util.c    | 59 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 1df747fadc3a..4ad1e214b1b2 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4435,10 +4435,15 @@ xfs_btree_block_change_owner(
 
 	/* modify the owner */
 	block = xfs_btree_get_block(cur, level, &bp);
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
+			return 0;
 		block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
-	else
+	} else {
+		if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
+			return 0;
 		block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
+	}
 
 	/*
 	 * If the block is a root block hosted in an inode, we might not have a
@@ -4447,14 +4452,19 @@ xfs_btree_block_change_owner(
 	 * block is formatted into the on-disk inode fork. We still change it,
 	 * though, so everything is consistent in memory.
 	 */
-	if (bp) {
-		if (cur->bc_tp)
-			xfs_trans_ordered_buf(cur->bc_tp, bp);
-		else
-			xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
-	} else {
+	if (!bp) {
 		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
 		ASSERT(level == cur->bc_nlevels - 1);
+		return 0;
+	}
+
+	if (cur->bc_tp) {
+		if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
+			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+			return -EAGAIN;
+		}
+	} else {
+		xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
 	}
 
 	return 0;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 15cd36f29fc4..5ffefac081f7 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1914,6 +1914,48 @@ xfs_swap_extent_forks(
 	return 0;
 }
 
+/*
+ * Fix up the owners of the bmbt blocks to refer to the current inode. The
+ * change owner scan attempts to order all modified buffers in the current
+ * transaction. In the event of ordered buffer failure, the offending buffer is
+ * physically logged as a fallback and the scan returns -EAGAIN. We must roll
+ * the transaction in this case to replenish the fallback log reservation and
+ * restart the scan. This process repeats until the scan completes.
+ */
+static int
+xfs_swap_change_owner(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*ip,
+	struct xfs_inode	*tmpip)
+{
+	int			error;
+	struct xfs_trans	*tp = *tpp;
+
+	do {
+		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
+					      NULL);
+		/* success or fatal error */
+		if (error != -EAGAIN)
+			break;
+
+		error = xfs_trans_roll(tpp, NULL);
+		if (error)
+			break;
+		tp = *tpp;
+
+		/*
+		 * Redirty both inodes so they can relog and keep the log tail
+		 * moving forward.
+		 */
+		xfs_trans_ijoin(tp, ip, 0);
+		xfs_trans_ijoin(tp, tmpip, 0);
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
+	} while (true);
+
+	return error;
+}
+
 int
 xfs_swap_extents(
 	struct xfs_inode	*ip,	/* target inode */
@@ -1927,8 +1969,8 @@ xfs_swap_extents(
 	int			error = 0;
 	int			lock_flags;
 	struct xfs_ifork	*cowfp;
-	__uint64_t		f;
-	int			resblks;
+	uint64_t		f;
+	int			resblks = 0;
 
 	/*
 	 * Lock the inodes against other IO, page faults and truncate to
@@ -1976,11 +2018,8 @@ xfs_swap_extents(
 			  XFS_SWAP_RMAP_SPACE_RES(mp,
 				XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
 				XFS_DATA_FORK);
-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
-				0, 0, &tp);
-	} else
-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
-				0, 0, &tp);
+	}
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
 	if (error)
 		goto out_unlock;
 
@@ -2072,14 +2111,12 @@ xfs_swap_extents(
 	 * inode number of the current inode.
 	 */
 	if (src_log_flags & XFS_ILOG_DOWNER) {
-		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
-					      ip->i_ino, NULL);
+		error = xfs_swap_change_owner(&tp, ip, tip);
 		if (error)
 			goto out_trans_cancel;
 	}
 	if (target_log_flags & XFS_ILOG_DOWNER) {
-		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
-					      tip->i_ino, NULL);
+		error = xfs_swap_change_owner(&tp, tip, ip);
 		if (error)
 			goto out_trans_cancel;
 	}
-- 
cgit v1.2.3


From 0e8d7e364ec546c44762664d30f4b1f6fd912197 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 17 Sep 2017 14:07:08 -0700
Subject: xfs: disable per-inode DAX flag

commit 742d84290739ae908f1b61b7d17ea382c8c0073a upstream.

Currently flag switching can be used to easily crash the kernel.  Disable
the per-inode DAX flag until that is sorted out.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_ioctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 73cfc7179124..be54216027b6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1005,11 +1005,12 @@ xfs_diflags_to_linux(
 		inode->i_flags |= S_NOATIME;
 	else
 		inode->i_flags &= ~S_NOATIME;
+#if 0	/* disabled until the flag switching races are sorted out */
 	if (xflags & FS_XFLAG_DAX)
 		inode->i_flags |= S_DAX;
 	else
 		inode->i_flags &= ~S_DAX;
-
+#endif
 }
 
 static int
-- 
cgit v1.2.3


From f46a61f686b0a8042ee4b7cb108ece81e3fb9401 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 17 Sep 2017 14:07:09 -0700
Subject: xfs: fix incorrect log_flushed on fsync

commit 47c7d0b19502583120c3f396c7559e7a77288a68 upstream.

When calling into _xfs_log_force{,_lsn}() with a pointer
to log_flushed variable, log_flushed will be set to 1 if:
1. xlog_sync() is called to flush the active log buffer
AND/OR
2. xlog_wait() is called to wait on a syncing log buffers

xfs_file_fsync() checks the value of log_flushed after
_xfs_log_force_lsn() call to optimize away an explicit
PREFLUSH request to the data block device after writing
out all the file's pages to disk.

This optimization is incorrect in the following sequence of events:

 Task A                    Task B
 -------------------------------------------------------
 xfs_file_fsync()
   _xfs_log_force_lsn()
     xlog_sync()
        [submit PREFLUSH]
                           xfs_file_fsync()
                             file_write_and_wait_range()
                               [submit WRITE X]
                               [endio  WRITE X]
                             _xfs_log_force_lsn()
                               xlog_wait()
        [endio  PREFLUSH]

The write X is not guarantied to be on persistent storage
when PREFLUSH request in completed, because write A was submitted
after the PREFLUSH request, but xfs_file_fsync() of task A will
be notified of log_flushed=1 and will skip explicit flush.

If the system crashes after fsync of task A, write X may not be
present on disk after reboot.

This bug was discovered and demonstrated using Josef Bacik's
dm-log-writes target, which can be used to record block io operations
and then replay a subset of these operations onto the target device.
The test goes something like this:
- Use fsx to execute ops of a file and record ops on log device
- Every now and then fsync the file, store md5 of file and mark
  the location in the log
- Then replay log onto device for each mark, mount fs and compare
  md5 of file to stored value

Cc: Christoph Hellwig <hch@lst.de>
Cc: Josef Bacik <jbacik@fb.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_log.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index fe5f3df8b253..33c9a3aae948 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3337,8 +3337,6 @@ maybe_sleep:
 		 */
 		if (iclog->ic_state & XLOG_STATE_IOERROR)
 			return -EIO;
-		if (log_flushed)
-			*log_flushed = 1;
 	} else {
 
 no_sleep:
@@ -3442,8 +3440,6 @@ try_again:
 
 				xlog_wait(&iclog->ic_prev->ic_write_wait,
 							&log->l_icloglock);
-				if (log_flushed)
-					*log_flushed = 1;
 				already_slept = 1;
 				goto try_again;
 			}
@@ -3477,9 +3473,6 @@ try_again:
 			 */
 			if (iclog->ic_state & XLOG_STATE_IOERROR)
 				return -EIO;
-
-			if (log_flushed)
-				*log_flushed = 1;
 		} else {		/* just return */
 			spin_unlock(&log->l_icloglock);
 		}
-- 
cgit v1.2.3


From bb69e8a228a74c9aa7b70f6624e5c4fa1af70533 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 17 Sep 2017 14:07:10 -0700
Subject: xfs: don't set v3 xflags for v2 inodes

commit dd60687ee541ca3f6df8758f38e6f22f57c42a37 upstream.

Reject attempts to set XFLAGS that correspond to di_flags2 inode flags
if the inode isn't a v3 inode, because di_flags2 only exists on v3.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_ioctl.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index be54216027b6..bce2e260f55e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -928,16 +928,15 @@ xfs_ioc_fsgetxattr(
 	return 0;
 }
 
-STATIC void
-xfs_set_diflags(
+STATIC uint16_t
+xfs_flags2diflags(
 	struct xfs_inode	*ip,
 	unsigned int		xflags)
 {
-	unsigned int		di_flags;
-	uint64_t		di_flags2;
-
 	/* can't set PREALLOC this way, just preserve it */
-	di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+	uint16_t		di_flags =
+		(ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+
 	if (xflags & FS_XFLAG_IMMUTABLE)
 		di_flags |= XFS_DIFLAG_IMMUTABLE;
 	if (xflags & FS_XFLAG_APPEND)
@@ -967,19 +966,24 @@ xfs_set_diflags(
 		if (xflags & FS_XFLAG_EXTSIZE)
 			di_flags |= XFS_DIFLAG_EXTSIZE;
 	}
-	ip->i_d.di_flags = di_flags;
 
-	/* diflags2 only valid for v3 inodes. */
-	if (ip->i_d.di_version < 3)
-		return;
+	return di_flags;
+}
+
+STATIC uint64_t
+xfs_flags2diflags2(
+	struct xfs_inode	*ip,
+	unsigned int		xflags)
+{
+	uint64_t		di_flags2 =
+		(ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
 
-	di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
 	if (xflags & FS_XFLAG_DAX)
 		di_flags2 |= XFS_DIFLAG2_DAX;
 	if (xflags & FS_XFLAG_COWEXTSIZE)
 		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
 
-	ip->i_d.di_flags2 = di_flags2;
+	return di_flags2;
 }
 
 STATIC void
@@ -1020,6 +1024,7 @@ xfs_ioctl_setattr_xflags(
 	struct fsxattr		*fa)
 {
 	struct xfs_mount	*mp = ip->i_mount;
+	uint64_t		di_flags2;
 
 	/* Can't change realtime flag if any extents are allocated. */
 	if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
@@ -1050,7 +1055,14 @@ xfs_ioctl_setattr_xflags(
 	    !capable(CAP_LINUX_IMMUTABLE))
 		return -EPERM;
 
-	xfs_set_diflags(ip, fa->fsx_xflags);
+	/* diflags2 only valid for v3 inodes. */
+	di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
+	if (di_flags2 && ip->i_d.di_version < 3)
+		return -EINVAL;
+
+	ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
+	ip->i_d.di_flags2 = di_flags2;
+
 	xfs_diflags_to_linux(ip);
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-- 
cgit v1.2.3


From 772003c6a4282211487c9d33958594d7f2be7dd2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 17 Sep 2017 14:07:11 -0700
Subject: xfs: open code end_buffer_async_write in xfs_finish_page_writeback

commit 8353a814f2518dcfa79a5bb77afd0e7dfa391bb1 upstream.

Our loop in xfs_finish_page_writeback, which iterates over all buffer
heads in a page and then calls end_buffer_async_write, which also
iterates over all buffers in the page to check if any I/O is in flight
is not only inefficient, but also potentially dangerous as
end_buffer_async_write can cause the page and all buffers to be freed.

Replace it with a single loop that does the work of end_buffer_async_write
on a per-page basis.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_aops.c | 72 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 48 insertions(+), 24 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f750d888bd17..d23889e0bedc 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -90,11 +90,11 @@ xfs_find_bdev_for_inode(
  * associated buffer_heads, paying attention to the start and end offsets that
  * we need to process on the page.
  *
- * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
- * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
- * the page at all, as we may be racing with memory reclaim and it can free both
- * the bufferhead chain and the page as it will see the page as clean and
- * unused.
+ * Note that we open code the action in end_buffer_async_write here so that we
+ * only have to iterate over the buffers attached to the page once.  This is not
+ * only more efficient, but also ensures that we only calls end_page_writeback
+ * at the end of the iteration, and thus avoids the pitfall of having the page
+ * and buffers potentially freed after every call to end_buffer_async_write.
  */
 static void
 xfs_finish_page_writeback(
@@ -102,29 +102,45 @@ xfs_finish_page_writeback(
 	struct bio_vec		*bvec,
 	int			error)
 {
-	unsigned int		end = bvec->bv_offset + bvec->bv_len - 1;
-	struct buffer_head	*head, *bh, *next;
+	struct buffer_head	*head = page_buffers(bvec->bv_page), *bh = head;
+	bool			busy = false;
 	unsigned int		off = 0;
-	unsigned int		bsize;
+	unsigned long		flags;
 
 	ASSERT(bvec->bv_offset < PAGE_SIZE);
 	ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
-	ASSERT(end < PAGE_SIZE);
+	ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
 	ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
 
-	bh = head = page_buffers(bvec->bv_page);
-
-	bsize = bh->b_size;
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
 	do {
-		if (off > end)
-			break;
-		next = bh->b_this_page;
-		if (off < bvec->bv_offset)
-			goto next_bh;
-		bh->b_end_io(bh, !error);
-next_bh:
-		off += bsize;
-	} while ((bh = next) != head);
+		if (off >= bvec->bv_offset &&
+		    off < bvec->bv_offset + bvec->bv_len) {
+			ASSERT(buffer_async_write(bh));
+			ASSERT(bh->b_end_io == NULL);
+
+			if (error) {
+				mapping_set_error(bvec->bv_page->mapping, -EIO);
+				set_buffer_write_io_error(bh);
+				clear_buffer_uptodate(bh);
+				SetPageError(bvec->bv_page);
+			} else {
+				set_buffer_uptodate(bh);
+			}
+			clear_buffer_async_write(bh);
+			unlock_buffer(bh);
+		} else if (buffer_async_write(bh)) {
+			ASSERT(buffer_locked(bh));
+			busy = true;
+		}
+		off += bh->b_size;
+	} while ((bh = bh->b_this_page) != head);
+	bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+	local_irq_restore(flags);
+
+	if (!busy)
+		end_page_writeback(bvec->bv_page);
 }
 
 /*
@@ -138,8 +154,10 @@ xfs_destroy_ioend(
 	int			error)
 {
 	struct inode		*inode = ioend->io_inode;
-	struct bio		*last = ioend->io_bio;
-	struct bio		*bio, *next;
+	struct bio		*bio = &ioend->io_inline_bio;
+	struct bio		*last = ioend->io_bio, *next;
+	u64			start = bio->bi_iter.bi_sector;
+	bool			quiet = bio_flagged(bio, BIO_QUIET);
 
 	for (bio = &ioend->io_inline_bio; bio; bio = next) {
 		struct bio_vec	*bvec;
@@ -160,6 +178,11 @@ xfs_destroy_ioend(
 
 		bio_put(bio);
 	}
+
+	if (unlikely(error && !quiet)) {
+		xfs_err_ratelimited(XFS_I(inode)->i_mount,
+			"writeback error on sector %llu", start);
+	}
 }
 
 /*
@@ -427,7 +450,8 @@ xfs_start_buffer_writeback(
 	ASSERT(!buffer_delay(bh));
 	ASSERT(!buffer_unwritten(bh));
 
-	mark_buffer_async_write(bh);
+	bh->b_end_io = NULL;
+	set_buffer_async_write(bh);
 	set_buffer_uptodate(bh);
 	clear_buffer_dirty(bh);
 }
-- 
cgit v1.2.3


From 81cb6f1a2a1964ed4d93604d1a3d49d92db2a01b Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Sun, 17 Sep 2017 14:07:12 -0700
Subject: xfs: use kmem_free to free return value of kmem_zalloc

commit 6c370590cfe0c36bcd62d548148aa65c984540b7 upstream.

In function xfs_test_remount_options(), kfree() is used to free memory
allocated by kmem_zalloc(). But it is better to use kmem_free().

Signed-off-by: Pan Bian <bianpan2016@163.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 882fb8524fcb..67d589e0a49f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1214,7 +1214,7 @@ xfs_test_remount_options(
 	tmp_mp->m_super = sb;
 	error = xfs_parseargs(tmp_mp, options);
 	xfs_free_fsname(tmp_mp);
-	kfree(tmp_mp);
+	kmem_free(tmp_mp);
 
 	return error;
 }
-- 
cgit v1.2.3


From 7b5fcb7fc05bdbce87e5bec9e358b059317ffb5f Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 24 Aug 2017 09:53:59 -0700
Subject: md/raid5: release/flush io in raid5_do_work()

commit 9c72a18e46ebe0f09484cce8ebf847abdab58498 upstream.

In raid5, there are scenarios where some ios are deferred to a later
time, and some IO need a flush to complete. To make sure we make
progress with these IOs, we need to call the following functions:

    flush_deferred_bios(conf);
    r5l_flush_stripe_to_raid(conf->log);

Both of these functions are called in raid5d(), but missing in
raid5_do_work(). As a result, these functions are not called
when multi-threading (group_thread_cnt > 0) is enabled. This patch
adds calls to these function to raid5_do_work().

Note for stable branches:

  r5l_flush_stripe_to_raid(conf->log) is need for 4.4+
  flush_deferred_bios(conf) is only needed for 4.11+

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/md/raid5.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 383f19c6bf24..549b4afd12e1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5844,6 +5844,8 @@ static void raid5_do_work(struct work_struct *work)
 
 	spin_unlock_irq(&conf->device_lock);
 
+	r5l_flush_stripe_to_raid(conf->log);
+
 	async_tx_issue_pending_all();
 	blk_finish_plug(&plug);
 
-- 
cgit v1.2.3


From ae04a8c4c6fc5b4aabfb166588045e2845b4d4e7 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 31 Aug 2017 15:11:06 -0700
Subject: xfs: fix compiler warnings

commit 7bf7a193a90cadccaad21c5970435c665c40fe27 upstream.

Fix up all the compiler warnings that have crept in.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_bmap.c       | 2 +-
 fs/xfs/libxfs/xfs_inode_fork.c | 9 +++------
 fs/xfs/xfs_buf_item.c          | 2 ++
 fs/xfs/xfs_iops.c              | 2 +-
 fs/xfs/xfs_log_recover.c       | 4 ++++
 5 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 2ab50caca14c..d2f4ab175096 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -579,7 +579,7 @@ xfs_bmap_validate_ret(
 
 #else
 #define xfs_bmap_check_leaf_extents(cur, ip, whichfork)		do { } while (0)
-#define	xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#define	xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)	do { } while (0)
 #endif /* DEBUG */
 
 /*
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 8a37efe04de3..4e30448c4465 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -1539,14 +1539,11 @@ xfs_iext_realloc_indirect(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	int		new_size)	/* new indirection array size */
 {
-	int		nlists;		/* number of irec's (ex lists) */
-	int		size;		/* current indirection array size */
-
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-	size = nlists * sizeof(xfs_ext_irec_t);
 	ASSERT(ifp->if_real_bytes);
-	ASSERT((new_size >= 0) && (new_size != size));
+	ASSERT((new_size >= 0) &&
+	       (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) *
+			     sizeof(xfs_ext_irec_t))));
 	if (new_size == 0) {
 		xfs_iext_destroy(ifp);
 	} else {
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index ef2c1375f092..e0a0af0946f2 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -570,7 +570,9 @@ xfs_buf_item_unlock(
 	bool			aborted = !!(lip->li_flags & XFS_LI_ABORTED);
 	bool			hold = !!(bip->bli_flags & XFS_BLI_HOLD);
 	bool			dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
+#if defined(DEBUG) || defined(XFS_WARN)
 	bool			ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
+#endif
 
 	/* Clear the buffer's association with this transaction. */
 	bp->b_transp = NULL;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a1247c3c1efb..5b81f7f41b80 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -802,7 +802,7 @@ xfs_vn_setattr_nonsize(
  * Caution: The caller of this function is responsible for calling
  * setattr_prepare() or otherwise verifying the change is fine.
  */
-int
+STATIC int
 xfs_setattr_size(
 	struct xfs_inode	*ip,
 	struct iattr		*iattr)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index edd849b8f14d..05909269f973 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4827,12 +4827,16 @@ xlog_recover_process_intents(
 	int			error = 0;
 	struct xfs_ail_cursor	cur;
 	struct xfs_ail		*ailp;
+#if defined(DEBUG) || defined(XFS_WARN)
 	xfs_lsn_t		last_lsn;
+#endif
 
 	ailp = log->l_ailp;
 	spin_lock(&ailp->xa_lock);
 	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+#if defined(DEBUG) || defined(XFS_WARN)
 	last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
+#endif
 	while (lip != NULL) {
 		/*
 		 * We're done when we see something other than an intent.
-- 
cgit v1.2.3


From 7829684088a216b8b53894768cd4f483c246cb94 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Fri, 25 Aug 2017 09:05:42 +0200
Subject: ipv6: Fix may be used uninitialized warning in rt6_check

commit 3614364527daa870264f6dde77f02853cdecd02c upstream.

rt_cookie might be used uninitialized, fix this by
initializing it.

Fixes: c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node")
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9c2dd3f77cdb..61729641e027 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1267,7 +1267,7 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt)
 
 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
 {
-	u32 rt_cookie;
+	u32 rt_cookie = 0;
 
 	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
 		return NULL;
-- 
cgit v1.2.3


From 089d7720383d7bc9ca6b8824a05dfa66f80d1f41 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 20 Sep 2017 08:20:15 +0200
Subject: Linux 4.9.51

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 038d126a15fc..b48aebbe187f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 9
-SUBLEVEL = 50
+SUBLEVEL = 51
 EXTRAVERSION =
 NAME = Roaring Lionus
 
-- 
cgit v1.2.3