blob: aac40aefa552ddb60130a6951ebe57c9ba92da36 [file] [log] [blame]
Arne Jansena2de7332011-03-08 14:14:00 +01001/*
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01002 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
Arne Jansena2de7332011-03-08 14:14:00 +01003 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
Arne Jansena2de7332011-03-08 14:14:00 +010019#include <linux/blkdev.h>
Jan Schmidt558540c2011-06-13 19:59:12 +020020#include <linux/ratelimit.h>
Arne Jansena2de7332011-03-08 14:14:00 +010021#include "ctree.h"
22#include "volumes.h"
23#include "disk-io.h"
24#include "ordered-data.h"
Jan Schmidt0ef8e452011-06-13 20:04:15 +020025#include "transaction.h"
Jan Schmidt558540c2011-06-13 19:59:12 +020026#include "backref.h"
Jan Schmidt5da6fcb2011-08-04 18:11:04 +020027#include "extent_io.h"
Stefan Behrensff023aa2012-11-06 11:43:11 +010028#include "dev-replace.h"
Stefan Behrens21adbd52011-11-09 13:44:05 +010029#include "check-integrity.h"
Josef Bacik606686e2012-06-04 14:03:51 -040030#include "rcu-string.h"
David Woodhouse53b381b2013-01-29 18:40:14 -050031#include "raid56.h"
Arne Jansena2de7332011-03-08 14:14:00 +010032
33/*
34 * This is only the first step towards a full-features scrub. It reads all
35 * extent and super block and verifies the checksums. In case a bad checksum
36 * is found or the extent cannot be read, good data will be written back if
37 * any can be found.
38 *
39 * Future enhancements:
Arne Jansena2de7332011-03-08 14:14:00 +010040 * - In case an unrepairable extent is encountered, track which files are
41 * affected and report them
Arne Jansena2de7332011-03-08 14:14:00 +010042 * - track and record media errors, throw out bad devices
Arne Jansena2de7332011-03-08 14:14:00 +010043 * - add a mode to also read unallocated space
Arne Jansena2de7332011-03-08 14:14:00 +010044 */
45
Stefan Behrensb5d67f62012-03-27 14:21:27 -040046struct scrub_block;
Stefan Behrensd9d181c2012-11-02 09:58:09 +010047struct scrub_ctx;
Arne Jansena2de7332011-03-08 14:14:00 +010048
Stefan Behrensff023aa2012-11-06 11:43:11 +010049/*
50 * the following three values only influence the performance.
51 * The last one configures the number of parallel and outstanding I/O
52 * operations. The first two values configure an upper limit for the number
53 * of (dynamically allocated) pages that are added to a bio.
54 */
55#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
56#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
57#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
Stefan Behrens7a9e9982012-11-02 14:58:04 +010058
59/*
60 * the following value times PAGE_SIZE needs to be large enough to match the
61 * largest node/leaf/sector size that shall be supported.
62 * Values larger than BTRFS_STRIPE_LEN are not supported.
63 */
Stefan Behrensb5d67f62012-03-27 14:21:27 -040064#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
Arne Jansena2de7332011-03-08 14:14:00 +010065
Miao Xieaf8e2d12014-10-23 14:42:50 +080066struct scrub_recover {
67 atomic_t refs;
68 struct btrfs_bio *bbio;
Miao Xieaf8e2d12014-10-23 14:42:50 +080069 u64 map_length;
70};
71
Arne Jansena2de7332011-03-08 14:14:00 +010072struct scrub_page {
Stefan Behrensb5d67f62012-03-27 14:21:27 -040073 struct scrub_block *sblock;
74 struct page *page;
Stefan Behrens442a4f62012-05-25 16:06:08 +020075 struct btrfs_device *dev;
Miao Xie5a6ac9e2014-11-06 17:20:58 +080076 struct list_head list;
Arne Jansena2de7332011-03-08 14:14:00 +010077 u64 flags; /* extent flags */
78 u64 generation;
Stefan Behrensb5d67f62012-03-27 14:21:27 -040079 u64 logical;
80 u64 physical;
Stefan Behrensff023aa2012-11-06 11:43:11 +010081 u64 physical_for_dev_replace;
Stefan Behrens7a9e9982012-11-02 14:58:04 +010082 atomic_t ref_count;
Stefan Behrensb5d67f62012-03-27 14:21:27 -040083 struct {
84 unsigned int mirror_num:8;
85 unsigned int have_csum:1;
86 unsigned int io_error:1;
87 };
Arne Jansena2de7332011-03-08 14:14:00 +010088 u8 csum[BTRFS_CSUM_SIZE];
Miao Xieaf8e2d12014-10-23 14:42:50 +080089
90 struct scrub_recover *recover;
Arne Jansena2de7332011-03-08 14:14:00 +010091};
92
93struct scrub_bio {
94 int index;
Stefan Behrensd9d181c2012-11-02 09:58:09 +010095 struct scrub_ctx *sctx;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +010096 struct btrfs_device *dev;
Arne Jansena2de7332011-03-08 14:14:00 +010097 struct bio *bio;
98 int err;
99 u64 logical;
100 u64 physical;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100101#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
102 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
103#else
104 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
105#endif
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400106 int page_count;
Arne Jansena2de7332011-03-08 14:14:00 +0100107 int next_free;
108 struct btrfs_work work;
109};
110
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400111struct scrub_block {
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100112 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400113 int page_count;
114 atomic_t outstanding_pages;
115 atomic_t ref_count; /* free mem on transition to zero */
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100116 struct scrub_ctx *sctx;
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800117 struct scrub_parity *sparity;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400118 struct {
119 unsigned int header_error:1;
120 unsigned int checksum_error:1;
121 unsigned int no_io_error_seen:1;
Stefan Behrens442a4f62012-05-25 16:06:08 +0200122 unsigned int generation_error:1; /* also sets header_error */
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800123
124 /* The following is for the data used to check parity */
125 /* It is for the data with checksum */
126 unsigned int data_corrected:1;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400127 };
128};
129
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800130/* Used for the chunks with parity stripe such RAID5/6 */
131struct scrub_parity {
132 struct scrub_ctx *sctx;
133
134 struct btrfs_device *scrub_dev;
135
136 u64 logic_start;
137
138 u64 logic_end;
139
140 int nsectors;
141
142 int stripe_len;
143
144 atomic_t ref_count;
145
146 struct list_head spages;
147
148 /* Work of parity check and repair */
149 struct btrfs_work work;
150
151 /* Mark the parity blocks which have data */
152 unsigned long *dbitmap;
153
154 /*
155 * Mark the parity blocks which have data, but errors happen when
156 * read data or check data
157 */
158 unsigned long *ebitmap;
159
160 unsigned long bitmap[0];
161};
162
Stefan Behrensff023aa2012-11-06 11:43:11 +0100163struct scrub_wr_ctx {
164 struct scrub_bio *wr_curr_bio;
165 struct btrfs_device *tgtdev;
166 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
167 atomic_t flush_all_writes;
168 struct mutex wr_lock;
169};
170
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100171struct scrub_ctx {
Stefan Behrensff023aa2012-11-06 11:43:11 +0100172 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100173 struct btrfs_root *dev_root;
Arne Jansena2de7332011-03-08 14:14:00 +0100174 int first_free;
175 int curr;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100176 atomic_t bios_in_flight;
177 atomic_t workers_pending;
Arne Jansena2de7332011-03-08 14:14:00 +0100178 spinlock_t list_lock;
179 wait_queue_head_t list_wait;
180 u16 csum_size;
181 struct list_head csum_list;
182 atomic_t cancel_req;
Arne Jansen86287642011-03-23 16:34:19 +0100183 int readonly;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100184 int pages_per_rd_bio;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400185 u32 sectorsize;
186 u32 nodesize;
Stefan Behrens63a212a2012-11-05 18:29:28 +0100187
188 int is_dev_replace;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100189 struct scrub_wr_ctx wr_ctx;
Stefan Behrens63a212a2012-11-05 18:29:28 +0100190
Arne Jansena2de7332011-03-08 14:14:00 +0100191 /*
192 * statistics
193 */
194 struct btrfs_scrub_progress stat;
195 spinlock_t stat_lock;
196};
197
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200198struct scrub_fixup_nodatasum {
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100199 struct scrub_ctx *sctx;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100200 struct btrfs_device *dev;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200201 u64 logical;
202 struct btrfs_root *root;
203 struct btrfs_work work;
204 int mirror_num;
205};
206
Josef Bacik652f25a2013-09-12 16:58:28 -0400207struct scrub_nocow_inode {
208 u64 inum;
209 u64 offset;
210 u64 root;
211 struct list_head list;
212};
213
Stefan Behrensff023aa2012-11-06 11:43:11 +0100214struct scrub_copy_nocow_ctx {
215 struct scrub_ctx *sctx;
216 u64 logical;
217 u64 len;
218 int mirror_num;
219 u64 physical_for_dev_replace;
Josef Bacik652f25a2013-09-12 16:58:28 -0400220 struct list_head inodes;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100221 struct btrfs_work work;
222};
223
Jan Schmidt558540c2011-06-13 19:59:12 +0200224struct scrub_warning {
225 struct btrfs_path *path;
226 u64 extent_item_size;
Jan Schmidt558540c2011-06-13 19:59:12 +0200227 const char *errstr;
228 sector_t sector;
229 u64 logical;
230 struct btrfs_device *dev;
Jan Schmidt558540c2011-06-13 19:59:12 +0200231};
232
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100233static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
234static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
235static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
236static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400237static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100238static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
Stefan Behrens3ec706c2012-11-05 15:46:42 +0100239 struct btrfs_fs_info *fs_info,
Stefan Behrensff023aa2012-11-06 11:43:11 +0100240 struct scrub_block *original_sblock,
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400241 u64 length, u64 logical,
Stefan Behrensff023aa2012-11-06 11:43:11 +0100242 struct scrub_block *sblocks_for_recheck);
Stefan Behrens34f5c8e2012-11-02 16:16:26 +0100243static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
244 struct scrub_block *sblock, int is_metadata,
245 int have_csum, u8 *csum, u64 generation,
Miao Xieaf8e2d12014-10-23 14:42:50 +0800246 u16 csum_size, int retry_failed_mirror);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400247static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
248 struct scrub_block *sblock,
249 int is_metadata, int have_csum,
250 const u8 *csum, u64 generation,
251 u16 csum_size);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400252static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
Zhao Lei114ab502015-01-20 15:11:36 +0800253 struct scrub_block *sblock_good);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400254static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
255 struct scrub_block *sblock_good,
256 int page_num, int force_write);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100257static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
258static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
259 int page_num);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400260static int scrub_checksum_data(struct scrub_block *sblock);
261static int scrub_checksum_tree_block(struct scrub_block *sblock);
262static int scrub_checksum_super(struct scrub_block *sblock);
263static void scrub_block_get(struct scrub_block *sblock);
264static void scrub_block_put(struct scrub_block *sblock);
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100265static void scrub_page_get(struct scrub_page *spage);
266static void scrub_page_put(struct scrub_page *spage);
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800267static void scrub_parity_get(struct scrub_parity *sparity);
268static void scrub_parity_put(struct scrub_parity *sparity);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100269static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
270 struct scrub_page *spage);
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100271static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100272 u64 physical, struct btrfs_device *dev, u64 flags,
Stefan Behrensff023aa2012-11-06 11:43:11 +0100273 u64 gen, int mirror_num, u8 *csum, int force,
274 u64 physical_for_dev_replace);
Stefan Behrens1623ede2012-03-27 14:21:26 -0400275static void scrub_bio_end_io(struct bio *bio, int err);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400276static void scrub_bio_end_io_worker(struct btrfs_work *work);
277static void scrub_block_complete(struct scrub_block *sblock);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100278static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
279 u64 extent_logical, u64 extent_len,
280 u64 *extent_physical,
281 struct btrfs_device **extent_dev,
282 int *extent_mirror_num);
283static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
284 struct scrub_wr_ctx *wr_ctx,
285 struct btrfs_fs_info *fs_info,
286 struct btrfs_device *dev,
287 int is_dev_replace);
288static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
289static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
290 struct scrub_page *spage);
291static void scrub_wr_submit(struct scrub_ctx *sctx);
292static void scrub_wr_bio_end_io(struct bio *bio, int err);
293static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
294static int write_page_nocow(struct scrub_ctx *sctx,
295 u64 physical_for_dev_replace, struct page *page);
296static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
Josef Bacik652f25a2013-09-12 16:58:28 -0400297 struct scrub_copy_nocow_ctx *ctx);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100298static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
299 int mirror_num, u64 physical_for_dev_replace);
300static void copy_nocow_pages_worker(struct btrfs_work *work);
Wang Shilongcb7ab022013-12-04 21:16:53 +0800301static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
Wang Shilong3cb09292013-12-04 21:15:19 +0800302static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
Stefan Behrens1623ede2012-03-27 14:21:26 -0400303
304
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100305static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
306{
307 atomic_inc(&sctx->bios_in_flight);
308}
309
310static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
311{
312 atomic_dec(&sctx->bios_in_flight);
313 wake_up(&sctx->list_wait);
314}
315
Wang Shilongcb7ab022013-12-04 21:16:53 +0800316static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
Wang Shilong3cb09292013-12-04 21:15:19 +0800317{
318 while (atomic_read(&fs_info->scrub_pause_req)) {
319 mutex_unlock(&fs_info->scrub_lock);
320 wait_event(fs_info->scrub_pause_wait,
321 atomic_read(&fs_info->scrub_pause_req) == 0);
322 mutex_lock(&fs_info->scrub_lock);
323 }
324}
325
Wang Shilongcb7ab022013-12-04 21:16:53 +0800326static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
327{
328 atomic_inc(&fs_info->scrubs_paused);
329 wake_up(&fs_info->scrub_pause_wait);
330
331 mutex_lock(&fs_info->scrub_lock);
332 __scrub_blocked_if_needed(fs_info);
333 atomic_dec(&fs_info->scrubs_paused);
334 mutex_unlock(&fs_info->scrub_lock);
335
336 wake_up(&fs_info->scrub_pause_wait);
337}
338
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100339/*
340 * used for workers that require transaction commits (i.e., for the
341 * NOCOW case)
342 */
343static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
344{
345 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
346
347 /*
348 * increment scrubs_running to prevent cancel requests from
349 * completing as long as a worker is running. we must also
350 * increment scrubs_paused to prevent deadlocking on pause
351 * requests used for transactions commits (as the worker uses a
352 * transaction context). it is safe to regard the worker
353 * as paused for all matters practical. effectively, we only
354 * avoid cancellation requests from completing.
355 */
356 mutex_lock(&fs_info->scrub_lock);
357 atomic_inc(&fs_info->scrubs_running);
358 atomic_inc(&fs_info->scrubs_paused);
359 mutex_unlock(&fs_info->scrub_lock);
Wang Shilong32a44782014-02-19 19:24:19 +0800360
361 /*
362 * check if @scrubs_running=@scrubs_paused condition
363 * inside wait_event() is not an atomic operation.
364 * which means we may inc/dec @scrub_running/paused
365 * at any time. Let's wake up @scrub_pause_wait as
366 * much as we can to let commit transaction blocked less.
367 */
368 wake_up(&fs_info->scrub_pause_wait);
369
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100370 atomic_inc(&sctx->workers_pending);
371}
372
373/* used for workers that require transaction commits */
374static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
375{
376 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
377
378 /*
379 * see scrub_pending_trans_workers_inc() why we're pretending
380 * to be paused in the scrub counters
381 */
382 mutex_lock(&fs_info->scrub_lock);
383 atomic_dec(&fs_info->scrubs_running);
384 atomic_dec(&fs_info->scrubs_paused);
385 mutex_unlock(&fs_info->scrub_lock);
386 atomic_dec(&sctx->workers_pending);
387 wake_up(&fs_info->scrub_pause_wait);
388 wake_up(&sctx->list_wait);
389}
390
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100391static void scrub_free_csums(struct scrub_ctx *sctx)
Arne Jansena2de7332011-03-08 14:14:00 +0100392{
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100393 while (!list_empty(&sctx->csum_list)) {
Arne Jansena2de7332011-03-08 14:14:00 +0100394 struct btrfs_ordered_sum *sum;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100395 sum = list_first_entry(&sctx->csum_list,
Arne Jansena2de7332011-03-08 14:14:00 +0100396 struct btrfs_ordered_sum, list);
397 list_del(&sum->list);
398 kfree(sum);
399 }
400}
401
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100402static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
Arne Jansena2de7332011-03-08 14:14:00 +0100403{
404 int i;
Arne Jansena2de7332011-03-08 14:14:00 +0100405
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100406 if (!sctx)
Arne Jansena2de7332011-03-08 14:14:00 +0100407 return;
408
Stefan Behrensff023aa2012-11-06 11:43:11 +0100409 scrub_free_wr_ctx(&sctx->wr_ctx);
410
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400411 /* this can happen when scrub is cancelled */
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100412 if (sctx->curr != -1) {
413 struct scrub_bio *sbio = sctx->bios[sctx->curr];
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400414
415 for (i = 0; i < sbio->page_count; i++) {
Stefan Behrensff023aa2012-11-06 11:43:11 +0100416 WARN_ON(!sbio->pagev[i]->page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400417 scrub_block_put(sbio->pagev[i]->sblock);
418 }
419 bio_put(sbio->bio);
420 }
421
Stefan Behrensff023aa2012-11-06 11:43:11 +0100422 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100423 struct scrub_bio *sbio = sctx->bios[i];
Arne Jansena2de7332011-03-08 14:14:00 +0100424
425 if (!sbio)
426 break;
Arne Jansena2de7332011-03-08 14:14:00 +0100427 kfree(sbio);
428 }
429
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100430 scrub_free_csums(sctx);
431 kfree(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +0100432}
433
434static noinline_for_stack
Stefan Behrens63a212a2012-11-05 18:29:28 +0100435struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +0100436{
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100437 struct scrub_ctx *sctx;
Arne Jansena2de7332011-03-08 14:14:00 +0100438 int i;
Arne Jansena2de7332011-03-08 14:14:00 +0100439 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100440 int pages_per_rd_bio;
441 int ret;
Arne Jansena2de7332011-03-08 14:14:00 +0100442
Stefan Behrensff023aa2012-11-06 11:43:11 +0100443 /*
444 * the setting of pages_per_rd_bio is correct for scrub but might
445 * be wrong for the dev_replace code where we might read from
446 * different devices in the initial huge bios. However, that
447 * code is able to correctly handle the case when adding a page
448 * to a bio fails.
449 */
450 if (dev->bdev)
451 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
452 bio_get_nr_vecs(dev->bdev));
453 else
454 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100455 sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
456 if (!sctx)
Arne Jansena2de7332011-03-08 14:14:00 +0100457 goto nomem;
Stefan Behrens63a212a2012-11-05 18:29:28 +0100458 sctx->is_dev_replace = is_dev_replace;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100459 sctx->pages_per_rd_bio = pages_per_rd_bio;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100460 sctx->curr = -1;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100461 sctx->dev_root = dev->dev_root;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100462 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
Arne Jansena2de7332011-03-08 14:14:00 +0100463 struct scrub_bio *sbio;
464
465 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
466 if (!sbio)
467 goto nomem;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100468 sctx->bios[i] = sbio;
Arne Jansena2de7332011-03-08 14:14:00 +0100469
Arne Jansena2de7332011-03-08 14:14:00 +0100470 sbio->index = i;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100471 sbio->sctx = sctx;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400472 sbio->page_count = 0;
Liu Bo9e0af232014-08-15 23:36:53 +0800473 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
474 scrub_bio_end_io_worker, NULL, NULL);
Arne Jansena2de7332011-03-08 14:14:00 +0100475
Stefan Behrensff023aa2012-11-06 11:43:11 +0100476 if (i != SCRUB_BIOS_PER_SCTX - 1)
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100477 sctx->bios[i]->next_free = i + 1;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200478 else
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100479 sctx->bios[i]->next_free = -1;
Arne Jansena2de7332011-03-08 14:14:00 +0100480 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100481 sctx->first_free = 0;
482 sctx->nodesize = dev->dev_root->nodesize;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100483 sctx->sectorsize = dev->dev_root->sectorsize;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100484 atomic_set(&sctx->bios_in_flight, 0);
485 atomic_set(&sctx->workers_pending, 0);
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100486 atomic_set(&sctx->cancel_req, 0);
487 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
488 INIT_LIST_HEAD(&sctx->csum_list);
Arne Jansena2de7332011-03-08 14:14:00 +0100489
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100490 spin_lock_init(&sctx->list_lock);
491 spin_lock_init(&sctx->stat_lock);
492 init_waitqueue_head(&sctx->list_wait);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100493
494 ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
495 fs_info->dev_replace.tgtdev, is_dev_replace);
496 if (ret) {
497 scrub_free_ctx(sctx);
498 return ERR_PTR(ret);
499 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100500 return sctx;
Arne Jansena2de7332011-03-08 14:14:00 +0100501
502nomem:
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100503 scrub_free_ctx(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +0100504 return ERR_PTR(-ENOMEM);
505}
506
Stefan Behrensff023aa2012-11-06 11:43:11 +0100507static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
508 void *warn_ctx)
Jan Schmidt558540c2011-06-13 19:59:12 +0200509{
510 u64 isize;
511 u32 nlink;
512 int ret;
513 int i;
514 struct extent_buffer *eb;
515 struct btrfs_inode_item *inode_item;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100516 struct scrub_warning *swarn = warn_ctx;
Jan Schmidt558540c2011-06-13 19:59:12 +0200517 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
518 struct inode_fs_paths *ipath = NULL;
519 struct btrfs_root *local_root;
520 struct btrfs_key root_key;
David Sterba1d4c08e2015-01-02 19:36:14 +0100521 struct btrfs_key key;
Jan Schmidt558540c2011-06-13 19:59:12 +0200522
523 root_key.objectid = root;
524 root_key.type = BTRFS_ROOT_ITEM_KEY;
525 root_key.offset = (u64)-1;
526 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
527 if (IS_ERR(local_root)) {
528 ret = PTR_ERR(local_root);
529 goto err;
530 }
531
David Sterba14692cc2015-01-02 18:55:46 +0100532 /*
533 * this makes the path point to (inum INODE_ITEM ioff)
534 */
David Sterba1d4c08e2015-01-02 19:36:14 +0100535 key.objectid = inum;
536 key.type = BTRFS_INODE_ITEM_KEY;
537 key.offset = 0;
538
539 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
Jan Schmidt558540c2011-06-13 19:59:12 +0200540 if (ret) {
541 btrfs_release_path(swarn->path);
542 goto err;
543 }
544
545 eb = swarn->path->nodes[0];
546 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
547 struct btrfs_inode_item);
548 isize = btrfs_inode_size(eb, inode_item);
549 nlink = btrfs_inode_nlink(eb, inode_item);
550 btrfs_release_path(swarn->path);
551
552 ipath = init_ipath(4096, local_root, swarn->path);
Dan Carpenter26bdef52011-11-16 11:28:01 +0300553 if (IS_ERR(ipath)) {
554 ret = PTR_ERR(ipath);
555 ipath = NULL;
556 goto err;
557 }
Jan Schmidt558540c2011-06-13 19:59:12 +0200558 ret = paths_from_inode(inum, ipath);
559
560 if (ret < 0)
561 goto err;
562
563 /*
564 * we deliberately ignore the bit ipath might have been too small to
565 * hold all of the paths here
566 */
567 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
Frank Holtonefe120a2013-12-20 11:37:06 -0500568 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
Jan Schmidt558540c2011-06-13 19:59:12 +0200569 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
570 "length %llu, links %u (path: %s)\n", swarn->errstr,
Josef Bacik606686e2012-06-04 14:03:51 -0400571 swarn->logical, rcu_str_deref(swarn->dev->name),
Jan Schmidt558540c2011-06-13 19:59:12 +0200572 (unsigned long long)swarn->sector, root, inum, offset,
573 min(isize - offset, (u64)PAGE_SIZE), nlink,
Jeff Mahoney745c4d82011-11-20 07:31:57 -0500574 (char *)(unsigned long)ipath->fspath->val[i]);
Jan Schmidt558540c2011-06-13 19:59:12 +0200575
576 free_ipath(ipath);
577 return 0;
578
579err:
Frank Holtonefe120a2013-12-20 11:37:06 -0500580 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
Jan Schmidt558540c2011-06-13 19:59:12 +0200581 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
582 "resolving failed with ret=%d\n", swarn->errstr,
Josef Bacik606686e2012-06-04 14:03:51 -0400583 swarn->logical, rcu_str_deref(swarn->dev->name),
Jan Schmidt558540c2011-06-13 19:59:12 +0200584 (unsigned long long)swarn->sector, root, inum, offset, ret);
585
586 free_ipath(ipath);
587 return 0;
588}
589
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400590static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
Jan Schmidt558540c2011-06-13 19:59:12 +0200591{
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100592 struct btrfs_device *dev;
593 struct btrfs_fs_info *fs_info;
Jan Schmidt558540c2011-06-13 19:59:12 +0200594 struct btrfs_path *path;
595 struct btrfs_key found_key;
596 struct extent_buffer *eb;
597 struct btrfs_extent_item *ei;
598 struct scrub_warning swarn;
Jan Schmidt558540c2011-06-13 19:59:12 +0200599 unsigned long ptr = 0;
Jan Schmidt4692cf52011-12-02 14:56:41 +0100600 u64 extent_item_pos;
Liu Bo69917e42012-09-07 20:01:28 -0600601 u64 flags = 0;
602 u64 ref_root;
603 u32 item_size;
604 u8 ref_level;
Liu Bo69917e42012-09-07 20:01:28 -0600605 int ret;
Jan Schmidt558540c2011-06-13 19:59:12 +0200606
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100607 WARN_ON(sblock->page_count < 1);
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100608 dev = sblock->pagev[0]->dev;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100609 fs_info = sblock->sctx->dev_root->fs_info;
610
Jan Schmidt558540c2011-06-13 19:59:12 +0200611 path = btrfs_alloc_path();
David Sterba8b9456d2014-07-30 01:25:30 +0200612 if (!path)
613 return;
Jan Schmidt558540c2011-06-13 19:59:12 +0200614
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100615 swarn.sector = (sblock->pagev[0]->physical) >> 9;
616 swarn.logical = sblock->pagev[0]->logical;
Jan Schmidt558540c2011-06-13 19:59:12 +0200617 swarn.errstr = errstr;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100618 swarn.dev = NULL;
Jan Schmidt558540c2011-06-13 19:59:12 +0200619
Liu Bo69917e42012-09-07 20:01:28 -0600620 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
621 &flags);
Jan Schmidt558540c2011-06-13 19:59:12 +0200622 if (ret < 0)
623 goto out;
624
Jan Schmidt4692cf52011-12-02 14:56:41 +0100625 extent_item_pos = swarn.logical - found_key.objectid;
Jan Schmidt558540c2011-06-13 19:59:12 +0200626 swarn.extent_item_size = found_key.offset;
627
628 eb = path->nodes[0];
629 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
630 item_size = btrfs_item_size_nr(eb, path->slots[0]);
631
Liu Bo69917e42012-09-07 20:01:28 -0600632 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
Jan Schmidt558540c2011-06-13 19:59:12 +0200633 do {
Liu Bo6eda71d2014-06-09 10:54:07 +0800634 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
635 item_size, &ref_root,
636 &ref_level);
Josef Bacik606686e2012-06-04 14:03:51 -0400637 printk_in_rcu(KERN_WARNING
Frank Holtonefe120a2013-12-20 11:37:06 -0500638 "BTRFS: %s at logical %llu on dev %s, "
Jan Schmidt558540c2011-06-13 19:59:12 +0200639 "sector %llu: metadata %s (level %d) in tree "
Josef Bacik606686e2012-06-04 14:03:51 -0400640 "%llu\n", errstr, swarn.logical,
641 rcu_str_deref(dev->name),
Jan Schmidt558540c2011-06-13 19:59:12 +0200642 (unsigned long long)swarn.sector,
643 ref_level ? "node" : "leaf",
644 ret < 0 ? -1 : ref_level,
645 ret < 0 ? -1 : ref_root);
646 } while (ret != 1);
Josef Bacikd8fe29e2013-03-29 08:09:34 -0600647 btrfs_release_path(path);
Jan Schmidt558540c2011-06-13 19:59:12 +0200648 } else {
Josef Bacikd8fe29e2013-03-29 08:09:34 -0600649 btrfs_release_path(path);
Jan Schmidt558540c2011-06-13 19:59:12 +0200650 swarn.path = path;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100651 swarn.dev = dev;
Jan Schmidt7a3ae2f2012-03-23 17:32:28 +0100652 iterate_extent_inodes(fs_info, found_key.objectid,
653 extent_item_pos, 1,
Jan Schmidt558540c2011-06-13 19:59:12 +0200654 scrub_print_warning_inode, &swarn);
655 }
656
657out:
658 btrfs_free_path(path);
Jan Schmidt558540c2011-06-13 19:59:12 +0200659}
660
Stefan Behrensff023aa2012-11-06 11:43:11 +0100661static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200662{
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200663 struct page *page = NULL;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200664 unsigned long index;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100665 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200666 int ret;
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200667 int corrected = 0;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200668 struct btrfs_key key;
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200669 struct inode *inode = NULL;
Liu Bo6f1c3602013-01-29 03:22:10 +0000670 struct btrfs_fs_info *fs_info;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200671 u64 end = offset + PAGE_SIZE - 1;
672 struct btrfs_root *local_root;
Liu Bo6f1c3602013-01-29 03:22:10 +0000673 int srcu_index;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200674
675 key.objectid = root;
676 key.type = BTRFS_ROOT_ITEM_KEY;
677 key.offset = (u64)-1;
Liu Bo6f1c3602013-01-29 03:22:10 +0000678
679 fs_info = fixup->root->fs_info;
680 srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
681
682 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
683 if (IS_ERR(local_root)) {
684 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200685 return PTR_ERR(local_root);
Liu Bo6f1c3602013-01-29 03:22:10 +0000686 }
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200687
688 key.type = BTRFS_INODE_ITEM_KEY;
689 key.objectid = inum;
690 key.offset = 0;
Liu Bo6f1c3602013-01-29 03:22:10 +0000691 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
692 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200693 if (IS_ERR(inode))
694 return PTR_ERR(inode);
695
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200696 index = offset >> PAGE_CACHE_SHIFT;
697
698 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200699 if (!page) {
700 ret = -ENOMEM;
701 goto out;
702 }
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200703
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200704 if (PageUptodate(page)) {
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200705 if (PageDirty(page)) {
706 /*
707 * we need to write the data to the defect sector. the
708 * data that was in that sector is not in memory,
709 * because the page was modified. we must not write the
710 * modified page to that sector.
711 *
712 * TODO: what could be done here: wait for the delalloc
713 * runner to write out that page (might involve
714 * COW) and see whether the sector is still
715 * referenced afterwards.
716 *
717 * For the meantime, we'll treat this error
718 * incorrectable, although there is a chance that a
719 * later scrub will find the bad sector again and that
720 * there's no dirty page in memory, then.
721 */
722 ret = -EIO;
723 goto out;
724 }
Miao Xie1203b682014-09-12 18:44:01 +0800725 ret = repair_io_failure(inode, offset, PAGE_SIZE,
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200726 fixup->logical, page,
Miao Xieffdd2012014-09-12 18:44:00 +0800727 offset - page_offset(page),
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200728 fixup->mirror_num);
729 unlock_page(page);
730 corrected = !ret;
731 } else {
732 /*
733 * we need to get good data first. the general readpage path
734 * will call repair_io_failure for us, we just have to make
735 * sure we read the bad mirror.
736 */
737 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
738 EXTENT_DAMAGED, GFP_NOFS);
739 if (ret) {
740 /* set_extent_bits should give proper error */
741 WARN_ON(ret > 0);
742 if (ret > 0)
743 ret = -EFAULT;
744 goto out;
745 }
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200746
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200747 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
748 btrfs_get_extent,
749 fixup->mirror_num);
750 wait_on_page_locked(page);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200751
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200752 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
753 end, EXTENT_DAMAGED, 0, NULL);
754 if (!corrected)
755 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
756 EXTENT_DAMAGED, GFP_NOFS);
757 }
758
759out:
760 if (page)
761 put_page(page);
Tobias Klauser7fb18a02014-04-25 14:58:05 +0200762
763 iput(inode);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200764
765 if (ret < 0)
766 return ret;
767
768 if (ret == 0 && corrected) {
769 /*
770 * we only need to call readpage for one of the inodes belonging
771 * to this extent. so make iterate_extent_inodes stop
772 */
773 return 1;
774 }
775
776 return -EIO;
777}
778
779static void scrub_fixup_nodatasum(struct btrfs_work *work)
780{
781 int ret;
782 struct scrub_fixup_nodatasum *fixup;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100783 struct scrub_ctx *sctx;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200784 struct btrfs_trans_handle *trans = NULL;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200785 struct btrfs_path *path;
786 int uncorrectable = 0;
787
788 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100789 sctx = fixup->sctx;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200790
791 path = btrfs_alloc_path();
792 if (!path) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100793 spin_lock(&sctx->stat_lock);
794 ++sctx->stat.malloc_errors;
795 spin_unlock(&sctx->stat_lock);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200796 uncorrectable = 1;
797 goto out;
798 }
799
800 trans = btrfs_join_transaction(fixup->root);
801 if (IS_ERR(trans)) {
802 uncorrectable = 1;
803 goto out;
804 }
805
806 /*
807 * the idea is to trigger a regular read through the standard path. we
808 * read a page from the (failed) logical address by specifying the
809 * corresponding copynum of the failed sector. thus, that readpage is
810 * expected to fail.
811 * that is the point where on-the-fly error correction will kick in
812 * (once it's finished) and rewrite the failed sector if a good copy
813 * can be found.
814 */
815 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
816 path, scrub_fixup_readpage,
817 fixup);
818 if (ret < 0) {
819 uncorrectable = 1;
820 goto out;
821 }
822 WARN_ON(ret != 1);
823
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100824 spin_lock(&sctx->stat_lock);
825 ++sctx->stat.corrected_errors;
826 spin_unlock(&sctx->stat_lock);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200827
828out:
829 if (trans && !IS_ERR(trans))
830 btrfs_end_transaction(trans, fixup->root);
831 if (uncorrectable) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100832 spin_lock(&sctx->stat_lock);
833 ++sctx->stat.uncorrectable_errors;
834 spin_unlock(&sctx->stat_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100835 btrfs_dev_replace_stats_inc(
836 &sctx->dev_root->fs_info->dev_replace.
837 num_uncorrectable_read_errors);
Frank Holtonefe120a2013-12-20 11:37:06 -0500838 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
839 "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
Geert Uytterhoevenc1c9ff72013-08-20 13:20:07 +0200840 fixup->logical, rcu_str_deref(fixup->dev->name));
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200841 }
842
843 btrfs_free_path(path);
844 kfree(fixup);
845
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100846 scrub_pending_trans_workers_dec(sctx);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200847}
848
Miao Xieaf8e2d12014-10-23 14:42:50 +0800849static inline void scrub_get_recover(struct scrub_recover *recover)
850{
851 atomic_inc(&recover->refs);
852}
853
854static inline void scrub_put_recover(struct scrub_recover *recover)
855{
856 if (atomic_dec_and_test(&recover->refs)) {
Zhao Lei6e9606d2015-01-20 15:11:34 +0800857 btrfs_put_bbio(recover->bbio);
Miao Xieaf8e2d12014-10-23 14:42:50 +0800858 kfree(recover);
859 }
860}
861
Arne Jansena2de7332011-03-08 14:14:00 +0100862/*
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400863 * scrub_handle_errored_block gets called when either verification of the
864 * pages failed or the bio failed to read, e.g. with EIO. In the latter
865 * case, this function handles all pages in the bio, even though only one
866 * may be bad.
867 * The goal of this function is to repair the errored block by using the
868 * contents of one of the mirrors.
Arne Jansena2de7332011-03-08 14:14:00 +0100869 */
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400870static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
Arne Jansena2de7332011-03-08 14:14:00 +0100871{
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100872 struct scrub_ctx *sctx = sblock_to_check->sctx;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100873 struct btrfs_device *dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400874 struct btrfs_fs_info *fs_info;
Arne Jansena2de7332011-03-08 14:14:00 +0100875 u64 length;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400876 u64 logical;
877 u64 generation;
878 unsigned int failed_mirror_index;
879 unsigned int is_metadata;
880 unsigned int have_csum;
881 u8 *csum;
882 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
883 struct scrub_block *sblock_bad;
Arne Jansena2de7332011-03-08 14:14:00 +0100884 int ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400885 int mirror_index;
886 int page_num;
887 int success;
888 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
889 DEFAULT_RATELIMIT_BURST);
Arne Jansena2de7332011-03-08 14:14:00 +0100890
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400891 BUG_ON(sblock_to_check->page_count < 1);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100892 fs_info = sctx->dev_root->fs_info;
Stefan Behrens4ded4f62012-11-14 18:57:29 +0000893 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
894 /*
895 * if we find an error in a super block, we just report it.
896 * They will get written with the next transaction commit
897 * anyway
898 */
899 spin_lock(&sctx->stat_lock);
900 ++sctx->stat.super_errors;
901 spin_unlock(&sctx->stat_lock);
902 return 0;
903 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400904 length = sblock_to_check->page_count * PAGE_SIZE;
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100905 logical = sblock_to_check->pagev[0]->logical;
906 generation = sblock_to_check->pagev[0]->generation;
907 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
908 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
909 is_metadata = !(sblock_to_check->pagev[0]->flags &
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400910 BTRFS_EXTENT_FLAG_DATA);
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100911 have_csum = sblock_to_check->pagev[0]->have_csum;
912 csum = sblock_to_check->pagev[0]->csum;
913 dev = sblock_to_check->pagev[0]->dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400914
Stefan Behrensff023aa2012-11-06 11:43:11 +0100915 if (sctx->is_dev_replace && !is_metadata && !have_csum) {
916 sblocks_for_recheck = NULL;
917 goto nodatasum_case;
918 }
919
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400920 /*
921 * read all mirrors one after the other. This includes to
922 * re-read the extent or metadata block that failed (that was
923 * the cause that this fixup code is called) another time,
924 * page by page this time in order to know which pages
925 * caused I/O errors and which ones are good (for all mirrors).
926 * It is the goal to handle the situation when more than one
927 * mirror contains I/O errors, but the errors do not
928 * overlap, i.e. the data can be repaired by selecting the
929 * pages from those mirrors without I/O error on the
930 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
931 * would be that mirror #1 has an I/O error on the first page,
932 * the second page is good, and mirror #2 has an I/O error on
933 * the second page, but the first page is good.
934 * Then the first page of the first mirror can be repaired by
935 * taking the first page of the second mirror, and the
936 * second page of the second mirror can be repaired by
937 * copying the contents of the 2nd page of the 1st mirror.
938 * One more note: if the pages of one mirror contain I/O
939 * errors, the checksum cannot be verified. In order to get
940 * the best data for repairing, the first attempt is to find
941 * a mirror without I/O errors and with a validated checksum.
942 * Only if this is not possible, the pages are picked from
943 * mirrors with I/O errors without considering the checksum.
944 * If the latter is the case, at the end, the checksum of the
945 * repaired area is verified in order to correctly maintain
946 * the statistics.
947 */
948
949 sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
950 sizeof(*sblocks_for_recheck),
951 GFP_NOFS);
952 if (!sblocks_for_recheck) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100953 spin_lock(&sctx->stat_lock);
954 sctx->stat.malloc_errors++;
955 sctx->stat.read_errors++;
956 sctx->stat.uncorrectable_errors++;
957 spin_unlock(&sctx->stat_lock);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100958 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400959 goto out;
960 }
961
962 /* setup the context, map the logical blocks and alloc the pages */
Stefan Behrensff023aa2012-11-06 11:43:11 +0100963 ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400964 logical, sblocks_for_recheck);
965 if (ret) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100966 spin_lock(&sctx->stat_lock);
967 sctx->stat.read_errors++;
968 sctx->stat.uncorrectable_errors++;
969 spin_unlock(&sctx->stat_lock);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100970 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400971 goto out;
972 }
973 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
974 sblock_bad = sblocks_for_recheck + failed_mirror_index;
975
976 /* build and submit the bios for the failed mirror, check checksums */
Stefan Behrens34f5c8e2012-11-02 16:16:26 +0100977 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
Miao Xieaf8e2d12014-10-23 14:42:50 +0800978 csum, generation, sctx->csum_size, 1);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400979
980 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
981 sblock_bad->no_io_error_seen) {
982 /*
983 * the error disappeared after reading page by page, or
984 * the area was part of a huge bio and other parts of the
985 * bio caused I/O errors, or the block layer merged several
986 * read requests into one and the error is caused by a
987 * different bio (usually one of the two latter cases is
988 * the cause)
989 */
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100990 spin_lock(&sctx->stat_lock);
991 sctx->stat.unverified_errors++;
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800992 sblock_to_check->data_corrected = 1;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100993 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400994
Stefan Behrensff023aa2012-11-06 11:43:11 +0100995 if (sctx->is_dev_replace)
996 scrub_write_block_to_dev_replace(sblock_bad);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400997 goto out;
998 }
999
1000 if (!sblock_bad->no_io_error_seen) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001001 spin_lock(&sctx->stat_lock);
1002 sctx->stat.read_errors++;
1003 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001004 if (__ratelimit(&_rs))
1005 scrub_print_warning("i/o error", sblock_to_check);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001006 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001007 } else if (sblock_bad->checksum_error) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001008 spin_lock(&sctx->stat_lock);
1009 sctx->stat.csum_errors++;
1010 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001011 if (__ratelimit(&_rs))
1012 scrub_print_warning("checksum error", sblock_to_check);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001013 btrfs_dev_stat_inc_and_print(dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02001014 BTRFS_DEV_STAT_CORRUPTION_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001015 } else if (sblock_bad->header_error) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001016 spin_lock(&sctx->stat_lock);
1017 sctx->stat.verify_errors++;
1018 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001019 if (__ratelimit(&_rs))
1020 scrub_print_warning("checksum/header error",
1021 sblock_to_check);
Stefan Behrens442a4f62012-05-25 16:06:08 +02001022 if (sblock_bad->generation_error)
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001023 btrfs_dev_stat_inc_and_print(dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02001024 BTRFS_DEV_STAT_GENERATION_ERRS);
1025 else
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001026 btrfs_dev_stat_inc_and_print(dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02001027 BTRFS_DEV_STAT_CORRUPTION_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001028 }
1029
Ilya Dryomov33ef30a2013-11-03 19:06:38 +02001030 if (sctx->readonly) {
1031 ASSERT(!sctx->is_dev_replace);
1032 goto out;
1033 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001034
1035 if (!is_metadata && !have_csum) {
1036 struct scrub_fixup_nodatasum *fixup_nodatasum;
1037
Stefan Behrensff023aa2012-11-06 11:43:11 +01001038 WARN_ON(sctx->is_dev_replace);
1039
Zhao Leib25c94c2015-01-20 15:11:35 +08001040nodatasum_case:
1041
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001042 /*
1043 * !is_metadata and !have_csum, this means that the data
1044 * might not be COW'ed, that it might be modified
1045 * concurrently. The general strategy to work on the
1046 * commit root does not help in the case when COW is not
1047 * used.
1048 */
1049 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1050 if (!fixup_nodatasum)
1051 goto did_not_correct_error;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001052 fixup_nodatasum->sctx = sctx;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001053 fixup_nodatasum->dev = dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001054 fixup_nodatasum->logical = logical;
1055 fixup_nodatasum->root = fs_info->extent_root;
1056 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01001057 scrub_pending_trans_workers_inc(sctx);
Liu Bo9e0af232014-08-15 23:36:53 +08001058 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1059 scrub_fixup_nodatasum, NULL, NULL);
Qu Wenruo0339ef22014-02-28 10:46:17 +08001060 btrfs_queue_work(fs_info->scrub_workers,
1061 &fixup_nodatasum->work);
Arne Jansena2de7332011-03-08 14:14:00 +01001062 goto out;
1063 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001064
1065 /*
1066 * now build and submit the bios for the other mirrors, check
Stefan Behrenscb2ced72012-11-02 16:14:21 +01001067 * checksums.
1068 * First try to pick the mirror which is completely without I/O
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001069 * errors and also does not have a checksum error.
1070 * If one is found, and if a checksum is present, the full block
1071 * that is known to contain an error is rewritten. Afterwards
1072 * the block is known to be corrected.
1073 * If a mirror is found which is completely correct, and no
1074 * checksum is present, only those pages are rewritten that had
1075 * an I/O error in the block to be repaired, since it cannot be
1076 * determined, which copy of the other pages is better (and it
1077 * could happen otherwise that a correct page would be
1078 * overwritten by a bad one).
1079 */
1080 for (mirror_index = 0;
1081 mirror_index < BTRFS_MAX_MIRRORS &&
1082 sblocks_for_recheck[mirror_index].page_count > 0;
1083 mirror_index++) {
Stefan Behrenscb2ced72012-11-02 16:14:21 +01001084 struct scrub_block *sblock_other;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001085
Stefan Behrenscb2ced72012-11-02 16:14:21 +01001086 if (mirror_index == failed_mirror_index)
1087 continue;
1088 sblock_other = sblocks_for_recheck + mirror_index;
1089
1090 /* build and submit the bios, check checksums */
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001091 scrub_recheck_block(fs_info, sblock_other, is_metadata,
1092 have_csum, csum, generation,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001093 sctx->csum_size, 0);
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001094
1095 if (!sblock_other->header_error &&
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001096 !sblock_other->checksum_error &&
1097 sblock_other->no_io_error_seen) {
Stefan Behrensff023aa2012-11-06 11:43:11 +01001098 if (sctx->is_dev_replace) {
1099 scrub_write_block_to_dev_replace(sblock_other);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001100 goto corrected_error;
Zhao Lei114ab502015-01-20 15:11:36 +08001101 } else {
1102 ret = scrub_repair_block_from_good_copy(
1103 sblock_bad, sblock_other);
1104 if (!ret)
1105 goto corrected_error;
1106 }
Arne Jansena2de7332011-03-08 14:14:00 +01001107 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001108 }
1109
Zhao Leib968fed2015-01-20 15:11:41 +08001110 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1111 goto did_not_correct_error;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001112
1113 /*
Stefan Behrensff023aa2012-11-06 11:43:11 +01001114 * In case of I/O errors in the area that is supposed to be
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001115 * repaired, continue by picking good copies of those pages.
1116 * Select the good pages from mirrors to rewrite bad pages from
1117 * the area to fix. Afterwards verify the checksum of the block
1118 * that is supposed to be repaired. This verification step is
1119 * only done for the purpose of statistic counting and for the
1120 * final scrub report, whether errors remain.
1121 * A perfect algorithm could make use of the checksum and try
1122 * all possible combinations of pages from the different mirrors
1123 * until the checksum verification succeeds. For example, when
1124 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1125 * of mirror #2 is readable but the final checksum test fails,
1126 * then the 2nd page of mirror #3 could be tried, whether now
1127 * the final checksum succeedes. But this would be a rare
1128 * exception and is therefore not implemented. At least it is
1129 * avoided that the good copy is overwritten.
1130 * A more useful improvement would be to pick the sectors
1131 * without I/O error based on sector sizes (512 bytes on legacy
1132 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1133 * mirror could be repaired by taking 512 byte of a different
1134 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1135 * area are unreadable.
1136 */
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001137 success = 1;
Zhao Leib968fed2015-01-20 15:11:41 +08001138 for (page_num = 0; page_num < sblock_bad->page_count;
1139 page_num++) {
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001140 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
Zhao Leib968fed2015-01-20 15:11:41 +08001141 struct scrub_block *sblock_other = NULL;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001142
Zhao Leib968fed2015-01-20 15:11:41 +08001143 /* skip no-io-error page in scrub */
1144 if (!page_bad->io_error && !sctx->is_dev_replace)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001145 continue;
1146
Zhao Leib968fed2015-01-20 15:11:41 +08001147 /* try to find no-io-error page in mirrors */
1148 if (page_bad->io_error) {
1149 for (mirror_index = 0;
1150 mirror_index < BTRFS_MAX_MIRRORS &&
1151 sblocks_for_recheck[mirror_index].page_count > 0;
1152 mirror_index++) {
1153 if (!sblocks_for_recheck[mirror_index].
1154 pagev[page_num]->io_error) {
1155 sblock_other = sblocks_for_recheck +
1156 mirror_index;
1157 break;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001158 }
Jan Schmidt13db62b2011-06-13 19:56:13 +02001159 }
Zhao Leib968fed2015-01-20 15:11:41 +08001160 if (!sblock_other)
1161 success = 0;
Jan Schmidt13db62b2011-06-13 19:56:13 +02001162 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001163
Zhao Leib968fed2015-01-20 15:11:41 +08001164 if (sctx->is_dev_replace) {
1165 /*
1166 * did not find a mirror to fetch the page
1167 * from. scrub_write_page_to_dev_replace()
1168 * handles this case (page->io_error), by
1169 * filling the block with zeros before
1170 * submitting the write request
1171 */
1172 if (!sblock_other)
1173 sblock_other = sblock_bad;
1174
1175 if (scrub_write_page_to_dev_replace(sblock_other,
1176 page_num) != 0) {
1177 btrfs_dev_replace_stats_inc(
1178 &sctx->dev_root->
1179 fs_info->dev_replace.
1180 num_write_errors);
1181 success = 0;
1182 }
1183 } else if (sblock_other) {
1184 ret = scrub_repair_page_from_good_copy(sblock_bad,
1185 sblock_other,
1186 page_num, 0);
1187 if (0 == ret)
1188 page_bad->io_error = 0;
1189 else
1190 success = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001191 }
1192 }
1193
Zhao Leib968fed2015-01-20 15:11:41 +08001194 if (success && !sctx->is_dev_replace) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001195 if (is_metadata || have_csum) {
1196 /*
1197 * need to verify the checksum now that all
1198 * sectors on disk are repaired (the write
1199 * request for data to be repaired is on its way).
1200 * Just be lazy and use scrub_recheck_block()
1201 * which re-reads the data before the checksum
1202 * is verified, but most likely the data comes out
1203 * of the page cache.
1204 */
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001205 scrub_recheck_block(fs_info, sblock_bad,
1206 is_metadata, have_csum, csum,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001207 generation, sctx->csum_size, 1);
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001208 if (!sblock_bad->header_error &&
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001209 !sblock_bad->checksum_error &&
1210 sblock_bad->no_io_error_seen)
1211 goto corrected_error;
1212 else
1213 goto did_not_correct_error;
1214 } else {
1215corrected_error:
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001216 spin_lock(&sctx->stat_lock);
1217 sctx->stat.corrected_errors++;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08001218 sblock_to_check->data_corrected = 1;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001219 spin_unlock(&sctx->stat_lock);
Josef Bacik606686e2012-06-04 14:03:51 -04001220 printk_ratelimited_in_rcu(KERN_ERR
Frank Holtonefe120a2013-12-20 11:37:06 -05001221 "BTRFS: fixed up error at logical %llu on dev %s\n",
Geert Uytterhoevenc1c9ff72013-08-20 13:20:07 +02001222 logical, rcu_str_deref(dev->name));
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001223 }
1224 } else {
1225did_not_correct_error:
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001226 spin_lock(&sctx->stat_lock);
1227 sctx->stat.uncorrectable_errors++;
1228 spin_unlock(&sctx->stat_lock);
Josef Bacik606686e2012-06-04 14:03:51 -04001229 printk_ratelimited_in_rcu(KERN_ERR
Frank Holtonefe120a2013-12-20 11:37:06 -05001230 "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
Geert Uytterhoevenc1c9ff72013-08-20 13:20:07 +02001231 logical, rcu_str_deref(dev->name));
Arne Jansena2de7332011-03-08 14:14:00 +01001232 }
1233
1234out:
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001235 if (sblocks_for_recheck) {
1236 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1237 mirror_index++) {
1238 struct scrub_block *sblock = sblocks_for_recheck +
1239 mirror_index;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001240 struct scrub_recover *recover;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001241 int page_index;
1242
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001243 for (page_index = 0; page_index < sblock->page_count;
1244 page_index++) {
1245 sblock->pagev[page_index]->sblock = NULL;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001246 recover = sblock->pagev[page_index]->recover;
1247 if (recover) {
1248 scrub_put_recover(recover);
1249 sblock->pagev[page_index]->recover =
1250 NULL;
1251 }
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001252 scrub_page_put(sblock->pagev[page_index]);
1253 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001254 }
1255 kfree(sblocks_for_recheck);
1256 }
1257
1258 return 0;
Arne Jansena2de7332011-03-08 14:14:00 +01001259}
1260
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001261static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
Miao Xieaf8e2d12014-10-23 14:42:50 +08001262{
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001263 if (bbio->raid_map) {
Zhao Leie34c3302015-01-20 15:11:31 +08001264 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
1265
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001266 if (bbio->raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
Miao Xieaf8e2d12014-10-23 14:42:50 +08001267 return 3;
1268 else
1269 return 2;
1270 } else {
1271 return (int)bbio->num_stripes;
1272 }
1273}
1274
1275static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
1276 u64 mapped_length,
1277 int nstripes, int mirror,
1278 int *stripe_index,
1279 u64 *stripe_offset)
1280{
1281 int i;
1282
1283 if (raid_map) {
1284 /* RAID5/6 */
1285 for (i = 0; i < nstripes; i++) {
1286 if (raid_map[i] == RAID6_Q_STRIPE ||
1287 raid_map[i] == RAID5_P_STRIPE)
1288 continue;
1289
1290 if (logical >= raid_map[i] &&
1291 logical < raid_map[i] + mapped_length)
1292 break;
1293 }
1294
1295 *stripe_index = i;
1296 *stripe_offset = logical - raid_map[i];
1297 } else {
1298 /* The other RAID type */
1299 *stripe_index = mirror;
1300 *stripe_offset = 0;
1301 }
1302}
1303
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001304static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
Stefan Behrens3ec706c2012-11-05 15:46:42 +01001305 struct btrfs_fs_info *fs_info,
Stefan Behrensff023aa2012-11-06 11:43:11 +01001306 struct scrub_block *original_sblock,
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001307 u64 length, u64 logical,
1308 struct scrub_block *sblocks_for_recheck)
Arne Jansena2de7332011-03-08 14:14:00 +01001309{
Miao Xieaf8e2d12014-10-23 14:42:50 +08001310 struct scrub_recover *recover;
1311 struct btrfs_bio *bbio;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001312 u64 sublen;
1313 u64 mapped_length;
1314 u64 stripe_offset;
1315 int stripe_index;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001316 int page_index;
1317 int mirror_index;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001318 int nmirrors;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001319 int ret;
1320
1321 /*
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001322 * note: the two members ref_count and outstanding_pages
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001323 * are not used (and not set) in the blocks that are used for
1324 * the recheck procedure
1325 */
1326
1327 page_index = 0;
1328 while (length > 0) {
Miao Xieaf8e2d12014-10-23 14:42:50 +08001329 sublen = min_t(u64, length, PAGE_SIZE);
1330 mapped_length = sublen;
1331 bbio = NULL;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001332
1333 /*
1334 * with a length of PAGE_SIZE, each returned stripe
1335 * represents one mirror
1336 */
Miao Xieaf8e2d12014-10-23 14:42:50 +08001337 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001338 &mapped_length, &bbio, 0, 1);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001339 if (ret || !bbio || mapped_length < sublen) {
Zhao Lei6e9606d2015-01-20 15:11:34 +08001340 btrfs_put_bbio(bbio);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001341 return -EIO;
1342 }
1343
Miao Xieaf8e2d12014-10-23 14:42:50 +08001344 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1345 if (!recover) {
Zhao Lei6e9606d2015-01-20 15:11:34 +08001346 btrfs_put_bbio(bbio);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001347 return -ENOMEM;
1348 }
1349
1350 atomic_set(&recover->refs, 1);
1351 recover->bbio = bbio;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001352 recover->map_length = mapped_length;
1353
Stefan Behrensff023aa2012-11-06 11:43:11 +01001354 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001355
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001356 nmirrors = scrub_nr_raid_mirrors(bbio);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001357 for (mirror_index = 0; mirror_index < nmirrors;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001358 mirror_index++) {
1359 struct scrub_block *sblock;
1360 struct scrub_page *page;
1361
1362 if (mirror_index >= BTRFS_MAX_MIRRORS)
Zhao Leidc5f7a32015-01-20 15:11:39 +08001363 break;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001364
1365 sblock = sblocks_for_recheck + mirror_index;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001366 sblock->sctx = sctx;
1367 page = kzalloc(sizeof(*page), GFP_NOFS);
1368 if (!page) {
1369leave_nomem:
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001370 spin_lock(&sctx->stat_lock);
1371 sctx->stat.malloc_errors++;
1372 spin_unlock(&sctx->stat_lock);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001373 scrub_put_recover(recover);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001374 return -ENOMEM;
1375 }
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001376 scrub_page_get(page);
1377 sblock->pagev[page_index] = page;
1378 page->logical = logical;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001379
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001380 scrub_stripe_index_and_offset(logical, bbio->raid_map,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001381 mapped_length,
Zhao Leie34c3302015-01-20 15:11:31 +08001382 bbio->num_stripes -
1383 bbio->num_tgtdevs,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001384 mirror_index,
1385 &stripe_index,
1386 &stripe_offset);
1387 page->physical = bbio->stripes[stripe_index].physical +
1388 stripe_offset;
1389 page->dev = bbio->stripes[stripe_index].dev;
1390
Stefan Behrensff023aa2012-11-06 11:43:11 +01001391 BUG_ON(page_index >= original_sblock->page_count);
1392 page->physical_for_dev_replace =
1393 original_sblock->pagev[page_index]->
1394 physical_for_dev_replace;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001395 /* for missing devices, dev->bdev is NULL */
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001396 page->mirror_num = mirror_index + 1;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001397 sblock->page_count++;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001398 page->page = alloc_page(GFP_NOFS);
1399 if (!page->page)
1400 goto leave_nomem;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001401
1402 scrub_get_recover(recover);
1403 page->recover = recover;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001404 }
Miao Xieaf8e2d12014-10-23 14:42:50 +08001405 scrub_put_recover(recover);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001406 length -= sublen;
1407 logical += sublen;
1408 page_index++;
1409 }
1410
1411 return 0;
1412}
1413
Miao Xieaf8e2d12014-10-23 14:42:50 +08001414struct scrub_bio_ret {
1415 struct completion event;
1416 int error;
1417};
1418
1419static void scrub_bio_wait_endio(struct bio *bio, int error)
1420{
1421 struct scrub_bio_ret *ret = bio->bi_private;
1422
1423 ret->error = error;
1424 complete(&ret->event);
1425}
1426
1427static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1428{
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001429 return page->recover && page->recover->bbio->raid_map;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001430}
1431
1432static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1433 struct bio *bio,
1434 struct scrub_page *page)
1435{
1436 struct scrub_bio_ret done;
1437 int ret;
1438
1439 init_completion(&done.event);
1440 done.error = 0;
1441 bio->bi_iter.bi_sector = page->logical >> 9;
1442 bio->bi_private = &done;
1443 bio->bi_end_io = scrub_bio_wait_endio;
1444
1445 ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001446 page->recover->map_length,
Miao Xie42452152014-11-25 16:39:28 +08001447 page->mirror_num, 0);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001448 if (ret)
1449 return ret;
1450
1451 wait_for_completion(&done.event);
1452 if (done.error)
1453 return -EIO;
1454
1455 return 0;
1456}
1457
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001458/*
1459 * this function will check the on disk data for checksum errors, header
1460 * errors and read I/O errors. If any I/O errors happen, the exact pages
1461 * which are errored are marked as being bad. The goal is to enable scrub
1462 * to take those pages that are not errored from all the mirrors so that
1463 * the pages that are errored in the just handled mirror can be repaired.
1464 */
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001465static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1466 struct scrub_block *sblock, int is_metadata,
1467 int have_csum, u8 *csum, u64 generation,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001468 u16 csum_size, int retry_failed_mirror)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001469{
1470 int page_num;
1471
1472 sblock->no_io_error_seen = 1;
1473 sblock->header_error = 0;
1474 sblock->checksum_error = 0;
1475
1476 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1477 struct bio *bio;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001478 struct scrub_page *page = sblock->pagev[page_num];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001479
Stefan Behrens442a4f62012-05-25 16:06:08 +02001480 if (page->dev->bdev == NULL) {
Stefan Behrensea9947b2012-05-04 15:16:07 -04001481 page->io_error = 1;
1482 sblock->no_io_error_seen = 0;
1483 continue;
1484 }
1485
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001486 WARN_ON(!page->page);
Chris Mason9be33952013-05-17 18:30:14 -04001487 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001488 if (!bio) {
1489 page->io_error = 1;
1490 sblock->no_io_error_seen = 0;
1491 continue;
1492 }
Stefan Behrens442a4f62012-05-25 16:06:08 +02001493 bio->bi_bdev = page->dev->bdev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001494
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001495 bio_add_page(bio, page->page, PAGE_SIZE, 0);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001496 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1497 if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1498 sblock->no_io_error_seen = 0;
1499 } else {
1500 bio->bi_iter.bi_sector = page->physical >> 9;
1501
1502 if (btrfsic_submit_bio_wait(READ, bio))
1503 sblock->no_io_error_seen = 0;
1504 }
Kent Overstreet33879d42013-11-23 22:33:32 -08001505
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001506 bio_put(bio);
1507 }
1508
1509 if (sblock->no_io_error_seen)
1510 scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1511 have_csum, csum, generation,
1512 csum_size);
1513
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001514 return;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001515}
1516
Miao Xie17a9be22014-07-24 11:37:08 +08001517static inline int scrub_check_fsid(u8 fsid[],
1518 struct scrub_page *spage)
1519{
1520 struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1521 int ret;
1522
1523 ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1524 return !ret;
1525}
1526
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001527static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1528 struct scrub_block *sblock,
1529 int is_metadata, int have_csum,
1530 const u8 *csum, u64 generation,
1531 u16 csum_size)
1532{
1533 int page_num;
1534 u8 calculated_csum[BTRFS_CSUM_SIZE];
1535 u32 crc = ~(u32)0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001536 void *mapped_buffer;
1537
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001538 WARN_ON(!sblock->pagev[0]->page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001539 if (is_metadata) {
1540 struct btrfs_header *h;
1541
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001542 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001543 h = (struct btrfs_header *)mapped_buffer;
1544
Qu Wenruo3cae2102013-07-16 11:19:18 +08001545 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
Miao Xie17a9be22014-07-24 11:37:08 +08001546 !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001547 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
Stefan Behrens442a4f62012-05-25 16:06:08 +02001548 BTRFS_UUID_SIZE)) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001549 sblock->header_error = 1;
Qu Wenruo3cae2102013-07-16 11:19:18 +08001550 } else if (generation != btrfs_stack_header_generation(h)) {
Stefan Behrens442a4f62012-05-25 16:06:08 +02001551 sblock->header_error = 1;
1552 sblock->generation_error = 1;
1553 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001554 csum = h->csum;
1555 } else {
1556 if (!have_csum)
1557 return;
1558
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001559 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001560 }
1561
1562 for (page_num = 0;;) {
1563 if (page_num == 0 && is_metadata)
Liu Bob0496682013-03-14 14:57:45 +00001564 crc = btrfs_csum_data(
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001565 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1566 crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1567 else
Liu Bob0496682013-03-14 14:57:45 +00001568 crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001569
Linus Torvalds9613beb2012-03-30 12:44:29 -07001570 kunmap_atomic(mapped_buffer);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001571 page_num++;
1572 if (page_num >= sblock->page_count)
1573 break;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001574 WARN_ON(!sblock->pagev[page_num]->page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001575
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001576 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001577 }
1578
1579 btrfs_csum_final(crc, calculated_csum);
1580 if (memcmp(calculated_csum, csum, csum_size))
1581 sblock->checksum_error = 1;
1582}
1583
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001584static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
Zhao Lei114ab502015-01-20 15:11:36 +08001585 struct scrub_block *sblock_good)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001586{
1587 int page_num;
1588 int ret = 0;
1589
1590 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1591 int ret_sub;
1592
1593 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1594 sblock_good,
Zhao Lei114ab502015-01-20 15:11:36 +08001595 page_num, 1);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001596 if (ret_sub)
1597 ret = ret_sub;
1598 }
1599
1600 return ret;
1601}
1602
1603static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1604 struct scrub_block *sblock_good,
1605 int page_num, int force_write)
1606{
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001607 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1608 struct scrub_page *page_good = sblock_good->pagev[page_num];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001609
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001610 BUG_ON(page_bad->page == NULL);
1611 BUG_ON(page_good->page == NULL);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001612 if (force_write || sblock_bad->header_error ||
1613 sblock_bad->checksum_error || page_bad->io_error) {
1614 struct bio *bio;
1615 int ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001616
Stefan Behrensff023aa2012-11-06 11:43:11 +01001617 if (!page_bad->dev->bdev) {
Frank Holtonefe120a2013-12-20 11:37:06 -05001618 printk_ratelimited(KERN_WARNING "BTRFS: "
1619 "scrub_repair_page_from_good_copy(bdev == NULL) "
1620 "is unexpected!\n");
Stefan Behrensff023aa2012-11-06 11:43:11 +01001621 return -EIO;
1622 }
1623
Chris Mason9be33952013-05-17 18:30:14 -04001624 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
Tsutomu Itohe627ee72012-04-12 16:03:56 -04001625 if (!bio)
1626 return -EIO;
Stefan Behrens442a4f62012-05-25 16:06:08 +02001627 bio->bi_bdev = page_bad->dev->bdev;
Kent Overstreet4f024f32013-10-11 15:44:27 -07001628 bio->bi_iter.bi_sector = page_bad->physical >> 9;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001629
1630 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1631 if (PAGE_SIZE != ret) {
1632 bio_put(bio);
1633 return -EIO;
1634 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001635
Kent Overstreet33879d42013-11-23 22:33:32 -08001636 if (btrfsic_submit_bio_wait(WRITE, bio)) {
Stefan Behrens442a4f62012-05-25 16:06:08 +02001637 btrfs_dev_stat_inc_and_print(page_bad->dev,
1638 BTRFS_DEV_STAT_WRITE_ERRS);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001639 btrfs_dev_replace_stats_inc(
1640 &sblock_bad->sctx->dev_root->fs_info->
1641 dev_replace.num_write_errors);
Stefan Behrens442a4f62012-05-25 16:06:08 +02001642 bio_put(bio);
1643 return -EIO;
1644 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001645 bio_put(bio);
1646 }
1647
1648 return 0;
1649}
1650
Stefan Behrensff023aa2012-11-06 11:43:11 +01001651static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1652{
1653 int page_num;
1654
Miao Xie5a6ac9e2014-11-06 17:20:58 +08001655 /*
1656 * This block is used for the check of the parity on the source device,
1657 * so the data needn't be written into the destination device.
1658 */
1659 if (sblock->sparity)
1660 return;
1661
Stefan Behrensff023aa2012-11-06 11:43:11 +01001662 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1663 int ret;
1664
1665 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1666 if (ret)
1667 btrfs_dev_replace_stats_inc(
1668 &sblock->sctx->dev_root->fs_info->dev_replace.
1669 num_write_errors);
1670 }
1671}
1672
1673static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1674 int page_num)
1675{
1676 struct scrub_page *spage = sblock->pagev[page_num];
1677
1678 BUG_ON(spage->page == NULL);
1679 if (spage->io_error) {
1680 void *mapped_buffer = kmap_atomic(spage->page);
1681
1682 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1683 flush_dcache_page(spage->page);
1684 kunmap_atomic(mapped_buffer);
1685 }
1686 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1687}
1688
1689static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1690 struct scrub_page *spage)
1691{
1692 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1693 struct scrub_bio *sbio;
1694 int ret;
1695
1696 mutex_lock(&wr_ctx->wr_lock);
1697again:
1698 if (!wr_ctx->wr_curr_bio) {
1699 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1700 GFP_NOFS);
1701 if (!wr_ctx->wr_curr_bio) {
1702 mutex_unlock(&wr_ctx->wr_lock);
1703 return -ENOMEM;
1704 }
1705 wr_ctx->wr_curr_bio->sctx = sctx;
1706 wr_ctx->wr_curr_bio->page_count = 0;
1707 }
1708 sbio = wr_ctx->wr_curr_bio;
1709 if (sbio->page_count == 0) {
1710 struct bio *bio;
1711
1712 sbio->physical = spage->physical_for_dev_replace;
1713 sbio->logical = spage->logical;
1714 sbio->dev = wr_ctx->tgtdev;
1715 bio = sbio->bio;
1716 if (!bio) {
Chris Mason9be33952013-05-17 18:30:14 -04001717 bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001718 if (!bio) {
1719 mutex_unlock(&wr_ctx->wr_lock);
1720 return -ENOMEM;
1721 }
1722 sbio->bio = bio;
1723 }
1724
1725 bio->bi_private = sbio;
1726 bio->bi_end_io = scrub_wr_bio_end_io;
1727 bio->bi_bdev = sbio->dev->bdev;
Kent Overstreet4f024f32013-10-11 15:44:27 -07001728 bio->bi_iter.bi_sector = sbio->physical >> 9;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001729 sbio->err = 0;
1730 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1731 spage->physical_for_dev_replace ||
1732 sbio->logical + sbio->page_count * PAGE_SIZE !=
1733 spage->logical) {
1734 scrub_wr_submit(sctx);
1735 goto again;
1736 }
1737
1738 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1739 if (ret != PAGE_SIZE) {
1740 if (sbio->page_count < 1) {
1741 bio_put(sbio->bio);
1742 sbio->bio = NULL;
1743 mutex_unlock(&wr_ctx->wr_lock);
1744 return -EIO;
1745 }
1746 scrub_wr_submit(sctx);
1747 goto again;
1748 }
1749
1750 sbio->pagev[sbio->page_count] = spage;
1751 scrub_page_get(spage);
1752 sbio->page_count++;
1753 if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1754 scrub_wr_submit(sctx);
1755 mutex_unlock(&wr_ctx->wr_lock);
1756
1757 return 0;
1758}
1759
1760static void scrub_wr_submit(struct scrub_ctx *sctx)
1761{
1762 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1763 struct scrub_bio *sbio;
1764
1765 if (!wr_ctx->wr_curr_bio)
1766 return;
1767
1768 sbio = wr_ctx->wr_curr_bio;
1769 wr_ctx->wr_curr_bio = NULL;
1770 WARN_ON(!sbio->bio->bi_bdev);
1771 scrub_pending_bio_inc(sctx);
1772 /* process all writes in a single worker thread. Then the block layer
1773 * orders the requests before sending them to the driver which
1774 * doubled the write performance on spinning disks when measured
1775 * with Linux 3.5 */
1776 btrfsic_submit_bio(WRITE, sbio->bio);
1777}
1778
1779static void scrub_wr_bio_end_io(struct bio *bio, int err)
1780{
1781 struct scrub_bio *sbio = bio->bi_private;
1782 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1783
1784 sbio->err = err;
1785 sbio->bio = bio;
1786
Liu Bo9e0af232014-08-15 23:36:53 +08001787 btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1788 scrub_wr_bio_end_io_worker, NULL, NULL);
Qu Wenruo0339ef22014-02-28 10:46:17 +08001789 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001790}
1791
1792static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1793{
1794 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1795 struct scrub_ctx *sctx = sbio->sctx;
1796 int i;
1797
1798 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1799 if (sbio->err) {
1800 struct btrfs_dev_replace *dev_replace =
1801 &sbio->sctx->dev_root->fs_info->dev_replace;
1802
1803 for (i = 0; i < sbio->page_count; i++) {
1804 struct scrub_page *spage = sbio->pagev[i];
1805
1806 spage->io_error = 1;
1807 btrfs_dev_replace_stats_inc(&dev_replace->
1808 num_write_errors);
1809 }
1810 }
1811
1812 for (i = 0; i < sbio->page_count; i++)
1813 scrub_page_put(sbio->pagev[i]);
1814
1815 bio_put(sbio->bio);
1816 kfree(sbio);
1817 scrub_pending_bio_dec(sctx);
1818}
1819
1820static int scrub_checksum(struct scrub_block *sblock)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001821{
1822 u64 flags;
1823 int ret;
1824
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001825 WARN_ON(sblock->page_count < 1);
1826 flags = sblock->pagev[0]->flags;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001827 ret = 0;
1828 if (flags & BTRFS_EXTENT_FLAG_DATA)
1829 ret = scrub_checksum_data(sblock);
1830 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1831 ret = scrub_checksum_tree_block(sblock);
1832 else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1833 (void)scrub_checksum_super(sblock);
1834 else
1835 WARN_ON(1);
1836 if (ret)
1837 scrub_handle_errored_block(sblock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001838
1839 return ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001840}
1841
1842static int scrub_checksum_data(struct scrub_block *sblock)
1843{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001844 struct scrub_ctx *sctx = sblock->sctx;
Arne Jansena2de7332011-03-08 14:14:00 +01001845 u8 csum[BTRFS_CSUM_SIZE];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001846 u8 *on_disk_csum;
1847 struct page *page;
1848 void *buffer;
Arne Jansena2de7332011-03-08 14:14:00 +01001849 u32 crc = ~(u32)0;
1850 int fail = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001851 u64 len;
1852 int index;
Arne Jansena2de7332011-03-08 14:14:00 +01001853
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001854 BUG_ON(sblock->page_count < 1);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001855 if (!sblock->pagev[0]->have_csum)
Arne Jansena2de7332011-03-08 14:14:00 +01001856 return 0;
1857
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001858 on_disk_csum = sblock->pagev[0]->csum;
1859 page = sblock->pagev[0]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07001860 buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001861
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001862 len = sctx->sectorsize;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001863 index = 0;
1864 for (;;) {
1865 u64 l = min_t(u64, len, PAGE_SIZE);
1866
Liu Bob0496682013-03-14 14:57:45 +00001867 crc = btrfs_csum_data(buffer, crc, l);
Linus Torvalds9613beb2012-03-30 12:44:29 -07001868 kunmap_atomic(buffer);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001869 len -= l;
1870 if (len == 0)
1871 break;
1872 index++;
1873 BUG_ON(index >= sblock->page_count);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001874 BUG_ON(!sblock->pagev[index]->page);
1875 page = sblock->pagev[index]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07001876 buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001877 }
1878
Arne Jansena2de7332011-03-08 14:14:00 +01001879 btrfs_csum_final(crc, csum);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001880 if (memcmp(csum, on_disk_csum, sctx->csum_size))
Arne Jansena2de7332011-03-08 14:14:00 +01001881 fail = 1;
1882
Arne Jansena2de7332011-03-08 14:14:00 +01001883 return fail;
1884}
1885
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001886static int scrub_checksum_tree_block(struct scrub_block *sblock)
Arne Jansena2de7332011-03-08 14:14:00 +01001887{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001888 struct scrub_ctx *sctx = sblock->sctx;
Arne Jansena2de7332011-03-08 14:14:00 +01001889 struct btrfs_header *h;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001890 struct btrfs_root *root = sctx->dev_root;
Arne Jansena2de7332011-03-08 14:14:00 +01001891 struct btrfs_fs_info *fs_info = root->fs_info;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001892 u8 calculated_csum[BTRFS_CSUM_SIZE];
1893 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1894 struct page *page;
1895 void *mapped_buffer;
1896 u64 mapped_size;
1897 void *p;
Arne Jansena2de7332011-03-08 14:14:00 +01001898 u32 crc = ~(u32)0;
1899 int fail = 0;
1900 int crc_fail = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001901 u64 len;
1902 int index;
1903
1904 BUG_ON(sblock->page_count < 1);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001905 page = sblock->pagev[0]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07001906 mapped_buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001907 h = (struct btrfs_header *)mapped_buffer;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001908 memcpy(on_disk_csum, h->csum, sctx->csum_size);
Arne Jansena2de7332011-03-08 14:14:00 +01001909
1910 /*
1911 * we don't use the getter functions here, as we
1912 * a) don't have an extent buffer and
1913 * b) the page is already kmapped
1914 */
Arne Jansena2de7332011-03-08 14:14:00 +01001915
Qu Wenruo3cae2102013-07-16 11:19:18 +08001916 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
Arne Jansena2de7332011-03-08 14:14:00 +01001917 ++fail;
1918
Qu Wenruo3cae2102013-07-16 11:19:18 +08001919 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
Arne Jansena2de7332011-03-08 14:14:00 +01001920 ++fail;
1921
Miao Xie17a9be22014-07-24 11:37:08 +08001922 if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
Arne Jansena2de7332011-03-08 14:14:00 +01001923 ++fail;
1924
1925 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1926 BTRFS_UUID_SIZE))
1927 ++fail;
1928
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001929 len = sctx->nodesize - BTRFS_CSUM_SIZE;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001930 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1931 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1932 index = 0;
1933 for (;;) {
1934 u64 l = min_t(u64, len, mapped_size);
1935
Liu Bob0496682013-03-14 14:57:45 +00001936 crc = btrfs_csum_data(p, crc, l);
Linus Torvalds9613beb2012-03-30 12:44:29 -07001937 kunmap_atomic(mapped_buffer);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001938 len -= l;
1939 if (len == 0)
1940 break;
1941 index++;
1942 BUG_ON(index >= sblock->page_count);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001943 BUG_ON(!sblock->pagev[index]->page);
1944 page = sblock->pagev[index]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07001945 mapped_buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001946 mapped_size = PAGE_SIZE;
1947 p = mapped_buffer;
1948 }
1949
1950 btrfs_csum_final(crc, calculated_csum);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001951 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
Arne Jansena2de7332011-03-08 14:14:00 +01001952 ++crc_fail;
1953
Arne Jansena2de7332011-03-08 14:14:00 +01001954 return fail || crc_fail;
1955}
1956
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001957static int scrub_checksum_super(struct scrub_block *sblock)
Arne Jansena2de7332011-03-08 14:14:00 +01001958{
1959 struct btrfs_super_block *s;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001960 struct scrub_ctx *sctx = sblock->sctx;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001961 u8 calculated_csum[BTRFS_CSUM_SIZE];
1962 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1963 struct page *page;
1964 void *mapped_buffer;
1965 u64 mapped_size;
1966 void *p;
Arne Jansena2de7332011-03-08 14:14:00 +01001967 u32 crc = ~(u32)0;
Stefan Behrens442a4f62012-05-25 16:06:08 +02001968 int fail_gen = 0;
1969 int fail_cor = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001970 u64 len;
1971 int index;
Arne Jansena2de7332011-03-08 14:14:00 +01001972
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001973 BUG_ON(sblock->page_count < 1);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001974 page = sblock->pagev[0]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07001975 mapped_buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001976 s = (struct btrfs_super_block *)mapped_buffer;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001977 memcpy(on_disk_csum, s->csum, sctx->csum_size);
Arne Jansena2de7332011-03-08 14:14:00 +01001978
Qu Wenruo3cae2102013-07-16 11:19:18 +08001979 if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
Stefan Behrens442a4f62012-05-25 16:06:08 +02001980 ++fail_cor;
Arne Jansena2de7332011-03-08 14:14:00 +01001981
Qu Wenruo3cae2102013-07-16 11:19:18 +08001982 if (sblock->pagev[0]->generation != btrfs_super_generation(s))
Stefan Behrens442a4f62012-05-25 16:06:08 +02001983 ++fail_gen;
Arne Jansena2de7332011-03-08 14:14:00 +01001984
Miao Xie17a9be22014-07-24 11:37:08 +08001985 if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
Stefan Behrens442a4f62012-05-25 16:06:08 +02001986 ++fail_cor;
Arne Jansena2de7332011-03-08 14:14:00 +01001987
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001988 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1989 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1990 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1991 index = 0;
1992 for (;;) {
1993 u64 l = min_t(u64, len, mapped_size);
1994
Liu Bob0496682013-03-14 14:57:45 +00001995 crc = btrfs_csum_data(p, crc, l);
Linus Torvalds9613beb2012-03-30 12:44:29 -07001996 kunmap_atomic(mapped_buffer);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001997 len -= l;
1998 if (len == 0)
1999 break;
2000 index++;
2001 BUG_ON(index >= sblock->page_count);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002002 BUG_ON(!sblock->pagev[index]->page);
2003 page = sblock->pagev[index]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002004 mapped_buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002005 mapped_size = PAGE_SIZE;
2006 p = mapped_buffer;
2007 }
2008
2009 btrfs_csum_final(crc, calculated_csum);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002010 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
Stefan Behrens442a4f62012-05-25 16:06:08 +02002011 ++fail_cor;
Arne Jansena2de7332011-03-08 14:14:00 +01002012
Stefan Behrens442a4f62012-05-25 16:06:08 +02002013 if (fail_cor + fail_gen) {
Arne Jansena2de7332011-03-08 14:14:00 +01002014 /*
2015 * if we find an error in a super block, we just report it.
2016 * They will get written with the next transaction commit
2017 * anyway
2018 */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002019 spin_lock(&sctx->stat_lock);
2020 ++sctx->stat.super_errors;
2021 spin_unlock(&sctx->stat_lock);
Stefan Behrens442a4f62012-05-25 16:06:08 +02002022 if (fail_cor)
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002023 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02002024 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2025 else
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002026 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02002027 BTRFS_DEV_STAT_GENERATION_ERRS);
Arne Jansena2de7332011-03-08 14:14:00 +01002028 }
2029
Stefan Behrens442a4f62012-05-25 16:06:08 +02002030 return fail_cor + fail_gen;
Arne Jansena2de7332011-03-08 14:14:00 +01002031}
2032
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002033static void scrub_block_get(struct scrub_block *sblock)
2034{
2035 atomic_inc(&sblock->ref_count);
2036}
2037
2038static void scrub_block_put(struct scrub_block *sblock)
2039{
2040 if (atomic_dec_and_test(&sblock->ref_count)) {
2041 int i;
2042
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002043 if (sblock->sparity)
2044 scrub_parity_put(sblock->sparity);
2045
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002046 for (i = 0; i < sblock->page_count; i++)
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002047 scrub_page_put(sblock->pagev[i]);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002048 kfree(sblock);
2049 }
2050}
2051
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002052static void scrub_page_get(struct scrub_page *spage)
2053{
2054 atomic_inc(&spage->ref_count);
2055}
2056
2057static void scrub_page_put(struct scrub_page *spage)
2058{
2059 if (atomic_dec_and_test(&spage->ref_count)) {
2060 if (spage->page)
2061 __free_page(spage->page);
2062 kfree(spage);
2063 }
2064}
2065
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002066static void scrub_submit(struct scrub_ctx *sctx)
Arne Jansena2de7332011-03-08 14:14:00 +01002067{
2068 struct scrub_bio *sbio;
2069
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002070 if (sctx->curr == -1)
Stefan Behrens1623ede2012-03-27 14:21:26 -04002071 return;
Arne Jansena2de7332011-03-08 14:14:00 +01002072
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002073 sbio = sctx->bios[sctx->curr];
2074 sctx->curr = -1;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01002075 scrub_pending_bio_inc(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +01002076
Stefan Behrensff023aa2012-11-06 11:43:11 +01002077 if (!sbio->bio->bi_bdev) {
2078 /*
2079 * this case should not happen. If btrfs_map_block() is
2080 * wrong, it could happen for dev-replace operations on
2081 * missing devices when no mirrors are available, but in
2082 * this case it should already fail the mount.
2083 * This case is handled correctly (but _very_ slowly).
2084 */
2085 printk_ratelimited(KERN_WARNING
Frank Holtonefe120a2013-12-20 11:37:06 -05002086 "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
Stefan Behrensff023aa2012-11-06 11:43:11 +01002087 bio_endio(sbio->bio, -EIO);
2088 } else {
2089 btrfsic_submit_bio(READ, sbio->bio);
2090 }
Arne Jansena2de7332011-03-08 14:14:00 +01002091}
2092
Stefan Behrensff023aa2012-11-06 11:43:11 +01002093static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2094 struct scrub_page *spage)
Arne Jansena2de7332011-03-08 14:14:00 +01002095{
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002096 struct scrub_block *sblock = spage->sblock;
Arne Jansena2de7332011-03-08 14:14:00 +01002097 struct scrub_bio *sbio;
Arne Jansen69f4cb52011-11-11 08:17:10 -05002098 int ret;
Arne Jansena2de7332011-03-08 14:14:00 +01002099
2100again:
2101 /*
2102 * grab a fresh bio or wait for one to become available
2103 */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002104 while (sctx->curr == -1) {
2105 spin_lock(&sctx->list_lock);
2106 sctx->curr = sctx->first_free;
2107 if (sctx->curr != -1) {
2108 sctx->first_free = sctx->bios[sctx->curr]->next_free;
2109 sctx->bios[sctx->curr]->next_free = -1;
2110 sctx->bios[sctx->curr]->page_count = 0;
2111 spin_unlock(&sctx->list_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01002112 } else {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002113 spin_unlock(&sctx->list_lock);
2114 wait_event(sctx->list_wait, sctx->first_free != -1);
Arne Jansena2de7332011-03-08 14:14:00 +01002115 }
2116 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002117 sbio = sctx->bios[sctx->curr];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002118 if (sbio->page_count == 0) {
Arne Jansen69f4cb52011-11-11 08:17:10 -05002119 struct bio *bio;
2120
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002121 sbio->physical = spage->physical;
2122 sbio->logical = spage->logical;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002123 sbio->dev = spage->dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002124 bio = sbio->bio;
2125 if (!bio) {
Chris Mason9be33952013-05-17 18:30:14 -04002126 bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002127 if (!bio)
2128 return -ENOMEM;
2129 sbio->bio = bio;
2130 }
Arne Jansen69f4cb52011-11-11 08:17:10 -05002131
2132 bio->bi_private = sbio;
2133 bio->bi_end_io = scrub_bio_end_io;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002134 bio->bi_bdev = sbio->dev->bdev;
Kent Overstreet4f024f32013-10-11 15:44:27 -07002135 bio->bi_iter.bi_sector = sbio->physical >> 9;
Arne Jansen69f4cb52011-11-11 08:17:10 -05002136 sbio->err = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002137 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2138 spage->physical ||
2139 sbio->logical + sbio->page_count * PAGE_SIZE !=
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002140 spage->logical ||
2141 sbio->dev != spage->dev) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002142 scrub_submit(sctx);
Arne Jansen69f4cb52011-11-11 08:17:10 -05002143 goto again;
2144 }
2145
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002146 sbio->pagev[sbio->page_count] = spage;
2147 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2148 if (ret != PAGE_SIZE) {
2149 if (sbio->page_count < 1) {
2150 bio_put(sbio->bio);
2151 sbio->bio = NULL;
2152 return -EIO;
2153 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002154 scrub_submit(sctx);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002155 goto again;
Arne Jansena2de7332011-03-08 14:14:00 +01002156 }
Arne Jansen1bc87792011-05-28 21:57:55 +02002157
Stefan Behrensff023aa2012-11-06 11:43:11 +01002158 scrub_block_get(sblock); /* one for the page added to the bio */
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002159 atomic_inc(&sblock->outstanding_pages);
2160 sbio->page_count++;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002161 if (sbio->page_count == sctx->pages_per_rd_bio)
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002162 scrub_submit(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +01002163
2164 return 0;
2165}
2166
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002167static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002168 u64 physical, struct btrfs_device *dev, u64 flags,
Stefan Behrensff023aa2012-11-06 11:43:11 +01002169 u64 gen, int mirror_num, u8 *csum, int force,
2170 u64 physical_for_dev_replace)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002171{
2172 struct scrub_block *sblock;
2173 int index;
2174
2175 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2176 if (!sblock) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002177 spin_lock(&sctx->stat_lock);
2178 sctx->stat.malloc_errors++;
2179 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002180 return -ENOMEM;
2181 }
2182
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002183 /* one ref inside this function, plus one for each page added to
2184 * a bio later on */
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002185 atomic_set(&sblock->ref_count, 1);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002186 sblock->sctx = sctx;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002187 sblock->no_io_error_seen = 1;
2188
2189 for (index = 0; len > 0; index++) {
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002190 struct scrub_page *spage;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002191 u64 l = min_t(u64, len, PAGE_SIZE);
2192
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002193 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2194 if (!spage) {
2195leave_nomem:
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002196 spin_lock(&sctx->stat_lock);
2197 sctx->stat.malloc_errors++;
2198 spin_unlock(&sctx->stat_lock);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002199 scrub_block_put(sblock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002200 return -ENOMEM;
2201 }
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002202 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2203 scrub_page_get(spage);
2204 sblock->pagev[index] = spage;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002205 spage->sblock = sblock;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002206 spage->dev = dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002207 spage->flags = flags;
2208 spage->generation = gen;
2209 spage->logical = logical;
2210 spage->physical = physical;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002211 spage->physical_for_dev_replace = physical_for_dev_replace;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002212 spage->mirror_num = mirror_num;
2213 if (csum) {
2214 spage->have_csum = 1;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002215 memcpy(spage->csum, csum, sctx->csum_size);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002216 } else {
2217 spage->have_csum = 0;
2218 }
2219 sblock->page_count++;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002220 spage->page = alloc_page(GFP_NOFS);
2221 if (!spage->page)
2222 goto leave_nomem;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002223 len -= l;
2224 logical += l;
2225 physical += l;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002226 physical_for_dev_replace += l;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002227 }
2228
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002229 WARN_ON(sblock->page_count == 0);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002230 for (index = 0; index < sblock->page_count; index++) {
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002231 struct scrub_page *spage = sblock->pagev[index];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002232 int ret;
2233
Stefan Behrensff023aa2012-11-06 11:43:11 +01002234 ret = scrub_add_page_to_rd_bio(sctx, spage);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002235 if (ret) {
2236 scrub_block_put(sblock);
2237 return ret;
2238 }
2239 }
2240
2241 if (force)
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002242 scrub_submit(sctx);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002243
2244 /* last one frees, either here or in bio completion for last page */
2245 scrub_block_put(sblock);
2246 return 0;
2247}
2248
2249static void scrub_bio_end_io(struct bio *bio, int err)
2250{
2251 struct scrub_bio *sbio = bio->bi_private;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002252 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002253
2254 sbio->err = err;
2255 sbio->bio = bio;
2256
Qu Wenruo0339ef22014-02-28 10:46:17 +08002257 btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002258}
2259
2260static void scrub_bio_end_io_worker(struct btrfs_work *work)
2261{
2262 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002263 struct scrub_ctx *sctx = sbio->sctx;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002264 int i;
2265
Stefan Behrensff023aa2012-11-06 11:43:11 +01002266 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002267 if (sbio->err) {
2268 for (i = 0; i < sbio->page_count; i++) {
2269 struct scrub_page *spage = sbio->pagev[i];
2270
2271 spage->io_error = 1;
2272 spage->sblock->no_io_error_seen = 0;
2273 }
2274 }
2275
2276 /* now complete the scrub_block items that have all pages completed */
2277 for (i = 0; i < sbio->page_count; i++) {
2278 struct scrub_page *spage = sbio->pagev[i];
2279 struct scrub_block *sblock = spage->sblock;
2280
2281 if (atomic_dec_and_test(&sblock->outstanding_pages))
2282 scrub_block_complete(sblock);
2283 scrub_block_put(sblock);
2284 }
2285
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002286 bio_put(sbio->bio);
2287 sbio->bio = NULL;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002288 spin_lock(&sctx->list_lock);
2289 sbio->next_free = sctx->first_free;
2290 sctx->first_free = sbio->index;
2291 spin_unlock(&sctx->list_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002292
2293 if (sctx->is_dev_replace &&
2294 atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2295 mutex_lock(&sctx->wr_ctx.wr_lock);
2296 scrub_wr_submit(sctx);
2297 mutex_unlock(&sctx->wr_ctx.wr_lock);
2298 }
2299
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01002300 scrub_pending_bio_dec(sctx);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002301}
2302
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002303static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2304 unsigned long *bitmap,
2305 u64 start, u64 len)
2306{
2307 int offset;
2308 int nsectors;
2309 int sectorsize = sparity->sctx->dev_root->sectorsize;
2310
2311 if (len >= sparity->stripe_len) {
2312 bitmap_set(bitmap, 0, sparity->nsectors);
2313 return;
2314 }
2315
2316 start -= sparity->logic_start;
2317 offset = (int)do_div(start, sparity->stripe_len);
2318 offset /= sectorsize;
2319 nsectors = (int)len / sectorsize;
2320
2321 if (offset + nsectors <= sparity->nsectors) {
2322 bitmap_set(bitmap, offset, nsectors);
2323 return;
2324 }
2325
2326 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2327 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2328}
2329
2330static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2331 u64 start, u64 len)
2332{
2333 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2334}
2335
2336static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2337 u64 start, u64 len)
2338{
2339 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2340}
2341
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002342static void scrub_block_complete(struct scrub_block *sblock)
2343{
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002344 int corrupted = 0;
2345
Stefan Behrensff023aa2012-11-06 11:43:11 +01002346 if (!sblock->no_io_error_seen) {
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002347 corrupted = 1;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002348 scrub_handle_errored_block(sblock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002349 } else {
2350 /*
2351 * if has checksum error, write via repair mechanism in
2352 * dev replace case, otherwise write here in dev replace
2353 * case.
2354 */
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002355 corrupted = scrub_checksum(sblock);
2356 if (!corrupted && sblock->sctx->is_dev_replace)
Stefan Behrensff023aa2012-11-06 11:43:11 +01002357 scrub_write_block_to_dev_replace(sblock);
2358 }
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002359
2360 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2361 u64 start = sblock->pagev[0]->logical;
2362 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2363 PAGE_SIZE;
2364
2365 scrub_parity_mark_sectors_error(sblock->sparity,
2366 start, end - start);
2367 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002368}
2369
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002370static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
Arne Jansena2de7332011-03-08 14:14:00 +01002371 u8 *csum)
2372{
2373 struct btrfs_ordered_sum *sum = NULL;
Miao Xief51a4a12013-06-19 10:36:09 +08002374 unsigned long index;
Arne Jansena2de7332011-03-08 14:14:00 +01002375 unsigned long num_sectors;
Arne Jansena2de7332011-03-08 14:14:00 +01002376
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002377 while (!list_empty(&sctx->csum_list)) {
2378 sum = list_first_entry(&sctx->csum_list,
Arne Jansena2de7332011-03-08 14:14:00 +01002379 struct btrfs_ordered_sum, list);
2380 if (sum->bytenr > logical)
2381 return 0;
2382 if (sum->bytenr + sum->len > logical)
2383 break;
2384
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002385 ++sctx->stat.csum_discards;
Arne Jansena2de7332011-03-08 14:14:00 +01002386 list_del(&sum->list);
2387 kfree(sum);
2388 sum = NULL;
2389 }
2390 if (!sum)
2391 return 0;
2392
Miao Xief51a4a12013-06-19 10:36:09 +08002393 index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002394 num_sectors = sum->len / sctx->sectorsize;
Miao Xief51a4a12013-06-19 10:36:09 +08002395 memcpy(csum, sum->sums + index, sctx->csum_size);
2396 if (index == num_sectors - 1) {
Arne Jansena2de7332011-03-08 14:14:00 +01002397 list_del(&sum->list);
2398 kfree(sum);
2399 }
Miao Xief51a4a12013-06-19 10:36:09 +08002400 return 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002401}
2402
2403/* scrub extent tries to collect up to 64 kB for each bio */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002404static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002405 u64 physical, struct btrfs_device *dev, u64 flags,
Stefan Behrensff023aa2012-11-06 11:43:11 +01002406 u64 gen, int mirror_num, u64 physical_for_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01002407{
2408 int ret;
2409 u8 csum[BTRFS_CSUM_SIZE];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002410 u32 blocksize;
2411
2412 if (flags & BTRFS_EXTENT_FLAG_DATA) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002413 blocksize = sctx->sectorsize;
2414 spin_lock(&sctx->stat_lock);
2415 sctx->stat.data_extents_scrubbed++;
2416 sctx->stat.data_bytes_scrubbed += len;
2417 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002418 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002419 blocksize = sctx->nodesize;
2420 spin_lock(&sctx->stat_lock);
2421 sctx->stat.tree_extents_scrubbed++;
2422 sctx->stat.tree_bytes_scrubbed += len;
2423 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002424 } else {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002425 blocksize = sctx->sectorsize;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002426 WARN_ON(1);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002427 }
Arne Jansena2de7332011-03-08 14:14:00 +01002428
2429 while (len) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002430 u64 l = min_t(u64, len, blocksize);
Arne Jansena2de7332011-03-08 14:14:00 +01002431 int have_csum = 0;
2432
2433 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2434 /* push csums to sbio */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002435 have_csum = scrub_find_csum(sctx, logical, l, csum);
Arne Jansena2de7332011-03-08 14:14:00 +01002436 if (have_csum == 0)
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002437 ++sctx->stat.no_csum;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002438 if (sctx->is_dev_replace && !have_csum) {
2439 ret = copy_nocow_pages(sctx, logical, l,
2440 mirror_num,
2441 physical_for_dev_replace);
2442 goto behind_scrub_pages;
2443 }
Arne Jansena2de7332011-03-08 14:14:00 +01002444 }
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002445 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
Stefan Behrensff023aa2012-11-06 11:43:11 +01002446 mirror_num, have_csum ? csum : NULL, 0,
2447 physical_for_dev_replace);
2448behind_scrub_pages:
Arne Jansena2de7332011-03-08 14:14:00 +01002449 if (ret)
2450 return ret;
2451 len -= l;
2452 logical += l;
2453 physical += l;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002454 physical_for_dev_replace += l;
Arne Jansena2de7332011-03-08 14:14:00 +01002455 }
2456 return 0;
2457}
2458
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002459static int scrub_pages_for_parity(struct scrub_parity *sparity,
2460 u64 logical, u64 len,
2461 u64 physical, struct btrfs_device *dev,
2462 u64 flags, u64 gen, int mirror_num, u8 *csum)
2463{
2464 struct scrub_ctx *sctx = sparity->sctx;
2465 struct scrub_block *sblock;
2466 int index;
2467
2468 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2469 if (!sblock) {
2470 spin_lock(&sctx->stat_lock);
2471 sctx->stat.malloc_errors++;
2472 spin_unlock(&sctx->stat_lock);
2473 return -ENOMEM;
2474 }
2475
2476 /* one ref inside this function, plus one for each page added to
2477 * a bio later on */
2478 atomic_set(&sblock->ref_count, 1);
2479 sblock->sctx = sctx;
2480 sblock->no_io_error_seen = 1;
2481 sblock->sparity = sparity;
2482 scrub_parity_get(sparity);
2483
2484 for (index = 0; len > 0; index++) {
2485 struct scrub_page *spage;
2486 u64 l = min_t(u64, len, PAGE_SIZE);
2487
2488 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2489 if (!spage) {
2490leave_nomem:
2491 spin_lock(&sctx->stat_lock);
2492 sctx->stat.malloc_errors++;
2493 spin_unlock(&sctx->stat_lock);
2494 scrub_block_put(sblock);
2495 return -ENOMEM;
2496 }
2497 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2498 /* For scrub block */
2499 scrub_page_get(spage);
2500 sblock->pagev[index] = spage;
2501 /* For scrub parity */
2502 scrub_page_get(spage);
2503 list_add_tail(&spage->list, &sparity->spages);
2504 spage->sblock = sblock;
2505 spage->dev = dev;
2506 spage->flags = flags;
2507 spage->generation = gen;
2508 spage->logical = logical;
2509 spage->physical = physical;
2510 spage->mirror_num = mirror_num;
2511 if (csum) {
2512 spage->have_csum = 1;
2513 memcpy(spage->csum, csum, sctx->csum_size);
2514 } else {
2515 spage->have_csum = 0;
2516 }
2517 sblock->page_count++;
2518 spage->page = alloc_page(GFP_NOFS);
2519 if (!spage->page)
2520 goto leave_nomem;
2521 len -= l;
2522 logical += l;
2523 physical += l;
2524 }
2525
2526 WARN_ON(sblock->page_count == 0);
2527 for (index = 0; index < sblock->page_count; index++) {
2528 struct scrub_page *spage = sblock->pagev[index];
2529 int ret;
2530
2531 ret = scrub_add_page_to_rd_bio(sctx, spage);
2532 if (ret) {
2533 scrub_block_put(sblock);
2534 return ret;
2535 }
2536 }
2537
2538 /* last one frees, either here or in bio completion for last page */
2539 scrub_block_put(sblock);
2540 return 0;
2541}
2542
2543static int scrub_extent_for_parity(struct scrub_parity *sparity,
2544 u64 logical, u64 len,
2545 u64 physical, struct btrfs_device *dev,
2546 u64 flags, u64 gen, int mirror_num)
2547{
2548 struct scrub_ctx *sctx = sparity->sctx;
2549 int ret;
2550 u8 csum[BTRFS_CSUM_SIZE];
2551 u32 blocksize;
2552
2553 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2554 blocksize = sctx->sectorsize;
2555 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2556 blocksize = sctx->nodesize;
2557 } else {
2558 blocksize = sctx->sectorsize;
2559 WARN_ON(1);
2560 }
2561
2562 while (len) {
2563 u64 l = min_t(u64, len, blocksize);
2564 int have_csum = 0;
2565
2566 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2567 /* push csums to sbio */
2568 have_csum = scrub_find_csum(sctx, logical, l, csum);
2569 if (have_csum == 0)
2570 goto skip;
2571 }
2572 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2573 flags, gen, mirror_num,
2574 have_csum ? csum : NULL);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002575 if (ret)
2576 return ret;
Dan Carpenter6b6d24b2014-12-12 22:30:00 +03002577skip:
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002578 len -= l;
2579 logical += l;
2580 physical += l;
2581 }
2582 return 0;
2583}
2584
Wang Shilong3b080b22014-04-01 18:01:43 +08002585/*
2586 * Given a physical address, this will calculate it's
2587 * logical offset. if this is a parity stripe, it will return
2588 * the most left data stripe's logical offset.
2589 *
2590 * return 0 if it is a data stripe, 1 means parity stripe.
2591 */
2592static int get_raid56_logic_offset(u64 physical, int num,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002593 struct map_lookup *map, u64 *offset,
2594 u64 *stripe_start)
Wang Shilong3b080b22014-04-01 18:01:43 +08002595{
2596 int i;
2597 int j = 0;
2598 u64 stripe_nr;
2599 u64 last_offset;
2600 int stripe_index;
2601 int rot;
2602
2603 last_offset = (physical - map->stripes[num].physical) *
2604 nr_data_stripes(map);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002605 if (stripe_start)
2606 *stripe_start = last_offset;
2607
Wang Shilong3b080b22014-04-01 18:01:43 +08002608 *offset = last_offset;
2609 for (i = 0; i < nr_data_stripes(map); i++) {
2610 *offset = last_offset + i * map->stripe_len;
2611
2612 stripe_nr = *offset;
2613 do_div(stripe_nr, map->stripe_len);
2614 do_div(stripe_nr, nr_data_stripes(map));
2615
2616 /* Work out the disk rotation on this stripe-set */
2617 rot = do_div(stripe_nr, map->num_stripes);
2618 /* calculate which stripe this data locates */
2619 rot += i;
Wang Shilonge4fbaee2014-04-11 18:32:25 +08002620 stripe_index = rot % map->num_stripes;
Wang Shilong3b080b22014-04-01 18:01:43 +08002621 if (stripe_index == num)
2622 return 0;
2623 if (stripe_index < num)
2624 j++;
2625 }
2626 *offset = last_offset + j * map->stripe_len;
2627 return 1;
2628}
2629
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002630static void scrub_free_parity(struct scrub_parity *sparity)
2631{
2632 struct scrub_ctx *sctx = sparity->sctx;
2633 struct scrub_page *curr, *next;
2634 int nbits;
2635
2636 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2637 if (nbits) {
2638 spin_lock(&sctx->stat_lock);
2639 sctx->stat.read_errors += nbits;
2640 sctx->stat.uncorrectable_errors += nbits;
2641 spin_unlock(&sctx->stat_lock);
2642 }
2643
2644 list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2645 list_del_init(&curr->list);
2646 scrub_page_put(curr);
2647 }
2648
2649 kfree(sparity);
2650}
2651
2652static void scrub_parity_bio_endio(struct bio *bio, int error)
2653{
2654 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2655 struct scrub_ctx *sctx = sparity->sctx;
2656
2657 if (error)
2658 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2659 sparity->nsectors);
2660
2661 scrub_free_parity(sparity);
2662 scrub_pending_bio_dec(sctx);
2663 bio_put(bio);
2664}
2665
2666static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2667{
2668 struct scrub_ctx *sctx = sparity->sctx;
2669 struct bio *bio;
2670 struct btrfs_raid_bio *rbio;
2671 struct scrub_page *spage;
2672 struct btrfs_bio *bbio = NULL;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002673 u64 length;
2674 int ret;
2675
2676 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2677 sparity->nsectors))
2678 goto out;
2679
2680 length = sparity->logic_end - sparity->logic_start + 1;
Miao Xie76035972014-11-14 17:45:42 +08002681 ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002682 sparity->logic_start,
Zhao Lei8e5cfb52015-01-20 15:11:33 +08002683 &length, &bbio, 0, 1);
2684 if (ret || !bbio || !bbio->raid_map)
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002685 goto bbio_out;
2686
2687 bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2688 if (!bio)
2689 goto bbio_out;
2690
2691 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2692 bio->bi_private = sparity;
2693 bio->bi_end_io = scrub_parity_bio_endio;
2694
2695 rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
Zhao Lei8e5cfb52015-01-20 15:11:33 +08002696 length, sparity->scrub_dev,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002697 sparity->dbitmap,
2698 sparity->nsectors);
2699 if (!rbio)
2700 goto rbio_out;
2701
2702 list_for_each_entry(spage, &sparity->spages, list)
2703 raid56_parity_add_scrub_pages(rbio, spage->page,
2704 spage->logical);
2705
2706 scrub_pending_bio_inc(sctx);
2707 raid56_parity_submit_scrub_rbio(rbio);
2708 return;
2709
2710rbio_out:
2711 bio_put(bio);
2712bbio_out:
Zhao Lei6e9606d2015-01-20 15:11:34 +08002713 btrfs_put_bbio(bbio);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002714 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2715 sparity->nsectors);
2716 spin_lock(&sctx->stat_lock);
2717 sctx->stat.malloc_errors++;
2718 spin_unlock(&sctx->stat_lock);
2719out:
2720 scrub_free_parity(sparity);
2721}
2722
2723static inline int scrub_calc_parity_bitmap_len(int nsectors)
2724{
2725 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
2726}
2727
2728static void scrub_parity_get(struct scrub_parity *sparity)
2729{
2730 atomic_inc(&sparity->ref_count);
2731}
2732
2733static void scrub_parity_put(struct scrub_parity *sparity)
2734{
2735 if (!atomic_dec_and_test(&sparity->ref_count))
2736 return;
2737
2738 scrub_parity_check_and_repair(sparity);
2739}
2740
2741static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2742 struct map_lookup *map,
2743 struct btrfs_device *sdev,
2744 struct btrfs_path *path,
2745 u64 logic_start,
2746 u64 logic_end)
2747{
2748 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2749 struct btrfs_root *root = fs_info->extent_root;
2750 struct btrfs_root *csum_root = fs_info->csum_root;
2751 struct btrfs_extent_item *extent;
2752 u64 flags;
2753 int ret;
2754 int slot;
2755 struct extent_buffer *l;
2756 struct btrfs_key key;
2757 u64 generation;
2758 u64 extent_logical;
2759 u64 extent_physical;
2760 u64 extent_len;
2761 struct btrfs_device *extent_dev;
2762 struct scrub_parity *sparity;
2763 int nsectors;
2764 int bitmap_len;
2765 int extent_mirror_num;
2766 int stop_loop = 0;
2767
2768 nsectors = map->stripe_len / root->sectorsize;
2769 bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2770 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2771 GFP_NOFS);
2772 if (!sparity) {
2773 spin_lock(&sctx->stat_lock);
2774 sctx->stat.malloc_errors++;
2775 spin_unlock(&sctx->stat_lock);
2776 return -ENOMEM;
2777 }
2778
2779 sparity->stripe_len = map->stripe_len;
2780 sparity->nsectors = nsectors;
2781 sparity->sctx = sctx;
2782 sparity->scrub_dev = sdev;
2783 sparity->logic_start = logic_start;
2784 sparity->logic_end = logic_end;
2785 atomic_set(&sparity->ref_count, 1);
2786 INIT_LIST_HEAD(&sparity->spages);
2787 sparity->dbitmap = sparity->bitmap;
2788 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2789
2790 ret = 0;
2791 while (logic_start < logic_end) {
2792 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2793 key.type = BTRFS_METADATA_ITEM_KEY;
2794 else
2795 key.type = BTRFS_EXTENT_ITEM_KEY;
2796 key.objectid = logic_start;
2797 key.offset = (u64)-1;
2798
2799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2800 if (ret < 0)
2801 goto out;
2802
2803 if (ret > 0) {
2804 ret = btrfs_previous_extent_item(root, path, 0);
2805 if (ret < 0)
2806 goto out;
2807 if (ret > 0) {
2808 btrfs_release_path(path);
2809 ret = btrfs_search_slot(NULL, root, &key,
2810 path, 0, 0);
2811 if (ret < 0)
2812 goto out;
2813 }
2814 }
2815
2816 stop_loop = 0;
2817 while (1) {
2818 u64 bytes;
2819
2820 l = path->nodes[0];
2821 slot = path->slots[0];
2822 if (slot >= btrfs_header_nritems(l)) {
2823 ret = btrfs_next_leaf(root, path);
2824 if (ret == 0)
2825 continue;
2826 if (ret < 0)
2827 goto out;
2828
2829 stop_loop = 1;
2830 break;
2831 }
2832 btrfs_item_key_to_cpu(l, &key, slot);
2833
2834 if (key.type == BTRFS_METADATA_ITEM_KEY)
2835 bytes = root->nodesize;
2836 else
2837 bytes = key.offset;
2838
2839 if (key.objectid + bytes <= logic_start)
2840 goto next;
2841
2842 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2843 key.type != BTRFS_METADATA_ITEM_KEY)
2844 goto next;
2845
2846 if (key.objectid > logic_end) {
2847 stop_loop = 1;
2848 break;
2849 }
2850
2851 while (key.objectid >= logic_start + map->stripe_len)
2852 logic_start += map->stripe_len;
2853
2854 extent = btrfs_item_ptr(l, slot,
2855 struct btrfs_extent_item);
2856 flags = btrfs_extent_flags(l, extent);
2857 generation = btrfs_extent_generation(l, extent);
2858
2859 if (key.objectid < logic_start &&
2860 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2861 btrfs_err(fs_info,
2862 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2863 key.objectid, logic_start);
2864 goto next;
2865 }
2866again:
2867 extent_logical = key.objectid;
2868 extent_len = bytes;
2869
2870 if (extent_logical < logic_start) {
2871 extent_len -= logic_start - extent_logical;
2872 extent_logical = logic_start;
2873 }
2874
2875 if (extent_logical + extent_len >
2876 logic_start + map->stripe_len)
2877 extent_len = logic_start + map->stripe_len -
2878 extent_logical;
2879
2880 scrub_parity_mark_sectors_data(sparity, extent_logical,
2881 extent_len);
2882
2883 scrub_remap_extent(fs_info, extent_logical,
2884 extent_len, &extent_physical,
2885 &extent_dev,
2886 &extent_mirror_num);
2887
2888 ret = btrfs_lookup_csums_range(csum_root,
2889 extent_logical,
2890 extent_logical + extent_len - 1,
2891 &sctx->csum_list, 1);
2892 if (ret)
2893 goto out;
2894
2895 ret = scrub_extent_for_parity(sparity, extent_logical,
2896 extent_len,
2897 extent_physical,
2898 extent_dev, flags,
2899 generation,
2900 extent_mirror_num);
2901 if (ret)
2902 goto out;
2903
2904 scrub_free_csums(sctx);
2905 if (extent_logical + extent_len <
2906 key.objectid + bytes) {
2907 logic_start += map->stripe_len;
2908
2909 if (logic_start >= logic_end) {
2910 stop_loop = 1;
2911 break;
2912 }
2913
2914 if (logic_start < key.objectid + bytes) {
2915 cond_resched();
2916 goto again;
2917 }
2918 }
2919next:
2920 path->slots[0]++;
2921 }
2922
2923 btrfs_release_path(path);
2924
2925 if (stop_loop)
2926 break;
2927
2928 logic_start += map->stripe_len;
2929 }
2930out:
2931 if (ret < 0)
2932 scrub_parity_mark_sectors_error(sparity, logic_start,
2933 logic_end - logic_start + 1);
2934 scrub_parity_put(sparity);
2935 scrub_submit(sctx);
2936 mutex_lock(&sctx->wr_ctx.wr_lock);
2937 scrub_wr_submit(sctx);
2938 mutex_unlock(&sctx->wr_ctx.wr_lock);
2939
2940 btrfs_release_path(path);
2941 return ret < 0 ? ret : 0;
2942}
2943
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002944static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002945 struct map_lookup *map,
2946 struct btrfs_device *scrub_dev,
Stefan Behrensff023aa2012-11-06 11:43:11 +01002947 int num, u64 base, u64 length,
2948 int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01002949{
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002950 struct btrfs_path *path, *ppath;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002951 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
Arne Jansena2de7332011-03-08 14:14:00 +01002952 struct btrfs_root *root = fs_info->extent_root;
2953 struct btrfs_root *csum_root = fs_info->csum_root;
2954 struct btrfs_extent_item *extent;
Arne Jansene7786c32011-05-28 20:58:38 +00002955 struct blk_plug plug;
Arne Jansena2de7332011-03-08 14:14:00 +01002956 u64 flags;
2957 int ret;
2958 int slot;
Arne Jansena2de7332011-03-08 14:14:00 +01002959 u64 nstripes;
Arne Jansena2de7332011-03-08 14:14:00 +01002960 struct extent_buffer *l;
2961 struct btrfs_key key;
2962 u64 physical;
2963 u64 logical;
Liu Bo625f1c8d2013-04-27 02:56:57 +00002964 u64 logic_end;
Wang Shilong3b080b22014-04-01 18:01:43 +08002965 u64 physical_end;
Arne Jansena2de7332011-03-08 14:14:00 +01002966 u64 generation;
Jan Schmidte12fa9c2011-06-17 15:55:21 +02002967 int mirror_num;
Arne Jansen7a262852011-06-10 12:39:23 +02002968 struct reada_control *reada1;
2969 struct reada_control *reada2;
2970 struct btrfs_key key_start;
2971 struct btrfs_key key_end;
Arne Jansena2de7332011-03-08 14:14:00 +01002972 u64 increment = map->stripe_len;
2973 u64 offset;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002974 u64 extent_logical;
2975 u64 extent_physical;
2976 u64 extent_len;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002977 u64 stripe_logical;
2978 u64 stripe_end;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002979 struct btrfs_device *extent_dev;
2980 int extent_mirror_num;
Wang Shilong3b080b22014-04-01 18:01:43 +08002981 int stop_loop = 0;
David Woodhouse53b381b2013-01-29 18:40:14 -05002982
Arne Jansena2de7332011-03-08 14:14:00 +01002983 nstripes = length;
Wang Shilong3b080b22014-04-01 18:01:43 +08002984 physical = map->stripes[num].physical;
Arne Jansena2de7332011-03-08 14:14:00 +01002985 offset = 0;
2986 do_div(nstripes, map->stripe_len);
2987 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2988 offset = map->stripe_len * num;
2989 increment = map->stripe_len * map->num_stripes;
Jan Schmidt193ea742011-06-13 19:56:54 +02002990 mirror_num = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002991 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2992 int factor = map->num_stripes / map->sub_stripes;
2993 offset = map->stripe_len * (num / map->sub_stripes);
2994 increment = map->stripe_len * factor;
Jan Schmidt193ea742011-06-13 19:56:54 +02002995 mirror_num = num % map->sub_stripes + 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002996 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2997 increment = map->stripe_len;
Jan Schmidt193ea742011-06-13 19:56:54 +02002998 mirror_num = num % map->num_stripes + 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002999 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3000 increment = map->stripe_len;
Jan Schmidt193ea742011-06-13 19:56:54 +02003001 mirror_num = num % map->num_stripes + 1;
Wang Shilong3b080b22014-04-01 18:01:43 +08003002 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
3003 BTRFS_BLOCK_GROUP_RAID6)) {
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003004 get_raid56_logic_offset(physical, num, map, &offset, NULL);
Wang Shilong3b080b22014-04-01 18:01:43 +08003005 increment = map->stripe_len * nr_data_stripes(map);
3006 mirror_num = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003007 } else {
3008 increment = map->stripe_len;
Jan Schmidt193ea742011-06-13 19:56:54 +02003009 mirror_num = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003010 }
3011
3012 path = btrfs_alloc_path();
3013 if (!path)
3014 return -ENOMEM;
3015
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003016 ppath = btrfs_alloc_path();
3017 if (!ppath) {
3018 btrfs_free_path(ppath);
3019 return -ENOMEM;
3020 }
3021
Stefan Behrensb5d67f62012-03-27 14:21:27 -04003022 /*
3023 * work on commit root. The related disk blocks are static as
3024 * long as COW is applied. This means, it is save to rewrite
3025 * them to repair disk errors without any race conditions
3026 */
Arne Jansena2de7332011-03-08 14:14:00 +01003027 path->search_commit_root = 1;
3028 path->skip_locking = 1;
3029
3030 /*
Arne Jansen7a262852011-06-10 12:39:23 +02003031 * trigger the readahead for extent tree csum tree and wait for
3032 * completion. During readahead, the scrub is officially paused
3033 * to not hold off transaction commits
Arne Jansena2de7332011-03-08 14:14:00 +01003034 */
3035 logical = base + offset;
Wang Shilong3b080b22014-04-01 18:01:43 +08003036 physical_end = physical + nstripes * map->stripe_len;
3037 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
3038 BTRFS_BLOCK_GROUP_RAID6)) {
3039 get_raid56_logic_offset(physical_end, num,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003040 map, &logic_end, NULL);
Wang Shilong3b080b22014-04-01 18:01:43 +08003041 logic_end += base;
3042 } else {
3043 logic_end = logical + increment * nstripes;
3044 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003045 wait_event(sctx->list_wait,
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01003046 atomic_read(&sctx->bios_in_flight) == 0);
Wang Shilongcb7ab022013-12-04 21:16:53 +08003047 scrub_blocked_if_needed(fs_info);
Arne Jansena2de7332011-03-08 14:14:00 +01003048
Arne Jansen7a262852011-06-10 12:39:23 +02003049 /* FIXME it might be better to start readahead at commit root */
3050 key_start.objectid = logical;
3051 key_start.type = BTRFS_EXTENT_ITEM_KEY;
3052 key_start.offset = (u64)0;
Wang Shilong3b080b22014-04-01 18:01:43 +08003053 key_end.objectid = logic_end;
Josef Bacik3173a182013-03-07 14:22:04 -05003054 key_end.type = BTRFS_METADATA_ITEM_KEY;
3055 key_end.offset = (u64)-1;
Arne Jansen7a262852011-06-10 12:39:23 +02003056 reada1 = btrfs_reada_add(root, &key_start, &key_end);
Arne Jansena2de7332011-03-08 14:14:00 +01003057
Arne Jansen7a262852011-06-10 12:39:23 +02003058 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3059 key_start.type = BTRFS_EXTENT_CSUM_KEY;
3060 key_start.offset = logical;
3061 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3062 key_end.type = BTRFS_EXTENT_CSUM_KEY;
Wang Shilong3b080b22014-04-01 18:01:43 +08003063 key_end.offset = logic_end;
Arne Jansen7a262852011-06-10 12:39:23 +02003064 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
Arne Jansena2de7332011-03-08 14:14:00 +01003065
Arne Jansen7a262852011-06-10 12:39:23 +02003066 if (!IS_ERR(reada1))
3067 btrfs_reada_wait(reada1);
3068 if (!IS_ERR(reada2))
3069 btrfs_reada_wait(reada2);
Arne Jansena2de7332011-03-08 14:14:00 +01003070
Arne Jansena2de7332011-03-08 14:14:00 +01003071
3072 /*
3073 * collect all data csums for the stripe to avoid seeking during
3074 * the scrub. This might currently (crc32) end up to be about 1MB
3075 */
Arne Jansene7786c32011-05-28 20:58:38 +00003076 blk_start_plug(&plug);
Arne Jansena2de7332011-03-08 14:14:00 +01003077
Arne Jansena2de7332011-03-08 14:14:00 +01003078 /*
3079 * now find all extents for each stripe and scrub them
3080 */
Arne Jansena2de7332011-03-08 14:14:00 +01003081 ret = 0;
Wang Shilong3b080b22014-04-01 18:01:43 +08003082 while (physical < physical_end) {
3083 /* for raid56, we skip parity stripe */
3084 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
3085 BTRFS_BLOCK_GROUP_RAID6)) {
3086 ret = get_raid56_logic_offset(physical, num,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003087 map, &logical, &stripe_logical);
Wang Shilong3b080b22014-04-01 18:01:43 +08003088 logical += base;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003089 if (ret) {
3090 stripe_logical += base;
3091 stripe_end = stripe_logical + increment - 1;
3092 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3093 ppath, stripe_logical,
3094 stripe_end);
3095 if (ret)
3096 goto out;
Wang Shilong3b080b22014-04-01 18:01:43 +08003097 goto skip;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003098 }
Wang Shilong3b080b22014-04-01 18:01:43 +08003099 }
Arne Jansena2de7332011-03-08 14:14:00 +01003100 /*
3101 * canceled?
3102 */
3103 if (atomic_read(&fs_info->scrub_cancel_req) ||
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003104 atomic_read(&sctx->cancel_req)) {
Arne Jansena2de7332011-03-08 14:14:00 +01003105 ret = -ECANCELED;
3106 goto out;
3107 }
3108 /*
3109 * check to see if we have to pause
3110 */
3111 if (atomic_read(&fs_info->scrub_pause_req)) {
3112 /* push queued extents */
Stefan Behrensff023aa2012-11-06 11:43:11 +01003113 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003114 scrub_submit(sctx);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003115 mutex_lock(&sctx->wr_ctx.wr_lock);
3116 scrub_wr_submit(sctx);
3117 mutex_unlock(&sctx->wr_ctx.wr_lock);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003118 wait_event(sctx->list_wait,
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01003119 atomic_read(&sctx->bios_in_flight) == 0);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003120 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
Wang Shilong3cb09292013-12-04 21:15:19 +08003121 scrub_blocked_if_needed(fs_info);
Arne Jansena2de7332011-03-08 14:14:00 +01003122 }
3123
Wang Shilong7c76edb2014-01-12 21:38:32 +08003124 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3125 key.type = BTRFS_METADATA_ITEM_KEY;
3126 else
3127 key.type = BTRFS_EXTENT_ITEM_KEY;
Arne Jansena2de7332011-03-08 14:14:00 +01003128 key.objectid = logical;
Liu Bo625f1c8d2013-04-27 02:56:57 +00003129 key.offset = (u64)-1;
Arne Jansena2de7332011-03-08 14:14:00 +01003130
3131 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3132 if (ret < 0)
3133 goto out;
Josef Bacik3173a182013-03-07 14:22:04 -05003134
Arne Jansen8c510322011-06-03 10:09:26 +02003135 if (ret > 0) {
Wang Shilongade2e0b2014-01-12 21:38:33 +08003136 ret = btrfs_previous_extent_item(root, path, 0);
Arne Jansena2de7332011-03-08 14:14:00 +01003137 if (ret < 0)
3138 goto out;
Arne Jansen8c510322011-06-03 10:09:26 +02003139 if (ret > 0) {
3140 /* there's no smaller item, so stick with the
3141 * larger one */
3142 btrfs_release_path(path);
3143 ret = btrfs_search_slot(NULL, root, &key,
3144 path, 0, 0);
3145 if (ret < 0)
3146 goto out;
3147 }
Arne Jansena2de7332011-03-08 14:14:00 +01003148 }
3149
Liu Bo625f1c8d2013-04-27 02:56:57 +00003150 stop_loop = 0;
Arne Jansena2de7332011-03-08 14:14:00 +01003151 while (1) {
Josef Bacik3173a182013-03-07 14:22:04 -05003152 u64 bytes;
3153
Arne Jansena2de7332011-03-08 14:14:00 +01003154 l = path->nodes[0];
3155 slot = path->slots[0];
3156 if (slot >= btrfs_header_nritems(l)) {
3157 ret = btrfs_next_leaf(root, path);
3158 if (ret == 0)
3159 continue;
3160 if (ret < 0)
3161 goto out;
3162
Liu Bo625f1c8d2013-04-27 02:56:57 +00003163 stop_loop = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003164 break;
3165 }
3166 btrfs_item_key_to_cpu(l, &key, slot);
3167
Josef Bacik3173a182013-03-07 14:22:04 -05003168 if (key.type == BTRFS_METADATA_ITEM_KEY)
David Sterba707e8a02014-06-04 19:22:26 +02003169 bytes = root->nodesize;
Josef Bacik3173a182013-03-07 14:22:04 -05003170 else
3171 bytes = key.offset;
3172
3173 if (key.objectid + bytes <= logical)
Arne Jansena2de7332011-03-08 14:14:00 +01003174 goto next;
3175
Liu Bo625f1c8d2013-04-27 02:56:57 +00003176 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3177 key.type != BTRFS_METADATA_ITEM_KEY)
3178 goto next;
Arne Jansena2de7332011-03-08 14:14:00 +01003179
Liu Bo625f1c8d2013-04-27 02:56:57 +00003180 if (key.objectid >= logical + map->stripe_len) {
3181 /* out of this device extent */
3182 if (key.objectid >= logic_end)
3183 stop_loop = 1;
3184 break;
3185 }
Arne Jansena2de7332011-03-08 14:14:00 +01003186
3187 extent = btrfs_item_ptr(l, slot,
3188 struct btrfs_extent_item);
3189 flags = btrfs_extent_flags(l, extent);
3190 generation = btrfs_extent_generation(l, extent);
3191
3192 if (key.objectid < logical &&
3193 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
Frank Holtonefe120a2013-12-20 11:37:06 -05003194 btrfs_err(fs_info,
3195 "scrub: tree block %llu spanning "
3196 "stripes, ignored. logical=%llu",
Geert Uytterhoevenc1c9ff72013-08-20 13:20:07 +02003197 key.objectid, logical);
Arne Jansena2de7332011-03-08 14:14:00 +01003198 goto next;
3199 }
3200
Liu Bo625f1c8d2013-04-27 02:56:57 +00003201again:
3202 extent_logical = key.objectid;
3203 extent_len = bytes;
3204
Arne Jansena2de7332011-03-08 14:14:00 +01003205 /*
3206 * trim extent to this stripe
3207 */
Liu Bo625f1c8d2013-04-27 02:56:57 +00003208 if (extent_logical < logical) {
3209 extent_len -= logical - extent_logical;
3210 extent_logical = logical;
Arne Jansena2de7332011-03-08 14:14:00 +01003211 }
Liu Bo625f1c8d2013-04-27 02:56:57 +00003212 if (extent_logical + extent_len >
Arne Jansena2de7332011-03-08 14:14:00 +01003213 logical + map->stripe_len) {
Liu Bo625f1c8d2013-04-27 02:56:57 +00003214 extent_len = logical + map->stripe_len -
3215 extent_logical;
Arne Jansena2de7332011-03-08 14:14:00 +01003216 }
3217
Liu Bo625f1c8d2013-04-27 02:56:57 +00003218 extent_physical = extent_logical - logical + physical;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003219 extent_dev = scrub_dev;
3220 extent_mirror_num = mirror_num;
3221 if (is_dev_replace)
3222 scrub_remap_extent(fs_info, extent_logical,
3223 extent_len, &extent_physical,
3224 &extent_dev,
3225 &extent_mirror_num);
Liu Bo625f1c8d2013-04-27 02:56:57 +00003226
3227 ret = btrfs_lookup_csums_range(csum_root, logical,
3228 logical + map->stripe_len - 1,
3229 &sctx->csum_list, 1);
Arne Jansena2de7332011-03-08 14:14:00 +01003230 if (ret)
3231 goto out;
3232
Liu Bo625f1c8d2013-04-27 02:56:57 +00003233 ret = scrub_extent(sctx, extent_logical, extent_len,
3234 extent_physical, extent_dev, flags,
3235 generation, extent_mirror_num,
Stefan Behrens115930c2013-07-04 16:14:23 +02003236 extent_logical - logical + physical);
Liu Bo625f1c8d2013-04-27 02:56:57 +00003237 if (ret)
3238 goto out;
3239
Josef Bacikd88d46c2013-06-10 12:59:04 +00003240 scrub_free_csums(sctx);
Liu Bo625f1c8d2013-04-27 02:56:57 +00003241 if (extent_logical + extent_len <
3242 key.objectid + bytes) {
Wang Shilong3b080b22014-04-01 18:01:43 +08003243 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
3244 BTRFS_BLOCK_GROUP_RAID6)) {
3245 /*
3246 * loop until we find next data stripe
3247 * or we have finished all stripes.
3248 */
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003249loop:
3250 physical += map->stripe_len;
3251 ret = get_raid56_logic_offset(physical,
3252 num, map, &logical,
3253 &stripe_logical);
3254 logical += base;
3255
3256 if (ret && physical < physical_end) {
3257 stripe_logical += base;
3258 stripe_end = stripe_logical +
3259 increment - 1;
3260 ret = scrub_raid56_parity(sctx,
3261 map, scrub_dev, ppath,
3262 stripe_logical,
3263 stripe_end);
3264 if (ret)
3265 goto out;
3266 goto loop;
3267 }
Wang Shilong3b080b22014-04-01 18:01:43 +08003268 } else {
3269 physical += map->stripe_len;
3270 logical += increment;
3271 }
Liu Bo625f1c8d2013-04-27 02:56:57 +00003272 if (logical < key.objectid + bytes) {
3273 cond_resched();
3274 goto again;
3275 }
3276
Wang Shilong3b080b22014-04-01 18:01:43 +08003277 if (physical >= physical_end) {
Liu Bo625f1c8d2013-04-27 02:56:57 +00003278 stop_loop = 1;
3279 break;
3280 }
3281 }
Arne Jansena2de7332011-03-08 14:14:00 +01003282next:
3283 path->slots[0]++;
3284 }
Chris Mason71267332011-05-23 06:30:52 -04003285 btrfs_release_path(path);
Wang Shilong3b080b22014-04-01 18:01:43 +08003286skip:
Arne Jansena2de7332011-03-08 14:14:00 +01003287 logical += increment;
3288 physical += map->stripe_len;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003289 spin_lock(&sctx->stat_lock);
Liu Bo625f1c8d2013-04-27 02:56:57 +00003290 if (stop_loop)
3291 sctx->stat.last_physical = map->stripes[num].physical +
3292 length;
3293 else
3294 sctx->stat.last_physical = physical;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003295 spin_unlock(&sctx->stat_lock);
Liu Bo625f1c8d2013-04-27 02:56:57 +00003296 if (stop_loop)
3297 break;
Arne Jansena2de7332011-03-08 14:14:00 +01003298 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01003299out:
Arne Jansena2de7332011-03-08 14:14:00 +01003300 /* push queued extents */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003301 scrub_submit(sctx);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003302 mutex_lock(&sctx->wr_ctx.wr_lock);
3303 scrub_wr_submit(sctx);
3304 mutex_unlock(&sctx->wr_ctx.wr_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01003305
Arne Jansene7786c32011-05-28 20:58:38 +00003306 blk_finish_plug(&plug);
Arne Jansena2de7332011-03-08 14:14:00 +01003307 btrfs_free_path(path);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003308 btrfs_free_path(ppath);
Arne Jansena2de7332011-03-08 14:14:00 +01003309 return ret < 0 ? ret : 0;
3310}
3311
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003312static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003313 struct btrfs_device *scrub_dev,
3314 u64 chunk_tree, u64 chunk_objectid,
3315 u64 chunk_offset, u64 length,
Stefan Behrensff023aa2012-11-06 11:43:11 +01003316 u64 dev_offset, int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01003317{
3318 struct btrfs_mapping_tree *map_tree =
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003319 &sctx->dev_root->fs_info->mapping_tree;
Arne Jansena2de7332011-03-08 14:14:00 +01003320 struct map_lookup *map;
3321 struct extent_map *em;
3322 int i;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003323 int ret = 0;
Arne Jansena2de7332011-03-08 14:14:00 +01003324
3325 read_lock(&map_tree->map_tree.lock);
3326 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3327 read_unlock(&map_tree->map_tree.lock);
3328
3329 if (!em)
3330 return -EINVAL;
3331
3332 map = (struct map_lookup *)em->bdev;
3333 if (em->start != chunk_offset)
3334 goto out;
3335
3336 if (em->len < length)
3337 goto out;
3338
3339 for (i = 0; i < map->num_stripes; ++i) {
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003340 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
Arne Jansen859acaf2012-02-09 15:09:02 +01003341 map->stripes[i].physical == dev_offset) {
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003342 ret = scrub_stripe(sctx, map, scrub_dev, i,
Stefan Behrensff023aa2012-11-06 11:43:11 +01003343 chunk_offset, length,
3344 is_dev_replace);
Arne Jansena2de7332011-03-08 14:14:00 +01003345 if (ret)
3346 goto out;
3347 }
3348 }
3349out:
3350 free_extent_map(em);
3351
3352 return ret;
3353}
3354
3355static noinline_for_stack
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003356int scrub_enumerate_chunks(struct scrub_ctx *sctx,
Stefan Behrensff023aa2012-11-06 11:43:11 +01003357 struct btrfs_device *scrub_dev, u64 start, u64 end,
3358 int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01003359{
3360 struct btrfs_dev_extent *dev_extent = NULL;
3361 struct btrfs_path *path;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003362 struct btrfs_root *root = sctx->dev_root;
Arne Jansena2de7332011-03-08 14:14:00 +01003363 struct btrfs_fs_info *fs_info = root->fs_info;
3364 u64 length;
3365 u64 chunk_tree;
3366 u64 chunk_objectid;
3367 u64 chunk_offset;
3368 int ret;
3369 int slot;
3370 struct extent_buffer *l;
3371 struct btrfs_key key;
3372 struct btrfs_key found_key;
3373 struct btrfs_block_group_cache *cache;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003374 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
Arne Jansena2de7332011-03-08 14:14:00 +01003375
3376 path = btrfs_alloc_path();
3377 if (!path)
3378 return -ENOMEM;
3379
3380 path->reada = 2;
3381 path->search_commit_root = 1;
3382 path->skip_locking = 1;
3383
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003384 key.objectid = scrub_dev->devid;
Arne Jansena2de7332011-03-08 14:14:00 +01003385 key.offset = 0ull;
3386 key.type = BTRFS_DEV_EXTENT_KEY;
3387
Arne Jansena2de7332011-03-08 14:14:00 +01003388 while (1) {
3389 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3390 if (ret < 0)
Arne Jansen8c510322011-06-03 10:09:26 +02003391 break;
3392 if (ret > 0) {
3393 if (path->slots[0] >=
3394 btrfs_header_nritems(path->nodes[0])) {
3395 ret = btrfs_next_leaf(root, path);
3396 if (ret)
3397 break;
3398 }
3399 }
Arne Jansena2de7332011-03-08 14:14:00 +01003400
3401 l = path->nodes[0];
3402 slot = path->slots[0];
3403
3404 btrfs_item_key_to_cpu(l, &found_key, slot);
3405
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003406 if (found_key.objectid != scrub_dev->devid)
Arne Jansena2de7332011-03-08 14:14:00 +01003407 break;
3408
David Sterba962a2982014-06-04 18:41:45 +02003409 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
Arne Jansena2de7332011-03-08 14:14:00 +01003410 break;
3411
3412 if (found_key.offset >= end)
3413 break;
3414
3415 if (found_key.offset < key.offset)
3416 break;
3417
3418 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3419 length = btrfs_dev_extent_length(l, dev_extent);
3420
Qu Wenruoced96ed2014-06-19 10:42:51 +08003421 if (found_key.offset + length <= start)
3422 goto skip;
Arne Jansena2de7332011-03-08 14:14:00 +01003423
3424 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3425 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3426 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3427
3428 /*
3429 * get a reference on the corresponding block group to prevent
3430 * the chunk from going away while we scrub it
3431 */
3432 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
Qu Wenruoced96ed2014-06-19 10:42:51 +08003433
3434 /* some chunks are removed but not committed to disk yet,
3435 * continue scrubbing */
3436 if (!cache)
3437 goto skip;
3438
Stefan Behrensff023aa2012-11-06 11:43:11 +01003439 dev_replace->cursor_right = found_key.offset + length;
3440 dev_replace->cursor_left = found_key.offset;
3441 dev_replace->item_needs_writeback = 1;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003442 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
Stefan Behrensff023aa2012-11-06 11:43:11 +01003443 chunk_offset, length, found_key.offset,
3444 is_dev_replace);
3445
3446 /*
3447 * flush, submit all pending read and write bios, afterwards
3448 * wait for them.
3449 * Note that in the dev replace case, a read request causes
3450 * write requests that are submitted in the read completion
3451 * worker. Therefore in the current situation, it is required
3452 * that all write requests are flushed, so that all read and
3453 * write requests are really completed when bios_in_flight
3454 * changes to 0.
3455 */
3456 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3457 scrub_submit(sctx);
3458 mutex_lock(&sctx->wr_ctx.wr_lock);
3459 scrub_wr_submit(sctx);
3460 mutex_unlock(&sctx->wr_ctx.wr_lock);
3461
3462 wait_event(sctx->list_wait,
3463 atomic_read(&sctx->bios_in_flight) == 0);
Wang Shilong12cf9372014-02-19 19:24:17 +08003464 atomic_inc(&fs_info->scrubs_paused);
3465 wake_up(&fs_info->scrub_pause_wait);
3466
3467 /*
3468 * must be called before we decrease @scrub_paused.
3469 * make sure we don't block transaction commit while
3470 * we are waiting pending workers finished.
3471 */
Stefan Behrensff023aa2012-11-06 11:43:11 +01003472 wait_event(sctx->list_wait,
3473 atomic_read(&sctx->workers_pending) == 0);
Wang Shilong12cf9372014-02-19 19:24:17 +08003474 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3475
3476 mutex_lock(&fs_info->scrub_lock);
3477 __scrub_blocked_if_needed(fs_info);
3478 atomic_dec(&fs_info->scrubs_paused);
3479 mutex_unlock(&fs_info->scrub_lock);
3480 wake_up(&fs_info->scrub_pause_wait);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003481
Arne Jansena2de7332011-03-08 14:14:00 +01003482 btrfs_put_block_group(cache);
3483 if (ret)
3484 break;
Stefan Behrensaf1be4f2012-11-27 17:39:51 +00003485 if (is_dev_replace &&
3486 atomic64_read(&dev_replace->num_write_errors) > 0) {
Stefan Behrensff023aa2012-11-06 11:43:11 +01003487 ret = -EIO;
3488 break;
3489 }
3490 if (sctx->stat.malloc_errors > 0) {
3491 ret = -ENOMEM;
3492 break;
3493 }
Arne Jansena2de7332011-03-08 14:14:00 +01003494
Ilya Dryomov539f3582013-10-07 13:42:57 +03003495 dev_replace->cursor_left = dev_replace->cursor_right;
3496 dev_replace->item_needs_writeback = 1;
Qu Wenruoced96ed2014-06-19 10:42:51 +08003497skip:
Arne Jansena2de7332011-03-08 14:14:00 +01003498 key.offset = found_key.offset + length;
Chris Mason71267332011-05-23 06:30:52 -04003499 btrfs_release_path(path);
Arne Jansena2de7332011-03-08 14:14:00 +01003500 }
3501
Arne Jansena2de7332011-03-08 14:14:00 +01003502 btrfs_free_path(path);
Arne Jansen8c510322011-06-03 10:09:26 +02003503
3504 /*
3505 * ret can still be 1 from search_slot or next_leaf,
3506 * that's not an error
3507 */
3508 return ret < 0 ? ret : 0;
Arne Jansena2de7332011-03-08 14:14:00 +01003509}
3510
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003511static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3512 struct btrfs_device *scrub_dev)
Arne Jansena2de7332011-03-08 14:14:00 +01003513{
3514 int i;
3515 u64 bytenr;
3516 u64 gen;
3517 int ret;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003518 struct btrfs_root *root = sctx->dev_root;
Arne Jansena2de7332011-03-08 14:14:00 +01003519
Miao Xie87533c42013-01-29 10:14:48 +00003520 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
Jeff Mahoney79787ea2012-03-12 16:03:00 +01003521 return -EIO;
3522
Miao Xie5f546062014-07-24 11:37:09 +08003523 /* Seed devices of a new filesystem has their own generation. */
3524 if (scrub_dev->fs_devices != root->fs_info->fs_devices)
3525 gen = scrub_dev->generation;
3526 else
3527 gen = root->fs_info->last_trans_committed;
Arne Jansena2de7332011-03-08 14:14:00 +01003528
3529 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3530 bytenr = btrfs_sb_offset(i);
Miao Xie935e5cc2014-09-03 21:35:33 +08003531 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3532 scrub_dev->commit_total_bytes)
Arne Jansena2de7332011-03-08 14:14:00 +01003533 break;
3534
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003535 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003536 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
Stefan Behrensff023aa2012-11-06 11:43:11 +01003537 NULL, 1, bytenr);
Arne Jansena2de7332011-03-08 14:14:00 +01003538 if (ret)
3539 return ret;
3540 }
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01003541 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
Arne Jansena2de7332011-03-08 14:14:00 +01003542
3543 return 0;
3544}
3545
3546/*
3547 * get a reference count on fs_info->scrub_workers. start worker if necessary
3548 */
Stefan Behrensff023aa2012-11-06 11:43:11 +01003549static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3550 int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01003551{
Josef Bacik0dc3b842011-11-18 14:37:27 -05003552 int ret = 0;
Qu Wenruo0339ef22014-02-28 10:46:17 +08003553 int flags = WQ_FREEZABLE | WQ_UNBOUND;
3554 int max_active = fs_info->thread_pool_size;
Arne Jansena2de7332011-03-08 14:14:00 +01003555
Arne Jansen632dd772011-06-10 12:07:07 +02003556 if (fs_info->scrub_workers_refcnt == 0) {
Stefan Behrensff023aa2012-11-06 11:43:11 +01003557 if (is_dev_replace)
Qu Wenruo0339ef22014-02-28 10:46:17 +08003558 fs_info->scrub_workers =
3559 btrfs_alloc_workqueue("btrfs-scrub", flags,
3560 1, 4);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003561 else
Qu Wenruo0339ef22014-02-28 10:46:17 +08003562 fs_info->scrub_workers =
3563 btrfs_alloc_workqueue("btrfs-scrub", flags,
3564 max_active, 4);
3565 if (!fs_info->scrub_workers) {
3566 ret = -ENOMEM;
Josef Bacik0dc3b842011-11-18 14:37:27 -05003567 goto out;
Qu Wenruo0339ef22014-02-28 10:46:17 +08003568 }
3569 fs_info->scrub_wr_completion_workers =
3570 btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
3571 max_active, 2);
3572 if (!fs_info->scrub_wr_completion_workers) {
3573 ret = -ENOMEM;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003574 goto out;
Qu Wenruo0339ef22014-02-28 10:46:17 +08003575 }
3576 fs_info->scrub_nocow_workers =
3577 btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
3578 if (!fs_info->scrub_nocow_workers) {
3579 ret = -ENOMEM;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003580 goto out;
Qu Wenruo0339ef22014-02-28 10:46:17 +08003581 }
Arne Jansen632dd772011-06-10 12:07:07 +02003582 }
Arne Jansena2de7332011-03-08 14:14:00 +01003583 ++fs_info->scrub_workers_refcnt;
Josef Bacik0dc3b842011-11-18 14:37:27 -05003584out:
Josef Bacik0dc3b842011-11-18 14:37:27 -05003585 return ret;
Arne Jansena2de7332011-03-08 14:14:00 +01003586}
3587
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003588static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
Arne Jansena2de7332011-03-08 14:14:00 +01003589{
Stefan Behrensff023aa2012-11-06 11:43:11 +01003590 if (--fs_info->scrub_workers_refcnt == 0) {
Qu Wenruo0339ef22014-02-28 10:46:17 +08003591 btrfs_destroy_workqueue(fs_info->scrub_workers);
3592 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3593 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003594 }
Arne Jansena2de7332011-03-08 14:14:00 +01003595 WARN_ON(fs_info->scrub_workers_refcnt < 0);
Arne Jansena2de7332011-03-08 14:14:00 +01003596}
3597
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003598int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3599 u64 end, struct btrfs_scrub_progress *progress,
Stefan Behrens63a212a2012-11-05 18:29:28 +01003600 int readonly, int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01003601{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003602 struct scrub_ctx *sctx;
Arne Jansena2de7332011-03-08 14:14:00 +01003603 int ret;
3604 struct btrfs_device *dev;
Miao Xie5d68da32014-07-24 11:37:07 +08003605 struct rcu_string *name;
Arne Jansena2de7332011-03-08 14:14:00 +01003606
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003607 if (btrfs_fs_closing(fs_info))
Arne Jansena2de7332011-03-08 14:14:00 +01003608 return -EINVAL;
3609
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003610 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04003611 /*
3612 * in this case scrub is unable to calculate the checksum
3613 * the way scrub is implemented. Do not handle this
3614 * situation at all because it won't ever happen.
3615 */
Frank Holtonefe120a2013-12-20 11:37:06 -05003616 btrfs_err(fs_info,
3617 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003618 fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04003619 return -EINVAL;
3620 }
3621
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003622 if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04003623 /* not supported for data w/o checksums */
Frank Holtonefe120a2013-12-20 11:37:06 -05003624 btrfs_err(fs_info,
3625 "scrub: size assumption sectorsize != PAGE_SIZE "
3626 "(%d != %lu) fails",
Geert Uytterhoeven27f9f022013-08-20 13:20:09 +02003627 fs_info->chunk_root->sectorsize, PAGE_SIZE);
Arne Jansena2de7332011-03-08 14:14:00 +01003628 return -EINVAL;
3629 }
3630
Stefan Behrens7a9e9982012-11-02 14:58:04 +01003631 if (fs_info->chunk_root->nodesize >
3632 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3633 fs_info->chunk_root->sectorsize >
3634 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3635 /*
3636 * would exhaust the array bounds of pagev member in
3637 * struct scrub_block
3638 */
Frank Holtonefe120a2013-12-20 11:37:06 -05003639 btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
3640 "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
Stefan Behrens7a9e9982012-11-02 14:58:04 +01003641 fs_info->chunk_root->nodesize,
3642 SCRUB_MAX_PAGES_PER_BLOCK,
3643 fs_info->chunk_root->sectorsize,
3644 SCRUB_MAX_PAGES_PER_BLOCK);
3645 return -EINVAL;
3646 }
3647
Arne Jansena2de7332011-03-08 14:14:00 +01003648
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003649 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3650 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
Stefan Behrens63a212a2012-11-05 18:29:28 +01003651 if (!dev || (dev->missing && !is_dev_replace)) {
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003652 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Arne Jansena2de7332011-03-08 14:14:00 +01003653 return -ENODEV;
3654 }
Arne Jansena2de7332011-03-08 14:14:00 +01003655
Miao Xie5d68da32014-07-24 11:37:07 +08003656 if (!is_dev_replace && !readonly && !dev->writeable) {
3657 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3658 rcu_read_lock();
3659 name = rcu_dereference(dev->name);
3660 btrfs_err(fs_info, "scrub: device %s is not writable",
3661 name->str);
3662 rcu_read_unlock();
3663 return -EROFS;
3664 }
3665
Wang Shilong3b7a0162013-10-12 02:11:12 +08003666 mutex_lock(&fs_info->scrub_lock);
Stefan Behrens63a212a2012-11-05 18:29:28 +01003667 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
Arne Jansena2de7332011-03-08 14:14:00 +01003668 mutex_unlock(&fs_info->scrub_lock);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003669 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003670 return -EIO;
Arne Jansena2de7332011-03-08 14:14:00 +01003671 }
3672
Stefan Behrens8dabb742012-11-06 13:15:27 +01003673 btrfs_dev_replace_lock(&fs_info->dev_replace);
3674 if (dev->scrub_device ||
3675 (!is_dev_replace &&
3676 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3677 btrfs_dev_replace_unlock(&fs_info->dev_replace);
Arne Jansena2de7332011-03-08 14:14:00 +01003678 mutex_unlock(&fs_info->scrub_lock);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003679 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Arne Jansena2de7332011-03-08 14:14:00 +01003680 return -EINPROGRESS;
3681 }
Stefan Behrens8dabb742012-11-06 13:15:27 +01003682 btrfs_dev_replace_unlock(&fs_info->dev_replace);
Wang Shilong3b7a0162013-10-12 02:11:12 +08003683
3684 ret = scrub_workers_get(fs_info, is_dev_replace);
3685 if (ret) {
3686 mutex_unlock(&fs_info->scrub_lock);
3687 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3688 return ret;
3689 }
3690
Stefan Behrens63a212a2012-11-05 18:29:28 +01003691 sctx = scrub_setup_ctx(dev, is_dev_replace);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003692 if (IS_ERR(sctx)) {
Arne Jansena2de7332011-03-08 14:14:00 +01003693 mutex_unlock(&fs_info->scrub_lock);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003694 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3695 scrub_workers_put(fs_info);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003696 return PTR_ERR(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +01003697 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003698 sctx->readonly = readonly;
3699 dev->scrub_device = sctx;
Wang Shilong3cb09292013-12-04 21:15:19 +08003700 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Arne Jansena2de7332011-03-08 14:14:00 +01003701
Wang Shilong3cb09292013-12-04 21:15:19 +08003702 /*
3703 * checking @scrub_pause_req here, we can avoid
3704 * race between committing transaction and scrubbing.
3705 */
Wang Shilongcb7ab022013-12-04 21:16:53 +08003706 __scrub_blocked_if_needed(fs_info);
Arne Jansena2de7332011-03-08 14:14:00 +01003707 atomic_inc(&fs_info->scrubs_running);
3708 mutex_unlock(&fs_info->scrub_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01003709
Stefan Behrensff023aa2012-11-06 11:43:11 +01003710 if (!is_dev_replace) {
Wang Shilong9b011ad2013-10-25 19:12:02 +08003711 /*
3712 * by holding device list mutex, we can
3713 * kick off writing super in log tree sync.
3714 */
Wang Shilong3cb09292013-12-04 21:15:19 +08003715 mutex_lock(&fs_info->fs_devices->device_list_mutex);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003716 ret = scrub_supers(sctx, dev);
Wang Shilong3cb09292013-12-04 21:15:19 +08003717 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003718 }
Arne Jansena2de7332011-03-08 14:14:00 +01003719
3720 if (!ret)
Stefan Behrensff023aa2012-11-06 11:43:11 +01003721 ret = scrub_enumerate_chunks(sctx, dev, start, end,
3722 is_dev_replace);
Arne Jansena2de7332011-03-08 14:14:00 +01003723
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01003724 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
Arne Jansena2de7332011-03-08 14:14:00 +01003725 atomic_dec(&fs_info->scrubs_running);
3726 wake_up(&fs_info->scrub_pause_wait);
3727
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01003728 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02003729
Arne Jansena2de7332011-03-08 14:14:00 +01003730 if (progress)
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003731 memcpy(progress, &sctx->stat, sizeof(*progress));
Arne Jansena2de7332011-03-08 14:14:00 +01003732
3733 mutex_lock(&fs_info->scrub_lock);
3734 dev->scrub_device = NULL;
Wang Shilong3b7a0162013-10-12 02:11:12 +08003735 scrub_workers_put(fs_info);
Arne Jansena2de7332011-03-08 14:14:00 +01003736 mutex_unlock(&fs_info->scrub_lock);
3737
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003738 scrub_free_ctx(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +01003739
3740 return ret;
3741}
3742
Jeff Mahoney143bede2012-03-01 14:56:26 +01003743void btrfs_scrub_pause(struct btrfs_root *root)
Arne Jansena2de7332011-03-08 14:14:00 +01003744{
3745 struct btrfs_fs_info *fs_info = root->fs_info;
3746
3747 mutex_lock(&fs_info->scrub_lock);
3748 atomic_inc(&fs_info->scrub_pause_req);
3749 while (atomic_read(&fs_info->scrubs_paused) !=
3750 atomic_read(&fs_info->scrubs_running)) {
3751 mutex_unlock(&fs_info->scrub_lock);
3752 wait_event(fs_info->scrub_pause_wait,
3753 atomic_read(&fs_info->scrubs_paused) ==
3754 atomic_read(&fs_info->scrubs_running));
3755 mutex_lock(&fs_info->scrub_lock);
3756 }
3757 mutex_unlock(&fs_info->scrub_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01003758}
3759
Jeff Mahoney143bede2012-03-01 14:56:26 +01003760void btrfs_scrub_continue(struct btrfs_root *root)
Arne Jansena2de7332011-03-08 14:14:00 +01003761{
3762 struct btrfs_fs_info *fs_info = root->fs_info;
3763
3764 atomic_dec(&fs_info->scrub_pause_req);
3765 wake_up(&fs_info->scrub_pause_wait);
Arne Jansena2de7332011-03-08 14:14:00 +01003766}
3767
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003768int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
Arne Jansena2de7332011-03-08 14:14:00 +01003769{
Arne Jansena2de7332011-03-08 14:14:00 +01003770 mutex_lock(&fs_info->scrub_lock);
3771 if (!atomic_read(&fs_info->scrubs_running)) {
3772 mutex_unlock(&fs_info->scrub_lock);
3773 return -ENOTCONN;
3774 }
3775
3776 atomic_inc(&fs_info->scrub_cancel_req);
3777 while (atomic_read(&fs_info->scrubs_running)) {
3778 mutex_unlock(&fs_info->scrub_lock);
3779 wait_event(fs_info->scrub_pause_wait,
3780 atomic_read(&fs_info->scrubs_running) == 0);
3781 mutex_lock(&fs_info->scrub_lock);
3782 }
3783 atomic_dec(&fs_info->scrub_cancel_req);
3784 mutex_unlock(&fs_info->scrub_lock);
3785
3786 return 0;
3787}
3788
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003789int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3790 struct btrfs_device *dev)
Jeff Mahoney49b25e02012-03-01 17:24:58 +01003791{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003792 struct scrub_ctx *sctx;
Arne Jansena2de7332011-03-08 14:14:00 +01003793
3794 mutex_lock(&fs_info->scrub_lock);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003795 sctx = dev->scrub_device;
3796 if (!sctx) {
Arne Jansena2de7332011-03-08 14:14:00 +01003797 mutex_unlock(&fs_info->scrub_lock);
3798 return -ENOTCONN;
3799 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003800 atomic_inc(&sctx->cancel_req);
Arne Jansena2de7332011-03-08 14:14:00 +01003801 while (dev->scrub_device) {
3802 mutex_unlock(&fs_info->scrub_lock);
3803 wait_event(fs_info->scrub_pause_wait,
3804 dev->scrub_device == NULL);
3805 mutex_lock(&fs_info->scrub_lock);
3806 }
3807 mutex_unlock(&fs_info->scrub_lock);
3808
3809 return 0;
3810}
Stefan Behrens1623ede2012-03-27 14:21:26 -04003811
Arne Jansena2de7332011-03-08 14:14:00 +01003812int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3813 struct btrfs_scrub_progress *progress)
3814{
3815 struct btrfs_device *dev;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003816 struct scrub_ctx *sctx = NULL;
Arne Jansena2de7332011-03-08 14:14:00 +01003817
3818 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01003819 dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
Arne Jansena2de7332011-03-08 14:14:00 +01003820 if (dev)
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003821 sctx = dev->scrub_device;
3822 if (sctx)
3823 memcpy(progress, &sctx->stat, sizeof(*progress));
Arne Jansena2de7332011-03-08 14:14:00 +01003824 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3825
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003826 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
Arne Jansena2de7332011-03-08 14:14:00 +01003827}
Stefan Behrensff023aa2012-11-06 11:43:11 +01003828
3829static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3830 u64 extent_logical, u64 extent_len,
3831 u64 *extent_physical,
3832 struct btrfs_device **extent_dev,
3833 int *extent_mirror_num)
3834{
3835 u64 mapped_length;
3836 struct btrfs_bio *bbio = NULL;
3837 int ret;
3838
3839 mapped_length = extent_len;
3840 ret = btrfs_map_block(fs_info, READ, extent_logical,
3841 &mapped_length, &bbio, 0);
3842 if (ret || !bbio || mapped_length < extent_len ||
3843 !bbio->stripes[0].dev->bdev) {
Zhao Lei6e9606d2015-01-20 15:11:34 +08003844 btrfs_put_bbio(bbio);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003845 return;
3846 }
3847
3848 *extent_physical = bbio->stripes[0].physical;
3849 *extent_mirror_num = bbio->mirror_num;
3850 *extent_dev = bbio->stripes[0].dev;
Zhao Lei6e9606d2015-01-20 15:11:34 +08003851 btrfs_put_bbio(bbio);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003852}
3853
3854static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3855 struct scrub_wr_ctx *wr_ctx,
3856 struct btrfs_fs_info *fs_info,
3857 struct btrfs_device *dev,
3858 int is_dev_replace)
3859{
3860 WARN_ON(wr_ctx->wr_curr_bio != NULL);
3861
3862 mutex_init(&wr_ctx->wr_lock);
3863 wr_ctx->wr_curr_bio = NULL;
3864 if (!is_dev_replace)
3865 return 0;
3866
3867 WARN_ON(!dev->bdev);
3868 wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3869 bio_get_nr_vecs(dev->bdev));
3870 wr_ctx->tgtdev = dev;
3871 atomic_set(&wr_ctx->flush_all_writes, 0);
3872 return 0;
3873}
3874
3875static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3876{
3877 mutex_lock(&wr_ctx->wr_lock);
3878 kfree(wr_ctx->wr_curr_bio);
3879 wr_ctx->wr_curr_bio = NULL;
3880 mutex_unlock(&wr_ctx->wr_lock);
3881}
3882
3883static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3884 int mirror_num, u64 physical_for_dev_replace)
3885{
3886 struct scrub_copy_nocow_ctx *nocow_ctx;
3887 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3888
3889 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3890 if (!nocow_ctx) {
3891 spin_lock(&sctx->stat_lock);
3892 sctx->stat.malloc_errors++;
3893 spin_unlock(&sctx->stat_lock);
3894 return -ENOMEM;
3895 }
3896
3897 scrub_pending_trans_workers_inc(sctx);
3898
3899 nocow_ctx->sctx = sctx;
3900 nocow_ctx->logical = logical;
3901 nocow_ctx->len = len;
3902 nocow_ctx->mirror_num = mirror_num;
3903 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
Liu Bo9e0af232014-08-15 23:36:53 +08003904 btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
3905 copy_nocow_pages_worker, NULL, NULL);
Josef Bacik652f25a2013-09-12 16:58:28 -04003906 INIT_LIST_HEAD(&nocow_ctx->inodes);
Qu Wenruo0339ef22014-02-28 10:46:17 +08003907 btrfs_queue_work(fs_info->scrub_nocow_workers,
3908 &nocow_ctx->work);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003909
3910 return 0;
3911}
3912
Josef Bacik652f25a2013-09-12 16:58:28 -04003913static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
3914{
3915 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3916 struct scrub_nocow_inode *nocow_inode;
3917
3918 nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
3919 if (!nocow_inode)
3920 return -ENOMEM;
3921 nocow_inode->inum = inum;
3922 nocow_inode->offset = offset;
3923 nocow_inode->root = root;
3924 list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
3925 return 0;
3926}
3927
3928#define COPY_COMPLETE 1
3929
Stefan Behrensff023aa2012-11-06 11:43:11 +01003930static void copy_nocow_pages_worker(struct btrfs_work *work)
3931{
3932 struct scrub_copy_nocow_ctx *nocow_ctx =
3933 container_of(work, struct scrub_copy_nocow_ctx, work);
3934 struct scrub_ctx *sctx = nocow_ctx->sctx;
3935 u64 logical = nocow_ctx->logical;
3936 u64 len = nocow_ctx->len;
3937 int mirror_num = nocow_ctx->mirror_num;
3938 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3939 int ret;
3940 struct btrfs_trans_handle *trans = NULL;
3941 struct btrfs_fs_info *fs_info;
3942 struct btrfs_path *path;
3943 struct btrfs_root *root;
3944 int not_written = 0;
3945
3946 fs_info = sctx->dev_root->fs_info;
3947 root = fs_info->extent_root;
3948
3949 path = btrfs_alloc_path();
3950 if (!path) {
3951 spin_lock(&sctx->stat_lock);
3952 sctx->stat.malloc_errors++;
3953 spin_unlock(&sctx->stat_lock);
3954 not_written = 1;
3955 goto out;
3956 }
3957
3958 trans = btrfs_join_transaction(root);
3959 if (IS_ERR(trans)) {
3960 not_written = 1;
3961 goto out;
3962 }
3963
3964 ret = iterate_inodes_from_logical(logical, fs_info, path,
Josef Bacik652f25a2013-09-12 16:58:28 -04003965 record_inode_for_nocow, nocow_ctx);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003966 if (ret != 0 && ret != -ENOENT) {
Frank Holtonefe120a2013-12-20 11:37:06 -05003967 btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
3968 "phys %llu, len %llu, mir %u, ret %d",
Geert Uytterhoeven118a0a22013-08-20 13:20:10 +02003969 logical, physical_for_dev_replace, len, mirror_num,
3970 ret);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003971 not_written = 1;
3972 goto out;
3973 }
3974
Josef Bacik652f25a2013-09-12 16:58:28 -04003975 btrfs_end_transaction(trans, root);
3976 trans = NULL;
3977 while (!list_empty(&nocow_ctx->inodes)) {
3978 struct scrub_nocow_inode *entry;
3979 entry = list_first_entry(&nocow_ctx->inodes,
3980 struct scrub_nocow_inode,
3981 list);
3982 list_del_init(&entry->list);
3983 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
3984 entry->root, nocow_ctx);
3985 kfree(entry);
3986 if (ret == COPY_COMPLETE) {
3987 ret = 0;
3988 break;
3989 } else if (ret) {
3990 break;
3991 }
3992 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01003993out:
Josef Bacik652f25a2013-09-12 16:58:28 -04003994 while (!list_empty(&nocow_ctx->inodes)) {
3995 struct scrub_nocow_inode *entry;
3996 entry = list_first_entry(&nocow_ctx->inodes,
3997 struct scrub_nocow_inode,
3998 list);
3999 list_del_init(&entry->list);
4000 kfree(entry);
4001 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004002 if (trans && !IS_ERR(trans))
4003 btrfs_end_transaction(trans, root);
4004 if (not_written)
4005 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4006 num_uncorrectable_read_errors);
4007
4008 btrfs_free_path(path);
4009 kfree(nocow_ctx);
4010
4011 scrub_pending_trans_workers_dec(sctx);
4012}
4013
Gui Hecheng32159242014-11-10 15:36:08 +08004014static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
4015 u64 logical)
4016{
4017 struct extent_state *cached_state = NULL;
4018 struct btrfs_ordered_extent *ordered;
4019 struct extent_io_tree *io_tree;
4020 struct extent_map *em;
4021 u64 lockstart = start, lockend = start + len - 1;
4022 int ret = 0;
4023
4024 io_tree = &BTRFS_I(inode)->io_tree;
4025
4026 lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
4027 ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4028 if (ordered) {
4029 btrfs_put_ordered_extent(ordered);
4030 ret = 1;
4031 goto out_unlock;
4032 }
4033
4034 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4035 if (IS_ERR(em)) {
4036 ret = PTR_ERR(em);
4037 goto out_unlock;
4038 }
4039
4040 /*
4041 * This extent does not actually cover the logical extent anymore,
4042 * move on to the next inode.
4043 */
4044 if (em->block_start > logical ||
4045 em->block_start + em->block_len < logical + len) {
4046 free_extent_map(em);
4047 ret = 1;
4048 goto out_unlock;
4049 }
4050 free_extent_map(em);
4051
4052out_unlock:
4053 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4054 GFP_NOFS);
4055 return ret;
4056}
4057
Josef Bacik652f25a2013-09-12 16:58:28 -04004058static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4059 struct scrub_copy_nocow_ctx *nocow_ctx)
Stefan Behrensff023aa2012-11-06 11:43:11 +01004060{
Miao Xie826aa0a2013-06-27 18:50:59 +08004061 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004062 struct btrfs_key key;
Miao Xie826aa0a2013-06-27 18:50:59 +08004063 struct inode *inode;
4064 struct page *page;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004065 struct btrfs_root *local_root;
Josef Bacik652f25a2013-09-12 16:58:28 -04004066 struct extent_io_tree *io_tree;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004067 u64 physical_for_dev_replace;
Gui Hecheng32159242014-11-10 15:36:08 +08004068 u64 nocow_ctx_logical;
Josef Bacik652f25a2013-09-12 16:58:28 -04004069 u64 len = nocow_ctx->len;
Miao Xie826aa0a2013-06-27 18:50:59 +08004070 unsigned long index;
Liu Bo6f1c3602013-01-29 03:22:10 +00004071 int srcu_index;
Josef Bacik652f25a2013-09-12 16:58:28 -04004072 int ret = 0;
4073 int err = 0;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004074
4075 key.objectid = root;
4076 key.type = BTRFS_ROOT_ITEM_KEY;
4077 key.offset = (u64)-1;
Liu Bo6f1c3602013-01-29 03:22:10 +00004078
4079 srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4080
Stefan Behrensff023aa2012-11-06 11:43:11 +01004081 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
Liu Bo6f1c3602013-01-29 03:22:10 +00004082 if (IS_ERR(local_root)) {
4083 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004084 return PTR_ERR(local_root);
Liu Bo6f1c3602013-01-29 03:22:10 +00004085 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004086
4087 key.type = BTRFS_INODE_ITEM_KEY;
4088 key.objectid = inum;
4089 key.offset = 0;
4090 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
Liu Bo6f1c3602013-01-29 03:22:10 +00004091 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004092 if (IS_ERR(inode))
4093 return PTR_ERR(inode);
4094
Miao Xieedd14002013-06-27 18:51:00 +08004095 /* Avoid truncate/dio/punch hole.. */
4096 mutex_lock(&inode->i_mutex);
4097 inode_dio_wait(inode);
4098
Stefan Behrensff023aa2012-11-06 11:43:11 +01004099 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
Josef Bacik652f25a2013-09-12 16:58:28 -04004100 io_tree = &BTRFS_I(inode)->io_tree;
Gui Hecheng32159242014-11-10 15:36:08 +08004101 nocow_ctx_logical = nocow_ctx->logical;
Josef Bacik652f25a2013-09-12 16:58:28 -04004102
Gui Hecheng32159242014-11-10 15:36:08 +08004103 ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
4104 if (ret) {
4105 ret = ret > 0 ? 0 : ret;
4106 goto out;
Josef Bacik652f25a2013-09-12 16:58:28 -04004107 }
4108
Stefan Behrensff023aa2012-11-06 11:43:11 +01004109 while (len >= PAGE_CACHE_SIZE) {
Stefan Behrensff023aa2012-11-06 11:43:11 +01004110 index = offset >> PAGE_CACHE_SHIFT;
Miao Xieedd14002013-06-27 18:51:00 +08004111again:
Stefan Behrensff023aa2012-11-06 11:43:11 +01004112 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4113 if (!page) {
Frank Holtonefe120a2013-12-20 11:37:06 -05004114 btrfs_err(fs_info, "find_or_create_page() failed");
Stefan Behrensff023aa2012-11-06 11:43:11 +01004115 ret = -ENOMEM;
Miao Xie826aa0a2013-06-27 18:50:59 +08004116 goto out;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004117 }
4118
4119 if (PageUptodate(page)) {
4120 if (PageDirty(page))
4121 goto next_page;
4122 } else {
4123 ClearPageError(page);
Gui Hecheng32159242014-11-10 15:36:08 +08004124 err = extent_read_full_page(io_tree, page,
Josef Bacik652f25a2013-09-12 16:58:28 -04004125 btrfs_get_extent,
4126 nocow_ctx->mirror_num);
Miao Xie826aa0a2013-06-27 18:50:59 +08004127 if (err) {
4128 ret = err;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004129 goto next_page;
4130 }
Miao Xieedd14002013-06-27 18:51:00 +08004131
Miao Xie26b258912013-06-27 18:50:58 +08004132 lock_page(page);
Miao Xieedd14002013-06-27 18:51:00 +08004133 /*
4134 * If the page has been remove from the page cache,
4135 * the data on it is meaningless, because it may be
4136 * old one, the new data may be written into the new
4137 * page in the page cache.
4138 */
4139 if (page->mapping != inode->i_mapping) {
Josef Bacik652f25a2013-09-12 16:58:28 -04004140 unlock_page(page);
Miao Xieedd14002013-06-27 18:51:00 +08004141 page_cache_release(page);
4142 goto again;
4143 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004144 if (!PageUptodate(page)) {
4145 ret = -EIO;
4146 goto next_page;
4147 }
4148 }
Gui Hecheng32159242014-11-10 15:36:08 +08004149
4150 ret = check_extent_to_block(inode, offset, len,
4151 nocow_ctx_logical);
4152 if (ret) {
4153 ret = ret > 0 ? 0 : ret;
4154 goto next_page;
4155 }
4156
Miao Xie826aa0a2013-06-27 18:50:59 +08004157 err = write_page_nocow(nocow_ctx->sctx,
4158 physical_for_dev_replace, page);
4159 if (err)
4160 ret = err;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004161next_page:
Miao Xie826aa0a2013-06-27 18:50:59 +08004162 unlock_page(page);
4163 page_cache_release(page);
4164
4165 if (ret)
4166 break;
4167
Stefan Behrensff023aa2012-11-06 11:43:11 +01004168 offset += PAGE_CACHE_SIZE;
4169 physical_for_dev_replace += PAGE_CACHE_SIZE;
Gui Hecheng32159242014-11-10 15:36:08 +08004170 nocow_ctx_logical += PAGE_CACHE_SIZE;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004171 len -= PAGE_CACHE_SIZE;
4172 }
Josef Bacik652f25a2013-09-12 16:58:28 -04004173 ret = COPY_COMPLETE;
Miao Xie826aa0a2013-06-27 18:50:59 +08004174out:
Miao Xieedd14002013-06-27 18:51:00 +08004175 mutex_unlock(&inode->i_mutex);
Miao Xie826aa0a2013-06-27 18:50:59 +08004176 iput(inode);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004177 return ret;
4178}
4179
4180static int write_page_nocow(struct scrub_ctx *sctx,
4181 u64 physical_for_dev_replace, struct page *page)
4182{
4183 struct bio *bio;
4184 struct btrfs_device *dev;
4185 int ret;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004186
4187 dev = sctx->wr_ctx.tgtdev;
4188 if (!dev)
4189 return -EIO;
4190 if (!dev->bdev) {
4191 printk_ratelimited(KERN_WARNING
Frank Holtonefe120a2013-12-20 11:37:06 -05004192 "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
Stefan Behrensff023aa2012-11-06 11:43:11 +01004193 return -EIO;
4194 }
Chris Mason9be33952013-05-17 18:30:14 -04004195 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004196 if (!bio) {
4197 spin_lock(&sctx->stat_lock);
4198 sctx->stat.malloc_errors++;
4199 spin_unlock(&sctx->stat_lock);
4200 return -ENOMEM;
4201 }
Kent Overstreet4f024f32013-10-11 15:44:27 -07004202 bio->bi_iter.bi_size = 0;
4203 bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004204 bio->bi_bdev = dev->bdev;
4205 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
4206 if (ret != PAGE_CACHE_SIZE) {
4207leave_with_eio:
4208 bio_put(bio);
4209 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4210 return -EIO;
4211 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004212
Kent Overstreet33879d42013-11-23 22:33:32 -08004213 if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
Stefan Behrensff023aa2012-11-06 11:43:11 +01004214 goto leave_with_eio;
4215
4216 bio_put(bio);
4217 return 0;
4218}