aboutsummaryrefslogtreecommitdiff
path: root/drivers/block/drbd/drbd_worker.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/drbd/drbd_worker.c')
-rw-r--r--drivers/block/drbd/drbd_worker.c292
1 files changed, 229 insertions, 63 deletions
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index ca4a16cea2d..108d58015cd 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -39,8 +39,6 @@
#include "drbd_int.h"
#include "drbd_req.h"
-#define SLEEP_TIME (HZ/10)
-
static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
@@ -217,10 +215,8 @@ void drbd_endio_sec(struct bio *bio, int error)
*/
void drbd_endio_pri(struct bio *bio, int error)
{
- unsigned long flags;
struct drbd_request *req = bio->bi_private;
struct drbd_conf *mdev = req->mdev;
- struct bio_and_error m;
enum drbd_req_event what;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
@@ -246,12 +242,7 @@ void drbd_endio_pri(struct bio *bio, int error)
bio_put(req->private_bio);
req->private_bio = ERR_PTR(error);
- spin_lock_irqsave(&mdev->req_lock, flags);
- __req_mod(req, what, &m);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- if (m.bio)
- complete_master_bio(mdev, &m);
+ req_mod(req, what);
}
int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
@@ -376,54 +367,145 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
struct drbd_epoch_entry *e;
if (!get_ldev(mdev))
- return 0;
+ return -EIO;
+
+ if (drbd_rs_should_slow_down(mdev))
+ goto defer;
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
if (!e)
- goto fail;
+ goto defer;
+ e->w.cb = w_e_send_csum;
spin_lock_irq(&mdev->req_lock);
list_add(&e->w.list, &mdev->read_ee);
spin_unlock_irq(&mdev->req_lock);
- e->w.cb = w_e_send_csum;
+ atomic_add(size >> 9, &mdev->rs_sect_ev);
if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
- return 1;
+ return 0;
+
+ /* drbd_submit_ee currently fails for one reason only:
+ * not being able to allocate enough bios.
+ * Is dropping the connection going to help? */
+ spin_lock_irq(&mdev->req_lock);
+ list_del(&e->w.list);
+ spin_unlock_irq(&mdev->req_lock);
drbd_free_ee(mdev, e);
-fail:
+defer:
put_ldev(mdev);
- return 2;
+ return -EAGAIN;
}
void resync_timer_fn(unsigned long data)
{
- unsigned long flags;
struct drbd_conf *mdev = (struct drbd_conf *) data;
int queue;
- spin_lock_irqsave(&mdev->req_lock, flags);
-
- if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
- queue = 1;
- if (mdev->state.conn == C_VERIFY_S)
- mdev->resync_work.cb = w_make_ov_request;
- else
- mdev->resync_work.cb = w_make_resync_request;
- } else {
+ queue = 1;
+ switch (mdev->state.conn) {
+ case C_VERIFY_S:
+ mdev->resync_work.cb = w_make_ov_request;
+ break;
+ case C_SYNC_TARGET:
+ mdev->resync_work.cb = w_make_resync_request;
+ break;
+ default:
queue = 0;
mdev->resync_work.cb = w_resync_inactive;
}
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
/* harmless race: list_empty outside data.work.q_lock */
if (list_empty(&mdev->resync_work.list) && queue)
drbd_queue_work(&mdev->data.work, &mdev->resync_work);
}
+static void fifo_set(struct fifo_buffer *fb, int value)
+{
+ int i;
+
+ for (i = 0; i < fb->size; i++)
+ fb->values[i] = value;
+}
+
+static int fifo_push(struct fifo_buffer *fb, int value)
+{
+ int ov;
+
+ ov = fb->values[fb->head_index];
+ fb->values[fb->head_index++] = value;
+
+ if (fb->head_index >= fb->size)
+ fb->head_index = 0;
+
+ return ov;
+}
+
+static void fifo_add_val(struct fifo_buffer *fb, int value)
+{
+ int i;
+
+ for (i = 0; i < fb->size; i++)
+ fb->values[i] += value;
+}
+
+int drbd_rs_controller(struct drbd_conf *mdev)
+{
+ unsigned int sect_in; /* Number of sectors that came in since the last turn */
+ unsigned int want; /* The number of sectors we want in the proxy */
+ int req_sect; /* Number of sectors to request in this turn */
+ int correction; /* Number of sectors more we need in the proxy*/
+ int cps; /* correction per invocation of drbd_rs_controller() */
+ int steps; /* Number of time steps to plan ahead */
+ int curr_corr;
+ int max_sect;
+
+ sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
+ mdev->rs_in_flight -= sect_in;
+
+ spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
+
+ steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+
+ if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
+ want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
+ } else { /* normal path */
+ want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
+ sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
+ }
+
+ correction = want - mdev->rs_in_flight - mdev->rs_planed;
+
+ /* Plan ahead */
+ cps = correction / steps;
+ fifo_add_val(&mdev->rs_plan_s, cps);
+ mdev->rs_planed += cps * steps;
+
+ /* What we do in this step */
+ curr_corr = fifo_push(&mdev->rs_plan_s, 0);
+ spin_unlock(&mdev->peer_seq_lock);
+ mdev->rs_planed -= curr_corr;
+
+ req_sect = sect_in + curr_corr;
+ if (req_sect < 0)
+ req_sect = 0;
+
+ max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
+ if (req_sect > max_sect)
+ req_sect = max_sect;
+
+ /*
+ dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
+ sect_in, mdev->rs_in_flight, want, correction,
+ steps, cps, mdev->rs_planed, curr_corr, req_sect);
+ */
+
+ return req_sect;
+}
+
int w_make_resync_request(struct drbd_conf *mdev,
struct drbd_work *w, int cancel)
{
@@ -431,8 +513,9 @@ int w_make_resync_request(struct drbd_conf *mdev,
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
int max_segment_size;
- int number, i, size, pe, mx;
+ int number, rollback_i, size, pe, mx;
int align, queued, sndbuf;
+ int i = 0;
if (unlikely(cancel))
return 1;
@@ -446,6 +529,12 @@ int w_make_resync_request(struct drbd_conf *mdev,
dev_err(DEV, "%s in w_make_resync_request\n",
drbd_conn_str(mdev->state.conn));
+ if (mdev->rs_total == 0) {
+ /* empty resync? */
+ drbd_resync_finished(mdev);
+ return 1;
+ }
+
if (!get_ldev(mdev)) {
/* Since we only need to access mdev->rsync a
get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
@@ -458,11 +547,25 @@ int w_make_resync_request(struct drbd_conf *mdev,
/* starting with drbd 8.3.8, we can handle multi-bio EEs,
* if it should be necessary */
- max_segment_size = mdev->agreed_pro_version < 94 ?
- queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
+ max_segment_size =
+ mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) :
+ mdev->agreed_pro_version < 95 ? DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE;
- number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE / 1024) * HZ);
- pe = atomic_read(&mdev->rs_pending_cnt);
+ if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
+ number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
+ mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+ } else {
+ mdev->c_sync_rate = mdev->sync_conf.rate;
+ number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
+ }
+
+ /* Throttle resync on lower level disk activity, which may also be
+ * caused by application IO on Primary/SyncTarget.
+ * Keep this after the call to drbd_rs_controller, as that assumes
+ * to be called as precisely as possible every SLEEP_TIME,
+ * and would be confused otherwise. */
+ if (drbd_rs_should_slow_down(mdev))
+ goto requeue;
mutex_lock(&mdev->data.mutex);
if (mdev->data.socket)
@@ -476,6 +579,7 @@ int w_make_resync_request(struct drbd_conf *mdev,
mx = number;
/* Limit the number of pending RS requests to no more than the peer's receive buffer */
+ pe = atomic_read(&mdev->rs_pending_cnt);
if ((pe + number) > mx) {
number = mx - pe;
}
@@ -526,6 +630,7 @@ next_sector:
* be prepared for all stripe sizes of software RAIDs.
*/
align = 1;
+ rollback_i = i;
for (;;) {
if (size + BM_BLOCK_SIZE > max_segment_size)
break;
@@ -561,14 +666,19 @@ next_sector:
size = (capacity-sector)<<9;
if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
switch (read_for_csum(mdev, sector, size)) {
- case 0: /* Disk failure*/
+ case -EIO: /* Disk failure */
put_ldev(mdev);
return 0;
- case 2: /* Allocation failed */
+ case -EAGAIN: /* allocation failed, or ldev busy */
drbd_rs_complete_io(mdev, sector);
mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
+ i = rollback_i;
goto requeue;
- /* case 1: everything ok */
+ case 0:
+ /* everything ok */
+ break;
+ default:
+ BUG();
}
} else {
inc_rs_pending(mdev);
@@ -595,6 +705,7 @@ next_sector:
}
requeue:
+ mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
put_ldev(mdev);
return 1;
@@ -670,6 +781,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca
return 1;
}
+static void ping_peer(struct drbd_conf *mdev)
+{
+ clear_bit(GOT_PING_ACK, &mdev->flags);
+ request_ping(mdev);
+ wait_event(mdev->misc_wait,
+ test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
+}
+
int drbd_resync_finished(struct drbd_conf *mdev)
{
unsigned long db, dt, dbdt;
@@ -709,6 +828,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
if (!get_ldev(mdev))
goto out;
+ ping_peer(mdev);
+
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
@@ -801,6 +922,8 @@ out:
mdev->rs_paused = 0;
mdev->ov_start_sector = 0;
+ drbd_md_sync(mdev);
+
if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
@@ -817,9 +940,13 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent
{
if (drbd_ee_has_active_page(e)) {
/* This might happen if sendpage() has not finished */
+ int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ atomic_add(i, &mdev->pp_in_use_by_net);
+ atomic_sub(i, &mdev->pp_in_use);
spin_lock_irq(&mdev->req_lock);
list_add_tail(&e->w.list, &mdev->net_ee);
spin_unlock_irq(&mdev->req_lock);
+ wake_up(&drbd_pp_wait);
} else
drbd_free_ee(mdev, e);
}
@@ -926,9 +1053,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
return 1;
}
- drbd_rs_complete_io(mdev, e->sector);
+ if (get_ldev(mdev)) {
+ drbd_rs_complete_io(mdev, e->sector);
+ put_ldev(mdev);
+ }
- di = (struct digest_info *)(unsigned long)e->block_id;
+ di = e->digest;
if (likely((e->flags & EE_WAS_ERROR) == 0)) {
/* quick hack to try to avoid a race against reconfiguration.
@@ -952,7 +1082,9 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
} else {
inc_rs_pending(mdev);
- e->block_id = ID_SYNCER;
+ e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+ e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
+ kfree(di);
ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
}
} else {
@@ -962,9 +1094,6 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
}
dec_unacked(mdev);
-
- kfree(di);
-
move_to_net_ee_or_free(mdev, e);
if (unlikely(!ok))
@@ -1034,9 +1163,12 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
* the resync lru has been cleaned up already */
- drbd_rs_complete_io(mdev, e->sector);
+ if (get_ldev(mdev)) {
+ drbd_rs_complete_io(mdev, e->sector);
+ put_ldev(mdev);
+ }
- di = (struct digest_info *)(unsigned long)e->block_id;
+ di = e->digest;
if (likely((e->flags & EE_WAS_ERROR) == 0)) {
digest_size = crypto_hash_digestsize(mdev->verify_tfm);
@@ -1055,9 +1187,6 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
}
dec_unacked(mdev);
-
- kfree(di);
-
if (!eq)
drbd_ov_oos_found(mdev, e->sector, e->size);
else
@@ -1108,7 +1237,7 @@ int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
* dec_ap_pending will be done in got_BarrierAck
* or (on connection loss) in w_clear_epoch. */
ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
- (struct p_header *)p, sizeof(*p), 0);
+ (struct p_header80 *)p, sizeof(*p), 0);
drbd_put_data_sock(mdev);
return ok;
@@ -1173,6 +1302,24 @@ int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
return ok;
}
+int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+
+ if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
+ drbd_al_begin_io(mdev, req->sector);
+ /* Calling drbd_al_begin_io() out of the worker might deadlocks
+ theoretically. Practically it can not deadlock, since this is
+ only used when unfreezing IOs. All the extents of the requests
+ that made it into the TL are already active */
+
+ drbd_req_make_private_bio(req, req->master_bio);
+ req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
+ generic_make_request(req->private_bio);
+
+ return 1;
+}
+
static int _drbd_may_sync_now(struct drbd_conf *mdev)
{
struct drbd_conf *odev = mdev;
@@ -1298,14 +1445,6 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na)
return retcode;
}
-static void ping_peer(struct drbd_conf *mdev)
-{
- clear_bit(GOT_PING_ACK, &mdev->flags);
- request_ping(mdev);
- wait_event(mdev->misc_wait,
- test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
-}
-
/**
* drbd_start_resync() - Start the resync process
* @mdev: DRBD device.
@@ -1379,13 +1518,21 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
r = SS_UNKNOWN_ERROR;
if (r == SS_SUCCESS) {
- mdev->rs_total =
- mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+ unsigned long tw = drbd_bm_total_weight(mdev);
+ unsigned long now = jiffies;
+ int i;
+
mdev->rs_failed = 0;
mdev->rs_paused = 0;
- mdev->rs_start =
- mdev->rs_mark_time = jiffies;
mdev->rs_same_csum = 0;
+ mdev->rs_last_events = 0;
+ mdev->rs_last_sect_ev = 0;
+ mdev->rs_total = tw;
+ mdev->rs_start = now;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ mdev->rs_mark_left[i] = tw;
+ mdev->rs_mark_time[i] = now;
+ }
_drbd_pause_after(mdev);
}
write_unlock_irq(&global_state_lock);
@@ -1397,12 +1544,31 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
(unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
(unsigned long) mdev->rs_total);
- if (mdev->rs_total == 0) {
- /* Peer still reachable? Beware of failing before-resync-target handlers! */
- ping_peer(mdev);
+ if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
+ /* This still has a race (about when exactly the peers
+ * detect connection loss) that can lead to a full sync
+ * on next handshake. In 8.3.9 we fixed this with explicit
+ * resync-finished notifications, but the fix
+ * introduces a protocol change. Sleeping for some
+ * time longer than the ping interval + timeout on the
+ * SyncSource, to give the SyncTarget the chance to
+ * detect connection loss, then waiting for a ping
+ * response (implicit in drbd_resync_finished) reduces
+ * the race considerably, but does not solve it. */
+ if (side == C_SYNC_SOURCE)
+ schedule_timeout_interruptible(
+ mdev->net_conf->ping_int * HZ +
+ mdev->net_conf->ping_timeo*HZ/9);
drbd_resync_finished(mdev);
}
+ atomic_set(&mdev->rs_sect_in, 0);
+ atomic_set(&mdev->rs_sect_ev, 0);
+ mdev->rs_in_flight = 0;
+ mdev->rs_planed = 0;
+ spin_lock(&mdev->peer_seq_lock);
+ fifo_set(&mdev->rs_plan_s, 0);
+ spin_unlock(&mdev->peer_seq_lock);
/* ns.conn may already be != mdev->state.conn,
* we may have been paused in between, or become paused until
* the timer triggers.