From eed1cc7866ae35bd28926d75447dd6076bd6bb96 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 27 Jun 2018 21:22:43 +0800 Subject: migration: delay postcopy paused state Before this patch we firstly setup the postcopy-paused state then we clean up the QEMUFile handles. That can be racy if there is a very fast "migrate-recover" command running in parallel. Fix that up. Reported-by: Peter Maydell Reviewed-by: Juan Quintela Signed-off-by: Peter Xu Message-Id: <20180627132246.5576-2-peterx@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- migration/savevm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/migration/savevm.c b/migration/savevm.c index c2f34ffc7c..851d74e8b6 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2194,9 +2194,6 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis) /* Clear the triggered bit to allow one recovery */ mis->postcopy_recover_triggered = false; - migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, - MIGRATION_STATUS_POSTCOPY_PAUSED); - assert(mis->from_src_file); qemu_file_shutdown(mis->from_src_file); qemu_fclose(mis->from_src_file); @@ -2209,6 +2206,9 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis) mis->to_src_file = NULL; qemu_mutex_unlock(&mis->rp_mutex); + migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, + MIGRATION_STATUS_POSTCOPY_PAUSED); + /* Notify the fault thread for the invalidated file handle */ postcopy_fault_thread_notify(mis); -- cgit v1.2.3 From 81e620531fa842f760086964ca1b8657ae6c07ba Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 27 Jun 2018 21:22:44 +0800 Subject: migration: move income process out of multifd Move the call to migration_incoming_process() out of multifd code. It's a bit strange that we can migration generic calls in multifd code. Instead, let multifd_recv_new_channel() return a boolean showing whether it's ready to continue the incoming migration. Signed-off-by: Peter Xu Message-Id: <20180627132246.5576-3-peterx@redhat.com> Reviewed-by: Juan Quintela Signed-off-by: Dr. David Alan Gilbert --- migration/migration.c | 5 ++++- migration/ram.c | 11 +++++------ migration/ram.h | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 94d71f8b24..aea6fb7444 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -507,7 +507,10 @@ void migration_ioc_process_incoming(QIOChannel *ioc) migration_incoming_setup(f); return; } - multifd_recv_new_channel(ioc); + + if (multifd_recv_new_channel(ioc)) { + migration_incoming_process(); + } } /** diff --git a/migration/ram.c b/migration/ram.c index 1cd98d6398..23cea47090 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1311,7 +1311,8 @@ bool multifd_recv_all_channels_created(void) return thread_count == atomic_read(&multifd_recv_state->count); } -void multifd_recv_new_channel(QIOChannel *ioc) +/* Return true if multifd is ready for the migration, otherwise false */ +bool multifd_recv_new_channel(QIOChannel *ioc) { MultiFDRecvParams *p; Error *local_err = NULL; @@ -1320,7 +1321,7 @@ void multifd_recv_new_channel(QIOChannel *ioc) id = multifd_recv_initial_packet(ioc, &local_err); if (id < 0) { multifd_recv_terminate_threads(local_err); - return; + return false; } p = &multifd_recv_state->params[id]; @@ -1328,7 +1329,7 @@ void multifd_recv_new_channel(QIOChannel *ioc) error_setg(&local_err, "multifd: received id '%d' already setup'", id); multifd_recv_terminate_threads(local_err); - return; + return false; } p->c = ioc; object_ref(OBJECT(ioc)); @@ -1339,9 +1340,7 @@ void multifd_recv_new_channel(QIOChannel *ioc) qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, QEMU_THREAD_JOINABLE); atomic_inc(&multifd_recv_state->count); - if (multifd_recv_state->count == migrate_multifd_channels()) { - migration_incoming_process(); - } + return multifd_recv_state->count == migrate_multifd_channels(); } /** diff --git a/migration/ram.h b/migration/ram.h index d386f4d641..457bf54b8c 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -46,7 +46,7 @@ int multifd_save_cleanup(Error **errp); int multifd_load_setup(void); int multifd_load_cleanup(Error **errp); bool multifd_recv_all_channels_created(void); -void multifd_recv_new_channel(QIOChannel *ioc); +bool multifd_recv_new_channel(QIOChannel *ioc); uint64_t ram_pagesize_summary(void); int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len); -- cgit v1.2.3 From 884835fa1e38066e2596224375bb35ac6686be4d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 27 Jun 2018 21:22:45 +0800 Subject: migration: unbreak postcopy recovery The whole postcopy recovery logic was accidentally broken. We need to fix it in two steps. This is the first step that we should do the recovery when needed. It was bypassed before after commit 36c2f8be2c. Introduce postcopy_try_recovery() helper for the postcopy recovery logic. Call it both in migration_fd_process_incoming() and migration_ioc_process_incoming(). Fixes: 36c2f8be2c ("migration: Delay start of migration main routines") Signed-off-by: Peter Xu Message-Id: <20180627132246.5576-4-peterx@redhat.com> Reviewed-by: Juan Quintela Signed-off-by: Dr. David Alan Gilbert --- migration/migration.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index aea6fb7444..eb3e09e899 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -466,7 +466,8 @@ void migration_incoming_process(void) qemu_coroutine_enter(co); } -void migration_fd_process_incoming(QEMUFile *f) +/* Returns true if recovered from a paused migration, otherwise false */ +static bool postcopy_try_recover(QEMUFile *f) { MigrationIncomingState *mis = migration_incoming_get_current(); @@ -491,11 +492,20 @@ void migration_fd_process_incoming(QEMUFile *f) * that source is ready to reply to page requests. */ qemu_sem_post(&mis->postcopy_pause_sem_dst); - } else { - /* New incoming migration */ - migration_incoming_setup(f); - migration_incoming_process(); + return true; + } + + return false; +} + +void migration_fd_process_incoming(QEMUFile *f) +{ + if (postcopy_try_recover(f)) { + return; } + + migration_incoming_setup(f); + migration_incoming_process(); } void migration_ioc_process_incoming(QIOChannel *ioc) @@ -504,6 +514,9 @@ void migration_ioc_process_incoming(QIOChannel *ioc) if (!mis->from_src_file) { QEMUFile *f = qemu_fopen_channel_input(ioc); + if (postcopy_try_recover(f)) { + return; + } migration_incoming_setup(f); return; } -- cgit v1.2.3 From a429e7f4887313370ed2d0d3607b7e6bdcfb0e1b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 27 Jun 2018 21:22:46 +0800 Subject: migration: unify incoming processing This is the 2nd patch to unbreak postcopy recovery. Let's unify the migration_incoming_process() call at a single place rather than calling it in connection setup codes. This fixes a problem that we will go into incoming migration procedure even if we are trying to recovery from a paused postcopy migration. Fixes: 36c2f8be2c ("migration: Delay start of migration main routines") Signed-off-by: Peter Xu Message-Id: <20180627132246.5576-5-peterx@redhat.com> Reviewed-by: Juan Quintela Signed-off-by: Dr. David Alan Gilbert --- migration/exec.c | 3 --- migration/fd.c | 3 --- migration/migration.c | 18 ++++++++++++++++-- migration/socket.c | 5 ----- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/migration/exec.c b/migration/exec.c index 0bbeb63c97..375d2e1b54 100644 --- a/migration/exec.c +++ b/migration/exec.c @@ -49,9 +49,6 @@ static gboolean exec_accept_incoming_migration(QIOChannel *ioc, { migration_channel_process_incoming(ioc); object_unref(OBJECT(ioc)); - if (!migrate_use_multifd()) { - migration_incoming_process(); - } return G_SOURCE_REMOVE; } diff --git a/migration/fd.c b/migration/fd.c index fee34ffdc0..a7c13df4ad 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -49,9 +49,6 @@ static gboolean fd_accept_incoming_migration(QIOChannel *ioc, { migration_channel_process_incoming(ioc); object_unref(OBJECT(ioc)); - if (!migrate_use_multifd()) { - migration_incoming_process(); - } return G_SOURCE_REMOVE; } diff --git a/migration/migration.c b/migration/migration.c index eb3e09e899..0404c53215 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -511,17 +511,31 @@ void migration_fd_process_incoming(QEMUFile *f) void migration_ioc_process_incoming(QIOChannel *ioc) { MigrationIncomingState *mis = migration_incoming_get_current(); + bool start_migration; if (!mis->from_src_file) { + /* The first connection (multifd may have multiple) */ QEMUFile *f = qemu_fopen_channel_input(ioc); + + /* If it's a recovery, we're done */ if (postcopy_try_recover(f)) { return; } + migration_incoming_setup(f); - return; + + /* + * Common migration only needs one channel, so we can start + * right now. Multifd needs more than one channel, we wait. + */ + start_migration = !migrate_use_multifd(); + } else { + /* Multiple connections */ + assert(migrate_use_multifd()); + start_migration = multifd_recv_new_channel(ioc); } - if (multifd_recv_new_channel(ioc)) { + if (start_migration) { migration_incoming_process(); } } diff --git a/migration/socket.c b/migration/socket.c index 3456eb76e9..f4c8174400 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -168,12 +168,7 @@ static void socket_accept_incoming_migration(QIONetListener *listener, if (migration_has_all_channels()) { /* Close listening socket as its no longer needed */ qio_net_listener_disconnect(listener); - object_unref(OBJECT(listener)); - - if (!migrate_use_multifd()) { - migration_incoming_process(); - } } } -- cgit v1.2.3 From 1aa8367861645c3c0917f585fe14b1b6b23b0f66 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 10 Jul 2018 17:18:53 +0800 Subject: migration: simplify check to use qemu file buffer Firstly, renaming the old matching_page_sizes variable to matches_target_page_size, which suites more to what it did (it only checks against target page size rather than multiple page sizes). Meanwhile, simplify the check logic a bit, and enhance the comments. Should have no functional change. Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Signed-off-by: Peter Xu Message-Id: <20180710091902.28780-2-peterx@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- migration/ram.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 23cea47090..49068e86d3 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -3580,7 +3580,7 @@ static int ram_load_postcopy(QEMUFile *f) { int flags = 0, ret = 0; bool place_needed = false; - bool matching_page_sizes = false; + bool matches_target_page_size = false; MigrationIncomingState *mis = migration_incoming_get_current(); /* Temporary page that is later 'placed' */ void *postcopy_host_page = postcopy_get_tmp_page(mis); @@ -3620,7 +3620,7 @@ static int ram_load_postcopy(QEMUFile *f) ret = -EINVAL; break; } - matching_page_sizes = block->page_size == TARGET_PAGE_SIZE; + matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; /* * Postcopy requires that we place whole host pages atomically; * these may be huge pages for RAMBlocks that are backed by @@ -3668,12 +3668,17 @@ static int ram_load_postcopy(QEMUFile *f) case RAM_SAVE_FLAG_PAGE: all_zero = false; - if (!place_needed || !matching_page_sizes) { + if (!matches_target_page_size) { + /* For huge pages, we always use temporary buffer */ qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); } else { - /* Avoids the qemu_file copy during postcopy, which is - * going to do a copy later; can only do it when we - * do this read in one go (matching page sizes) + /* + * For small pages that matches target page size, we + * avoid the qemu_file copy. Instead we directly use + * the buffer of QEMUFile to place the page. Note: we + * cannot do any QEMUFile operation before using that + * buffer to make sure the buffer is valid when + * placing the page. */ qemu_get_buffer_in_place(f, (uint8_t **)&place_source, TARGET_PAGE_SIZE); -- cgit v1.2.3 From fd037a656aca23dc3c47aa8c9de261bec6b8aad0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 10 Jul 2018 17:18:54 +0800 Subject: migration: loosen recovery check when load vm We were checking against -EIO, assuming that it will cover all IO failures. But actually it is not. One example is that in qemu_loadvm_section_start_full() we can have tons of places that will return -EINVAL even if the error is caused by IO failures on the network. Let's loosen the recovery check logic here to cover all the error cases happened by removing the explicit check against -EIO. After all we won't lose anything here if any other failure happened. Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Signed-off-by: Peter Xu Message-Id: <20180710091902.28780-3-peterx@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- migration/savevm.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/migration/savevm.c b/migration/savevm.c index 851d74e8b6..efcc795071 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2276,18 +2276,14 @@ out: qemu_file_set_error(f, ret); /* - * Detect whether it is: - * - * 1. postcopy running (after receiving all device data, which - * must be in POSTCOPY_INCOMING_RUNNING state. Note that - * POSTCOPY_INCOMING_LISTENING is still not enough, it's - * still receiving device states). - * 2. network failure (-EIO) - * - * If so, we try to wait for a recovery. + * If we are during an active postcopy, then we pause instead + * of bail out to at least keep the VM's dirty data. Note + * that POSTCOPY_INCOMING_LISTENING stage is still not enough, + * during which we're still receiving device states and we + * still haven't yet started the VM on destination. */ if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING && - ret == -EIO && postcopy_pause_incoming(mis)) { + postcopy_pause_incoming(mis)) { /* Reset f to point to the newly created channel */ f = mis->from_src_file; goto retry; -- cgit v1.2.3 From a725ef9fe36424351faf51696c3fc441ded13f35 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 10 Jul 2018 17:18:55 +0800 Subject: migration: fix incorrect bitmap size calculation The calculation on size of received bitmap is incorrect for postcopy recovery. Here we wanted to let the size to cover all the valid bits in the bitmap, we should use DIV_ROUND_UP() instead of a division. For example, a RAMBlock with size=4K (which contains only one single 4K page) will have nbits=1, then nbits/8=0, then the real bitmap won't be sent to source at all. Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Signed-off-by: Peter Xu Message-Id: <20180710091902.28780-4-peterx@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- migration/ram.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 49068e86d3..52dd678092 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -235,7 +235,7 @@ int64_t ramblock_recv_bitmap_send(QEMUFile *file, bitmap_to_le(le_bitmap, block->receivedmap, nbits); /* Size of the bitmap, in bytes */ - size = nbits / 8; + size = DIV_ROUND_UP(nbits, 8); /* * size is always aligned to 8 bytes for 64bit machines, but it @@ -3944,7 +3944,7 @@ int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block) int ret = -EINVAL; QEMUFile *file = s->rp_state.from_dst_file; unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS; - uint64_t local_size = nbits / 8; + uint64_t local_size = DIV_ROUND_UP(nbits, 8); uint64_t size, end_mark; trace_ram_dirty_bitmap_reload_begin(block->idstr); -- cgit v1.2.3 From 3c9928d9f9b6b5717fa8e53e9441c6b041d6554a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 10 Jul 2018 17:18:56 +0800 Subject: migration: show pause/recover state on dst host These two states will be missing when doing "query-migrate" on destination VM. Add these states so that we can get the query results as expected. Signed-off-by: Peter Xu Message-Id: <20180710091902.28780-5-peterx@redhat.com> Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Dr. David Alan Gilbert --- migration/migration.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index 0404c53215..8d56d56930 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -911,6 +911,8 @@ static void fill_destination_migration_info(MigrationInfo *info) case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_ACTIVE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_PAUSED: + case MIGRATION_STATUS_POSTCOPY_RECOVER: case MIGRATION_STATUS_FAILED: case MIGRATION_STATUS_COLO: info->has_status = true; -- cgit v1.2.3 From d131662a1afd44ffd0ced3334fa0cf1f30f87196 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 10 Jul 2018 17:18:57 +0800 Subject: tests: introduce migrate_postcopy_* helpers Separate the old postcopy UNIX socket test into three steps, provide a helper for each step. With these helpers, we can do more compliated tests like postcopy recovery, while keep the codes shared. Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Reviewed-by: Balamuruhan S Signed-off-by: Peter Xu Message-Id: <20180710091902.28780-6-peterx@redhat.com> Signed-off-by: Dr. David Alan Gilbert Fix up merge with 2e295789 / Skip tests for ppc tcg --- tests/migration-test.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/tests/migration-test.c b/tests/migration-test.c index 331efb0fe5..193c7df94e 100644 --- a/tests/migration-test.c +++ b/tests/migration-test.c @@ -351,13 +351,19 @@ static void migrate(QTestState *who, const char *uri) qobject_unref(rsp); } -static void migrate_start_postcopy(QTestState *who) +static void migrate_postcopy_start(QTestState *from, QTestState *to) { QDict *rsp; - rsp = wait_command(who, "{ 'execute': 'migrate-start-postcopy' }"); + rsp = wait_command(from, "{ 'execute': 'migrate-start-postcopy' }"); g_assert(qdict_haskey(rsp, "return")); qobject_unref(rsp); + + if (!got_stop) { + qtest_qmp_eventwait(from, "STOP"); + } + + qtest_qmp_eventwait(to, "RESUME"); } static int test_migrate_start(QTestState **from, QTestState **to, @@ -510,13 +516,14 @@ static void test_deprecated(void) qtest_quit(from); } -static void test_postcopy(void) +static int migrate_postcopy_prepare(QTestState **from_ptr, + QTestState **to_ptr) { char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs); QTestState *from, *to; if (test_migrate_start(&from, &to, uri, false)) { - return; + return -1; } migrate_set_capability(from, "postcopy-ram", "true"); @@ -534,28 +541,41 @@ static void test_postcopy(void) wait_for_serial("src_serial"); migrate(from, uri); + g_free(uri); wait_for_migration_pass(from); - migrate_start_postcopy(from); + *from_ptr = from; + *to_ptr = to; - if (!got_stop) { - qtest_qmp_eventwait(from, "STOP"); - } + return 0; +} - qtest_qmp_eventwait(to, "RESUME"); +static void migrate_postcopy_complete(QTestState *from, QTestState *to) +{ + wait_for_migration_complete(from); + /* Make sure we get at least one "B" on destination */ wait_for_serial("dest_serial"); - wait_for_migration_complete(from); if (uffd_feature_thread_id) { read_blocktime(to); } - g_free(uri); test_migrate_end(from, to, true); } +static void test_postcopy(void) +{ + QTestState *from, *to; + + if (migrate_postcopy_prepare(&from, &to)) { + return; + } + migrate_postcopy_start(from, to); + migrate_postcopy_complete(from, to); +} + static void test_baddest(void) { QTestState *from, *to; -- cgit v1.2.3 From 7e1d74271c36d12a90608db73ea72aab352b5bb0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 10 Jul 2018 17:18:58 +0800 Subject: tests: allow migrate() to take extra flags For example, we can pass in '"resume": true' to resume a migration. Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Reviewed-by: Balamuruhan S Signed-off-by: Peter Xu Message-Id: <20180710091902.28780-7-peterx@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- tests/migration-test.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/migration-test.c b/tests/migration-test.c index 193c7df94e..0a1e2f0a09 100644 --- a/tests/migration-test.c +++ b/tests/migration-test.c @@ -337,14 +337,14 @@ static void migrate_set_capability(QTestState *who, const char *capability, qobject_unref(rsp); } -static void migrate(QTestState *who, const char *uri) +static void migrate(QTestState *who, const char *uri, const char *extra) { QDict *rsp; gchar *cmd; cmd = g_strdup_printf("{ 'execute': 'migrate'," - "'arguments': { 'uri': '%s' } }", - uri); + " 'arguments': { 'uri': '%s' %s } }", + uri, extra ? extra : ""); rsp = qtest_qmp(who, cmd); g_free(cmd); g_assert(qdict_haskey(rsp, "return")); @@ -540,7 +540,7 @@ static int migrate_postcopy_prepare(QTestState **from_ptr, /* Wait for the first serial output from the source */ wait_for_serial("src_serial"); - migrate(from, uri); + migrate(from, uri, NULL); g_free(uri); wait_for_migration_pass(from); @@ -586,7 +586,7 @@ static void test_baddest(void) if (test_migrate_start(&from, &to, "tcp:0:0", true)) { return; } - migrate(from, "tcp:0:0"); + migrate(from, "tcp:0:0", NULL); do { rsp = wait_command(from, "{ 'execute': 'query-migrate' }"); rsp_return = qdict_get_qdict(rsp, "return"); @@ -630,7 +630,7 @@ static void test_precopy_unix(void) /* Wait for the first serial output from the source */ wait_for_serial("src_serial"); - migrate(from, uri); + migrate(from, uri, NULL); wait_for_migration_pass(from); -- cgit v1.2.3 From 2f7074c6fdec5015086f5328f8427097bee86e13 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 10 Jul 2018 17:18:59 +0800 Subject: tests: introduce migrate_query*() helpers Introduce helpers to query migration states and use it. Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Reviewed-by: Balamuruhan S Signed-off-by: Peter Xu Message-Id: <20180710091902.28780-8-peterx@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- tests/migration-test.c | 64 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/tests/migration-test.c b/tests/migration-test.c index 0a1e2f0a09..e2697efd01 100644 --- a/tests/migration-test.c +++ b/tests/migration-test.c @@ -168,6 +168,37 @@ static QDict *wait_command(QTestState *who, const char *command) return response; } +/* + * Note: caller is responsible to free the returned object via + * qobject_unref() after use + */ +static QDict *migrate_query(QTestState *who) +{ + QDict *rsp, *rsp_return; + + rsp = wait_command(who, "{ 'execute': 'query-migrate' }"); + rsp_return = qdict_get_qdict(rsp, "return"); + g_assert(rsp_return); + qobject_ref(rsp_return); + qobject_unref(rsp); + + return rsp_return; +} + +/* + * Note: caller is responsible to free the returned object via + * g_free() after use + */ +static gchar *migrate_query_status(QTestState *who) +{ + QDict *rsp_return = migrate_query(who); + gchar *status = g_strdup(qdict_get_str(rsp_return, "status")); + + g_assert(status); + qobject_unref(rsp_return); + + return status; +} /* * It's tricky to use qemu's migration event capability with qtest, @@ -176,11 +207,10 @@ static QDict *wait_command(QTestState *who, const char *command) static uint64_t get_migration_pass(QTestState *who) { - QDict *rsp, *rsp_return, *rsp_ram; + QDict *rsp_return, *rsp_ram; uint64_t result; - rsp = wait_command(who, "{ 'execute': 'query-migrate' }"); - rsp_return = qdict_get_qdict(rsp, "return"); + rsp_return = migrate_query(who); if (!qdict_haskey(rsp_return, "ram")) { /* Still in setup */ result = 0; @@ -188,33 +218,29 @@ static uint64_t get_migration_pass(QTestState *who) rsp_ram = qdict_get_qdict(rsp_return, "ram"); result = qdict_get_try_int(rsp_ram, "dirty-sync-count", 0); } - qobject_unref(rsp); + qobject_unref(rsp_return); return result; } static void read_blocktime(QTestState *who) { - QDict *rsp, *rsp_return; + QDict *rsp_return; - rsp = wait_command(who, "{ 'execute': 'query-migrate' }"); - rsp_return = qdict_get_qdict(rsp, "return"); + rsp_return = migrate_query(who); g_assert(qdict_haskey(rsp_return, "postcopy-blocktime")); - qobject_unref(rsp); + qobject_unref(rsp_return); } static void wait_for_migration_complete(QTestState *who) { while (true) { - QDict *rsp, *rsp_return; bool completed; - const char *status; + char *status; - rsp = wait_command(who, "{ 'execute': 'query-migrate' }"); - rsp_return = qdict_get_qdict(rsp, "return"); - status = qdict_get_str(rsp_return, "status"); + status = migrate_query_status(who); completed = strcmp(status, "completed") == 0; g_assert_cmpstr(status, !=, "failed"); - qobject_unref(rsp); + g_free(status); if (completed) { return; } @@ -580,7 +606,7 @@ static void test_baddest(void) { QTestState *from, *to; QDict *rsp, *rsp_return; - const char *status; + char *status; bool failed; if (test_migrate_start(&from, &to, "tcp:0:0", true)) { @@ -588,14 +614,10 @@ static void test_baddest(void) } migrate(from, "tcp:0:0", NULL); do { - rsp = wait_command(from, "{ 'execute': 'query-migrate' }"); - rsp_return = qdict_get_qdict(rsp, "return"); - - status = qdict_get_str(rsp_return, "status"); - + status = migrate_query_status(from); g_assert(!strcmp(status, "setup") || !(strcmp(status, "failed"))); failed = !strcmp(status, "failed"); - qobject_unref(rsp); + g_free(status); } while (!failed); /* Is the machine currently running? */ -- cgit v1.2.3 From 2f6d313836591659375db97a2c1de4f28413f12c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 10 Jul 2018 17:19:00 +0800 Subject: tests: introduce wait_for_migration_status() It's generalized from wait_for_migration_complete() to allow us to wait for any migration status besides failure. Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Reviewed-by: Balamuruhan S Signed-off-by: Peter Xu Message-Id: <20180710091902.28780-9-peterx@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- tests/migration-test.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/migration-test.c b/tests/migration-test.c index e2697efd01..4c15071893 100644 --- a/tests/migration-test.c +++ b/tests/migration-test.c @@ -231,14 +231,15 @@ static void read_blocktime(QTestState *who) qobject_unref(rsp_return); } -static void wait_for_migration_complete(QTestState *who) +static void wait_for_migration_status(QTestState *who, + const char *goal) { while (true) { bool completed; char *status; status = migrate_query_status(who); - completed = strcmp(status, "completed") == 0; + completed = strcmp(status, goal) == 0; g_assert_cmpstr(status, !=, "failed"); g_free(status); if (completed) { @@ -248,6 +249,11 @@ static void wait_for_migration_complete(QTestState *who) } } +static void wait_for_migration_complete(QTestState *who) +{ + wait_for_migration_status(who, "completed"); +} + static void wait_for_migration_pass(QTestState *who) { uint64_t initial_pass = get_migration_pass(who); -- cgit v1.2.3 From d5f496407db4444758e3374b95109a5d18eba70b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 10 Jul 2018 17:19:01 +0800 Subject: tests: add postcopy recovery test Test the postcopy recovery procedure by emulating a network failure using migrate-pause command. Tested-by: Balamuruhan S Signed-off-by: Peter Xu Message-Id: <20180710091902.28780-10-peterx@redhat.com> Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Dr. David Alan Gilbert --- tests/migration-test.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tests/migration-test.c b/tests/migration-test.c index 4c15071893..d8b2633fce 100644 --- a/tests/migration-test.c +++ b/tests/migration-test.c @@ -352,6 +352,29 @@ static void migrate_set_parameter(QTestState *who, const char *parameter, migrate_check_parameter(who, parameter, value); } +static void migrate_pause(QTestState *who) +{ + QDict *rsp; + + rsp = wait_command(who, "{ 'execute': 'migrate-pause' }"); + g_assert(qdict_haskey(rsp, "return")); + qobject_unref(rsp); +} + +static void migrate_recover(QTestState *who, const char *uri) +{ + QDict *rsp; + gchar *cmd = g_strdup_printf( + "{ 'execute': 'migrate-recover', " + " 'id': 'recover-cmd', " + " 'arguments': { 'uri': '%s' } }", uri); + + rsp = wait_command(who, cmd); + g_assert(qdict_haskey(rsp, "return")); + g_free(cmd); + qobject_unref(rsp); +} + static void migrate_set_capability(QTestState *who, const char *capability, const char *value) { @@ -608,6 +631,62 @@ static void test_postcopy(void) migrate_postcopy_complete(from, to); } +static void test_postcopy_recovery(void) +{ + QTestState *from, *to; + char *uri; + + if (migrate_postcopy_prepare(&from, &to)) { + return; + } + + /* Turn postcopy speed down, 4K/s is slow enough on any machines */ + migrate_set_parameter(from, "max-postcopy-bandwidth", "4096"); + + /* Now we start the postcopy */ + migrate_postcopy_start(from, to); + + /* + * Wait until postcopy is really started; we can only run the + * migrate-pause command during a postcopy + */ + wait_for_migration_status(from, "postcopy-active"); + + /* + * Manually stop the postcopy migration. This emulates a network + * failure with the migration socket + */ + migrate_pause(from); + + /* + * Wait for destination side to reach postcopy-paused state. The + * migrate-recover command can only succeed if destination machine + * is in the paused state + */ + wait_for_migration_status(to, "postcopy-paused"); + + /* + * Create a new socket to emulate a new channel that is different + * from the broken migration channel; tell the destination to + * listen to the new port + */ + uri = g_strdup_printf("unix:%s/migsocket-recover", tmpfs); + migrate_recover(to, uri); + + /* + * Try to rebuild the migration channel using the resume flag and + * the newly created channel + */ + wait_for_migration_status(from, "postcopy-paused"); + migrate(from, uri, ", 'resume': true"); + g_free(uri); + + /* Restore the postcopy bandwidth to unlimited */ + migrate_set_parameter(from, "max-postcopy-bandwidth", "0"); + + migrate_postcopy_complete(from, to); +} + static void test_baddest(void) { QTestState *from, *to; @@ -698,6 +777,7 @@ int main(int argc, char **argv) module_call_init(MODULE_INIT_QOM); qtest_add_func("/migration/postcopy/unix", test_postcopy); + qtest_add_func("/migration/postcopy/recovery", test_postcopy_recovery); qtest_add_func("/migration/deprecated", test_deprecated); qtest_add_func("/migration/bad_dest", test_baddest); qtest_add_func("/migration/precopy/unix", test_precopy_unix); -- cgit v1.2.3 From 3e81f73c7a1286e251180c19f62829fe5c045e39 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 10 Jul 2018 17:19:02 +0800 Subject: tests: hide stderr for postcopy recovery test We dumped something when network failure happens. We should avoid those messages to be dumped when running the tests: $ ./tests/migration-test -p /x86_64/migration/postcopy/recovery /x86_64/migration/postcopy/recovery: qemu-system-x86_64: check_section_footer: Read section footer failed: -5 qemu-system-x86_64: Detected IO failure for postcopy. Migration paused. qemu-system-x86_64: Detected IO failure for postcopy. Migration paused. OK After the patch: $ ./tests/migration-test -p /x86_64/migration/postcopy/recovery /x86_64/migration/postcopy/recovery: OK Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Signed-off-by: Peter Xu Message-Id: <20180710091902.28780-11-peterx@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- tests/migration-test.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/migration-test.c b/tests/migration-test.c index d8b2633fce..086f727b34 100644 --- a/tests/migration-test.c +++ b/tests/migration-test.c @@ -572,12 +572,13 @@ static void test_deprecated(void) } static int migrate_postcopy_prepare(QTestState **from_ptr, - QTestState **to_ptr) + QTestState **to_ptr, + bool hide_error) { char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs); QTestState *from, *to; - if (test_migrate_start(&from, &to, uri, false)) { + if (test_migrate_start(&from, &to, uri, hide_error)) { return -1; } @@ -624,7 +625,7 @@ static void test_postcopy(void) { QTestState *from, *to; - if (migrate_postcopy_prepare(&from, &to)) { + if (migrate_postcopy_prepare(&from, &to, false)) { return; } migrate_postcopy_start(from, to); @@ -636,7 +637,7 @@ static void test_postcopy_recovery(void) QTestState *from, *to; char *uri; - if (migrate_postcopy_prepare(&from, &to)) { + if (migrate_postcopy_prepare(&from, &to, true)) { return; } -- cgit v1.2.3 From 858b6d62249a9a9510fae6c808a3d2de80e689b5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 10 Jul 2018 17:44:24 +0800 Subject: migration: reorder MIG_CMD_POSTCOPY_RESUME MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It was accidently added before MIG_CMD_PACKAGED so it might break command compatibility when we run postcopy migration between old/new QEMUs. Fix that up quickly before the QEMU 3.0 release. Reported-by: Lukáš Doktor Suggested-by: Dr. David Alan Gilbert Signed-off-by: Peter Xu Message-Id: <20180710094424.30754-1-peterx@redhat.com> Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Signed-off-by: Dr. David Alan Gilbert --- migration/savevm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/savevm.c b/migration/savevm.c index efcc795071..7f92567a10 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -81,8 +81,8 @@ enum qemu_vm_cmd { MIG_CMD_POSTCOPY_RAM_DISCARD, /* A list of pages to discard that were previously sent during precopy but are dirty. */ - MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */ MIG_CMD_PACKAGED, /* Send a wrapped stream within this stream */ + MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */ MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */ MIG_CMD_MAX }; -- cgit v1.2.3