aboutsummaryrefslogtreecommitdiff
path: root/migration/ram.c
diff options
context:
space:
mode:
Diffstat (limited to 'migration/ram.c')
-rw-r--r--migration/ram.c307
1 files changed, 297 insertions, 10 deletions
diff --git a/migration/ram.c b/migration/ram.c
index 7811cde643..72143da0ac 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -56,6 +56,11 @@
#include "savevm.h"
#include "qemu/iov.h"
#include "multifd.h"
+#include "sysemu/runstate.h"
+
+#if defined(__linux__)
+#include "qemu/userfaultfd.h"
+#endif /* defined(__linux__) */
/***********************************************************/
/* ram save/restore */
@@ -126,7 +131,7 @@ static void XBZRLE_cache_unlock(void)
* @new_size: new cache size
* @errp: set *errp if the check failed, with reason
*/
-int xbzrle_cache_resize(int64_t new_size, Error **errp)
+int xbzrle_cache_resize(uint64_t new_size, Error **errp)
{
PageCache *new_cache;
int64_t ret = 0;
@@ -298,6 +303,8 @@ struct RAMSrcPageRequest {
struct RAMState {
/* QEMUFile used for this migration */
QEMUFile *f;
+ /* UFFD file descriptor, used in 'write-tracking' migration */
+ int uffdio_fd;
/* Last block that we have visited searching for dirty pages */
RAMBlock *last_seen_block;
/* Last block from where we have sent data */
@@ -1434,6 +1441,269 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
return block;
}
+#if defined(__linux__)
+/**
+ * poll_fault_page: try to get next UFFD write fault page and, if pending fault
+ * is found, return RAM block pointer and page offset
+ *
+ * Returns pointer to the RAMBlock containing faulting page,
+ * NULL if no write faults are pending
+ *
+ * @rs: current RAM state
+ * @offset: page offset from the beginning of the block
+ */
+static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
+{
+ struct uffd_msg uffd_msg;
+ void *page_address;
+ RAMBlock *bs;
+ int res;
+
+ if (!migrate_background_snapshot()) {
+ return NULL;
+ }
+
+ res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
+ if (res <= 0) {
+ return NULL;
+ }
+
+ page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
+ bs = qemu_ram_block_from_host(page_address, false, offset);
+ assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0);
+ return bs;
+}
+
+/**
+ * ram_save_release_protection: release UFFD write protection after
+ * a range of pages has been saved
+ *
+ * @rs: current RAM state
+ * @pss: page-search-status structure
+ * @start_page: index of the first page in the range relative to pss->block
+ *
+ * Returns 0 on success, negative value in case of an error
+*/
+static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
+ unsigned long start_page)
+{
+ int res = 0;
+
+ /* Check if page is from UFFD-managed region. */
+ if (pss->block->flags & RAM_UF_WRITEPROTECT) {
+ void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
+ uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
+
+ /* Flush async buffers before un-protect. */
+ qemu_fflush(rs->f);
+ /* Un-protect memory range. */
+ res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
+ false, false);
+ }
+
+ return res;
+}
+
+/* ram_write_tracking_available: check if kernel supports required UFFD features
+ *
+ * Returns true if supports, false otherwise
+ */
+bool ram_write_tracking_available(void)
+{
+ uint64_t uffd_features;
+ int res;
+
+ res = uffd_query_features(&uffd_features);
+ return (res == 0 &&
+ (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
+}
+
+/* ram_write_tracking_compatible: check if guest configuration is
+ * compatible with 'write-tracking'
+ *
+ * Returns true if compatible, false otherwise
+ */
+bool ram_write_tracking_compatible(void)
+{
+ const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
+ int uffd_fd;
+ RAMBlock *bs;
+ bool ret = false;
+
+ /* Open UFFD file descriptor */
+ uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
+ if (uffd_fd < 0) {
+ return false;
+ }
+
+ RCU_READ_LOCK_GUARD();
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+ uint64_t uffd_ioctls;
+
+ /* Nothing to do with read-only and MMIO-writable regions */
+ if (bs->mr->readonly || bs->mr->rom_device) {
+ continue;
+ }
+ /* Try to register block memory via UFFD-IO to track writes */
+ if (uffd_register_memory(uffd_fd, bs->host, bs->max_length,
+ UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
+ goto out;
+ }
+ if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
+ goto out;
+ }
+ }
+ ret = true;
+
+out:
+ uffd_close_fd(uffd_fd);
+ return ret;
+}
+
+/*
+ * ram_write_tracking_start: start UFFD-WP memory tracking
+ *
+ * Returns 0 for success or negative value in case of error
+ */
+int ram_write_tracking_start(void)
+{
+ int uffd_fd;
+ RAMState *rs = ram_state;
+ RAMBlock *bs;
+
+ /* Open UFFD file descriptor */
+ uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
+ if (uffd_fd < 0) {
+ return uffd_fd;
+ }
+ rs->uffdio_fd = uffd_fd;
+
+ RCU_READ_LOCK_GUARD();
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+ /* Nothing to do with read-only and MMIO-writable regions */
+ if (bs->mr->readonly || bs->mr->rom_device) {
+ continue;
+ }
+
+ /* Register block memory with UFFD to track writes */
+ if (uffd_register_memory(rs->uffdio_fd, bs->host,
+ bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
+ goto fail;
+ }
+ /* Apply UFFD write protection to the block memory range */
+ if (uffd_change_protection(rs->uffdio_fd, bs->host,
+ bs->max_length, true, false)) {
+ goto fail;
+ }
+ bs->flags |= RAM_UF_WRITEPROTECT;
+ memory_region_ref(bs->mr);
+
+ trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size,
+ bs->host, bs->max_length);
+ }
+
+ return 0;
+
+fail:
+ error_report("ram_write_tracking_start() failed: restoring initial memory state");
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+ if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
+ continue;
+ }
+ /*
+ * In case some memory block failed to be write-protected
+ * remove protection and unregister all succeeded RAM blocks
+ */
+ uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
+ uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
+ /* Cleanup flags and remove reference */
+ bs->flags &= ~RAM_UF_WRITEPROTECT;
+ memory_region_unref(bs->mr);
+ }
+
+ uffd_close_fd(uffd_fd);
+ rs->uffdio_fd = -1;
+ return -1;
+}
+
+/**
+ * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
+ */
+void ram_write_tracking_stop(void)
+{
+ RAMState *rs = ram_state;
+ RAMBlock *bs;
+
+ RCU_READ_LOCK_GUARD();
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+ if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
+ continue;
+ }
+ /* Remove protection and unregister all affected RAM blocks */
+ uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
+ uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
+
+ trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size,
+ bs->host, bs->max_length);
+
+ /* Cleanup flags and remove reference */
+ bs->flags &= ~RAM_UF_WRITEPROTECT;
+ memory_region_unref(bs->mr);
+ }
+
+ /* Finally close UFFD file descriptor */
+ uffd_close_fd(rs->uffdio_fd);
+ rs->uffdio_fd = -1;
+}
+
+#else
+/* No target OS support, stubs just fail or ignore */
+
+static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
+{
+ (void) rs;
+ (void) offset;
+
+ return NULL;
+}
+
+static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
+ unsigned long start_page)
+{
+ (void) rs;
+ (void) pss;
+ (void) start_page;
+
+ return 0;
+}
+
+bool ram_write_tracking_available(void)
+{
+ return false;
+}
+
+bool ram_write_tracking_compatible(void)
+{
+ assert(0);
+ return false;
+}
+
+int ram_write_tracking_start(void)
+{
+ assert(0);
+ return -1;
+}
+
+void ram_write_tracking_stop(void)
+{
+ assert(0);
+}
+#endif /* defined(__linux__) */
+
/**
* get_queued_page: unqueue a page from the postcopy requests
*
@@ -1473,6 +1743,14 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
} while (block && !dirty);
+ if (!block) {
+ /*
+ * Poll write faults too if background snapshot is enabled; that's
+ * when we have vcpus got blocked by the write protected pages.
+ */
+ block = poll_fault_page(rs, &offset);
+ }
+
if (block) {
/*
* As soon as we start servicing pages out of order, then we have
@@ -1715,6 +1993,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
int tmppages, pages = 0;
size_t pagesize_bits =
qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
+ unsigned long start_page = pss->page;
+ int res;
if (ramblock_is_ignored(pss->block)) {
error_report("block %s should not be migrated !", pss->block->idstr);
@@ -1740,10 +2020,11 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
} while ((pss->page & (pagesize_bits - 1)) &&
offset_in_ramblock(pss->block,
((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
-
/* The offset we leave with is the last one we looked at */
pss->page--;
- return pages;
+
+ res = ram_save_release_protection(rs, pss, start_page);
+ return (res < 0 ? res : pages);
}
/**
@@ -1880,10 +2161,13 @@ static void ram_save_cleanup(void *opaque)
RAMState **rsp = opaque;
RAMBlock *block;
- /* caller have hold iothread lock or is in a bh, so there is
- * no writing race against the migration bitmap
- */
- memory_global_dirty_log_stop();
+ /* We don't use dirty log with background snapshots */
+ if (!migrate_background_snapshot()) {
+ /* caller have hold iothread lock or is in a bh, so there is
+ * no writing race against the migration bitmap
+ */
+ memory_global_dirty_log_stop();
+ }
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
g_free(block->clear_bmap);
@@ -2343,8 +2627,11 @@ static void ram_init_bitmaps(RAMState *rs)
WITH_RCU_READ_LOCK_GUARD() {
ram_list_init_bitmaps();
- memory_global_dirty_log_start();
- migration_bitmap_sync_precopy(rs);
+ /* We don't use dirty log with background snapshots */
+ if (!migrate_background_snapshot()) {
+ memory_global_dirty_log_start();
+ migration_bitmap_sync_precopy(rs);
+ }
}
qemu_mutex_unlock_ramlist();
qemu_mutex_unlock_iothread();
@@ -3521,7 +3808,7 @@ static int ram_load_precopy(QEMUFile *f)
}
}
/* For postcopy we need to check hugepage sizes match */
- if (postcopy_advised &&
+ if (postcopy_advised && migrate_postcopy_ram() &&
block->page_size != qemu_host_page_size) {
uint64_t remote_page_size = qemu_get_be64(f);
if (remote_page_size != block->page_size) {