12 files changed, 647 insertions, 53 deletions
diff --git a/arch_init.c b/arch_init.c
index a8b91eed7a..ea9ddad697 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -457,15 +457,10 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
             bytes_sent = -1;
             if (is_zero_page(p)) {
                 acct_info.dup_pages++;
-                if (!ram_bulk_stage) {
-                    bytes_sent = save_block_hdr(f, block, offset, cont,
-                                                RAM_SAVE_FLAG_COMPRESS);
-                    qemu_put_byte(f, 0);
-                    bytes_sent++;
-                } else {
-                    acct_info.skipped_pages++;
-                    bytes_sent = 0;
-                }
+                bytes_sent = save_block_hdr(f, block, offset, cont,
+                                            RAM_SAVE_FLAG_COMPRESS);
+                qemu_put_byte(f, 0);
+                bytes_sent++;
             } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
                 current_addr = block->offset + offset;
                 bytes_sent = save_xbzrle_page(f, p, current_addr, block,
@@ -498,6 +493,18 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 
 static uint64_t bytes_transferred;
 
+void acct_update_position(QEMUFile *f, size_t size, bool zero)
+{
+    uint64_t pages = size / TARGET_PAGE_SIZE;
+    if (zero) {
+        acct_info.dup_pages += pages;
+    } else {
+        acct_info.norm_pages += pages;
+        bytes_transferred += size;
+        qemu_update_position(f, size);
+    }
+}
+
 static ram_addr_t ram_save_remaining(void)
 {
     return migration_dirty_pages;
@@ -808,6 +815,9 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                     QTAILQ_FOREACH(block, &ram_list.blocks, next) {
                         if (!strncmp(id, block->idstr, sizeof(id))) {
                             if (block->length != length) {
+                                fprintf(stderr, "Length mismatch: %s: %ld "
+                                        "in != " RAM_ADDR_FMT "\n", id, length,
+                                        block->length);
                                 ret =  -EINVAL;
                                 goto done;
                             }
@@ -837,14 +847,16 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
             }
 
             ch = qemu_get_byte(f);
-            memset(host, ch, TARGET_PAGE_SIZE);
+            if (ch != 0 || !is_zero_page(host)) {
+                memset(host, ch, TARGET_PAGE_SIZE);
 #ifndef _WIN32
-            if (ch == 0 &&
-                (!kvm_enabled() || kvm_has_sync_mmu()) &&
-                getpagesize() <= TARGET_PAGE_SIZE) {
-                qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
-            }
+                if (ch == 0 &&
+                    (!kvm_enabled() || kvm_has_sync_mmu()) &&
+                    getpagesize() <= TARGET_PAGE_SIZE) {
+                    qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
+                }
 #endif
+            }
         } else if (flags & RAM_SAVE_FLAG_PAGE) {
             void *host;
 
diff --git a/docs/rdma.txt b/docs/rdma.txt
new file mode 100644
index 0000000000..45a4b1d50d
--- /dev/null
+++ b/docs/rdma.txt
@@ -0,0 +1,415 @@
+(RDMA: Remote Direct Memory Access)
+RDMA Live Migration Specification, Version # 1
+==============================================
+Wiki: http://wiki.qemu.org/Features/RDMALiveMigration
+Github: git@github.com:hinesmr/qemu.git, 'rdma' branch
+
+Copyright (C) 2013 Michael R. Hines <mrhines@us.ibm.com>
+
+An *exhaustive* paper (2010) shows additional performance details
+linked on the QEMU wiki above.
+
+Contents:
+=========
+* Introduction
+* Before running
+* Running
+* Performance
+* RDMA Migration Protocol Description
+* Versioning and Capabilities
+* QEMUFileRDMA Interface
+* Migration of pc.ram
+* Error handling
+* TODO
+
+Introduction:
+=============
+
+RDMA helps make your migration more deterministic under heavy load because
+of the significantly lower latency and higher throughput over TCP/IP. This is
+because the RDMA I/O architecture reduces the number of interrupts and
+data copies by bypassing the host networking stack. In particular, a TCP-based
+migration, under certain types of memory-bound workloads, may take a more
+unpredicatable amount of time to complete the migration if the amount of
+memory tracked during each live migration iteration round cannot keep pace
+with the rate of dirty memory produced by the workload.
+
+RDMA currently comes in two flavors: both Ethernet based (RoCE, or RDMA
+over Convered Ethernet) as well as Infiniband-based. This implementation of
+migration using RDMA is capable of using both technologies because of
+the use of the OpenFabrics OFED software stack that abstracts out the
+programming model irrespective of the underlying hardware.
+
+Refer to openfabrics.org or your respective RDMA hardware vendor for
+an understanding on how to verify that you have the OFED software stack
+installed in your environment. You should be able to successfully link
+against the "librdmacm" and "libibverbs" libraries and development headers
+for a working build of QEMU to run successfully using RDMA Migration.
+
+BEFORE RUNNING:
+===============
+
+Use of RDMA during migration requires pinning and registering memory
+with the hardware. This means that memory must be physically resident
+before the hardware can transmit that memory to another machine.
+If this is not acceptable for your application or product, then the use
+of RDMA migration may in fact be harmful to co-located VMs or other
+software on the machine if there is not sufficient memory available to
+relocate the entire footprint of the virtual machine. If so, then the
+use of RDMA is discouraged and it is recommended to use standard TCP migration.
+
+Experimental: Next, decide if you want dynamic page registration.
+For example, if you have an 8GB RAM virtual machine, but only 1GB
+is in active use, then enabling this feature will cause all 8GB to
+be pinned and resident in memory. This feature mostly affects the
+bulk-phase round of the migration and can be enabled for extremely
+high-performance RDMA hardware using the following command:
+
+QEMU Monitor Command:
+$ migrate_set_capability x-rdma-pin-all on # disabled by default
+
+Performing this action will cause all 8GB to be pinned, so if that's
+not what you want, then please ignore this step altogether.
+
+On the other hand, this will also significantly speed up the bulk round
+of the migration, which can greatly reduce the "total" time of your migration.
+Example performance of this using an idle VM in the previous example
+can be found in the "Performance" section.
+
+Note: for very large virtual machines (hundreds of GBs), pinning all
+*all* of the memory of your virtual machine in the kernel is very expensive
+may extend the initial bulk iteration time by many seconds,
+and thus extending the total migration time. However, this will not
+affect the determinism or predictability of your migration you will
+still gain from the benefits of advanced pinning with RDMA.
+
+RUNNING:
+========
+
+First, set the migration speed to match your hardware's capabilities:
+
+QEMU Monitor Command:
+$ migrate_set_speed 40g # or whatever is the MAX of your RDMA device
+
+Next, on the destination machine, add the following to the QEMU command line:
+
+qemu ..... -incoming x-rdma:host:port
+
+Finally, perform the actual migration on the source machine:
+
+QEMU Monitor Command:
+$ migrate -d x-rdma:host:port
+
+PERFORMANCE
+===========
+
+Here is a brief summary of total migration time and downtime using RDMA:
+Using a 40gbps infiniband link performing a worst-case stress test,
+using an 8GB RAM virtual machine:
+
+Using the following command:
+$ apt-get install stress
+$ stress --vm-bytes 7500M --vm 1 --vm-keep
+
+1. Migration throughput: 26 gigabits/second.
+2. Downtime (stop time) varies between 15 and 100 milliseconds.
+
+EFFECTS of memory registration on bulk phase round:
+
+For example, in the same 8GB RAM example with all 8GB of memory in
+active use and the VM itself is completely idle using the same 40 gbps
+infiniband link:
+
+1. x-rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps
+2. x-rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps
+
+These numbers would of course scale up to whatever size virtual machine
+you have to migrate using RDMA.
+
+Enabling this feature does *not* have any measurable affect on
+migration *downtime*. This is because, without this feature, all of the
+memory will have already been registered already in advance during
+the bulk round and does not need to be re-registered during the successive
+iteration rounds.
+
+RDMA Protocol Description:
+==========================
+
+Migration with RDMA is separated into two parts:
+
+1. The transmission of the pages using RDMA
+2. Everything else (a control channel is introduced)
+
+"Everything else" is transmitted using a formal
+protocol now, consisting of infiniband SEND messages.
+
+An infiniband SEND message is the standard ibverbs
+message used by applications of infiniband hardware.
+The only difference between a SEND message and an RDMA
+message is that SEND messages cause notifications
+to be posted to the completion queue (CQ) on the
+infiniband receiver side, whereas RDMA messages (used
+for pc.ram) do not (to behave like an actual DMA).
+
+Messages in infiniband require two things:
+
+1. registration of the memory that will be transmitted
+2. (SEND only) work requests to be posted on both
+   sides of the network before the actual transmission
+   can occur.
+
+RDMA messages are much easier to deal with. Once the memory
+on the receiver side is registered and pinned, we're
+basically done. All that is required is for the sender
+side to start dumping bytes onto the link.
+
+(Memory is not released from pinning until the migration
+completes, given that RDMA migrations are very fast.)
+
+SEND messages require more coordination because the
+receiver must have reserved space (using a receive
+work request) on the receive queue (RQ) before QEMUFileRDMA
+can start using them to carry all the bytes as
+a control transport for migration of device state.
+
+To begin the migration, the initial connection setup is
+as follows (migration-rdma.c):
+
+1. Receiver and Sender are started (command line or libvirt):
+2. Both sides post two RQ work requests
+3. Receiver does listen()
+4. Sender does connect()
+5. Receiver accept()
+6. Check versioning and capabilities (described later)
+
+At this point, we define a control channel on top of SEND messages
+which is described by a formal protocol. Each SEND message has a
+header portion and a data portion (but together are transmitted
+as a single SEND message).
+
+Header:
+    * Length  (of the data portion, uint32, network byte order)
+    * Type    (what command to perform, uint32, network byte order)
+    * Repeat  (Number of commands in data portion, same type only)
+
+The 'Repeat' field is here to support future multiple page registrations
+in a single message without any need to change the protocol itself
+so that the protocol is compatible against multiple versions of QEMU.
+Version #1 requires that all server implementations of the protocol must
+check this field and register all requests found in the array of commands located
+in the data portion and return an equal number of results in the response.
+The maximum number of repeats is hard-coded to 4096. This is a conservative
+limit based on the maximum size of a SEND message along with emperical
+observations on the maximum future benefit of simultaneous page registrations.
+
+The 'type' field has 10 different command values:
+    1. Unused
+    2. Error              (sent to the source during bad things)
+    3. Ready              (control-channel is available)
+    4. QEMU File          (for sending non-live device state)
+    5. RAM Blocks request (used right after connection setup)
+    6. RAM Blocks result  (used right after connection setup)
+    7. Compress page      (zap zero page and skip registration)
+    8. Register request   (dynamic chunk registration)
+    9. Register result    ('rkey' to be used by sender)
+    10. Register finished  (registration for current iteration finished)
+
+A single control message, as hinted above, can contain within the data
+portion an array of many commands of the same type. If there is more than
+one command, then the 'repeat' field will be greater than 1.
+
+After connection setup, message 5 & 6 are used to exchange ram block
+information and optionally pin all the memory if requested by the user.
+
+After ram block exchange is completed, we have two protocol-level
+functions, responsible for communicating control-channel commands
+using the above list of values:
+
+Logically:
+
+qemu_rdma_exchange_recv(header, expected command type)
+
+1. We transmit a READY command to let the sender know that
+   we are *ready* to receive some data bytes on the control channel.
+2. Before attempting to receive the expected command, we post another
+   RQ work request to replace the one we just used up.
+3. Block on a CQ event channel and wait for the SEND to arrive.
+4. When the send arrives, librdmacm will unblock us.
+5. Verify that the command-type and version received matches the one we expected.
+
+qemu_rdma_exchange_send(header, data, optional response header & data):
+
+1. Block on the CQ event channel waiting for a READY command
+   from the receiver to tell us that the receiver
+   is *ready* for us to transmit some new bytes.
+2. Optionally: if we are expecting a response from the command
+   (that we have no yet transmitted), let's post an RQ
+   work request to receive that data a few moments later.
+3. When the READY arrives, librdmacm will
+   unblock us and we immediately post a RQ work request
+   to replace the one we just used up.
+4. Now, we can actually post the work request to SEND
+   the requested command type of the header we were asked for.
+5. Optionally, if we are expecting a response (as before),
+   we block again and wait for that response using the additional
+   work request we previously posted. (This is used to carry
+   'Register result' commands #6 back to the sender which
+   hold the rkey need to perform RDMA. Note that the virtual address
+   corresponding to this rkey was already exchanged at the beginning
+   of the connection (described below).
+
+All of the remaining command types (not including 'ready')
+described above all use the aformentioned two functions to do the hard work:
+
+1. After connection setup, RAMBlock information is exchanged using
+   this protocol before the actual migration begins. This information includes
+   a description of each RAMBlock on the server side as well as the virtual addresses
+   and lengths of each RAMBlock. This is used by the client to determine the
+   start and stop locations of chunks and how to register them dynamically
+   before performing the RDMA operations.
+2. During runtime, once a 'chunk' becomes full of pages ready to
+   be sent with RDMA, the registration commands are used to ask the
+   other side to register the memory for this chunk and respond
+   with the result (rkey) of the registration.
+3. Also, the QEMUFile interfaces also call these functions (described below)
+   when transmitting non-live state, such as devices or to send
+   its own protocol information during the migration process.
+4. Finally, zero pages are only checked if a page has not yet been registered
+   using chunk registration (or not checked at all and unconditionally
+   written if chunk registration is disabled. This is accomplished using
+   the "Compress" command listed above. If the page *has* been registered
+   then we check the entire chunk for zero. Only if the entire chunk is
+   zero, then we send a compress command to zap the page on the other side.
+
+Versioning and Capabilities
+===========================
+Current version of the protocol is version #1.
+
+The same version applies to both for protocol traffic and capabilities
+negotiation. (i.e. There is only one version number that is referred to
+by all communication).
+
+librdmacm provides the user with a 'private data' area to be exchanged
+at connection-setup time before any infiniband traffic is generated.
+
+Header:
+    * Version (protocol version validated before send/recv occurs), uint32, network byte order
+    * Flags   (bitwise OR of each capability), uint32, network byte order
+
+There is no data portion of this header right now, so there is
+no length field. The maximum size of the 'private data' section
+is only 192 bytes per the Infiniband specification, so it's not
+very useful for data anyway. This structure needs to remain small.
+
+This private data area is a convenient place to check for protocol
+versioning because the user does not need to register memory to
+transmit a few bytes of version information.
+
+This is also a convenient place to negotiate capabilities
+(like dynamic page registration).
+
+If the version is invalid, we throw an error.
+
+If the version is new, we only negotiate the capabilities that the
+requested version is able to perform and ignore the rest.
+
+Currently there is only *one* capability in Version #1: dynamic page registration
+
+Finally: Negotiation happens with the Flags field: If the primary-VM
+sets a flag, but the destination does not support this capability, it
+will return a zero-bit for that flag and the primary-VM will understand
+that as not being an available capability and will thus disable that
+capability on the primary-VM side.
+
+QEMUFileRDMA Interface:
+=======================
+
+QEMUFileRDMA introduces a couple of new functions:
+
+1. qemu_rdma_get_buffer()  (QEMUFileOps rdma_read_ops)
+2. qemu_rdma_put_buffer()  (QEMUFileOps rdma_write_ops)
+
+These two functions are very short and simply use the protocol
+describe above to deliver bytes without changing the upper-level
+users of QEMUFile that depend on a bytestream abstraction.
+
+Finally, how do we handoff the actual bytes to get_buffer()?
+
+Again, because we're trying to "fake" a bytestream abstraction
+using an analogy not unlike individual UDP frames, we have
+to hold on to the bytes received from control-channel's SEND
+messages in memory.
+
+Each time we receive a complete "QEMU File" control-channel
+message, the bytes from SEND are copied into a small local holding area.
+
+Then, we return the number of bytes requested by get_buffer()
+and leave the remaining bytes in the holding area until get_buffer()
+comes around for another pass.
+
+If the buffer is empty, then we follow the same steps
+listed above and issue another "QEMU File" protocol command,
+asking for a new SEND message to re-fill the buffer.
+
+Migration of pc.ram:
+====================
+
+At the beginning of the migration, (migration-rdma.c),
+the sender and the receiver populate the list of RAMBlocks
+to be registered with each other into a structure.
+Then, using the aforementioned protocol, they exchange a
+description of these blocks with each other, to be used later
+during the iteration of main memory. This description includes
+a list of all the RAMBlocks, their offsets and lengths, virtual
+addresses and possibly includes pre-registered RDMA keys in case dynamic
+page registration was disabled on the server-side, otherwise not.
+
+Main memory is not migrated with the aforementioned protocol,
+but is instead migrated with normal RDMA Write operations.
+
+Pages are migrated in "chunks" (hard-coded to 1 Megabyte right now).
+Chunk size is not dynamic, but it could be in a future implementation.
+There's nothing to indicate that this is useful right now.
+
+When a chunk is full (or a flush() occurs), the memory backed by
+the chunk is registered with librdmacm is pinned in memory on
+both sides using the aforementioned protocol.
+After pinning, an RDMA Write is generated and transmitted
+for the entire chunk.
+
+Chunks are also transmitted in batches: This means that we
+do not request that the hardware signal the completion queue
+for the completion of *every* chunk. The current batch size
+is about 64 chunks (corresponding to 64 MB of memory).
+Only the last chunk in a batch must be signaled.
+This helps keep everything as asynchronous as possible
+and helps keep the hardware busy performing RDMA operations.
+
+Error-handling:
+===============
+
+Infiniband has what is called a "Reliable, Connected"
+link (one of 4 choices). This is the mode in which
+we use for RDMA migration.
+
+If a *single* message fails,
+the decision is to abort the migration entirely and
+cleanup all the RDMA descriptors and unregister all
+the memory.
+
+After cleanup, the Virtual Machine is returned to normal
+operation the same way that would happen if the TCP
+socket is broken during a non-RDMA based migration.
+
+TODO:
+=====
+1. 'migrate x-rdma:host:port' and '-incoming x-rdma' options will be
+   renamed to 'rdma' after the experimental phase of this work has
+   completed upstream.
+2. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits
+   are not compatible with infinband memory pinning and will result in
+   an aborted migration (but with the source VM left unaffected).
+3. Use of the recent /proc/<pid>/pagemap would likely speed up
+   the use of KSM and ballooning while using RDMA.
+4. Also, some form of balloon-device usage tracking would also
+   help alleviate some issues.
diff --git a/exec.c b/exec.c
index 0b0118bd41..2e6fc0087f 100644
--- a/exec.c
+++ b/exec.c
@@ -2630,3 +2630,12 @@ bool cpu_physical_memory_is_io(hwaddr phys_addr)
              memory_region_is_romd(mr));
 }
 #endif
+
+void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
+{
+    RAMBlock *block;
+
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        func(block->host, block->offset, block->length, opaque);
+    }
+}
diff --git a/hmp.c b/hmp.c
index 494a9aa459..148a3fbbd6 100644
--- a/hmp.c
+++ b/hmp.c
@@ -169,6 +169,8 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
     if (info->has_ram) {
         monitor_printf(mon, "transferred ram: %" PRIu64 " kbytes\n",
                        info->ram->transferred >> 10);
+        monitor_printf(mon, "throughput: %0.2f mbps\n",
+                       info->ram->mbps);
         monitor_printf(mon, "remaining ram: %" PRIu64 " kbytes\n",
                        info->ram->remaining >> 10);
         monitor_printf(mon, "total ram: %" PRIu64 " kbytes\n",
diff --git a/include/block/coroutine.h b/include/block/coroutine.h
index a978162a3f..377805a3b0 100644
--- a/include/block/coroutine.h
+++ b/include/block/coroutine.h
@@ -209,4 +209,10 @@ void qemu_co_rwlock_unlock(CoRwlock *lock);
  */
 void coroutine_fn co_sleep_ns(QEMUClock *clock, int64_t ns);
 
+/**
+ * Yield until a file descriptor becomes readable
+ *
+ * Note that this function clobbers the handlers for the file descriptor.
+ */
+void coroutine_fn yield_until_fd_readable(int fd);
 #endif /* QEMU_COROUTINE_H */
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index e061e21093..92a422313f 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -113,6 +113,11 @@ void cpu_physical_memory_write_rom(hwaddr addr,
 extern struct MemoryRegion io_mem_rom;
 extern struct MemoryRegion io_mem_notdirty;
 
+typedef void (RAMBlockIterFunc)(void *host_addr,
+    ram_addr_t offset, ram_addr_t length, void *opaque);
+
+void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
+
 #endif
 
 #endif /* !CPU_COMMON_H */
diff --git a/include/migration/migration.h b/include/migration/migration.h
index e2acec64c0..f0640e0eec 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -21,6 +21,7 @@
 #include "qapi/error.h"
 #include "migration/vmstate.h"
 #include "qapi-types.h"
+#include "exec/cpu-common.h"
 
 struct MigrationParams {
     bool blk;
@@ -40,6 +41,7 @@ struct MigrationState
 
     int state;
     MigrationParams params;
+    double mbps;
     int64_t total_time;
     int64_t downtime;
     int64_t expected_downtime;
@@ -92,6 +94,8 @@ uint64_t ram_bytes_remaining(void);
 uint64_t ram_bytes_transferred(void);
 uint64_t ram_bytes_total(void);
 
+void acct_update_position(QEMUFile *f, size_t size, bool zero);
+
 extern SaveVMHandlers savevm_ram_handlers;
 
 uint64_t dup_mig_bytes_transferred(void);
@@ -119,6 +123,8 @@ void migrate_add_blocker(Error *reason);
  */
 void migrate_del_blocker(Error *reason);
 
+bool migrate_rdma_pin_all(void);
+
 int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
                          uint8_t *dst, int dlen);
 int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
@@ -127,4 +133,23 @@ int migrate_use_xbzrle(void);
 int64_t migrate_xbzrle_cache_size(void);
 
 int64_t xbzrle_cache_resize(int64_t new_size);
+
+void ram_control_before_iterate(QEMUFile *f, uint64_t flags);
+void ram_control_after_iterate(QEMUFile *f, uint64_t flags);
+void ram_control_load_hook(QEMUFile *f, uint64_t flags);
+
+/* Whenever this is found in the data stream, the flags
+ * will be passed to ram_control_load_hook in the incoming-migration
+ * side. This lets before_ram_iterate/after_ram_iterate add
+ * transport-specific sections to the RAM migration data.
+ */
+#define RAM_SAVE_FLAG_HOOK     0x80
+
+#define RAM_SAVE_CONTROL_NOT_SUPP -1000
+#define RAM_SAVE_CONTROL_DELAYED  -2000
+
+size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
+                             ram_addr_t offset, size_t size,
+                             int *bytes_sent);
+
 #endif
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 7519464192..0f757fbeb6 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -23,6 +23,7 @@
  */
 #ifndef QEMU_FILE_H
 #define QEMU_FILE_H 1
+#include "exec/cpu-common.h"
 
 /* This function writes a chunk of data to a file at the given position.
  * The pos argument can be ignored if the file is only being used for
@@ -57,12 +58,40 @@ typedef int (QEMUFileGetFD)(void *opaque);
 typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov,
                                            int iovcnt, int64_t pos);
 
+/*
+ * This function provides hooks around different
+ * stages of RAM migration.
+ */
+typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags);
+
+/*
+ * Constants used by ram_control_* hooks
+ */
+#define RAM_CONTROL_SETUP    0
+#define RAM_CONTROL_ROUND    1
+#define RAM_CONTROL_HOOK     2
+#define RAM_CONTROL_FINISH   3
+
+/*
+ * This function allows override of where the RAM page
+ * is saved (such as RDMA, for example.)
+ */
+typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque,
+                               ram_addr_t block_offset,
+                               ram_addr_t offset,
+                               size_t size,
+                               int *bytes_sent);
+
 typedef struct QEMUFileOps {
     QEMUFilePutBufferFunc *put_buffer;
     QEMUFileGetBufferFunc *get_buffer;
     QEMUFileCloseFunc *close;
     QEMUFileGetFD *get_fd;
     QEMUFileWritevBufferFunc *writev_buffer;
+    QEMURamHookFunc *before_ram_iterate;
+    QEMURamHookFunc *after_ram_iterate;
+    QEMURamHookFunc *hook_ram_load;
+    QEMURamSaveFunc *save_page;
 } QEMUFileOps;
 
 QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops);
@@ -80,6 +109,7 @@ void qemu_put_byte(QEMUFile *f, int v);
  * The buffer should be available till it is sent asynchronously.
  */
 void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size);
+bool qemu_file_mode_is_not_valid(const char *mode);
 
 static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v)
 {
@@ -93,6 +123,7 @@ void qemu_put_be32(QEMUFile *f, unsigned int v);
 void qemu_put_be64(QEMUFile *f, uint64_t v);
 int qemu_get_buffer(QEMUFile *f, uint8_t *buf, int size);
 int qemu_get_byte(QEMUFile *f);
+void qemu_update_position(QEMUFile *f, size_t size);
 
 static inline unsigned int qemu_get_ubyte(QEMUFile *f)
 {
@@ -110,6 +141,7 @@ void qemu_file_reset_rate_limit(QEMUFile *f);
 void qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
 int64_t qemu_file_get_rate_limit(QEMUFile *f);
 int qemu_file_get_error(QEMUFile *f);
+void qemu_fflush(QEMUFile *f);
 
 static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
 {
diff --git a/migration.c b/migration.c
index 058f9e69f4..a704d48669 100644
--- a/migration.c
+++ b/migration.c
@@ -66,6 +66,7 @@ MigrationState *migrate_get_current(void)
         .state = MIG_STATE_SETUP,
         .bandwidth_limit = MAX_THROTTLE,
         .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE,
+        .mbps = -1,
     };
 
     return &current_migration;
@@ -201,6 +202,7 @@ MigrationInfo *qmp_query_migrate(Error **errp)
         info->ram->normal = norm_mig_pages_transferred();
         info->ram->normal_bytes = norm_mig_bytes_transferred();
         info->ram->dirty_pages_rate = s->dirty_pages_rate;
+        info->ram->mbps = s->mbps;
 
         if (blk_mig_active()) {
             info->has_disk = true;
@@ -230,6 +232,7 @@ MigrationInfo *qmp_query_migrate(Error **errp)
         info->ram->skipped = skipped_mig_pages_transferred();
         info->ram->normal = norm_mig_pages_transferred();
         info->ram->normal_bytes = norm_mig_bytes_transferred();
+        info->ram->mbps = s->mbps;
         break;
     case MIG_STATE_ERROR:
         info->has_status = true;
@@ -473,6 +476,15 @@ void qmp_migrate_set_downtime(double value, Error **errp)
     max_downtime = (uint64_t)value;
 }
 
+bool migrate_rdma_pin_all(void)
+{
+    MigrationState *s;
+
+    s = migrate_get_current();
+
+    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL];
+}
+
 int migrate_use_xbzrle(void)
 {
     MigrationState *s;
@@ -543,6 +555,9 @@ static void *migration_thread(void *opaque)
             double bandwidth = transferred_bytes / time_spent;
             max_size = bandwidth * migrate_max_downtime() / 1000000;
 
+            s->mbps = time_spent ? (((double) transferred_bytes * 8.0) /
+                    ((double) time_spent / 1000.0)) / 1000.0 / 1000.0 : -1;
+
             DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
                     " bandwidth %g max_size %" PRId64 "\n",
                     transferred_bytes, time_spent, bandwidth, max_size);
diff --git a/qapi-schema.json b/qapi-schema.json
index 6cc07c20ce..a30a728dde 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -513,12 +513,15 @@
 # @dirty-pages-rate: number of pages dirtied by second by the
 #        guest (since 1.3)
 #
+# @mbps: throughput in megabits/sec. (since 1.6)
+#
 # Since: 0.14.0
 ##
 { 'type': 'MigrationStats',
   'data': {'transferred': 'int', 'remaining': 'int', 'total': 'int' ,
            'duplicate': 'int', 'skipped': 'int', 'normal': 'int',
-           'normal-bytes': 'int', 'dirty-pages-rate' : 'int' } }
+           'normal-bytes': 'int', 'dirty-pages-rate' : 'int',
+           'mbps' : 'number' } }
 
 ##
 # @XBZRLECacheStats
@@ -605,10 +608,15 @@
 #          This feature allows us to minimize migration traffic for certain work
 #          loads, by sending compressed difference of the pages
 #
+# @x-rdma-pin-all: Controls whether or not the entire VM memory footprint is
+#          mlock()'d on demand or all at once. Refer to docs/rdma.txt for usage.
+#          Disabled by default. Experimental: may (or may not) be renamed after
+#          further testing is complete. (since 1.6)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
-  'data': ['xbzrle'] }
+  'data': ['xbzrle', 'x-rdma-pin-all'] }
 
 ##
 # @MigrationCapabilityStatus
diff --git a/qemu-coroutine-io.c b/qemu-coroutine-io.c
index e8ad1a4011..c4df35a640 100644
--- a/qemu-coroutine-io.c
+++ b/qemu-coroutine-io.c
@@ -63,3 +63,26 @@ qemu_co_send_recv(int sockfd, void *buf, size_t bytes, bool do_send)
     struct iovec iov = { .iov_base = buf, .iov_len = bytes };
     return qemu_co_sendv_recvv(sockfd, &iov, 1, 0, bytes, do_send);
 }
+
+typedef struct {
+    Coroutine *co;
+    int fd;
+} FDYieldUntilData;
+
+static void fd_coroutine_enter(void *opaque)
+{
+    FDYieldUntilData *data = opaque;
+    qemu_set_fd_handler(data->fd, NULL, NULL, NULL);
+    qemu_coroutine_enter(data->co, NULL);
+}
+
+void coroutine_fn yield_until_fd_readable(int fd)
+{
+    FDYieldUntilData data;
+
+    assert(qemu_in_coroutine());
+    data.co = qemu_coroutine_self();
+    data.fd = fd;
+    qemu_set_fd_handler(fd, fd_coroutine_enter, NULL, &data);
+    qemu_coroutine_yield();
+}
diff --git a/savevm.c b/savevm.c
index 48cc2a995f..e0491e7580 100644
--- a/savevm.c
+++ b/savevm.c
@@ -149,34 +149,6 @@ typedef struct QEMUFileSocket
     QEMUFile *file;
 } QEMUFileSocket;
 
-typedef struct {
-    Coroutine *co;
-    int fd;
-} FDYieldUntilData;
-
-static void fd_coroutine_enter(void *opaque)
-{
-    FDYieldUntilData *data = opaque;
-    qemu_set_fd_handler(data->fd, NULL, NULL, NULL);
-    qemu_coroutine_enter(data->co, NULL);
-}
-
-/**
- * Yield until a file descriptor becomes readable
- *
- * Note that this function clobbers the handlers for the file descriptor.
- */
-static void coroutine_fn yield_until_fd_readable(int fd)
-{
-    FDYieldUntilData data;
-
-    assert(qemu_in_coroutine());
-    data.co = qemu_coroutine_self();
-    data.fd = fd;
-    qemu_set_fd_handler(fd, fd_coroutine_enter, NULL, &data);
-    qemu_coroutine_yield();
-}
-
 static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
                                     int64_t pos)
 {
@@ -477,14 +449,23 @@ static const QEMUFileOps socket_write_ops = {
     .close =      socket_close
 };
 
-QEMUFile *qemu_fopen_socket(int fd, const char *mode)
+bool qemu_file_mode_is_not_valid(const char *mode)
 {
-    QEMUFileSocket *s;
-
     if (mode == NULL ||
         (mode[0] != 'r' && mode[0] != 'w') ||
         mode[1] != 'b' || mode[2] != 0) {
         fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
+        return true;
+    }
+
+    return false;
+}
+
+QEMUFile *qemu_fopen_socket(int fd, const char *mode)
+{
+    QEMUFileSocket *s;
+
+    if (qemu_file_mode_is_not_valid(mode)) {
         return NULL;
     }
 
@@ -503,10 +484,7 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
 {
     QEMUFileStdio *s;
 
-    if (mode == NULL ||
-	(mode[0] != 'r' && mode[0] != 'w') ||
-	mode[1] != 'b' || mode[2] != 0) {
-        fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
+    if (qemu_file_mode_is_not_valid(mode)) {
         return NULL;
     }
 
@@ -611,7 +589,7 @@ static inline bool qemu_file_is_writable(QEMUFile *f)
  * If there is writev_buffer QEMUFileOps it uses it otherwise uses
  * put_buffer ops.
  */
-static void qemu_fflush(QEMUFile *f)
+void qemu_fflush(QEMUFile *f)
 {
     ssize_t ret = 0;
 
@@ -638,6 +616,65 @@ static void qemu_fflush(QEMUFile *f)
     }
 }
 
+void ram_control_before_iterate(QEMUFile *f, uint64_t flags)
+{
+    int ret = 0;
+
+    if (f->ops->before_ram_iterate) {
+        ret = f->ops->before_ram_iterate(f, f->opaque, flags);
+        if (ret < 0) {
+            qemu_file_set_error(f, ret);
+        }
+    }
+}
+
+void ram_control_after_iterate(QEMUFile *f, uint64_t flags)
+{
+    int ret = 0;
+
+    if (f->ops->after_ram_iterate) {
+        ret = f->ops->after_ram_iterate(f, f->opaque, flags);
+        if (ret < 0) {
+            qemu_file_set_error(f, ret);
+        }
+    }
+}
+
+void ram_control_load_hook(QEMUFile *f, uint64_t flags)
+{
+    int ret = 0;
+
+    if (f->ops->hook_ram_load) {
+        ret = f->ops->hook_ram_load(f, f->opaque, flags);
+        if (ret < 0) {
+            qemu_file_set_error(f, ret);
+        }
+    } else {
+        qemu_file_set_error(f, ret);
+    }
+}
+
+size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
+                         ram_addr_t offset, size_t size, int *bytes_sent)
+{
+    if (f->ops->save_page) {
+        int ret = f->ops->save_page(f, f->opaque, block_offset,
+                                    offset, size, bytes_sent);
+
+        if (ret != RAM_SAVE_CONTROL_DELAYED) {
+            if (*bytes_sent > 0) {
+                qemu_update_position(f, *bytes_sent);
+            } else if (ret < 0) {
+                qemu_file_set_error(f, ret);
+            }
+        }
+
+        return ret;
+    }
+
+    return RAM_SAVE_CONTROL_NOT_SUPP;
+}
+
 static void qemu_fill_buffer(QEMUFile *f)
 {
     int len;
@@ -671,6 +708,11 @@ int qemu_get_fd(QEMUFile *f)
     return -1;
 }
 
+void qemu_update_position(QEMUFile *f, size_t size)
+{
+    f->pos += size;
+}
+
 /** Closes the file
  *
  * Returns negative error value if any error happened on previous operations or