From 87d2f8254347879f80b3a834834a6b934dea5ecc Mon Sep 17 00:00:00 2001 From: Alon Levy Date: Sun, 12 May 2013 14:16:28 +0300 Subject: arch_init/ram_load: add error message for block length mismatch Makes it easier to debug situations where the source and target have different ram blocks in a device and migration fails due to that, for instance a BAR size change on a PCI device. Signed-off-by: Alon Levy Signed-off-by: Juan Quintela --- arch_init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch_init.c b/arch_init.c index a8b91eed7a..64421e1619 100644 --- a/arch_init.c +++ b/arch_init.c @@ -808,6 +808,9 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) QTAILQ_FOREACH(block, &ram_list.blocks, next) { if (!strncmp(id, block->idstr, sizeof(id))) { if (block->length != length) { + fprintf(stderr, "Length mismatch: %s: %ld " + "in != " RAM_ADDR_FMT "\n", id, length, + block->length); ret = -EINVAL; goto done; } -- cgit v1.2.3 From 9ef051e5536b6368a1076046ec6c4ec4ac12b5c6 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 10 Jun 2013 12:14:19 +0200 Subject: Revert "migration: do not sent zero pages in bulk stage" Not sending zero pages breaks migration if a page is zero at the source but not at the destination. This can e.g. happen if different BIOS versions are used at source and destination. It has also been reported that migration on pseries is completely broken with this patch. This effectively reverts commit f1c72795af573b24a7da5eb52375c9aba8a37972. Conflicts: arch_init.c Signed-off-by: Peter Lieven Signed-off-by: Juan Quintela --- arch_init.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch_init.c b/arch_init.c index 64421e1619..8bb933fddb 100644 --- a/arch_init.c +++ b/arch_init.c @@ -457,15 +457,10 @@ static int ram_save_block(QEMUFile *f, bool last_stage) bytes_sent = -1; if (is_zero_page(p)) { acct_info.dup_pages++; - if (!ram_bulk_stage) { - bytes_sent = save_block_hdr(f, block, offset, cont, - RAM_SAVE_FLAG_COMPRESS); - qemu_put_byte(f, 0); - bytes_sent++; - } else { - acct_info.skipped_pages++; - bytes_sent = 0; - } + bytes_sent = save_block_hdr(f, block, offset, cont, + RAM_SAVE_FLAG_COMPRESS); + qemu_put_byte(f, 0); + bytes_sent++; } else if (!ram_bulk_stage && migrate_use_xbzrle()) { current_addr = block->offset + offset; bytes_sent = save_xbzrle_page(f, p, current_addr, block, -- cgit v1.2.3 From 211ea74022f51164a7729030b28eec90b6c99a08 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 10 Jun 2013 12:14:20 +0200 Subject: migration: do not overwrite zero pages on incoming migration do not memset pages to zero if they already read as zero. this will allocate a new zero page and consume memory unnecessarily. even if we madvise a MADV_DONTNEED later this will only deallocate the memory asynchronously. Signed-off-by: Peter Lieven Signed-off-by: Juan Quintela --- arch_init.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch_init.c b/arch_init.c index 8bb933fddb..4dd17f4ff3 100644 --- a/arch_init.c +++ b/arch_init.c @@ -835,14 +835,16 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) } ch = qemu_get_byte(f); - memset(host, ch, TARGET_PAGE_SIZE); + if (ch != 0 || !is_zero_page(host)) { + memset(host, ch, TARGET_PAGE_SIZE); #ifndef _WIN32 - if (ch == 0 && - (!kvm_enabled() || kvm_has_sync_mmu()) && - getpagesize() <= TARGET_PAGE_SIZE) { - qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED); - } + if (ch == 0 && + (!kvm_enabled() || kvm_has_sync_mmu()) && + getpagesize() <= TARGET_PAGE_SIZE) { + qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED); + } #endif + } } else if (flags & RAM_SAVE_FLAG_PAGE) { void *host; -- cgit v1.2.3 From f4abc9d621823b14a6cd508c66c1ecb21f96349e Mon Sep 17 00:00:00 2001 From: "Michael R. Hines" Date: Tue, 25 Jun 2013 21:35:27 -0400 Subject: rdma: add documentation docs/rdma.txt contains full documentation, wiki links, github url and contact information. Reviewed-by: Juan Quintela Reviewed-by: Paolo Bonzini Reviewed-by: Chegu Vinod Tested-by: Chegu Vinod Tested-by: Michael R. Hines Signed-off-by: Michael R. Hines Signed-off-by: Juan Quintela --- docs/rdma.txt | 415 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 415 insertions(+) create mode 100644 docs/rdma.txt diff --git a/docs/rdma.txt b/docs/rdma.txt new file mode 100644 index 0000000000..45a4b1d50d --- /dev/null +++ b/docs/rdma.txt @@ -0,0 +1,415 @@ +(RDMA: Remote Direct Memory Access) +RDMA Live Migration Specification, Version # 1 +============================================== +Wiki: http://wiki.qemu.org/Features/RDMALiveMigration +Github: git@github.com:hinesmr/qemu.git, 'rdma' branch + +Copyright (C) 2013 Michael R. Hines + +An *exhaustive* paper (2010) shows additional performance details +linked on the QEMU wiki above. + +Contents: +========= +* Introduction +* Before running +* Running +* Performance +* RDMA Migration Protocol Description +* Versioning and Capabilities +* QEMUFileRDMA Interface +* Migration of pc.ram +* Error handling +* TODO + +Introduction: +============= + +RDMA helps make your migration more deterministic under heavy load because +of the significantly lower latency and higher throughput over TCP/IP. This is +because the RDMA I/O architecture reduces the number of interrupts and +data copies by bypassing the host networking stack. In particular, a TCP-based +migration, under certain types of memory-bound workloads, may take a more +unpredicatable amount of time to complete the migration if the amount of +memory tracked during each live migration iteration round cannot keep pace +with the rate of dirty memory produced by the workload. + +RDMA currently comes in two flavors: both Ethernet based (RoCE, or RDMA +over Convered Ethernet) as well as Infiniband-based. This implementation of +migration using RDMA is capable of using both technologies because of +the use of the OpenFabrics OFED software stack that abstracts out the +programming model irrespective of the underlying hardware. + +Refer to openfabrics.org or your respective RDMA hardware vendor for +an understanding on how to verify that you have the OFED software stack +installed in your environment. You should be able to successfully link +against the "librdmacm" and "libibverbs" libraries and development headers +for a working build of QEMU to run successfully using RDMA Migration. + +BEFORE RUNNING: +=============== + +Use of RDMA during migration requires pinning and registering memory +with the hardware. This means that memory must be physically resident +before the hardware can transmit that memory to another machine. +If this is not acceptable for your application or product, then the use +of RDMA migration may in fact be harmful to co-located VMs or other +software on the machine if there is not sufficient memory available to +relocate the entire footprint of the virtual machine. If so, then the +use of RDMA is discouraged and it is recommended to use standard TCP migration. + +Experimental: Next, decide if you want dynamic page registration. +For example, if you have an 8GB RAM virtual machine, but only 1GB +is in active use, then enabling this feature will cause all 8GB to +be pinned and resident in memory. This feature mostly affects the +bulk-phase round of the migration and can be enabled for extremely +high-performance RDMA hardware using the following command: + +QEMU Monitor Command: +$ migrate_set_capability x-rdma-pin-all on # disabled by default + +Performing this action will cause all 8GB to be pinned, so if that's +not what you want, then please ignore this step altogether. + +On the other hand, this will also significantly speed up the bulk round +of the migration, which can greatly reduce the "total" time of your migration. +Example performance of this using an idle VM in the previous example +can be found in the "Performance" section. + +Note: for very large virtual machines (hundreds of GBs), pinning all +*all* of the memory of your virtual machine in the kernel is very expensive +may extend the initial bulk iteration time by many seconds, +and thus extending the total migration time. However, this will not +affect the determinism or predictability of your migration you will +still gain from the benefits of advanced pinning with RDMA. + +RUNNING: +======== + +First, set the migration speed to match your hardware's capabilities: + +QEMU Monitor Command: +$ migrate_set_speed 40g # or whatever is the MAX of your RDMA device + +Next, on the destination machine, add the following to the QEMU command line: + +qemu ..... -incoming x-rdma:host:port + +Finally, perform the actual migration on the source machine: + +QEMU Monitor Command: +$ migrate -d x-rdma:host:port + +PERFORMANCE +=========== + +Here is a brief summary of total migration time and downtime using RDMA: +Using a 40gbps infiniband link performing a worst-case stress test, +using an 8GB RAM virtual machine: + +Using the following command: +$ apt-get install stress +$ stress --vm-bytes 7500M --vm 1 --vm-keep + +1. Migration throughput: 26 gigabits/second. +2. Downtime (stop time) varies between 15 and 100 milliseconds. + +EFFECTS of memory registration on bulk phase round: + +For example, in the same 8GB RAM example with all 8GB of memory in +active use and the VM itself is completely idle using the same 40 gbps +infiniband link: + +1. x-rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps +2. x-rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps + +These numbers would of course scale up to whatever size virtual machine +you have to migrate using RDMA. + +Enabling this feature does *not* have any measurable affect on +migration *downtime*. This is because, without this feature, all of the +memory will have already been registered already in advance during +the bulk round and does not need to be re-registered during the successive +iteration rounds. + +RDMA Protocol Description: +========================== + +Migration with RDMA is separated into two parts: + +1. The transmission of the pages using RDMA +2. Everything else (a control channel is introduced) + +"Everything else" is transmitted using a formal +protocol now, consisting of infiniband SEND messages. + +An infiniband SEND message is the standard ibverbs +message used by applications of infiniband hardware. +The only difference between a SEND message and an RDMA +message is that SEND messages cause notifications +to be posted to the completion queue (CQ) on the +infiniband receiver side, whereas RDMA messages (used +for pc.ram) do not (to behave like an actual DMA). + +Messages in infiniband require two things: + +1. registration of the memory that will be transmitted +2. (SEND only) work requests to be posted on both + sides of the network before the actual transmission + can occur. + +RDMA messages are much easier to deal with. Once the memory +on the receiver side is registered and pinned, we're +basically done. All that is required is for the sender +side to start dumping bytes onto the link. + +(Memory is not released from pinning until the migration +completes, given that RDMA migrations are very fast.) + +SEND messages require more coordination because the +receiver must have reserved space (using a receive +work request) on the receive queue (RQ) before QEMUFileRDMA +can start using them to carry all the bytes as +a control transport for migration of device state. + +To begin the migration, the initial connection setup is +as follows (migration-rdma.c): + +1. Receiver and Sender are started (command line or libvirt): +2. Both sides post two RQ work requests +3. Receiver does listen() +4. Sender does connect() +5. Receiver accept() +6. Check versioning and capabilities (described later) + +At this point, we define a control channel on top of SEND messages +which is described by a formal protocol. Each SEND message has a +header portion and a data portion (but together are transmitted +as a single SEND message). + +Header: + * Length (of the data portion, uint32, network byte order) + * Type (what command to perform, uint32, network byte order) + * Repeat (Number of commands in data portion, same type only) + +The 'Repeat' field is here to support future multiple page registrations +in a single message without any need to change the protocol itself +so that the protocol is compatible against multiple versions of QEMU. +Version #1 requires that all server implementations of the protocol must +check this field and register all requests found in the array of commands located +in the data portion and return an equal number of results in the response. +The maximum number of repeats is hard-coded to 4096. This is a conservative +limit based on the maximum size of a SEND message along with emperical +observations on the maximum future benefit of simultaneous page registrations. + +The 'type' field has 10 different command values: + 1. Unused + 2. Error (sent to the source during bad things) + 3. Ready (control-channel is available) + 4. QEMU File (for sending non-live device state) + 5. RAM Blocks request (used right after connection setup) + 6. RAM Blocks result (used right after connection setup) + 7. Compress page (zap zero page and skip registration) + 8. Register request (dynamic chunk registration) + 9. Register result ('rkey' to be used by sender) + 10. Register finished (registration for current iteration finished) + +A single control message, as hinted above, can contain within the data +portion an array of many commands of the same type. If there is more than +one command, then the 'repeat' field will be greater than 1. + +After connection setup, message 5 & 6 are used to exchange ram block +information and optionally pin all the memory if requested by the user. + +After ram block exchange is completed, we have two protocol-level +functions, responsible for communicating control-channel commands +using the above list of values: + +Logically: + +qemu_rdma_exchange_recv(header, expected command type) + +1. We transmit a READY command to let the sender know that + we are *ready* to receive some data bytes on the control channel. +2. Before attempting to receive the expected command, we post another + RQ work request to replace the one we just used up. +3. Block on a CQ event channel and wait for the SEND to arrive. +4. When the send arrives, librdmacm will unblock us. +5. Verify that the command-type and version received matches the one we expected. + +qemu_rdma_exchange_send(header, data, optional response header & data): + +1. Block on the CQ event channel waiting for a READY command + from the receiver to tell us that the receiver + is *ready* for us to transmit some new bytes. +2. Optionally: if we are expecting a response from the command + (that we have no yet transmitted), let's post an RQ + work request to receive that data a few moments later. +3. When the READY arrives, librdmacm will + unblock us and we immediately post a RQ work request + to replace the one we just used up. +4. Now, we can actually post the work request to SEND + the requested command type of the header we were asked for. +5. Optionally, if we are expecting a response (as before), + we block again and wait for that response using the additional + work request we previously posted. (This is used to carry + 'Register result' commands #6 back to the sender which + hold the rkey need to perform RDMA. Note that the virtual address + corresponding to this rkey was already exchanged at the beginning + of the connection (described below). + +All of the remaining command types (not including 'ready') +described above all use the aformentioned two functions to do the hard work: + +1. After connection setup, RAMBlock information is exchanged using + this protocol before the actual migration begins. This information includes + a description of each RAMBlock on the server side as well as the virtual addresses + and lengths of each RAMBlock. This is used by the client to determine the + start and stop locations of chunks and how to register them dynamically + before performing the RDMA operations. +2. During runtime, once a 'chunk' becomes full of pages ready to + be sent with RDMA, the registration commands are used to ask the + other side to register the memory for this chunk and respond + with the result (rkey) of the registration. +3. Also, the QEMUFile interfaces also call these functions (described below) + when transmitting non-live state, such as devices or to send + its own protocol information during the migration process. +4. Finally, zero pages are only checked if a page has not yet been registered + using chunk registration (or not checked at all and unconditionally + written if chunk registration is disabled. This is accomplished using + the "Compress" command listed above. If the page *has* been registered + then we check the entire chunk for zero. Only if the entire chunk is + zero, then we send a compress command to zap the page on the other side. + +Versioning and Capabilities +=========================== +Current version of the protocol is version #1. + +The same version applies to both for protocol traffic and capabilities +negotiation. (i.e. There is only one version number that is referred to +by all communication). + +librdmacm provides the user with a 'private data' area to be exchanged +at connection-setup time before any infiniband traffic is generated. + +Header: + * Version (protocol version validated before send/recv occurs), uint32, network byte order + * Flags (bitwise OR of each capability), uint32, network byte order + +There is no data portion of this header right now, so there is +no length field. The maximum size of the 'private data' section +is only 192 bytes per the Infiniband specification, so it's not +very useful for data anyway. This structure needs to remain small. + +This private data area is a convenient place to check for protocol +versioning because the user does not need to register memory to +transmit a few bytes of version information. + +This is also a convenient place to negotiate capabilities +(like dynamic page registration). + +If the version is invalid, we throw an error. + +If the version is new, we only negotiate the capabilities that the +requested version is able to perform and ignore the rest. + +Currently there is only *one* capability in Version #1: dynamic page registration + +Finally: Negotiation happens with the Flags field: If the primary-VM +sets a flag, but the destination does not support this capability, it +will return a zero-bit for that flag and the primary-VM will understand +that as not being an available capability and will thus disable that +capability on the primary-VM side. + +QEMUFileRDMA Interface: +======================= + +QEMUFileRDMA introduces a couple of new functions: + +1. qemu_rdma_get_buffer() (QEMUFileOps rdma_read_ops) +2. qemu_rdma_put_buffer() (QEMUFileOps rdma_write_ops) + +These two functions are very short and simply use the protocol +describe above to deliver bytes without changing the upper-level +users of QEMUFile that depend on a bytestream abstraction. + +Finally, how do we handoff the actual bytes to get_buffer()? + +Again, because we're trying to "fake" a bytestream abstraction +using an analogy not unlike individual UDP frames, we have +to hold on to the bytes received from control-channel's SEND +messages in memory. + +Each time we receive a complete "QEMU File" control-channel +message, the bytes from SEND are copied into a small local holding area. + +Then, we return the number of bytes requested by get_buffer() +and leave the remaining bytes in the holding area until get_buffer() +comes around for another pass. + +If the buffer is empty, then we follow the same steps +listed above and issue another "QEMU File" protocol command, +asking for a new SEND message to re-fill the buffer. + +Migration of pc.ram: +==================== + +At the beginning of the migration, (migration-rdma.c), +the sender and the receiver populate the list of RAMBlocks +to be registered with each other into a structure. +Then, using the aforementioned protocol, they exchange a +description of these blocks with each other, to be used later +during the iteration of main memory. This description includes +a list of all the RAMBlocks, their offsets and lengths, virtual +addresses and possibly includes pre-registered RDMA keys in case dynamic +page registration was disabled on the server-side, otherwise not. + +Main memory is not migrated with the aforementioned protocol, +but is instead migrated with normal RDMA Write operations. + +Pages are migrated in "chunks" (hard-coded to 1 Megabyte right now). +Chunk size is not dynamic, but it could be in a future implementation. +There's nothing to indicate that this is useful right now. + +When a chunk is full (or a flush() occurs), the memory backed by +the chunk is registered with librdmacm is pinned in memory on +both sides using the aforementioned protocol. +After pinning, an RDMA Write is generated and transmitted +for the entire chunk. + +Chunks are also transmitted in batches: This means that we +do not request that the hardware signal the completion queue +for the completion of *every* chunk. The current batch size +is about 64 chunks (corresponding to 64 MB of memory). +Only the last chunk in a batch must be signaled. +This helps keep everything as asynchronous as possible +and helps keep the hardware busy performing RDMA operations. + +Error-handling: +=============== + +Infiniband has what is called a "Reliable, Connected" +link (one of 4 choices). This is the mode in which +we use for RDMA migration. + +If a *single* message fails, +the decision is to abort the migration entirely and +cleanup all the RDMA descriptors and unregister all +the memory. + +After cleanup, the Virtual Machine is returned to normal +operation the same way that would happen if the TCP +socket is broken during a non-RDMA based migration. + +TODO: +===== +1. 'migrate x-rdma:host:port' and '-incoming x-rdma' options will be + renamed to 'rdma' after the experimental phase of this work has + completed upstream. +2. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits + are not compatible with infinband memory pinning and will result in + an aborted migration (but with the source VM left unaffected). +3. Use of the recent /proc//pagemap would likely speed up + the use of KSM and ballooning while using RDMA. +4. Also, some form of balloon-device usage tracking would also + help alleviate some issues. -- cgit v1.2.3 From 2b0ce0797d6bfb13ebefe010da86abced0b7a9b3 Mon Sep 17 00:00:00 2001 From: "Michael R. Hines" Date: Tue, 25 Jun 2013 21:35:28 -0400 Subject: rdma: introduce qemu_update_position() RDMA writes happen asynchronously, and thus the performance accounting also needs to be able to occur asynchronously. This allows anybody to call into savevm.c to update both f->pos as well as into arch_init.c to update the acct_info structure with up-to-date values when the RDMA transfer actually completes. Reviewed-by: Juan Quintela Tested-by: Chegu Vinod Tested-by: Michael R. Hines Signed-off-by: Michael R. Hines Signed-off-by: Juan Quintela --- arch_init.c | 12 ++++++++++++ include/migration/migration.h | 2 ++ include/migration/qemu-file.h | 1 + savevm.c | 5 +++++ 4 files changed, 20 insertions(+) diff --git a/arch_init.c b/arch_init.c index 4dd17f4ff3..ea9ddad697 100644 --- a/arch_init.c +++ b/arch_init.c @@ -493,6 +493,18 @@ static int ram_save_block(QEMUFile *f, bool last_stage) static uint64_t bytes_transferred; +void acct_update_position(QEMUFile *f, size_t size, bool zero) +{ + uint64_t pages = size / TARGET_PAGE_SIZE; + if (zero) { + acct_info.dup_pages += pages; + } else { + acct_info.norm_pages += pages; + bytes_transferred += size; + qemu_update_position(f, size); + } +} + static ram_addr_t ram_save_remaining(void) { return migration_dirty_pages; diff --git a/include/migration/migration.h b/include/migration/migration.h index e2acec64c0..0be28a288a 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -92,6 +92,8 @@ uint64_t ram_bytes_remaining(void); uint64_t ram_bytes_transferred(void); uint64_t ram_bytes_total(void); +void acct_update_position(QEMUFile *f, size_t size, bool zero); + extern SaveVMHandlers savevm_ram_handlers; uint64_t dup_mig_bytes_transferred(void); diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 7519464192..8fab0dd752 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -93,6 +93,7 @@ void qemu_put_be32(QEMUFile *f, unsigned int v); void qemu_put_be64(QEMUFile *f, uint64_t v); int qemu_get_buffer(QEMUFile *f, uint8_t *buf, int size); int qemu_get_byte(QEMUFile *f); +void qemu_update_position(QEMUFile *f, size_t size); static inline unsigned int qemu_get_ubyte(QEMUFile *f) { diff --git a/savevm.c b/savevm.c index 48cc2a995f..9b5577ef6c 100644 --- a/savevm.c +++ b/savevm.c @@ -671,6 +671,11 @@ int qemu_get_fd(QEMUFile *f) return -1; } +void qemu_update_position(QEMUFile *f, size_t size) +{ + f->pos += size; +} + /** Closes the file * * Returns negative error value if any error happened on previous operations or -- cgit v1.2.3 From 9f05d0c3a4f9e8fcb13ed09cc350af45a627809a Mon Sep 17 00:00:00 2001 From: "Michael R. Hines" Date: Tue, 25 Jun 2013 21:35:29 -0400 Subject: rdma: export yield_until_fd_readable() The RDMA event channel can be made non-blocking just like a TCP socket. Exporting this function allows us to yield so that the QEMU monitor remains available. Reviewed-by: Juan Quintela Reviewed-by: Paolo Bonzini Reviewed-by: Chegu Vinod Tested-by: Chegu Vinod Tested-by: Michael R. Hines Signed-off-by: Michael R. Hines Signed-off-by: Juan Quintela --- include/block/coroutine.h | 6 ++++++ qemu-coroutine-io.c | 23 +++++++++++++++++++++++ savevm.c | 28 ---------------------------- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/include/block/coroutine.h b/include/block/coroutine.h index a978162a3f..377805a3b0 100644 --- a/include/block/coroutine.h +++ b/include/block/coroutine.h @@ -209,4 +209,10 @@ void qemu_co_rwlock_unlock(CoRwlock *lock); */ void coroutine_fn co_sleep_ns(QEMUClock *clock, int64_t ns); +/** + * Yield until a file descriptor becomes readable + * + * Note that this function clobbers the handlers for the file descriptor. + */ +void coroutine_fn yield_until_fd_readable(int fd); #endif /* QEMU_COROUTINE_H */ diff --git a/qemu-coroutine-io.c b/qemu-coroutine-io.c index e8ad1a4011..c4df35a640 100644 --- a/qemu-coroutine-io.c +++ b/qemu-coroutine-io.c @@ -63,3 +63,26 @@ qemu_co_send_recv(int sockfd, void *buf, size_t bytes, bool do_send) struct iovec iov = { .iov_base = buf, .iov_len = bytes }; return qemu_co_sendv_recvv(sockfd, &iov, 1, 0, bytes, do_send); } + +typedef struct { + Coroutine *co; + int fd; +} FDYieldUntilData; + +static void fd_coroutine_enter(void *opaque) +{ + FDYieldUntilData *data = opaque; + qemu_set_fd_handler(data->fd, NULL, NULL, NULL); + qemu_coroutine_enter(data->co, NULL); +} + +void coroutine_fn yield_until_fd_readable(int fd) +{ + FDYieldUntilData data; + + assert(qemu_in_coroutine()); + data.co = qemu_coroutine_self(); + data.fd = fd; + qemu_set_fd_handler(fd, fd_coroutine_enter, NULL, &data); + qemu_coroutine_yield(); +} diff --git a/savevm.c b/savevm.c index 9b5577ef6c..e35c7a4084 100644 --- a/savevm.c +++ b/savevm.c @@ -149,34 +149,6 @@ typedef struct QEMUFileSocket QEMUFile *file; } QEMUFileSocket; -typedef struct { - Coroutine *co; - int fd; -} FDYieldUntilData; - -static void fd_coroutine_enter(void *opaque) -{ - FDYieldUntilData *data = opaque; - qemu_set_fd_handler(data->fd, NULL, NULL, NULL); - qemu_coroutine_enter(data->co, NULL); -} - -/** - * Yield until a file descriptor becomes readable - * - * Note that this function clobbers the handlers for the file descriptor. - */ -static void coroutine_fn yield_until_fd_readable(int fd) -{ - FDYieldUntilData data; - - assert(qemu_in_coroutine()); - data.co = qemu_coroutine_self(); - data.fd = fd; - qemu_set_fd_handler(fd, fd_coroutine_enter, NULL, &data); - qemu_coroutine_yield(); -} - static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, int64_t pos) { -- cgit v1.2.3 From 7e114f8cf24a01893226e3a4d22a288125515cfd Mon Sep 17 00:00:00 2001 From: "Michael R. Hines" Date: Tue, 25 Jun 2013 21:35:30 -0400 Subject: rdma: export throughput w/ MigrationStats QMP This exposes throughput (in megabits/sec) through QMP. Reviewed-by: Juan Quintela Reviewed-by: Paolo Bonzini Reviewed-by: Chegu Vinod Tested-by: Chegu Vinod Tested-by: Michael R. Hines Signed-off-by: Michael R. Hines Signed-off-by: Juan Quintela --- hmp.c | 2 ++ include/migration/migration.h | 1 + migration.c | 6 ++++++ qapi-schema.json | 5 ++++- 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/hmp.c b/hmp.c index 494a9aa459..148a3fbbd6 100644 --- a/hmp.c +++ b/hmp.c @@ -169,6 +169,8 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) if (info->has_ram) { monitor_printf(mon, "transferred ram: %" PRIu64 " kbytes\n", info->ram->transferred >> 10); + monitor_printf(mon, "throughput: %0.2f mbps\n", + info->ram->mbps); monitor_printf(mon, "remaining ram: %" PRIu64 " kbytes\n", info->ram->remaining >> 10); monitor_printf(mon, "total ram: %" PRIu64 " kbytes\n", diff --git a/include/migration/migration.h b/include/migration/migration.h index 0be28a288a..535e844880 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -40,6 +40,7 @@ struct MigrationState int state; MigrationParams params; + double mbps; int64_t total_time; int64_t downtime; int64_t expected_downtime; diff --git a/migration.c b/migration.c index 058f9e69f4..441a3b23f9 100644 --- a/migration.c +++ b/migration.c @@ -66,6 +66,7 @@ MigrationState *migrate_get_current(void) .state = MIG_STATE_SETUP, .bandwidth_limit = MAX_THROTTLE, .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE, + .mbps = -1, }; return ¤t_migration; @@ -201,6 +202,7 @@ MigrationInfo *qmp_query_migrate(Error **errp) info->ram->normal = norm_mig_pages_transferred(); info->ram->normal_bytes = norm_mig_bytes_transferred(); info->ram->dirty_pages_rate = s->dirty_pages_rate; + info->ram->mbps = s->mbps; if (blk_mig_active()) { info->has_disk = true; @@ -230,6 +232,7 @@ MigrationInfo *qmp_query_migrate(Error **errp) info->ram->skipped = skipped_mig_pages_transferred(); info->ram->normal = norm_mig_pages_transferred(); info->ram->normal_bytes = norm_mig_bytes_transferred(); + info->ram->mbps = s->mbps; break; case MIG_STATE_ERROR: info->has_status = true; @@ -543,6 +546,9 @@ static void *migration_thread(void *opaque) double bandwidth = transferred_bytes / time_spent; max_size = bandwidth * migrate_max_downtime() / 1000000; + s->mbps = time_spent ? (((double) transferred_bytes * 8.0) / + ((double) time_spent / 1000.0)) / 1000.0 / 1000.0 : -1; + DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64 "\n", transferred_bytes, time_spent, bandwidth, max_size); diff --git a/qapi-schema.json b/qapi-schema.json index 6cc07c20ce..78da5577af 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -513,12 +513,15 @@ # @dirty-pages-rate: number of pages dirtied by second by the # guest (since 1.3) # +# @mbps: throughput in megabits/sec. (since 1.6) +# # Since: 0.14.0 ## { 'type': 'MigrationStats', 'data': {'transferred': 'int', 'remaining': 'int', 'total': 'int' , 'duplicate': 'int', 'skipped': 'int', 'normal': 'int', - 'normal-bytes': 'int', 'dirty-pages-rate' : 'int' } } + 'normal-bytes': 'int', 'dirty-pages-rate' : 'int', + 'mbps' : 'number' } } ## # @XBZRLECacheStats -- cgit v1.2.3 From bc1256f7f187cc7d491bfe3861249a60873adbbc Mon Sep 17 00:00:00 2001 From: "Michael R. Hines" Date: Tue, 25 Jun 2013 21:35:31 -0400 Subject: rdma: introduce qemu_file_mode_is_not_valid() QEMUFileRDMA also has read and write modes. This function is now shared to reduce code duplication. Reviewed-by: Juan Quintela Reviewed-by: Paolo Bonzini Reviewed-by: Chegu Vinod Tested-by: Chegu Vinod Tested-by: Michael R. Hines Signed-off-by: Michael R. Hines Signed-off-by: Juan Quintela --- include/migration/qemu-file.h | 1 + savevm.c | 20 +++++++++++++------- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 8fab0dd752..dd3fd5155e 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -80,6 +80,7 @@ void qemu_put_byte(QEMUFile *f, int v); * The buffer should be available till it is sent asynchronously. */ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size); +bool qemu_file_mode_is_not_valid(const char *mode); static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v) { diff --git a/savevm.c b/savevm.c index e35c7a4084..67ddf06af0 100644 --- a/savevm.c +++ b/savevm.c @@ -449,14 +449,23 @@ static const QEMUFileOps socket_write_ops = { .close = socket_close }; -QEMUFile *qemu_fopen_socket(int fd, const char *mode) +bool qemu_file_mode_is_not_valid(const char *mode) { - QEMUFileSocket *s; - if (mode == NULL || (mode[0] != 'r' && mode[0] != 'w') || mode[1] != 'b' || mode[2] != 0) { fprintf(stderr, "qemu_fopen: Argument validity check failed\n"); + return true; + } + + return false; +} + +QEMUFile *qemu_fopen_socket(int fd, const char *mode) +{ + QEMUFileSocket *s; + + if (qemu_file_mode_is_not_valid(mode)) { return NULL; } @@ -475,10 +484,7 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode) { QEMUFileStdio *s; - if (mode == NULL || - (mode[0] != 'r' && mode[0] != 'w') || - mode[1] != 'b' || mode[2] != 0) { - fprintf(stderr, "qemu_fopen: Argument validity check failed\n"); + if (qemu_file_mode_is_not_valid(mode)) { return NULL; } -- cgit v1.2.3 From be903b2ae7ca750bde2549432c5536087436cf49 Mon Sep 17 00:00:00 2001 From: "Michael R. Hines" Date: Tue, 25 Jun 2013 21:35:32 -0400 Subject: rdma: export qemu_fflush() RDMA uses this to flush the control channel before sending its own message to handle page registrations. Reviewed-by: Juan Quintela Reviewed-by: Paolo Bonzini Reviewed-by: Chegu Vinod Tested-by: Chegu Vinod Tested-by: Michael R. Hines Signed-off-by: Michael R. Hines Signed-off-by: Juan Quintela --- include/migration/qemu-file.h | 1 + savevm.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index dd3fd5155e..37d1604065 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -112,6 +112,7 @@ void qemu_file_reset_rate_limit(QEMUFile *f); void qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate); int64_t qemu_file_get_rate_limit(QEMUFile *f); int qemu_file_get_error(QEMUFile *f); +void qemu_fflush(QEMUFile *f); static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv) { diff --git a/savevm.c b/savevm.c index 67ddf06af0..26d56071ea 100644 --- a/savevm.c +++ b/savevm.c @@ -589,7 +589,7 @@ static inline bool qemu_file_is_writable(QEMUFile *f) * If there is writev_buffer QEMUFileOps it uses it otherwise uses * put_buffer ops. */ -static void qemu_fflush(QEMUFile *f) +void qemu_fflush(QEMUFile *f) { ssize_t ret = 0; -- cgit v1.2.3 From bd2fa51fcdba3408f308df1b08fae04053ecdee5 Mon Sep 17 00:00:00 2001 From: "Michael R. Hines" Date: Tue, 25 Jun 2013 21:35:34 -0400 Subject: rdma: introduce qemu_ram_foreach_block() This is used during RDMA initialization in order to transmit a description of all the RAM blocks to the peer for later dynamic chunk registration purposes. Reviewed-by: Juan Quintela Reviewed-by: Paolo Bonzini Reviewed-by: Chegu Vinod Tested-by: Chegu Vinod Tested-by: Michael R. Hines Signed-off-by: Michael R. Hines Signed-off-by: Juan Quintela --- exec.c | 9 +++++++++ include/exec/cpu-common.h | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/exec.c b/exec.c index 0b0118bd41..2e6fc0087f 100644 --- a/exec.c +++ b/exec.c @@ -2630,3 +2630,12 @@ bool cpu_physical_memory_is_io(hwaddr phys_addr) memory_region_is_romd(mr)); } #endif + +void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque) +{ + RAMBlock *block; + + QTAILQ_FOREACH(block, &ram_list.blocks, next) { + func(block->host, block->offset, block->length, opaque); + } +} diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index e061e21093..92a422313f 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -113,6 +113,11 @@ void cpu_physical_memory_write_rom(hwaddr addr, extern struct MemoryRegion io_mem_rom; extern struct MemoryRegion io_mem_notdirty; +typedef void (RAMBlockIterFunc)(void *host_addr, + ram_addr_t offset, ram_addr_t length, void *opaque); + +void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque); + #endif #endif /* !CPU_COMMON_H */ -- cgit v1.2.3 From 43487c678d6e4e7182bfa70d2bc75422578782aa Mon Sep 17 00:00:00 2001 From: "Michael R. Hines" Date: Tue, 25 Jun 2013 21:35:35 -0400 Subject: rdma: new QEMUFileOps hooks These are the prototypes and implementation of new hooks that RDMA takes advantage of to perform dynamic page registration. An optional hook is also introduced for a custom function to be able to override the default save_page function. Also included are the prototypes and accessor methods used by arch_init.c which invoke funtions inside savevm.c to call out to the hooks that may or may not have been overridden inside of QEMUFileOps. Reviewed-by: Juan Quintela Reviewed-by: Paolo Bonzini Reviewed-by: Chegu Vinod Tested-by: Chegu Vinod Tested-by: Michael R. Hines Signed-off-by: Michael R. Hines Signed-off-by: Juan Quintela --- include/migration/migration.h | 20 +++++++++++++++ include/migration/qemu-file.h | 29 +++++++++++++++++++++ savevm.c | 59 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+) diff --git a/include/migration/migration.h b/include/migration/migration.h index 535e844880..3ddd64ffcb 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -21,6 +21,7 @@ #include "qapi/error.h" #include "migration/vmstate.h" #include "qapi-types.h" +#include "exec/cpu-common.h" struct MigrationParams { bool blk; @@ -130,4 +131,23 @@ int migrate_use_xbzrle(void); int64_t migrate_xbzrle_cache_size(void); int64_t xbzrle_cache_resize(int64_t new_size); + +void ram_control_before_iterate(QEMUFile *f, uint64_t flags); +void ram_control_after_iterate(QEMUFile *f, uint64_t flags); +void ram_control_load_hook(QEMUFile *f, uint64_t flags); + +/* Whenever this is found in the data stream, the flags + * will be passed to ram_control_load_hook in the incoming-migration + * side. This lets before_ram_iterate/after_ram_iterate add + * transport-specific sections to the RAM migration data. + */ +#define RAM_SAVE_FLAG_HOOK 0x80 + +#define RAM_SAVE_CONTROL_NOT_SUPP -1000 +#define RAM_SAVE_CONTROL_DELAYED -2000 + +size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, + ram_addr_t offset, size_t size, + int *bytes_sent); + #endif diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 37d1604065..0f757fbeb6 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -23,6 +23,7 @@ */ #ifndef QEMU_FILE_H #define QEMU_FILE_H 1 +#include "exec/cpu-common.h" /* This function writes a chunk of data to a file at the given position. * The pos argument can be ignored if the file is only being used for @@ -57,12 +58,40 @@ typedef int (QEMUFileGetFD)(void *opaque); typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov, int iovcnt, int64_t pos); +/* + * This function provides hooks around different + * stages of RAM migration. + */ +typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); + +/* + * Constants used by ram_control_* hooks + */ +#define RAM_CONTROL_SETUP 0 +#define RAM_CONTROL_ROUND 1 +#define RAM_CONTROL_HOOK 2 +#define RAM_CONTROL_FINISH 3 + +/* + * This function allows override of where the RAM page + * is saved (such as RDMA, for example.) + */ +typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque, + ram_addr_t block_offset, + ram_addr_t offset, + size_t size, + int *bytes_sent); + typedef struct QEMUFileOps { QEMUFilePutBufferFunc *put_buffer; QEMUFileGetBufferFunc *get_buffer; QEMUFileCloseFunc *close; QEMUFileGetFD *get_fd; QEMUFileWritevBufferFunc *writev_buffer; + QEMURamHookFunc *before_ram_iterate; + QEMURamHookFunc *after_ram_iterate; + QEMURamHookFunc *hook_ram_load; + QEMURamSaveFunc *save_page; } QEMUFileOps; QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops); diff --git a/savevm.c b/savevm.c index 26d56071ea..e0491e7580 100644 --- a/savevm.c +++ b/savevm.c @@ -616,6 +616,65 @@ void qemu_fflush(QEMUFile *f) } } +void ram_control_before_iterate(QEMUFile *f, uint64_t flags) +{ + int ret = 0; + + if (f->ops->before_ram_iterate) { + ret = f->ops->before_ram_iterate(f, f->opaque, flags); + if (ret < 0) { + qemu_file_set_error(f, ret); + } + } +} + +void ram_control_after_iterate(QEMUFile *f, uint64_t flags) +{ + int ret = 0; + + if (f->ops->after_ram_iterate) { + ret = f->ops->after_ram_iterate(f, f->opaque, flags); + if (ret < 0) { + qemu_file_set_error(f, ret); + } + } +} + +void ram_control_load_hook(QEMUFile *f, uint64_t flags) +{ + int ret = 0; + + if (f->ops->hook_ram_load) { + ret = f->ops->hook_ram_load(f, f->opaque, flags); + if (ret < 0) { + qemu_file_set_error(f, ret); + } + } else { + qemu_file_set_error(f, ret); + } +} + +size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, + ram_addr_t offset, size_t size, int *bytes_sent) +{ + if (f->ops->save_page) { + int ret = f->ops->save_page(f, f->opaque, block_offset, + offset, size, bytes_sent); + + if (ret != RAM_SAVE_CONTROL_DELAYED) { + if (*bytes_sent > 0) { + qemu_update_position(f, *bytes_sent); + } else if (ret < 0) { + qemu_file_set_error(f, ret); + } + } + + return ret; + } + + return RAM_SAVE_CONTROL_NOT_SUPP; +} + static void qemu_fill_buffer(QEMUFile *f) { int len; -- cgit v1.2.3 From 60d9222c8f50c3e5dd3df9ee84ddd1d1c4b35389 Mon Sep 17 00:00:00 2001 From: "Michael R. Hines" Date: Tue, 25 Jun 2013 21:35:36 -0400 Subject: rdma: introduce capability x-rdma-pin-all This capability allows you to disable dynamic chunk registration for better throughput on high-performance links. For example, using an 8GB RAM virtual machine with all 8GB of memory in active use and the VM itself is completely idle using a 40 gbps infiniband link: 1. x-rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps 2. x-rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps These numbers would of course scale up to whatever size virtual machine you have to migrate using RDMA. Enabling this feature does *not* have any measurable affect on migration *downtime*. This is because, without this feature, all of the memory will have already been registered already in advance during the bulk round and does not need to be re-registered during the successive iteration rounds. Reviewed-by: Juan Quintela Reviewed-by: Paolo Bonzini Reviewed-by: Chegu Vinod Reviewed-by: Eric Blake Tested-by: Chegu Vinod Tested-by: Michael R. Hines Signed-off-by: Michael R. Hines Signed-off-by: Juan Quintela --- include/migration/migration.h | 2 ++ migration.c | 9 +++++++++ qapi-schema.json | 7 ++++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/migration/migration.h b/include/migration/migration.h index 3ddd64ffcb..f0640e0eec 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -123,6 +123,8 @@ void migrate_add_blocker(Error *reason); */ void migrate_del_blocker(Error *reason); +bool migrate_rdma_pin_all(void); + int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, uint8_t *dst, int dlen); int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen); diff --git a/migration.c b/migration.c index 441a3b23f9..a704d48669 100644 --- a/migration.c +++ b/migration.c @@ -476,6 +476,15 @@ void qmp_migrate_set_downtime(double value, Error **errp) max_downtime = (uint64_t)value; } +bool migrate_rdma_pin_all(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]; +} + int migrate_use_xbzrle(void) { MigrationState *s; diff --git a/qapi-schema.json b/qapi-schema.json index 78da5577af..a30a728dde 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -608,10 +608,15 @@ # This feature allows us to minimize migration traffic for certain work # loads, by sending compressed difference of the pages # +# @x-rdma-pin-all: Controls whether or not the entire VM memory footprint is +# mlock()'d on demand or all at once. Refer to docs/rdma.txt for usage. +# Disabled by default. Experimental: may (or may not) be renamed after +# further testing is complete. (since 1.6) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle'] } + 'data': ['xbzrle', 'x-rdma-pin-all'] } ## # @MigrationCapabilityStatus -- cgit v1.2.3