cpu-exec: Add sleeping algorithm

The goal is to sleep qemu whenever the guest clock is in advance compared to the host clock (we use the monotonic clocks). The amount of time to sleep is calculated in the execution loop in cpu_exec. At first, we tried to approximate at each for loop the real time elapsed while searching for a TB (generating or retrieving from cache) and executing it. We would then approximate the virtual time corresponding to the number of virtual instructions executed. The difference between these 2 values would allow us to know if the guest is in advance or delayed. However, the function used for measuring the real time (qemu_clock_get_ns(QEMU_CLOCK_REALTIME)) proved to be very expensive. We had an added overhead of 13% of the total run time. Therefore, we modified the algorithm and only take into account the difference between the 2 clocks at the begining of the cpu_exec function. During the for loop we try to reduce the advance of the guest only by computing the virtual time elapsed and sleeping if necessary. The overhead is thus reduced to 3%. Even though this method still has a noticeable overhead, it no longer is a bottleneck in trying to achieve a better guest frequency for which the guest clock is faster than the host one. As for the the alignement of the 2 clocks, with the first algorithm the guest clock was oscillating between -1 and 1ms compared to the host clock. Using the second algorithm we notice that the guest is 5ms behind the host, which is still acceptable for our use case. The tests where conducted using fio and stress. The host machine in an i5 CPU at 3.10GHz running Debian Jessie (kernel 3.12). The guest machine is an arm versatile-pb built with buildroot. Currently, on our test machine, the lowest icount we can achieve that is suitable for aligning the 2 clocks is 6. However, we observe that the IO tests (using fio) are slower than the cpu tests (using stress). Signed-off-by: Sebastian Tanase <sebastian.tanase@openwide.fr> Tested-by: Camille Bégué <camille.begue@openwide.fr> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
author: Sebastian Tanase <sebastian.tanase@openwide.fr> 2014-07-25 11:56:31 +0200
committer: Paolo Bonzini <pbonzini@redhat.com> 2014-08-06 17:53:07 +0200
commit: c2aa5f819900660f936faadfe92fe5d60a562482 (patch)
tree: 09c74582ec42dfebee60217a01665e784eb20747
parent: a8bfac37085c3372366d722f131a7e18d664ee4d (diff)
3 files changed, 97 insertions, 0 deletions
diff --git a/cpu-exec.c b/cpu-exec.c
index 38e5f02a30..68f82b631b 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -22,6 +22,72 @@
 #include "tcg.h"
 #include "qemu/atomic.h"
 #include "sysemu/qtest.h"
+#include "qemu/timer.h"
+
+/* -icount align implementation. */
+
+typedef struct SyncClocks {
+    int64_t diff_clk;
+    int64_t last_cpu_icount;
+} SyncClocks;
+
+#if !defined(CONFIG_USER_ONLY)
+/* Allow the guest to have a max 3ms advance.
+ * The difference between the 2 clocks could therefore
+ * oscillate around 0.
+ */
+#define VM_CLOCK_ADVANCE 3000000
+
+static void align_clocks(SyncClocks *sc, const CPUState *cpu)
+{
+    int64_t cpu_icount;
+
+    if (!icount_align_option) {
+        return;
+    }
+
+    cpu_icount = cpu->icount_extra + cpu->icount_decr.u16.low;
+    sc->diff_clk += cpu_icount_to_ns(sc->last_cpu_icount - cpu_icount);
+    sc->last_cpu_icount = cpu_icount;
+
+    if (sc->diff_clk > VM_CLOCK_ADVANCE) {
+#ifndef _WIN32
+        struct timespec sleep_delay, rem_delay;
+        sleep_delay.tv_sec = sc->diff_clk / 1000000000LL;
+        sleep_delay.tv_nsec = sc->diff_clk % 1000000000LL;
+        if (nanosleep(&sleep_delay, &rem_delay) < 0) {
+            sc->diff_clk -= (sleep_delay.tv_sec - rem_delay.tv_sec) * 1000000000LL;
+            sc->diff_clk -= sleep_delay.tv_nsec - rem_delay.tv_nsec;
+        } else {
+            sc->diff_clk = 0;
+        }
+#else
+        Sleep(sc->diff_clk / SCALE_MS);
+        sc->diff_clk = 0;
+#endif
+    }
+}
+
+static void init_delay_params(SyncClocks *sc,
+                              const CPUState *cpu)
+{
+    if (!icount_align_option) {
+        return;
+    }
+    sc->diff_clk = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) -
+                   qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
+                   cpu_get_clock_offset();
+    sc->last_cpu_icount = cpu->icount_extra + cpu->icount_decr.u16.low;
+}
+#else
+static void align_clocks(SyncClocks *sc, const CPUState *cpu)
+{
+}
+
+static void init_delay_params(SyncClocks *sc, const CPUState *cpu)
+{
+}
+#endif /* CONFIG USER ONLY */
 
 void cpu_loop_exit(CPUState *cpu)
 {
@@ -227,6 +293,8 @@ int cpu_exec(CPUArchState *env)
     TranslationBlock *tb;
     uint8_t *tc_ptr;
     uintptr_t next_tb;
+    SyncClocks sc;
+
     /* This must be volatile so it is not trashed by longjmp() */
     volatile bool have_tb_lock = false;
 
@@ -283,6 +351,13 @@ int cpu_exec(CPUArchState *env)
 #endif
     cpu->exception_index = -1;
 
+    /* Calculate difference between guest clock and host clock.
+     * This delay includes the delay of the last cycle, so
+     * what we have to do is sleep until it is 0. As for the
+     * advance/delay we gain here, we try to fix it next time.
+     */
+    init_delay_params(&sc, cpu);
+
     /* prepare setjmp context for exception handling */
     for(;;) {
         if (sigsetjmp(cpu->jmp_env, 0) == 0) {
@@ -672,6 +747,7 @@ int cpu_exec(CPUArchState *env)
                             if (insns_left > 0) {
                                 /* Execute remaining instructions.  */
                                 cpu_exec_nocache(env, insns_left, tb);
+                                align_clocks(&sc, cpu);
                             }
                             cpu->exception_index = EXCP_INTERRUPT;
                             next_tb = 0;
@@ -684,6 +760,9 @@ int cpu_exec(CPUArchState *env)
                     }
                 }
                 cpu->current_tb = NULL;
+                /* Try to align the host and virtual clocks
+                   if the guest is in advance */
+                align_clocks(&sc, cpu);
                 /* reset soft MMU for next block (it can currently
                    only be set by a memory fault) */
             } /* for(;;) */
diff --git a/cpus.c b/cpus.c
index 7e09538799..19245e99b9 100644
--- a/cpus.c
+++ b/cpus.c
@@ -219,6 +219,23 @@ int64_t cpu_get_clock(void)
     return ti;
 }
 
+/* return the offset between the host clock and virtual CPU clock */
+int64_t cpu_get_clock_offset(void)
+{
+    int64_t ti;
+    unsigned start;
+
+    do {
+        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
+        ti = timers_state.cpu_clock_offset;
+        if (!timers_state.cpu_ticks_enabled) {
+            ti -= get_clock();
+        }
+    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
+
+    return -ti;
+}
+
 /* enable cpu_get_ticks()
  * Caller must hold BQL which server as mutex for vm_clock_seqlock.
  */
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index e12c7149e1..5f5210d543 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -745,6 +745,7 @@ static inline int64_t get_clock(void)
 /* icount */
 int64_t cpu_get_icount(void);
 int64_t cpu_get_clock(void);
+int64_t cpu_get_clock_offset(void);
 int64_t cpu_icount_to_ns(int64_t icount);
 
 /*******************************************/
author	Sebastian Tanase <sebastian.tanase@openwide.fr>	2014-07-25 11:56:31 +0200
committer	Paolo Bonzini <pbonzini@redhat.com>	2014-08-06 17:53:07 +0200
commit	c2aa5f819900660f936faadfe92fe5d60a562482 (patch)
tree	09c74582ec42dfebee60217a01665e784eb20747
parent	a8bfac37085c3372366d722f131a7e18d664ee4d (diff)