From 45f1882a9e1a8421404ea47de6aadf3d945618d0 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Tue, 21 Nov 2017 19:16:56 -0500
Subject: iotests: fix 075 and 078
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both of these tests are for formats which now stipulate that they are
read-only. Adjust the tests to match.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/075 | 18 +++++++++---------
 tests/qemu-iotests/078 | 14 +++++++-------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/qemu-iotests/075 b/tests/qemu-iotests/075
index 770d51c6cb..caa30d4743 100755
--- a/tests/qemu-iotests/075
+++ b/tests/qemu-iotests/075
@@ -48,56 +48,56 @@ offsets_offset=136
 echo
 echo "== check that the first sector can be read =="
 _use_sample_img simple-pattern.cloop.bz2
-$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== check that the last sector can be read =="
 _use_sample_img simple-pattern.cloop.bz2
-$QEMU_IO -c "read $((1024 * 1024 - 512)) 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -r -c "read $((1024 * 1024 - 512)) 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== block_size must be a multiple of 512 =="
 _use_sample_img simple-pattern.cloop.bz2
 poke_file "$TEST_IMG" "$block_size_offset" "\x00\x00\x02\x01"
-$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== block_size cannot be zero =="
 _use_sample_img simple-pattern.cloop.bz2
 poke_file "$TEST_IMG" "$block_size_offset" "\x00\x00\x00\x00"
-$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== huge block_size ==="
 _use_sample_img simple-pattern.cloop.bz2
 poke_file "$TEST_IMG" "$block_size_offset" "\xff\xff\xfe\x00"
-$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== offsets_size overflow ==="
 _use_sample_img simple-pattern.cloop.bz2
 poke_file "$TEST_IMG" "$n_blocks_offset" "\xff\xff\xff\xff"
-$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== refuse images that require too many offsets ==="
 _use_sample_img simple-pattern.cloop.bz2
 poke_file "$TEST_IMG" "$n_blocks_offset" "\x04\x00\x00\x01"
-$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== refuse images with non-monotonically increasing offsets =="
 _use_sample_img simple-pattern.cloop.bz2
 poke_file "$TEST_IMG" "$offsets_offset" "\x00\x00\x00\x00\xff\xff\xff\xff"
 poke_file "$TEST_IMG" $((offsets_offset + 8)) "\x00\x00\x00\x00\xff\xfe\x00\x00"
-$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== refuse images with invalid compressed block size =="
 _use_sample_img simple-pattern.cloop.bz2
 poke_file "$TEST_IMG" "$offsets_offset" "\x00\x00\x00\x00\x00\x00\x00\x00"
 poke_file "$TEST_IMG" $((offsets_offset + 8)) "\xff\xff\xff\xff\xff\xff\xff\xff"
-$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir
 
 # success, all done
 echo "*** done"
diff --git a/tests/qemu-iotests/078 b/tests/qemu-iotests/078
index f333e9ac84..a106c26f6b 100755
--- a/tests/qemu-iotests/078
+++ b/tests/qemu-iotests/078
@@ -48,41 +48,41 @@ disk_size_offset=$((0x58))
 echo
 echo "== Read from a valid image =="
 _use_sample_img empty.bochs.bz2
-{ $QEMU_IO -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
+{ $QEMU_IO -r -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== Negative catalog size =="
 _use_sample_img empty.bochs.bz2
 poke_file "$TEST_IMG" "$catalog_size_offset" "\xff\xff\xff\xff"
-{ $QEMU_IO -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
+{ $QEMU_IO -r -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== Overflow for catalog size * sizeof(uint32_t) =="
 _use_sample_img empty.bochs.bz2
 poke_file "$TEST_IMG" "$catalog_size_offset" "\x00\x00\x00\x40"
-{ $QEMU_IO -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
+{ $QEMU_IO -r -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== Too small catalog bitmap for image size =="
 _use_sample_img empty.bochs.bz2
 poke_file "$TEST_IMG" "$disk_size_offset" "\x00\xc0\x0f\x00\x00\x00\x00\x7f"
-{ $QEMU_IO -c "read 2T 4k" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
+{ $QEMU_IO -r -c "read 2T 4k" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
 
 _use_sample_img empty.bochs.bz2
 poke_file "$TEST_IMG" "$catalog_size_offset" "\x10\x00\x00\x00"
-{ $QEMU_IO -c "read 0xfbe00 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
+{ $QEMU_IO -r -c "read 0xfbe00 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== Negative extent size =="
 _use_sample_img empty.bochs.bz2
 poke_file "$TEST_IMG" "$extent_size_offset" "\x00\x00\x00\x80"
-{ $QEMU_IO -c "read 768k 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
+{ $QEMU_IO -r -c "read 768k 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
 
 echo
 echo "== Zero extent size =="
 _use_sample_img empty.bochs.bz2
 poke_file "$TEST_IMG" "$extent_size_offset" "\x00\x00\x00\x00"
-{ $QEMU_IO -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
+{ $QEMU_IO -r -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir
 
 # success, all done
 echo "*** done"
-- 
cgit v1.2.3


From b1d1cb272882dd6868740155120f6aeb260a204c Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 24 Nov 2017 16:53:50 +0800
Subject: docs: Add image locking subsection

This documents the image locking feature and explains when and how
related options can be used.

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 docs/qemu-block-drivers.texi | 36 ++++++++++++++++++++++++++++++++++++
 qemu-doc.texi                |  1 +
 2 files changed, 37 insertions(+)

diff --git a/docs/qemu-block-drivers.texi b/docs/qemu-block-drivers.texi
index 1cb1e55686..503c1847aa 100644
--- a/docs/qemu-block-drivers.texi
+++ b/docs/qemu-block-drivers.texi
@@ -785,6 +785,42 @@ warning: ssh server @code{ssh.example.com:22} does not support fsync
 With sufficiently new versions of libssh2 and OpenSSH, @code{fsync} is
 supported.
 
+@node disk_image_locking
+@subsection Disk image file locking
+
+By default, QEMU tries to protect image files from unexpected concurrent
+access, as long as it's supported by the block protocol driver and host
+operating system. If multiple QEMU processes (including QEMU emulators and
+utilities) try to open the same image with conflicting accessing modes, all but
+the first one will get an error.
+
+This feature is currently supported by the file protocol on Linux with the Open
+File Descriptor (OFD) locking API, and can be configured to fall back to POSIX
+locking if the POSIX host doesn't support Linux OFD locking.
+
+To explicitly enable image locking, specify "locking=on" in the file protocol
+driver options. If OFD locking is not possible, a warning will be printed and
+the POSIX locking API will be used. In this case there is a risk that the lock
+will get silently lost when doing hot plugging and block jobs, due to the
+shortcomings of the POSIX locking API.
+
+QEMU transparently handles lock handover during shared storage migration.  For
+shared virtual disk images between multiple VMs, the "share-rw" device option
+should be used.
+
+Alternatively, locking can be fully disabled by "locking=off" block device
+option. In the command line, the option is usually in the form of
+"file.locking=off" as the protocol driver is normally placed as a "file" child
+under a format driver. For example:
+
+@code{-blockdev driver=qcow2,file.filename=/path/to/image,file.locking=off,file.driver=file}
+
+To check if image locking is active, check the output of the "lslocks" command
+on host and see if there are locks held by the QEMU process on the image file.
+More than one byte could be locked by the QEMU instance, each byte of which
+reflects a particular permission that is acquired or protected by the running
+block driver.
+
 @c man end
 
 @ignore
diff --git a/qemu-doc.texi b/qemu-doc.texi
index 617254917d..db2351c746 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -405,6 +405,7 @@ encrypted disk images.
 * disk_images_iscsi::         iSCSI LUNs
 * disk_images_gluster::       GlusterFS disk images
 * disk_images_ssh::           Secure Shell (ssh) disk images
+* disk_image_locking::        Disk image file locking
 @end menu
 
 @node disk_images_quickstart
-- 
cgit v1.2.3


From 1878eaff9bdeece4546d4f9587e6c75ab0b79b8b Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 24 Nov 2017 16:53:51 +0800
Subject: qemu-options: Mention locking option of file driver

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-options.hx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/qemu-options.hx b/qemu-options.hx
index 3728e9b4dd..f11c4ac960 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -693,6 +693,10 @@ This is the protocol-level block driver for accessing regular files.
 The path to the image file in the local filesystem
 @item aio
 Specifies the AIO backend (threads/native, default: threads)
+@item locking
+Specifies whether the image file is protected with Linux OFD / POSIX locks. The
+default is to use the Linux Open File Descriptor API if available, otherwise no
+lock is applied.  (auto/on/off, default: auto)
 @end table
 Example:
 @example
-- 
cgit v1.2.3


From c117bb14ff633848cc6b0ff77f081f583dfa8c5e Mon Sep 17 00:00:00 2001
From: Kashyap Chamarthy <kchamart@redhat.com>
Date: Tue, 21 Nov 2017 12:52:53 +0100
Subject: QAPI & interop: Clarify events emitted by 'block-job-cancel'

When you cancel an in-progress 'mirror' job (or "active `block-commit`")
with QMP `block-job-cancel`, it emits the event: BLOCK_JOB_CANCELLED.
However, when `block-job-cancel` is issued *after* `drive-mirror` has
indicated (via the event BLOCK_JOB_READY) that the source and
destination have reached synchronization:

    [...] # Snip `drive-mirror` invocation & outputs
    {
      "execute":"block-job-cancel",
      "arguments":{
        "device":"virtio0"
      }
    }

    {"return": {}}

It (`block-job-cancel`) will counterintuitively emit the event
'BLOCK_JOB_COMPLETED':

    {
      "timestamp":{
        "seconds":1510678024,
        "microseconds":526240
      },
      "event":"BLOCK_JOB_COMPLETED",
      "data":{
        "device":"virtio0",
        "len":41126400,
        "offset":41126400,
        "speed":0,
        "type":"mirror"
      }
    }

But this is expected behaviour, where the _COMPLETED event indicates
that synchronization has successfully ended (and the destination now has
a point-in-time copy, which is at the time of cancel).

So add a small note to this effect in 'block-core.json'.  While at it,
also update the "Live disk synchronization -- drive-mirror and
blockdev-mirror" section in 'live-block-operations.rst'.

(Thanks: Max Reitz for reminding me of this caveat on IRC.)

Signed-off-by: Kashyap Chamarthy <kchamart@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 docs/interop/live-block-operations.rst | 50 ++++++++++++++++++++++------------
 qapi/block-core.json                   |  6 ++++
 2 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/docs/interop/live-block-operations.rst b/docs/interop/live-block-operations.rst
index 5f0179749f..734252bc80 100644
--- a/docs/interop/live-block-operations.rst
+++ b/docs/interop/live-block-operations.rst
@@ -506,26 +506,40 @@ Again, given our familiar disk image chain::
 
     [A] <-- [B] <-- [C] <-- [D]
 
-The ``drive-mirror`` (and its newer equivalent ``blockdev-mirror``) allows
-you to copy data from the entire chain into a single target image (which
-can be located on a different host).
-
-Once a 'mirror' job has started, there are two possible actions while a
-``drive-mirror`` job is active:
-
-(1) Issuing the command ``block-job-cancel`` after it emits the event
-    ``BLOCK_JOB_CANCELLED``: will (after completing synchronization of
-    the content from the disk image chain to the target image, [E])
-    create a point-in-time (which is at the time of *triggering* the
-    cancel command) copy, contained in image [E], of the the entire disk
+The ``drive-mirror`` (and its newer equivalent ``blockdev-mirror``)
+allows you to copy data from the entire chain into a single target image
+(which can be located on a different host), [E].
+
+.. note::
+
+    When you cancel an in-progress 'mirror' job *before* the source and
+    target are synchronized, ``block-job-cancel`` will emit the event
+    ``BLOCK_JOB_CANCELLED``.  However, note that if you cancel a
+    'mirror' job *after* it has indicated (via the event
+    ``BLOCK_JOB_READY``) that the source and target have reached
+    synchronization, then the event emitted by ``block-job-cancel``
+    changes to ``BLOCK_JOB_COMPLETED``.
+
+    Besides the 'mirror' job, the "active ``block-commit``" is the only
+    other block device job that emits the event ``BLOCK_JOB_READY``.
+    The rest of the block device jobs ('stream', "non-active
+    ``block-commit``", and 'backup') end automatically.
+
+So there are two possible actions to take, after a 'mirror' job has
+emitted the event ``BLOCK_JOB_READY``, indicating that the source and
+target have reached synchronization:
+
+(1) Issuing the command ``block-job-cancel`` (after it emits the event
+    ``BLOCK_JOB_COMPLETED``) will create a point-in-time (which is at
+    the time of *triggering* the cancel command) copy of the entire disk
     image chain (or only the top-most image, depending on the ``sync``
-    mode).
+    mode), contained in the target image [E]. One use case for this is
+    live VM migration with non-shared storage.
 
-(2) Issuing the command ``block-job-complete`` after it emits the event
-    ``BLOCK_JOB_COMPLETED``: will, after completing synchronization of
-    the content, adjust the guest device (i.e. live QEMU) to point to
-    the target image, and, causing all the new writes from this point on
-    to happen there.  One use case for this is live storage migration.
+(2) Issuing the command ``block-job-complete`` (after it emits the event
+    ``BLOCK_JOB_COMPLETED``) will adjust the guest device (i.e. live
+    QEMU) to point to the target image, [E], causing all the new writes
+    from this point on to happen there.
 
 About synchronization modes: The synchronization mode determines
 *which* part of the disk image chain will be copied to the target.
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 76bf50f813..dd763dcf87 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2065,6 +2065,12 @@
 # BLOCK_JOB_CANCELLED event.  Before that happens the job is still visible when
 # enumerated using query-block-jobs.
 #
+# Note that if you issue 'block-job-cancel' after 'drive-mirror' has indicated
+# (via the event BLOCK_JOB_READY) that the source and destination are
+# synchronized, then the event triggered by this command changes to
+# BLOCK_JOB_COMPLETED, to indicate that the mirroring has ended and the
+# destination now has a point-in-time copy tied to the time of the cancellation.
+#
 # For streaming, the image file retains its backing file unless the streaming
 # operation happens to complete just as it is being cancelled.  A new streaming
 # operation can be started at a later time to finish copying all data from the
-- 
cgit v1.2.3


From 0a3e155f3f5ec9b6f12d00894c7701b3cbb66590 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Tue, 28 Nov 2017 16:53:27 +0200
Subject: blockjob: Remove the job from the list earlier in block_job_unref()

When destroying a block job in block_job_unref() we should remove it
from the job list before calling block_job_remove_all_bdrv().

This is because removing the BDSs can trigger an aio_poll() and wake
up other jobs that might attempt to use the block job list. If that
happens the job we're currently destroying should not be in that list
anymore.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockjob.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/blockjob.c b/blockjob.c
index ff9a614531..2f0cc1528b 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -152,6 +152,7 @@ void block_job_unref(BlockJob *job)
 {
     if (--job->refcnt == 0) {
         BlockDriverState *bs = blk_bs(job->blk);
+        QLIST_REMOVE(job, job_list);
         bs->job = NULL;
         block_job_remove_all_bdrv(job);
         blk_remove_aio_context_notifier(job->blk,
@@ -160,7 +161,6 @@ void block_job_unref(BlockJob *job)
         blk_unref(job->blk);
         error_free(job->blocker);
         g_free(job->id);
-        QLIST_REMOVE(job, job_list);
         g_free(job);
     }
 }
-- 
cgit v1.2.3


From 02d213009d571bcd7171e3ff9234722a11d30d1b Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 29 Nov 2017 11:25:10 +0100
Subject: block: Expect graph changes in bdrv_parent_drained_begin/end

The .drained_begin/end callbacks can (directly or indirectly via
aio_poll()) cause block nodes to be removed or the current BdrvChild to
point to a different child node.

Use QLIST_FOREACH_SAFE() to make sure we don't access invalid
BlockDriverStates or accidentally continue iterating the parents of the
new child node instead of the node we actually came from.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Tested-by: Jeff Cody <jcody@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/io.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/io.c b/block/io.c
index 4fdf93a014..6773926fc1 100644
--- a/block/io.c
+++ b/block/io.c
@@ -42,9 +42,9 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
 
 void bdrv_parent_drained_begin(BlockDriverState *bs)
 {
-    BdrvChild *c;
+    BdrvChild *c, *next;
 
-    QLIST_FOREACH(c, &bs->parents, next_parent) {
+    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
         if (c->role->drained_begin) {
             c->role->drained_begin(c);
         }
@@ -53,9 +53,9 @@ void bdrv_parent_drained_begin(BlockDriverState *bs)
 
 void bdrv_parent_drained_end(BlockDriverState *bs)
 {
-    BdrvChild *c;
+    BdrvChild *c, *next;
 
-    QLIST_FOREACH(c, &bs->parents, next_parent) {
+    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
         if (c->role->drained_end) {
             c->role->drained_end(c);
         }
-- 
cgit v1.2.3


From 5bf1d5a73a4a6d0e2d692bd02b6d7f3eedeed3b7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 29 Nov 2017 11:25:11 +0100
Subject: blockjob: remove clock argument from block_job_sleep_ns

All callers are using QEMU_CLOCK_REALTIME, and it will not be possible to
support more than one clock when block_job_sleep_ns switches to a single
timer stored in the BlockJob struct.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Tested-By: Jeff Cody <jcody@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/backup.c               | 4 ++--
 block/commit.c               | 2 +-
 block/mirror.c               | 6 +++---
 block/stream.c               | 2 +-
 blockjob.c                   | 5 +++--
 include/block/blockjob_int.h | 7 +++----
 tests/test-blockjob-txn.c    | 2 +-
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 06ddbfd03d..99e6bcc748 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -346,9 +346,9 @@ static bool coroutine_fn yield_and_check(BackupBlockJob *job)
         uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
                                                       job->bytes_read);
         job->bytes_read = 0;
-        block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
+        block_job_sleep_ns(&job->common, delay_ns);
     } else {
-        block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
+        block_job_sleep_ns(&job->common, 0);
     }
 
     if (block_job_is_cancelled(&job->common)) {
diff --git a/block/commit.c b/block/commit.c
index 5036eec434..c5327551ce 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -174,7 +174,7 @@ static void coroutine_fn commit_run(void *opaque)
         /* Note that even when no rate limit is applied we need to yield
          * with no pending I/O here so that bdrv_drain_all() returns.
          */
-        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
+        block_job_sleep_ns(&s->common, delay_ns);
         if (block_job_is_cancelled(&s->common)) {
             break;
         }
diff --git a/block/mirror.c b/block/mirror.c
index 307b6391a8..c9badc1203 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -598,7 +598,7 @@ static void mirror_throttle(MirrorBlockJob *s)
 
     if (now - s->last_pause_ns > SLICE_TIME) {
         s->last_pause_ns = now;
-        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
+        block_job_sleep_ns(&s->common, 0);
     } else {
         block_job_pause_point(&s->common);
     }
@@ -870,13 +870,13 @@ static void coroutine_fn mirror_run(void *opaque)
         ret = 0;
         trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
         if (!s->synced) {
-            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
+            block_job_sleep_ns(&s->common, delay_ns);
             if (block_job_is_cancelled(&s->common)) {
                 break;
             }
         } else if (!should_complete) {
             delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
-            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
+            block_job_sleep_ns(&s->common, delay_ns);
         }
         s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     }
diff --git a/block/stream.c b/block/stream.c
index e6f72346e5..499cdacdb0 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -141,7 +141,7 @@ static void coroutine_fn stream_run(void *opaque)
         /* Note that even when no rate limit is applied we need to yield
          * with no pending I/O here so that bdrv_drain_all() returns.
          */
-        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
+        block_job_sleep_ns(&s->common, delay_ns);
         if (block_job_is_cancelled(&s->common)) {
             break;
         }
diff --git a/blockjob.c b/blockjob.c
index 2f0cc1528b..db9e4fc89a 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -788,7 +788,7 @@ bool block_job_is_cancelled(BlockJob *job)
     return job->cancelled;
 }
 
-void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
+void block_job_sleep_ns(BlockJob *job, int64_t ns)
 {
     assert(job->busy);
 
@@ -803,7 +803,8 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
      * it wakes and runs, otherwise we risk double-entry or entry after
      * completion. */
     if (!block_job_should_pause(job)) {
-        co_aio_sleep_ns(blk_get_aio_context(job->blk), type, ns);
+        co_aio_sleep_ns(blk_get_aio_context(job->blk),
+                        QEMU_CLOCK_REALTIME, ns);
     }
 
     block_job_pause_point(job);
diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
index 43f3be2965..f7ab183a39 100644
--- a/include/block/blockjob_int.h
+++ b/include/block/blockjob_int.h
@@ -139,14 +139,13 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
 /**
  * block_job_sleep_ns:
  * @job: The job that calls the function.
- * @clock: The clock to sleep on.
  * @ns: How many nanoseconds to stop for.
  *
  * Put the job to sleep (assuming that it wasn't canceled) for @ns
- * nanoseconds.  Canceling the job will not interrupt the wait, so the
- * cancel will not process until the coroutine wakes up.
+ * %QEMU_CLOCK_REALTIME nanoseconds.  Canceling the job will not interrupt
+ * the wait, so the cancel will not process until the coroutine wakes up.
  */
-void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns);
+void block_job_sleep_ns(BlockJob *job, int64_t ns);
 
 /**
  * block_job_yield:
diff --git a/tests/test-blockjob-txn.c b/tests/test-blockjob-txn.c
index c77343fc04..3591c9617f 100644
--- a/tests/test-blockjob-txn.c
+++ b/tests/test-blockjob-txn.c
@@ -44,7 +44,7 @@ static void coroutine_fn test_block_job_run(void *opaque)
 
     while (s->iterations--) {
         if (s->use_timer) {
-            block_job_sleep_ns(job, QEMU_CLOCK_REALTIME, 0);
+            block_job_sleep_ns(job, 0);
         } else {
             block_job_yield(job);
         }
-- 
cgit v1.2.3


From 356f59b8757f47c0aca3e2e4e51d6010f64cade1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 29 Nov 2017 11:25:12 +0100
Subject: blockjob: introduce block_job_do_yield

Hide the clearing of job->busy in a single function, and set it
in block_job_enter.  This lets block_job_do_yield verify that
qemu_coroutine_enter is not used while job->busy = false.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Tested-By: Jeff Cody <jcody@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockjob.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index db9e4fc89a..4d22b7d2fb 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -729,6 +729,15 @@ static bool block_job_should_pause(BlockJob *job)
     return job->pause_count > 0;
 }
 
+static void block_job_do_yield(BlockJob *job)
+{
+    job->busy = false;
+    qemu_coroutine_yield();
+
+    /* Set by block_job_enter before re-entering the coroutine.  */
+    assert(job->busy);
+}
+
 void coroutine_fn block_job_pause_point(BlockJob *job)
 {
     assert(job && block_job_started(job));
@@ -746,9 +755,7 @@ void coroutine_fn block_job_pause_point(BlockJob *job)
 
     if (block_job_should_pause(job) && !block_job_is_cancelled(job)) {
         job->paused = true;
-        job->busy = false;
-        qemu_coroutine_yield(); /* wait for block_job_resume() */
-        job->busy = true;
+        block_job_do_yield(job);
         job->paused = false;
     }
 
@@ -778,9 +785,12 @@ void block_job_enter(BlockJob *job)
         return;
     }
 
-    if (!job->busy) {
-        bdrv_coroutine_enter(blk_bs(job->blk), job->co);
+    if (job->busy) {
+        return;
     }
+
+    job->busy = true;
+    aio_co_wake(job->co);
 }
 
 bool block_job_is_cancelled(BlockJob *job)
@@ -819,11 +829,9 @@ void block_job_yield(BlockJob *job)
         return;
     }
 
-    job->busy = false;
     if (!block_job_should_pause(job)) {
-        qemu_coroutine_yield();
+        block_job_do_yield(job);
     }
-    job->busy = true;
 
     block_job_pause_point(job);
 }
-- 
cgit v1.2.3


From fc24908e7d232b79c2a6f0363f52bda18dedb57b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 29 Nov 2017 11:25:13 +0100
Subject: blockjob: reimplement block_job_sleep_ns to allow cancellation

This reverts the effects of commit 4afeffc857 ("blockjob: do not allow
coroutine double entry or entry-after-completion", 2017-11-21)

This fixed the symptom of a bug rather than the root cause. Canceling the
wait on a sleeping blockjob coroutine is generally fine, we just need to
make it work correctly across AioContexts.  To do so, use a QEMUTimer
that calls block_job_enter.  Use a mutex to ensure that block_job_enter
synchronizes correctly with block_job_sleep_ns.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Tested-By: Jeff Cody <jcody@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockjob.c                   | 63 ++++++++++++++++++++++++++++++++++++--------
 include/block/blockjob.h     |  8 +++++-
 include/block/blockjob_int.h |  4 +--
 3 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index 4d22b7d2fb..0ed50b953b 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -37,6 +37,26 @@
 #include "qemu/timer.h"
 #include "qapi-event.h"
 
+/* Right now, this mutex is only needed to synchronize accesses to job->busy
+ * and job->sleep_timer, such as concurrent calls to block_job_do_yield and
+ * block_job_enter. */
+static QemuMutex block_job_mutex;
+
+static void block_job_lock(void)
+{
+    qemu_mutex_lock(&block_job_mutex);
+}
+
+static void block_job_unlock(void)
+{
+    qemu_mutex_unlock(&block_job_mutex);
+}
+
+static void __attribute__((__constructor__)) block_job_init(void)
+{
+    qemu_mutex_init(&block_job_mutex);
+}
+
 static void block_job_event_cancelled(BlockJob *job);
 static void block_job_event_completed(BlockJob *job, const char *msg);
 
@@ -161,6 +181,7 @@ void block_job_unref(BlockJob *job)
         blk_unref(job->blk);
         error_free(job->blocker);
         g_free(job->id);
+        assert(!timer_pending(&job->sleep_timer));
         g_free(job);
     }
 }
@@ -287,6 +308,13 @@ static void coroutine_fn block_job_co_entry(void *opaque)
     job->driver->start(job);
 }
 
+static void block_job_sleep_timer_cb(void *opaque)
+{
+    BlockJob *job = opaque;
+
+    block_job_enter(job);
+}
+
 void block_job_start(BlockJob *job)
 {
     assert(job && !block_job_started(job) && job->paused &&
@@ -556,7 +584,7 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
     info->type      = g_strdup(BlockJobType_str(job->driver->job_type));
     info->device    = g_strdup(job->id);
     info->len       = job->len;
-    info->busy      = job->busy;
+    info->busy      = atomic_read(&job->busy);
     info->paused    = job->pause_count > 0;
     info->offset    = job->offset;
     info->speed     = job->speed;
@@ -664,6 +692,9 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
     job->paused        = true;
     job->pause_count   = 1;
     job->refcnt        = 1;
+    aio_timer_init(qemu_get_aio_context(), &job->sleep_timer,
+                   QEMU_CLOCK_REALTIME, SCALE_NS,
+                   block_job_sleep_timer_cb, job);
 
     error_setg(&job->blocker, "block device is in use by block job: %s",
                BlockJobType_str(driver->job_type));
@@ -729,9 +760,20 @@ static bool block_job_should_pause(BlockJob *job)
     return job->pause_count > 0;
 }
 
-static void block_job_do_yield(BlockJob *job)
+/* Yield, and schedule a timer to reenter the coroutine after @ns nanoseconds.
+ * Reentering the job coroutine with block_job_enter() before the timer has
+ * expired is allowed and cancels the timer.
+ *
+ * If @ns is (uint64_t) -1, no timer is scheduled and block_job_enter() must be
+ * called explicitly. */
+static void block_job_do_yield(BlockJob *job, uint64_t ns)
 {
+    block_job_lock();
+    if (ns != -1) {
+        timer_mod(&job->sleep_timer, ns);
+    }
     job->busy = false;
+    block_job_unlock();
     qemu_coroutine_yield();
 
     /* Set by block_job_enter before re-entering the coroutine.  */
@@ -755,7 +797,7 @@ void coroutine_fn block_job_pause_point(BlockJob *job)
 
     if (block_job_should_pause(job) && !block_job_is_cancelled(job)) {
         job->paused = true;
-        block_job_do_yield(job);
+        block_job_do_yield(job, -1);
         job->paused = false;
     }
 
@@ -785,11 +827,16 @@ void block_job_enter(BlockJob *job)
         return;
     }
 
+    block_job_lock();
     if (job->busy) {
+        block_job_unlock();
         return;
     }
 
+    assert(!job->deferred_to_main_loop);
+    timer_del(&job->sleep_timer);
     job->busy = true;
+    block_job_unlock();
     aio_co_wake(job->co);
 }
 
@@ -807,14 +854,8 @@ void block_job_sleep_ns(BlockJob *job, int64_t ns)
         return;
     }
 
-    /* We need to leave job->busy set here, because when we have
-     * put a coroutine to 'sleep', we have scheduled it to run in
-     * the future.  We cannot enter that same coroutine again before
-     * it wakes and runs, otherwise we risk double-entry or entry after
-     * completion. */
     if (!block_job_should_pause(job)) {
-        co_aio_sleep_ns(blk_get_aio_context(job->blk),
-                        QEMU_CLOCK_REALTIME, ns);
+        block_job_do_yield(job, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + ns);
     }
 
     block_job_pause_point(job);
@@ -830,7 +871,7 @@ void block_job_yield(BlockJob *job)
     }
 
     if (!block_job_should_pause(job)) {
-        block_job_do_yield(job);
+        block_job_do_yield(job, -1);
     }
 
     block_job_pause_point(job);
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 67c0968fa5..00403d9482 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -77,7 +77,7 @@ typedef struct BlockJob {
     /**
      * Set to false by the job while the coroutine has yielded and may be
      * re-entered by block_job_enter().  There may still be I/O or event loop
-     * activity pending.
+     * activity pending.  Accessed under block_job_mutex (in blockjob.c).
      */
     bool busy;
 
@@ -135,6 +135,12 @@ typedef struct BlockJob {
      */
     int ret;
 
+    /**
+     * Timer that is used by @block_job_sleep_ns. Accessed under
+     * block_job_mutex (in blockjob.c).
+     */
+    QEMUTimer sleep_timer;
+
     /** Non-NULL if this job is part of a transaction */
     BlockJobTxn *txn;
     QLIST_ENTRY(BlockJob) txn_list;
diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
index f7ab183a39..c9b23b0cc9 100644
--- a/include/block/blockjob_int.h
+++ b/include/block/blockjob_int.h
@@ -142,8 +142,8 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
  * @ns: How many nanoseconds to stop for.
  *
  * Put the job to sleep (assuming that it wasn't canceled) for @ns
- * %QEMU_CLOCK_REALTIME nanoseconds.  Canceling the job will not interrupt
- * the wait, so the cancel will not process until the coroutine wakes up.
+ * %QEMU_CLOCK_REALTIME nanoseconds.  Canceling the job will immediately
+ * interrupt the wait.
  */
 void block_job_sleep_ns(BlockJob *job, int64_t ns);
 
-- 
cgit v1.2.3


From f1a7ff770f7d71ee7833ff019aac9d6cc3d13f71 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Mon, 27 Nov 2017 17:00:07 +0100
Subject: block/nfs: fix nfs_client_open for filesize greater than 1TB

DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE) was overflowing ret (int) if
st.st_size is greater than 1TB.

Cc: qemu-stable@nongnu.org
Signed-off-by: Peter Lieven <pl@kamp.de>
Message-id: 1511798407-31129-1-git-send-email-pl@kamp.de
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/nfs.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/block/nfs.c b/block/nfs.c
index 337fcd9c84..effc8719b5 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -1,7 +1,7 @@
 /*
  * QEMU Block driver for native access to files on NFS shares
  *
- * Copyright (c) 2014-2016 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2014-2017 Peter Lieven <pl@kamp.de>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -496,7 +496,7 @@ out:
 static int64_t nfs_client_open(NFSClient *client, QDict *options,
                                int flags, int open_flags, Error **errp)
 {
-    int ret = -EINVAL;
+    int64_t ret = -EINVAL;
     QemuOpts *opts = NULL;
     Error *local_err = NULL;
     struct stat st;
@@ -686,8 +686,7 @@ static QemuOptsList nfs_create_opts = {
 
 static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
 {
-    int ret = 0;
-    int64_t total_size = 0;
+    int64_t ret, total_size;
     NFSClient *client = g_new0(NFSClient, 1);
     QDict *options = NULL;
 
-- 
cgit v1.2.3