aboutsummaryrefslogtreecommitdiff
path: root/block/file-posix.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/file-posix.c')
-rw-r--r--block/file-posix.c2742
1 files changed, 1910 insertions, 832 deletions
diff --git a/block/file-posix.c b/block/file-posix.c
index 2da3a76355..35684f7e21 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -26,9 +26,12 @@
#include "qapi/error.h"
#include "qemu/cutils.h"
#include "qemu/error-report.h"
+#include "block/block-io.h"
#include "block/block_int.h"
#include "qemu/module.h"
#include "qemu/option.h"
+#include "qemu/units.h"
+#include "qemu/memalign.h"
#include "trace.h"
#include "block/thread-pool.h"
#include "qemu/iov.h"
@@ -40,8 +43,11 @@
#include "scsi/constants.h"
#if defined(__APPLE__) && (__MACH__)
+#include <sys/ioctl.h>
+#if defined(HAVE_HOST_BLOCK_DEVICE)
#include <paths.h>
#include <sys/param.h>
+#include <sys/mount.h>
#include <IOKit/IOKitLib.h>
#include <IOKit/IOBSD.h>
#include <IOKit/storage/IOMediaBSDClient.h>
@@ -50,6 +56,7 @@
//#include <IOKit/storage/IOCDTypes.h>
#include <IOKit/storage/IODVDMedia.h>
#include <CoreFoundation/CoreFoundation.h>
+#endif /* defined(HAVE_HOST_BLOCK_DEVICE) */
#endif
#ifdef __sun__
@@ -60,10 +67,15 @@
#include <sys/ioctl.h>
#include <sys/param.h>
#include <sys/syscall.h>
+#include <sys/vfs.h>
+#if defined(CONFIG_BLKZONED)
+#include <linux/blkzoned.h>
+#endif
#include <linux/cdrom.h>
#include <linux/fd.h>
#include <linux/fs.h>
#include <linux/hdreg.h>
+#include <linux/magic.h>
#include <scsi/sg.h>
#ifdef __s390__
#include <asm/dasd.h>
@@ -98,24 +110,6 @@
#include <sys/diskslice.h>
#endif
-#ifdef CONFIG_XFS
-#include <xfs/xfs.h>
-#endif
-
-//#define DEBUG_BLOCK
-
-#ifdef DEBUG_BLOCK
-# define DEBUG_BLOCK_PRINT 1
-#else
-# define DEBUG_BLOCK_PRINT 0
-#endif
-#define DPRINTF(fmt, ...) \
-do { \
- if (DEBUG_BLOCK_PRINT) { \
- printf(fmt, ## __VA_ARGS__); \
- } \
-} while (0)
-
/* OS X does not have O_DSYNC */
#ifndef O_DSYNC
#ifdef O_SYNC
@@ -142,7 +136,6 @@ do { \
typedef struct BDRVRawState {
int fd;
- int lock_fd;
bool use_lock;
int type;
int open_flags;
@@ -152,51 +145,87 @@ typedef struct BDRVRawState {
uint64_t perm;
uint64_t shared_perm;
-#ifdef CONFIG_XFS
- bool is_xfs:1;
-#endif
+ /* The perms bits whose corresponding bytes are already locked in
+ * s->fd. */
+ uint64_t locked_perm;
+ uint64_t locked_shared_perm;
+
+ uint64_t aio_max_batch;
+
+ int perm_change_fd;
+ int perm_change_flags;
+ BDRVReopenState *reopen_state;
+
bool has_discard:1;
bool has_write_zeroes:1;
- bool discard_zeroes:1;
bool use_linux_aio:1;
- bool page_cache_inconsistent:1;
+ bool use_linux_io_uring:1;
+ int page_cache_inconsistent; /* errno from fdatasync failure */
bool has_fallocate;
bool needs_alignment;
+ bool force_alignment;
+ bool drop_cache;
bool check_cache_dropped;
+ struct {
+ uint64_t discard_nb_ok;
+ uint64_t discard_nb_failed;
+ uint64_t discard_bytes_ok;
+ } stats;
PRManager *pr_mgr;
} BDRVRawState;
typedef struct BDRVRawReopenState {
- int fd;
int open_flags;
+ bool drop_cache;
bool check_cache_dropped;
} BDRVRawReopenState;
-static int fd_open(BlockDriverState *bs);
+static int fd_open(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+
+ /* this is just to ensure s->fd is sane (its called by io ops) */
+ if (s->fd >= 0) {
+ return 0;
+ }
+ return -EIO;
+}
+
static int64_t raw_getlength(BlockDriverState *bs);
typedef struct RawPosixAIOData {
BlockDriverState *bs;
+ int aio_type;
int aio_fildes;
- union {
- struct iovec *aio_iov;
- void *aio_ioctl_buf;
- };
- int aio_niov;
- uint64_t aio_nbytes;
-#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */
+
off_t aio_offset;
- int aio_type;
+ uint64_t aio_nbytes;
+
union {
struct {
+ struct iovec *iov;
+ int niov;
+ } io;
+ struct {
+ uint64_t cmd;
+ void *buf;
+ } ioctl;
+ struct {
int aio_fd2;
off_t aio_offset2;
- };
+ } copy_range;
struct {
PreallocMode prealloc;
Error **errp;
- };
+ } truncate;
+ struct {
+ unsigned int *nr_zones;
+ BlockZoneDescriptor *zones;
+ } zone_report;
+ struct {
+ unsigned long op;
+ } zone_mgmt;
};
} RawPosixAIOData;
@@ -204,8 +233,22 @@ typedef struct RawPosixAIOData {
static int cdrom_reopen(BlockDriverState *bs);
#endif
+/*
+ * Elide EAGAIN and EACCES details when failing to lock, as this
+ * indicates that the specified file region is already locked by
+ * another process, which is considered a common scenario.
+ */
+#define raw_lock_error_setg_errno(errp, err, fmt, ...) \
+ do { \
+ if ((err) == EAGAIN || (err) == EACCES) { \
+ error_setg((errp), (fmt), ## __VA_ARGS__); \
+ } else { \
+ error_setg_errno((errp), (err), (fmt), ## __VA_ARGS__); \
+ } \
+ } while (0)
+
#if defined(__NetBSD__)
-static int raw_normalize_devicepath(const char **filename)
+static int raw_normalize_devicepath(const char **filename, Error **errp)
{
static char namebuf[PATH_MAX];
const char *dp, *fname;
@@ -214,8 +257,7 @@ static int raw_normalize_devicepath(const char **filename)
fname = *filename;
dp = strrchr(fname, '/');
if (lstat(fname, &sb) < 0) {
- fprintf(stderr, "%s: stat failed: %s\n",
- fname, strerror(errno));
+ error_setg_file_open(errp, errno, fname);
return -errno;
}
@@ -229,14 +271,13 @@ static int raw_normalize_devicepath(const char **filename)
snprintf(namebuf, PATH_MAX, "%.*s/r%s",
(int)(dp - fname), fname, dp + 1);
}
- fprintf(stderr, "%s is a block device", fname);
*filename = namebuf;
- fprintf(stderr, ", using %s\n", *filename);
+ warn_report("%s is a block device, using %s", fname, *filename);
return 0;
}
#else
-static int raw_normalize_devicepath(const char **filename)
+static int raw_normalize_devicepath(const char **filename, Error **errp)
{
return 0;
}
@@ -292,6 +333,39 @@ static int probe_physical_blocksize(int fd, unsigned int *blk_size)
#endif
}
+/*
+ * Returns true if no alignment restrictions are necessary even for files
+ * opened with O_DIRECT.
+ *
+ * raw_probe_alignment() probes the required alignment and assume that 1 means
+ * the probing failed, so it falls back to a safe default of 4k. This can be
+ * avoided if we know that byte alignment is okay for the file.
+ */
+static bool dio_byte_aligned(int fd)
+{
+#ifdef __linux__
+ struct statfs buf;
+ int ret;
+
+ ret = fstatfs(fd, &buf);
+ if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) {
+ return true;
+ }
+#endif
+ return false;
+}
+
+static bool raw_needs_alignment(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+
+ if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) {
+ return true;
+ }
+
+ return s->force_alignment;
+}
+
/* Check if read is allowed with given memory buffer and length.
*
* This function is used to check O_DIRECT memory buffer and request alignment.
@@ -321,7 +395,8 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
{
BDRVRawState *s = bs->opaque;
char *buf;
- size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
+ size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size());
+ size_t alignments[] = {1, 512, 1024, 2048, 4096};
/* For SCSI generic devices the alignment is not really used.
With buffered I/O, we don't have any restrictions. */
@@ -337,36 +412,57 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
bs->bl.request_alignment = 0;
}
-#ifdef CONFIG_XFS
- if (s->is_xfs) {
- struct dioattr da;
- if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
- bs->bl.request_alignment = da.d_miniosz;
- /* The kernel returns wrong information for d_mem */
- /* s->buf_align = da.d_mem; */
- }
+
+#ifdef __linux__
+ /*
+ * The XFS ioctl definitions are shipped in extra packages that might
+ * not always be available. Since we just need the XFS_IOC_DIOINFO ioctl
+ * here, we simply use our own definition instead:
+ */
+ struct xfs_dioattr {
+ uint32_t d_mem;
+ uint32_t d_miniosz;
+ uint32_t d_maxiosz;
+ } da;
+ if (ioctl(fd, _IOR('X', 30, struct xfs_dioattr), &da) >= 0) {
+ bs->bl.request_alignment = da.d_miniosz;
+ /* The kernel returns wrong information for d_mem */
+ /* s->buf_align = da.d_mem; */
}
#endif
- /* If we could not get the sizes so far, we can only guess them */
- if (!s->buf_align) {
+ /*
+ * If we could not get the sizes so far, we can only guess them. First try
+ * to detect request alignment, since it is more likely to succeed. Then
+ * try to detect buf_align, which cannot be detected in some cases (e.g.
+ * Gluster). If buf_align cannot be detected, we fallback to the value of
+ * request_alignment.
+ */
+
+ if (!bs->bl.request_alignment) {
+ int i;
size_t align;
- buf = qemu_memalign(max_align, 2 * max_align);
- for (align = 512; align <= max_align; align <<= 1) {
- if (raw_is_io_aligned(fd, buf + align, max_align)) {
- s->buf_align = align;
+ buf = qemu_memalign(max_align, max_align);
+ for (i = 0; i < ARRAY_SIZE(alignments); i++) {
+ align = alignments[i];
+ if (raw_is_io_aligned(fd, buf, align)) {
+ /* Fallback to safe value. */
+ bs->bl.request_alignment = (align != 1) ? align : max_align;
break;
}
}
qemu_vfree(buf);
}
- if (!bs->bl.request_alignment) {
+ if (!s->buf_align) {
+ int i;
size_t align;
- buf = qemu_memalign(s->buf_align, max_align);
- for (align = 512; align <= max_align; align <<= 1) {
- if (raw_is_io_aligned(fd, buf, align)) {
- bs->bl.request_alignment = align;
+ buf = qemu_memalign(max_align, 2 * max_align);
+ for (i = 0; i < ARRAY_SIZE(alignments); i++) {
+ align = alignments[i];
+ if (raw_is_io_aligned(fd, buf + align, max_align)) {
+ /* Fallback to request_alignment. */
+ s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
break;
}
}
@@ -379,13 +475,54 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
}
}
-static void raw_parse_flags(int bdrv_flags, int *open_flags)
+static int check_hdev_writable(int fd)
{
+#if defined(BLKROGET)
+ /* Linux block devices can be configured "read-only" using blockdev(8).
+ * This is independent of device node permissions and therefore open(2)
+ * with O_RDWR succeeds. Actual writes fail with EPERM.
+ *
+ * bdrv_open() is supposed to fail if the disk is read-only. Explicitly
+ * check for read-only block devices so that Linux block devices behave
+ * properly.
+ */
+ struct stat st;
+ int readonly = 0;
+
+ if (fstat(fd, &st)) {
+ return -errno;
+ }
+
+ if (!S_ISBLK(st.st_mode)) {
+ return 0;
+ }
+
+ if (ioctl(fd, BLKROGET, &readonly) < 0) {
+ return -errno;
+ }
+
+ if (readonly) {
+ return -EACCES;
+ }
+#endif /* defined(BLKROGET) */
+ return 0;
+}
+
+static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
+{
+ bool read_write = false;
assert(open_flags != NULL);
*open_flags |= O_BINARY;
*open_flags &= ~O_ACCMODE;
- if (bdrv_flags & BDRV_O_RDWR) {
+
+ if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
+ read_write = has_writers;
+ } else if (bdrv_flags & BDRV_O_RDWR) {
+ read_write = true;
+ }
+
+ if (read_write) {
*open_flags |= O_RDWR;
} else {
*open_flags |= O_RDONLY;
@@ -416,7 +553,12 @@ static QemuOptsList raw_runtime_opts = {
{
.name = "aio",
.type = QEMU_OPT_STRING,
- .help = "host AIO implementation (threads, native)",
+ .help = "host AIO implementation (threads, native, io_uring)",
+ },
+ {
+ .name = "aio-max-batch",
+ .type = QEMU_OPT_NUMBER,
+ .help = "AIO max batch size (0 = auto handled by AIO backend, default: 0)",
},
{
.name = "locking",
@@ -428,6 +570,13 @@ static QemuOptsList raw_runtime_opts = {
.type = QEMU_OPT_STRING,
.help = "id of persistent reservation manager object (default: none)",
},
+#if defined(__linux__)
+ {
+ .name = "drop-cache",
+ .type = QEMU_OPT_BOOL,
+ .help = "invalidate page cache during live migration (default: on)",
+ },
+#endif
{
.name = "x-check-cache-dropped",
.type = QEMU_OPT_BOOL,
@@ -437,6 +586,8 @@ static QemuOptsList raw_runtime_opts = {
},
};
+static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
+
static int raw_open_common(BlockDriverState *bs, QDict *options,
int bdrv_flags, int open_flags,
bool device, Error **errp)
@@ -452,24 +603,28 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
OnOffAuto locking;
opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
- qemu_opts_absorb_qdict(opts, options, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
+ if (!qemu_opts_absorb_qdict(opts, options, errp)) {
ret = -EINVAL;
goto fail;
}
filename = qemu_opt_get(opts, "filename");
- ret = raw_normalize_devicepath(&filename);
+ ret = raw_normalize_devicepath(&filename, errp);
if (ret != 0) {
- error_setg_errno(errp, -ret, "Could not normalize device path");
goto fail;
}
- aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO)
- ? BLOCKDEV_AIO_OPTIONS_NATIVE
- : BLOCKDEV_AIO_OPTIONS_THREADS;
+ if (bdrv_flags & BDRV_O_NATIVE_AIO) {
+ aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE;
+#ifdef CONFIG_LINUX_IO_URING
+ } else if (bdrv_flags & BDRV_O_IO_URING) {
+ aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING;
+#endif
+ } else {
+ aio_default = BLOCKDEV_AIO_OPTIONS_THREADS;
+ }
+
aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
qemu_opt_get(opts, "aio"),
aio_default, &local_err);
@@ -478,7 +633,13 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
ret = -EINVAL;
goto fail;
}
+
s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
+#ifdef CONFIG_LINUX_IO_URING
+ s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
+#endif
+
+ s->aio_max_batch = qemu_opt_get_number(opts, "aio-max-batch", 0);
locking = qapi_enum_parse(&OnOffAuto_lookup,
qemu_opt_get(opts, "locking"),
@@ -492,11 +653,10 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
case ON_OFF_AUTO_ON:
s->use_lock = true;
if (!qemu_has_ofd_lock()) {
- fprintf(stderr,
- "File lock requested but OFD locking syscall is "
- "unavailable, falling back to POSIX file locks.\n"
- "Due to the implementation, locks can be lost "
- "unexpectedly.\n");
+ warn_report("File lock requested but OFD locking syscall is "
+ "unavailable, falling back to POSIX file locks");
+ error_printf("Due to the implementation, locks can be lost "
+ "unexpectedly.\n");
}
break;
case ON_OFF_AUTO_OFF:
@@ -519,17 +679,18 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
}
}
+ s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
false);
s->open_flags = open_flags;
- raw_parse_flags(bdrv_flags, &s->open_flags);
+ raw_parse_flags(bdrv_flags, &s->open_flags, false);
s->fd = -1;
- fd = qemu_open(filename, s->open_flags, 0644);
- if (fd < 0) {
- ret = -errno;
- error_setg_errno(errp, errno, "Could not open '%s'", filename);
+ fd = qemu_open(filename, s->open_flags, errp);
+ ret = fd < 0 ? -errno : 0;
+
+ if (ret < 0) {
if (ret == -EROFS) {
ret = -EACCES;
}
@@ -537,34 +698,25 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
}
s->fd = fd;
- s->lock_fd = -1;
- if (s->use_lock) {
- fd = qemu_open(filename, s->open_flags);
- if (fd < 0) {
- ret = -errno;
- error_setg_errno(errp, errno, "Could not open '%s' for locking",
- filename);
- qemu_close(s->fd);
+ /* Check s->open_flags rather than bdrv_flags due to auto-read-only */
+ if (s->open_flags & O_RDWR) {
+ ret = check_hdev_writable(s->fd);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "The device is not writable");
goto fail;
}
- s->lock_fd = fd;
}
+
s->perm = 0;
s->shared_perm = BLK_PERM_ALL;
#ifdef CONFIG_LINUX_AIO
/* Currently Linux does AIO only for files opened with O_DIRECT */
- if (s->use_linux_aio) {
- if (!(s->open_flags & O_DIRECT)) {
- error_setg(errp, "aio=native was specified, but it requires "
- "cache.direct=on, which was not specified.");
- ret = -EINVAL;
- goto fail;
- }
- if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
- error_prepend(errp, "Unable to use native AIO: ");
- goto fail;
- }
+ if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) {
+ error_setg(errp, "aio=native was specified, but it requires "
+ "cache.direct=on, which was not specified.");
+ ret = -EINVAL;
+ goto fail;
}
#else
if (s->use_linux_aio) {
@@ -575,11 +727,17 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
}
#endif /* !defined(CONFIG_LINUX_AIO) */
+#ifndef CONFIG_LINUX_IO_URING
+ if (s->use_linux_io_uring) {
+ error_setg(errp, "aio=io_uring was specified, but is not supported "
+ "in this build.");
+ ret = -EINVAL;
+ goto fail;
+ }
+#endif /* !defined(CONFIG_LINUX_IO_URING) */
+
s->has_discard = true;
s->has_write_zeroes = true;
- if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
- s->needs_alignment = true;
- }
if (fstat(s->fd, &st) < 0) {
ret = -errno;
@@ -588,44 +746,43 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
}
if (!device) {
- if (S_ISBLK(st.st_mode)) {
- warn_report("Opening a block device as a file using the '%s' "
- "driver is deprecated", bs->drv->format_name);
- } else if (S_ISCHR(st.st_mode)) {
- warn_report("Opening a character device as a file using the '%s' "
- "driver is deprecated", bs->drv->format_name);
- } else if (!S_ISREG(st.st_mode)) {
- error_setg(errp, "A regular file was expected by the '%s' driver, "
- "but something else was given", bs->drv->format_name);
+ if (!S_ISREG(st.st_mode)) {
+ error_setg(errp, "'%s' driver requires '%s' to be a regular file",
+ bs->drv->format_name, bs->filename);
ret = -EINVAL;
goto fail;
} else {
- s->discard_zeroes = true;
s->has_fallocate = true;
}
} else {
if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
- error_setg(errp, "'%s' driver expects either "
- "a character or block device", bs->drv->format_name);
+ error_setg(errp, "'%s' driver requires '%s' to be either "
+ "a character or block device",
+ bs->drv->format_name, bs->filename);
ret = -EINVAL;
goto fail;
}
}
+#ifdef CONFIG_BLKZONED
+ /*
+ * The kernel page cache does not reliably work for writes to SWR zones
+ * of zoned block device because it can not guarantee the order of writes.
+ */
+ if ((bs->bl.zoned != BLK_Z_NONE) &&
+ (!(s->open_flags & O_DIRECT))) {
+ error_setg(errp, "The driver supports zoned devices, and it requires "
+ "cache.direct=on, which was not specified.");
+ return -EINVAL; /* No host kernel page cache */
+ }
+#endif
if (S_ISBLK(st.st_mode)) {
-#ifdef BLKDISCARDZEROES
- unsigned int arg;
- if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
- s->discard_zeroes = true;
- }
-#endif
#ifdef __linux__
/* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do
* not rely on the contents of discarded blocks unless using O_DIRECT.
* Same for BLKZEROOUT.
*/
if (!(bs->open_flags & BDRV_O_NOCACHE)) {
- s->discard_zeroes = false;
s->has_write_zeroes = false;
}
#endif
@@ -638,19 +795,21 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
* so QEMU makes sure all IO operations on the device are aligned
* to sector size, or else FreeBSD will reject them with EINVAL.
*/
- s->needs_alignment = true;
+ s->force_alignment = true;
}
#endif
+ s->needs_alignment = raw_needs_alignment(bs);
-#ifdef CONFIG_XFS
- if (platform_test_xfs_fd(s->fd)) {
- s->is_xfs = true;
+ bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
+ if (S_ISREG(st.st_mode)) {
+ /* When extending regular files, we get zeros from the OS */
+ bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
}
-#endif
-
- bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
ret = 0;
fail:
+ if (ret < 0 && s->fd != -1) {
+ qemu_close(s->fd);
+ }
if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
unlink(filename);
}
@@ -680,43 +839,74 @@ typedef enum {
* file; if @unlock == true, also unlock the unneeded bytes.
* @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
*/
-static int raw_apply_lock_bytes(int fd,
+static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
uint64_t perm_lock_bits,
uint64_t shared_perm_lock_bits,
bool unlock, Error **errp)
{
int ret;
int i;
+ uint64_t locked_perm, locked_shared_perm;
+
+ if (s) {
+ locked_perm = s->locked_perm;
+ locked_shared_perm = s->locked_shared_perm;
+ } else {
+ /*
+ * We don't have the previous bits, just lock/unlock for each of the
+ * requested bits.
+ */
+ if (unlock) {
+ locked_perm = BLK_PERM_ALL;
+ locked_shared_perm = BLK_PERM_ALL;
+ } else {
+ locked_perm = 0;
+ locked_shared_perm = 0;
+ }
+ }
PERM_FOREACH(i) {
int off = RAW_LOCK_PERM_BASE + i;
- if (perm_lock_bits & (1ULL << i)) {
+ uint64_t bit = (1ULL << i);
+ if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
ret = qemu_lock_fd(fd, off, 1, false);
if (ret) {
- error_setg(errp, "Failed to lock byte %d", off);
+ raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
+ off);
return ret;
+ } else if (s) {
+ s->locked_perm |= bit;
}
- } else if (unlock) {
+ } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
ret = qemu_unlock_fd(fd, off, 1);
if (ret) {
- error_setg(errp, "Failed to unlock byte %d", off);
+ error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
return ret;
+ } else if (s) {
+ s->locked_perm &= ~bit;
}
}
}
PERM_FOREACH(i) {
int off = RAW_LOCK_SHARED_BASE + i;
- if (shared_perm_lock_bits & (1ULL << i)) {
+ uint64_t bit = (1ULL << i);
+ if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
ret = qemu_lock_fd(fd, off, 1, false);
if (ret) {
- error_setg(errp, "Failed to lock byte %d", off);
+ raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
+ off);
return ret;
+ } else if (s) {
+ s->locked_shared_perm |= bit;
}
- } else if (unlock) {
+ } else if (unlock && (locked_shared_perm & bit) &&
+ !(shared_perm_lock_bits & bit)) {
ret = qemu_unlock_fd(fd, off, 1);
if (ret) {
- error_setg(errp, "Failed to unlock byte %d", off);
+ error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
return ret;
+ } else if (s) {
+ s->locked_shared_perm &= ~bit;
}
}
}
@@ -737,9 +927,10 @@ static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
ret = qemu_lock_fd_test(fd, off, 1, true);
if (ret) {
char *perm_name = bdrv_perm_names(p);
- error_setg(errp,
- "Failed to get \"%s\" lock",
- perm_name);
+
+ raw_lock_error_setg_errno(errp, -ret,
+ "Failed to get \"%s\" lock",
+ perm_name);
g_free(perm_name);
return ret;
}
@@ -752,9 +943,10 @@ static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
ret = qemu_lock_fd_test(fd, off, 1, true);
if (ret) {
char *perm_name = bdrv_perm_names(p);
- error_setg(errp,
- "Failed to get shared \"%s\" lock",
- perm_name);
+
+ raw_lock_error_setg_errno(errp, -ret,
+ "Failed to get shared \"%s\" lock",
+ perm_name);
g_free(perm_name);
return ret;
}
@@ -780,15 +972,25 @@ static int raw_handle_perm_lock(BlockDriverState *bs,
return 0;
}
- assert(s->lock_fd > 0);
-
switch (op) {
case RAW_PL_PREPARE:
- ret = raw_apply_lock_bytes(s->lock_fd, s->perm | new_perm,
+ if ((s->perm | new_perm) == s->perm &&
+ (s->shared_perm & new_shared) == s->shared_perm)
+ {
+ /*
+ * We are going to unlock bytes, it should not fail. If it fail due
+ * to some fs-dependent permission-unrelated reasons (which occurs
+ * sometimes on NFS and leads to abort in bdrv_replace_child) we
+ * can't prevent such errors by any check here. And we ignore them
+ * anyway in ABORT and COMMIT.
+ */
+ return 0;
+ }
+ ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
~s->shared_perm | ~new_shared,
false, errp);
if (!ret) {
- ret = raw_check_lock_bytes(s->lock_fd, new_perm, new_shared, errp);
+ ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
if (!ret) {
return 0;
}
@@ -796,78 +998,67 @@ static int raw_handle_perm_lock(BlockDriverState *bs,
"Is another process using the image [%s]?\n",
bs->filename);
}
- op = RAW_PL_ABORT;
/* fall through to unlock bytes. */
case RAW_PL_ABORT:
- raw_apply_lock_bytes(s->lock_fd, s->perm, ~s->shared_perm,
+ raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
true, &local_err);
if (local_err) {
/* Theoretically the above call only unlocks bytes and it cannot
* fail. Something weird happened, report it.
*/
- error_report_err(local_err);
+ warn_report_err(local_err);
}
break;
case RAW_PL_COMMIT:
- raw_apply_lock_bytes(s->lock_fd, new_perm, ~new_shared,
+ raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
true, &local_err);
if (local_err) {
/* Theoretically the above call only unlocks bytes and it cannot
* fail. Something weird happened, report it.
*/
- error_report_err(local_err);
+ warn_report_err(local_err);
}
break;
}
return ret;
}
-static int raw_reopen_prepare(BDRVReopenState *state,
- BlockReopenQueue *queue, Error **errp)
+/* Sets a specific flag */
+static int fcntl_setfl(int fd, int flag)
{
- BDRVRawState *s;
- BDRVRawReopenState *rs;
- QemuOpts *opts;
- int ret = 0;
- Error *local_err = NULL;
-
- assert(state != NULL);
- assert(state->bs != NULL);
-
- s = state->bs->opaque;
-
- state->opaque = g_new0(BDRVRawReopenState, 1);
- rs = state->opaque;
- rs->fd = -1;
+ int flags;
- /* Handle options changes */
- opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
- qemu_opts_absorb_qdict(opts, state->options, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
- ret = -EINVAL;
- goto out;
+ flags = fcntl(fd, F_GETFL);
+ if (flags == -1) {
+ return -errno;
}
-
- rs->check_cache_dropped =
- qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
-
- /* This driver's reopen function doesn't currently allow changing
- * other options, so let's put them back in the original QDict and
- * bdrv_reopen_prepare() will detect changes and complain. */
- qemu_opts_to_qdict(opts, state->options);
-
- if (s->type == FTYPE_CD) {
- rs->open_flags |= O_NONBLOCK;
+ if (fcntl(fd, F_SETFL, flags | flag) == -1) {
+ return -errno;
}
+ return 0;
+}
- raw_parse_flags(state->flags, &rs->open_flags);
-
+static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
+ int *open_flags, uint64_t perm, bool force_dup,
+ Error **errp)
+{
+ BDRVRawState *s = bs->opaque;
+ int fd = -1;
+ int ret;
+ bool has_writers = perm &
+ (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
int fcntl_flags = O_APPEND | O_NONBLOCK;
#ifdef O_NOATIME
fcntl_flags |= O_NOATIME;
#endif
+ *open_flags = 0;
+ if (s->type == FTYPE_CD) {
+ *open_flags |= O_NONBLOCK;
+ }
+
+ raw_parse_flags(flags, open_flags, has_writers);
+
#ifdef O_ASYNC
/* Not all operating systems have O_ASYNC, and those that don't
* will not let us track the state into rs->open_flags (typically
@@ -877,46 +1068,91 @@ static int raw_reopen_prepare(BDRVReopenState *state,
assert((s->open_flags & O_ASYNC) == 0);
#endif
- if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
+ if (!force_dup && *open_flags == s->open_flags) {
+ /* We're lucky, the existing fd is fine */
+ return s->fd;
+ }
+
+ if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
/* dup the original fd */
- rs->fd = qemu_dup(s->fd);
- if (rs->fd >= 0) {
- ret = fcntl_setfl(rs->fd, rs->open_flags);
+ fd = qemu_dup(s->fd);
+ if (fd >= 0) {
+ ret = fcntl_setfl(fd, *open_flags);
if (ret) {
- qemu_close(rs->fd);
- rs->fd = -1;
+ qemu_close(fd);
+ fd = -1;
}
}
}
/* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
- if (rs->fd == -1) {
- const char *normalized_filename = state->bs->filename;
- ret = raw_normalize_devicepath(&normalized_filename);
- if (ret < 0) {
- error_setg_errno(errp, -ret, "Could not normalize device path");
- } else {
- assert(!(rs->open_flags & O_CREAT));
- rs->fd = qemu_open(normalized_filename, rs->open_flags);
- if (rs->fd == -1) {
- error_setg_errno(errp, errno, "Could not reopen file");
- ret = -1;
+ if (fd == -1) {
+ const char *normalized_filename = bs->filename;
+ ret = raw_normalize_devicepath(&normalized_filename, errp);
+ if (ret >= 0) {
+ fd = qemu_open(normalized_filename, *open_flags, errp);
+ if (fd == -1) {
+ return -1;
}
}
}
- /* Fail already reopen_prepare() if we can't get a working O_DIRECT
- * alignment with the new fd. */
- if (rs->fd != -1) {
- raw_probe_alignment(state->bs, rs->fd, &local_err);
- if (local_err) {
- qemu_close(rs->fd);
- rs->fd = -1;
- error_propagate(errp, local_err);
- ret = -EINVAL;
+ if (fd != -1 && (*open_flags & O_RDWR)) {
+ ret = check_hdev_writable(fd);
+ if (ret < 0) {
+ qemu_close(fd);
+ error_setg_errno(errp, -ret, "The device is not writable");
+ return -1;
}
}
+ return fd;
+}
+
+static int raw_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ BDRVRawState *s;
+ BDRVRawReopenState *rs;
+ QemuOpts *opts;
+ int ret;
+
+ assert(state != NULL);
+ assert(state->bs != NULL);
+
+ s = state->bs->opaque;
+
+ state->opaque = g_new0(BDRVRawReopenState, 1);
+ rs = state->opaque;
+
+ /* Handle options changes */
+ opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
+ if (!qemu_opts_absorb_qdict(opts, state->options, errp)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
+ rs->check_cache_dropped =
+ qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
+
+ /* This driver's reopen function doesn't currently allow changing
+ * other options, so let's put them back in the original QDict and
+ * bdrv_reopen_prepare() will detect changes and complain. */
+ qemu_opts_to_qdict(opts, state->options);
+
+ /*
+ * As part of reopen prepare we also want to create new fd by
+ * raw_reconfigure_getfd(). But it wants updated "perm", when in
+ * bdrv_reopen_multiple() .bdrv_reopen_prepare() callback called prior to
+ * permission update. Happily, permission update is always a part
+ * (a separate stage) of bdrv_reopen_multiple() so we can rely on this
+ * fact and reconfigure fd in raw_check_perm().
+ */
+
+ s->reopen_state = state;
+ ret = 0;
+
out:
qemu_opts_del(opts);
return ret;
@@ -927,118 +1163,360 @@ static void raw_reopen_commit(BDRVReopenState *state)
BDRVRawReopenState *rs = state->opaque;
BDRVRawState *s = state->bs->opaque;
+ s->drop_cache = rs->drop_cache;
s->check_cache_dropped = rs->check_cache_dropped;
s->open_flags = rs->open_flags;
-
- qemu_close(s->fd);
- s->fd = rs->fd;
-
g_free(state->opaque);
state->opaque = NULL;
+
+ assert(s->reopen_state == state);
+ s->reopen_state = NULL;
}
static void raw_reopen_abort(BDRVReopenState *state)
{
BDRVRawReopenState *rs = state->opaque;
+ BDRVRawState *s = state->bs->opaque;
/* nothing to do if NULL, we didn't get far enough */
if (rs == NULL) {
return;
}
- if (rs->fd >= 0) {
- qemu_close(rs->fd);
- rs->fd = -1;
- }
g_free(state->opaque);
state->opaque = NULL;
+
+ assert(s->reopen_state == state);
+ s->reopen_state = NULL;
}
-static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
+static int hdev_get_max_hw_transfer(int fd, struct stat *st)
{
#ifdef BLKSECTGET
- int max_bytes = 0;
- short max_sectors = 0;
- if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
- return max_bytes;
- } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
- return max_sectors << BDRV_SECTOR_BITS;
+ if (S_ISBLK(st->st_mode)) {
+ unsigned short max_sectors = 0;
+ if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
+ return max_sectors * 512;
+ }
} else {
- return -errno;
+ int max_bytes = 0;
+ if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
+ return max_bytes;
+ }
}
+ return -errno;
#else
return -ENOSYS;
#endif
}
-static int hdev_get_max_segments(const struct stat *st)
+/*
+ * Get a sysfs attribute value as character string.
+ */
+#ifdef CONFIG_LINUX
+static int get_sysfs_str_val(struct stat *st, const char *attribute,
+ char **val) {
+ g_autofree char *sysfspath = NULL;
+ size_t len;
+
+ if (!S_ISBLK(st->st_mode)) {
+ return -ENOTSUP;
+ }
+
+ sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
+ major(st->st_rdev), minor(st->st_rdev),
+ attribute);
+ if (!g_file_get_contents(sysfspath, val, &len, NULL)) {
+ return -ENOENT;
+ }
+
+ /* The file is ended with '\n' */
+ char *p;
+ p = *val;
+ if (*(p + len - 1) == '\n') {
+ *(p + len - 1) = '\0';
+ }
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
{
+ g_autofree char *val = NULL;
+ int ret;
+
+ ret = get_sysfs_str_val(st, "zoned", &val);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (strcmp(val, "host-managed") == 0) {
+ *zoned = BLK_Z_HM;
+ } else if (strcmp(val, "host-aware") == 0) {
+ *zoned = BLK_Z_HA;
+ } else if (strcmp(val, "none") == 0) {
+ *zoned = BLK_Z_NONE;
+ } else {
+ return -ENOTSUP;
+ }
+ return 0;
+}
+#endif /* defined(CONFIG_BLKZONED) */
+
+/*
+ * Get a sysfs attribute value as a long integer.
+ */
#ifdef CONFIG_LINUX
- char buf[32];
+static long get_sysfs_long_val(struct stat *st, const char *attribute)
+{
+ g_autofree char *str = NULL;
const char *end;
- char *sysfspath;
+ long val;
int ret;
- int fd = -1;
- long max_segments;
- sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
- major(st->st_rdev), minor(st->st_rdev));
- fd = open(sysfspath, O_RDONLY);
- if (fd == -1) {
- ret = -errno;
- goto out;
- }
- do {
- ret = read(fd, buf, sizeof(buf) - 1);
- } while (ret == -1 && errno == EINTR);
+ ret = get_sysfs_str_val(st, attribute, &str);
if (ret < 0) {
- ret = -errno;
- goto out;
- } else if (ret == 0) {
- ret = -EIO;
- goto out;
+ return ret;
}
- buf[ret] = 0;
+
/* The file is ended with '\n', pass 'end' to accept that. */
- ret = qemu_strtol(buf, &end, 10, &max_segments);
- if (ret == 0 && end && *end == '\n') {
- ret = max_segments;
+ ret = qemu_strtol(str, &end, 10, &val);
+ if (ret == 0 && end && *end == '\0') {
+ ret = val;
}
+ return ret;
+}
+#endif
-out:
- if (fd != -1) {
- close(fd);
+static int hdev_get_max_segments(int fd, struct stat *st)
+{
+#ifdef CONFIG_LINUX
+ int ret;
+
+ if (S_ISCHR(st->st_mode)) {
+ if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
+ return ret;
+ }
+ return -ENOTSUP;
}
- g_free(sysfspath);
- return ret;
+ return get_sysfs_long_val(st, "max_segments");
#else
return -ENOTSUP;
#endif
}
-static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
+#if defined(CONFIG_BLKZONED)
+/*
+ * If the reset_all flag is true, then the wps of zone whose state is
+ * not readonly or offline should be all reset to the start sector.
+ * Else, take the real wp of the device.
+ */
+static int get_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
+ unsigned int nrz, bool reset_all)
{
- BDRVRawState *s = bs->opaque;
- struct stat st;
+ struct blk_zone *blkz;
+ size_t rep_size;
+ uint64_t sector = offset >> BDRV_SECTOR_BITS;
+ BlockZoneWps *wps = bs->wps;
+ unsigned int j = offset / bs->bl.zone_size;
+ unsigned int n = 0, i = 0;
+ int ret;
+ rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+ g_autofree struct blk_zone_report *rep = NULL;
- if (!fstat(s->fd, &st)) {
- if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
- int ret = hdev_get_max_transfer_length(bs, s->fd);
- if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
- bs->bl.max_transfer = pow2floor(ret);
- }
- ret = hdev_get_max_segments(&st);
- if (ret > 0) {
- bs->bl.max_transfer = MIN(bs->bl.max_transfer,
- ret * getpagesize());
+ rep = g_malloc(rep_size);
+ blkz = (struct blk_zone *)(rep + 1);
+ while (n < nrz) {
+ memset(rep, 0, rep_size);
+ rep->sector = sector;
+ rep->nr_zones = nrz - n;
+
+ do {
+ ret = ioctl(fd, BLKREPORTZONE, rep);
+ } while (ret != 0 && errno == EINTR);
+ if (ret != 0) {
+ error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+ fd, offset, errno);
+ return -errno;
+ }
+
+ if (!rep->nr_zones) {
+ break;
+ }
+
+ for (i = 0; i < rep->nr_zones; ++i, ++n, ++j) {
+ /*
+ * The wp tracking cares only about sequential writes required and
+ * sequential write preferred zones so that the wp can advance to
+ * the right location.
+ * Use the most significant bit of the wp location to indicate the
+ * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
+ */
+ if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ wps->wp[j] |= 1ULL << 63;
+ } else {
+ switch(blkz[i].cond) {
+ case BLK_ZONE_COND_FULL:
+ case BLK_ZONE_COND_READONLY:
+ /* Zone not writable */
+ wps->wp[j] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
+ break;
+ case BLK_ZONE_COND_OFFLINE:
+ /* Zone not writable nor readable */
+ wps->wp[j] = (blkz[i].start) << BDRV_SECTOR_BITS;
+ break;
+ default:
+ if (reset_all) {
+ wps->wp[j] = blkz[i].start << BDRV_SECTOR_BITS;
+ } else {
+ wps->wp[j] = blkz[i].wp << BDRV_SECTOR_BITS;
+ }
+ break;
+ }
}
}
+ sector = blkz[i - 1].start + blkz[i - 1].len;
+ }
+
+ return 0;
+}
+
+static void update_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
+ unsigned int nrz)
+{
+ if (get_zones_wp(bs, fd, offset, nrz, 0) < 0) {
+ error_report("update zone wp failed");
+ }
+}
+
+static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
+ Error **errp)
+{
+ BDRVRawState *s = bs->opaque;
+ BlockZoneModel zoned;
+ int ret;
+
+ ret = get_sysfs_zoned_model(st, &zoned);
+ if (ret < 0 || zoned == BLK_Z_NONE) {
+ goto no_zoned;
+ }
+ bs->bl.zoned = zoned;
+
+ ret = get_sysfs_long_val(st, "max_open_zones");
+ if (ret >= 0) {
+ bs->bl.max_open_zones = ret;
+ }
+
+ ret = get_sysfs_long_val(st, "max_active_zones");
+ if (ret >= 0) {
+ bs->bl.max_active_zones = ret;
+ }
+
+ /*
+ * The zoned device must at least have zone size and nr_zones fields.
+ */
+ ret = get_sysfs_long_val(st, "chunk_sectors");
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "Unable to read chunk_sectors "
+ "sysfs attribute");
+ goto no_zoned;
+ } else if (!ret) {
+ error_setg(errp, "Read 0 from chunk_sectors sysfs attribute");
+ goto no_zoned;
+ }
+ bs->bl.zone_size = ret << BDRV_SECTOR_BITS;
+
+ ret = get_sysfs_long_val(st, "nr_zones");
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "Unable to read nr_zones "
+ "sysfs attribute");
+ goto no_zoned;
+ } else if (!ret) {
+ error_setg(errp, "Read 0 from nr_zones sysfs attribute");
+ goto no_zoned;
+ }
+ bs->bl.nr_zones = ret;
+
+ ret = get_sysfs_long_val(st, "zone_append_max_bytes");
+ if (ret > 0) {
+ bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
}
+ ret = get_sysfs_long_val(st, "physical_block_size");
+ if (ret >= 0) {
+ bs->bl.write_granularity = ret;
+ }
+
+ /* The refresh_limits() function can be called multiple times. */
+ g_free(bs->wps);
+ bs->wps = g_malloc(sizeof(BlockZoneWps) +
+ sizeof(int64_t) * bs->bl.nr_zones);
+ ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "report wps failed");
+ goto no_zoned;
+ }
+ qemu_co_mutex_init(&bs->wps->colock);
+ return;
+
+no_zoned:
+ bs->bl.zoned = BLK_Z_NONE;
+ g_free(bs->wps);
+ bs->wps = NULL;
+}
+#else /* !defined(CONFIG_BLKZONED) */
+static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
+ Error **errp)
+{
+ bs->bl.zoned = BLK_Z_NONE;
+}
+#endif /* !defined(CONFIG_BLKZONED) */
+
+static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+ BDRVRawState *s = bs->opaque;
+ struct stat st;
+
+ s->needs_alignment = raw_needs_alignment(bs);
raw_probe_alignment(bs, s->fd, errp);
+
bs->bl.min_mem_alignment = s->buf_align;
- bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
+ bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size());
+
+ /*
+ * Maximum transfers are best effort, so it is okay to ignore any
+ * errors. That said, based on the man page errors in fstat would be
+ * very much unexpected; the only possible case seems to be ENOMEM.
+ */
+ if (fstat(s->fd, &st)) {
+ return;
+ }
+
+#if defined(__APPLE__) && (__MACH__)
+ struct statfs buf;
+
+ if (!fstatfs(s->fd, &buf)) {
+ bs->bl.opt_transfer = buf.f_iosize;
+ bs->bl.pdiscard_alignment = buf.f_bsize;
+ }
+#endif
+
+ if (bdrv_is_sg(bs) || S_ISBLK(st.st_mode)) {
+ int ret = hdev_get_max_hw_transfer(s->fd, &st);
+
+ if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
+ bs->bl.max_hw_transfer = ret;
+ }
+
+ ret = hdev_get_max_segments(s->fd, &st);
+ if (ret > 0) {
+ bs->bl.max_hw_iov = ret;
+ }
+ }
+
+ raw_refresh_zoned_limits(bs, &st, errp);
}
static int check_for_dasd(int fd)
@@ -1062,9 +1540,12 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
BDRVRawState *s = bs->opaque;
int ret;
- /* If DASD, get blocksizes */
+ /* If DASD or zoned devices, get blocksizes */
if (check_for_dasd(s->fd) < 0) {
- return -ENOTSUP;
+ /* zoned devices are not DASD */
+ if (bs->bl.zoned == BLK_Z_NONE) {
+ return -ENOTSUP;
+ }
}
ret = probe_logical_blocksize(s->fd, &bsz->log);
if (ret < 0) {
@@ -1114,29 +1595,37 @@ static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
}
#endif
-static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
+#if defined(__linux__)
+static int handle_aiocb_ioctl(void *opaque)
{
+ RawPosixAIOData *aiocb = opaque;
int ret;
- ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
+ ret = RETRY_ON_EINTR(
+ ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf)
+ );
if (ret == -1) {
return -errno;
}
return 0;
}
+#endif /* linux */
-static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
+static int handle_aiocb_flush(void *opaque)
{
+ RawPosixAIOData *aiocb = opaque;
BDRVRawState *s = aiocb->bs->opaque;
int ret;
if (s->page_cache_inconsistent) {
- return -EIO;
+ return -s->page_cache_inconsistent;
}
ret = qemu_fdatasync(aiocb->aio_fildes);
if (ret == -1) {
+ trace_file_flush_fdatasync_failed(errno);
+
/* There is no clear definition of the semantics of a failing fsync(),
* so we may have to assume the worst. The sad truth is that this
* assumption is correct for Linux. Some pages are now probably marked
@@ -1151,7 +1640,7 @@ static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
* Obviously, this doesn't affect O_DIRECT, which bypasses the page
* cache. */
if ((s->open_flags & O_DIRECT) == 0) {
- s->page_cache_inconsistent = true;
+ s->page_cache_inconsistent = errno;
}
return -errno;
}
@@ -1196,18 +1685,17 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
{
ssize_t len;
- do {
- if (aiocb->aio_type & QEMU_AIO_WRITE)
- len = qemu_pwritev(aiocb->aio_fildes,
- aiocb->aio_iov,
- aiocb->aio_niov,
- aiocb->aio_offset);
- else
- len = qemu_preadv(aiocb->aio_fildes,
- aiocb->aio_iov,
- aiocb->aio_niov,
- aiocb->aio_offset);
- } while (len == -1 && errno == EINTR);
+ len = RETRY_ON_EINTR(
+ (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
+ qemu_pwritev(aiocb->aio_fildes,
+ aiocb->io.iov,
+ aiocb->io.niov,
+ aiocb->aio_offset) :
+ qemu_preadv(aiocb->aio_fildes,
+ aiocb->io.iov,
+ aiocb->io.niov,
+ aiocb->aio_offset)
+ );
if (len == -1) {
return -errno;
@@ -1227,7 +1715,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
ssize_t len;
while (offset < aiocb->aio_nbytes) {
- if (aiocb->aio_type & QEMU_AIO_WRITE) {
+ if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
len = pwrite(aiocb->aio_fildes,
(const char *)buf + offset,
aiocb->aio_nbytes - offset,
@@ -1261,8 +1749,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
return offset;
}
-static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
+static int handle_aiocb_rw(void *opaque)
{
+ RawPosixAIOData *aiocb = opaque;
ssize_t nbytes;
char *buf;
@@ -1271,8 +1760,9 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
* If there is just a single buffer, and it is properly aligned
* we can just use plain pread/pwrite without any problems.
*/
- if (aiocb->aio_niov == 1) {
- return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
+ if (aiocb->io.niov == 1) {
+ nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
+ goto out;
}
/*
* We have more than one iovec, and all are properly aligned.
@@ -1284,7 +1774,7 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
nbytes = handle_aiocb_rw_vector(aiocb);
if (nbytes == aiocb->aio_nbytes ||
(nbytes < 0 && nbytes != -ENOSYS)) {
- return nbytes;
+ goto out;
}
preadv_present = false;
}
@@ -1302,32 +1792,33 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
*/
buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
if (buf == NULL) {
- return -ENOMEM;
+ nbytes = -ENOMEM;
+ goto out;
}
if (aiocb->aio_type & QEMU_AIO_WRITE) {
char *p = buf;
int i;
- for (i = 0; i < aiocb->aio_niov; ++i) {
- memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
- p += aiocb->aio_iov[i].iov_len;
+ for (i = 0; i < aiocb->io.niov; ++i) {
+ memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
+ p += aiocb->io.iov[i].iov_len;
}
assert(p - buf == aiocb->aio_nbytes);
}
nbytes = handle_aiocb_rw_linear(aiocb, buf);
- if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+ if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
char *p = buf;
size_t count = aiocb->aio_nbytes, copy;
int i;
- for (i = 0; i < aiocb->aio_niov && count; ++i) {
+ for (i = 0; i < aiocb->io.niov && count; ++i) {
copy = count;
- if (copy > aiocb->aio_iov[i].iov_len) {
- copy = aiocb->aio_iov[i].iov_len;
+ if (copy > aiocb->io.iov[i].iov_len) {
+ copy = aiocb->io.iov[i].iov_len;
}
- memcpy(aiocb->aio_iov[i].iov_base, p, copy);
+ memcpy(aiocb->io.iov[i].iov_base, p, copy);
assert(count >= copy);
p += copy;
count -= copy;
@@ -1336,49 +1827,24 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
}
qemu_vfree(buf);
- return nbytes;
-}
-
-#ifdef CONFIG_XFS
-static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
-{
- struct xfs_flock64 fl;
- int err;
-
- memset(&fl, 0, sizeof(fl));
- fl.l_whence = SEEK_SET;
- fl.l_start = offset;
- fl.l_len = bytes;
-
- if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
- err = errno;
- DPRINTF("cannot write zero range (%s)\n", strerror(errno));
- return -err;
- }
-
- return 0;
-}
-
-static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
-{
- struct xfs_flock64 fl;
- int err;
-
- memset(&fl, 0, sizeof(fl));
- fl.l_whence = SEEK_SET;
- fl.l_start = offset;
- fl.l_len = bytes;
-
- if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
- err = errno;
- DPRINTF("cannot punch hole (%s)\n", strerror(errno));
- return -err;
+out:
+ if (nbytes == aiocb->aio_nbytes) {
+ return 0;
+ } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
+ if (aiocb->aio_type & QEMU_AIO_WRITE) {
+ return -EINVAL;
+ } else {
+ iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
+ 0, aiocb->aio_nbytes - nbytes);
+ return 0;
+ }
+ } else {
+ assert(nbytes < 0);
+ return nbytes;
}
-
- return 0;
}
-#endif
+#if defined(CONFIG_FALLOCATE) || defined(BLKZEROOUT) || defined(BLKDISCARD)
static int translate_err(int err)
{
if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
@@ -1387,6 +1853,7 @@ static int translate_err(int err)
}
return err;
}
+#endif
#ifdef CONFIG_FALLOCATE
static int do_fallocate(int fd, int mode, off_t offset, off_t len)
@@ -1410,28 +1877,32 @@ static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
}
#ifdef BLKZEROOUT
- do {
- uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
- if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
- return 0;
- }
- } while (errno == EINTR);
+ /* The BLKZEROOUT implementation in the kernel doesn't set
+ * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
+ * fallbacks. */
+ if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
+ do {
+ uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
+ if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
+ return 0;
+ }
+ } while (errno == EINTR);
- ret = translate_err(-errno);
+ ret = translate_err(-errno);
+ if (ret == -ENOTSUP) {
+ s->has_write_zeroes = false;
+ }
+ }
#endif
- if (ret == -ENOTSUP) {
- s->has_write_zeroes = false;
- }
return ret;
}
-static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
+static int handle_aiocb_write_zeroes(void *opaque)
{
-#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
- BDRVRawState *s = aiocb->bs->opaque;
-#endif
+ RawPosixAIOData *aiocb = opaque;
#ifdef CONFIG_FALLOCATE
+ BDRVRawState *s = aiocb->bs->opaque;
int64_t len;
#endif
@@ -1439,20 +1910,21 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
return handle_aiocb_write_zeroes_block(aiocb);
}
-#ifdef CONFIG_XFS
- if (s->is_xfs) {
- return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
- }
-#endif
-
#ifdef CONFIG_FALLOCATE_ZERO_RANGE
if (s->has_write_zeroes) {
int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
aiocb->aio_offset, aiocb->aio_nbytes);
- if (ret == 0 || ret != -ENOTSUP) {
+ if (ret == -ENOTSUP) {
+ s->has_write_zeroes = false;
+ } else if (ret == 0 || ret != -EINVAL) {
return ret;
}
- s->has_write_zeroes = false;
+ /*
+ * Note: Some file systems do not like unaligned byte ranges, and
+ * return EINVAL in such a case, though they should not do it according
+ * to the man-page of fallocate(). Thus we simply ignore this return
+ * value and try the other fallbacks instead.
+ */
}
#endif
@@ -1467,6 +1939,17 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
return ret;
}
s->has_fallocate = false;
+ } else if (ret == -EINVAL) {
+ /*
+ * Some file systems like older versions of GPFS do not like un-
+ * aligned byte ranges, and return EINVAL in such a case, though
+ * they should not do it according to the man-page of fallocate().
+ * Warn about the bad filesystem and try the final fallback instead.
+ */
+ warn_report_once("Your file system is misbehaving: "
+ "fallocate(FALLOC_FL_PUNCH_HOLE) returned EINVAL. "
+ "Please report this bug to your file system "
+ "vendor.");
} else if (ret != -ENOTSUP) {
return ret;
} else {
@@ -1478,7 +1961,7 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
#ifdef CONFIG_FALLOCATE
/* Last resort: we are trying to extend the file with zeroed data. This
* can be done via fallocate(fd, 0) */
- len = bdrv_getlength(aiocb->bs);
+ len = raw_getlength(aiocb->bs);
if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
if (ret == 0 || ret != -ENOTSUP) {
@@ -1491,33 +1974,29 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
return -ENOTSUP;
}
-static ssize_t handle_aiocb_write_zeroes_unmap(RawPosixAIOData *aiocb)
+static int handle_aiocb_write_zeroes_unmap(void *opaque)
{
+ RawPosixAIOData *aiocb = opaque;
BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
- int ret;
/* First try to write zeros and unmap at the same time */
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
- ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
- aiocb->aio_offset, aiocb->aio_nbytes);
- if (ret != -ENOTSUP) {
+ int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ aiocb->aio_offset, aiocb->aio_nbytes);
+ switch (ret) {
+ case -ENOTSUP:
+ case -EINVAL:
+ case -EBUSY:
+ break;
+ default:
return ret;
}
#endif
-#ifdef CONFIG_XFS
- if (s->is_xfs) {
- /* xfs_discard() guarantees that the discarded area reads as all-zero
- * afterwards, so we can use it here. */
- return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
- }
-#endif
-
/* If we couldn't manage to unmap while guaranteed that the area reads as
* all-zero afterwards, just write zeroes without unmapping */
- ret = handle_aiocb_write_zeroes(aiocb);
- return ret;
+ return handle_aiocb_write_zeroes(aiocb);
}
#ifndef HAVE_COPY_FILE_RANGE
@@ -1534,18 +2013,161 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
}
#endif
-static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb)
+/*
+ * parse_zone - Fill a zone descriptor
+ */
+#if defined(CONFIG_BLKZONED)
+static inline int parse_zone(struct BlockZoneDescriptor *zone,
+ const struct blk_zone *blkz) {
+ zone->start = blkz->start << BDRV_SECTOR_BITS;
+ zone->length = blkz->len << BDRV_SECTOR_BITS;
+ zone->wp = blkz->wp << BDRV_SECTOR_BITS;
+
+#ifdef HAVE_BLK_ZONE_REP_CAPACITY
+ zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
+#else
+ zone->cap = blkz->len << BDRV_SECTOR_BITS;
+#endif
+
+ switch (blkz->type) {
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ zone->type = BLK_ZT_SWR;
+ break;
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ zone->type = BLK_ZT_SWP;
+ break;
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ zone->type = BLK_ZT_CONV;
+ break;
+ default:
+ error_report("Unsupported zone type: 0x%x", blkz->type);
+ return -ENOTSUP;
+ }
+
+ switch (blkz->cond) {
+ case BLK_ZONE_COND_NOT_WP:
+ zone->state = BLK_ZS_NOT_WP;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ zone->state = BLK_ZS_EMPTY;
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ zone->state = BLK_ZS_IOPEN;
+ break;
+ case BLK_ZONE_COND_EXP_OPEN:
+ zone->state = BLK_ZS_EOPEN;
+ break;
+ case BLK_ZONE_COND_CLOSED:
+ zone->state = BLK_ZS_CLOSED;
+ break;
+ case BLK_ZONE_COND_READONLY:
+ zone->state = BLK_ZS_RDONLY;
+ break;
+ case BLK_ZONE_COND_FULL:
+ zone->state = BLK_ZS_FULL;
+ break;
+ case BLK_ZONE_COND_OFFLINE:
+ zone->state = BLK_ZS_OFFLINE;
+ break;
+ default:
+ error_report("Unsupported zone state: 0x%x", blkz->cond);
+ return -ENOTSUP;
+ }
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int handle_aiocb_zone_report(void *opaque)
+{
+ RawPosixAIOData *aiocb = opaque;
+ int fd = aiocb->aio_fildes;
+ unsigned int *nr_zones = aiocb->zone_report.nr_zones;
+ BlockZoneDescriptor *zones = aiocb->zone_report.zones;
+ /* zoned block devices use 512-byte sectors */
+ uint64_t sector = aiocb->aio_offset / 512;
+
+ struct blk_zone *blkz;
+ size_t rep_size;
+ unsigned int nrz;
+ int ret;
+ unsigned int n = 0, i = 0;
+
+ nrz = *nr_zones;
+ rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+ g_autofree struct blk_zone_report *rep = NULL;
+ rep = g_malloc(rep_size);
+
+ blkz = (struct blk_zone *)(rep + 1);
+ while (n < nrz) {
+ memset(rep, 0, rep_size);
+ rep->sector = sector;
+ rep->nr_zones = nrz - n;
+
+ do {
+ ret = ioctl(fd, BLKREPORTZONE, rep);
+ } while (ret != 0 && errno == EINTR);
+ if (ret != 0) {
+ error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+ fd, sector, errno);
+ return -errno;
+ }
+
+ if (!rep->nr_zones) {
+ break;
+ }
+
+ for (i = 0; i < rep->nr_zones; i++, n++) {
+ ret = parse_zone(&zones[n], &blkz[i]);
+ if (ret != 0) {
+ return ret;
+ }
+
+ /* The next report should start after the last zone reported */
+ sector = blkz[i].start + blkz[i].len;
+ }
+ }
+
+ *nr_zones = n;
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int handle_aiocb_zone_mgmt(void *opaque)
{
+ RawPosixAIOData *aiocb = opaque;
+ int fd = aiocb->aio_fildes;
+ uint64_t sector = aiocb->aio_offset / 512;
+ int64_t nr_sectors = aiocb->aio_nbytes / 512;
+ struct blk_zone_range range;
+ int ret;
+
+ /* Execute the operation */
+ range.sector = sector;
+ range.nr_sectors = nr_sectors;
+ do {
+ ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
+ } while (ret != 0 && errno == EINTR);
+
+ return ret < 0 ? -errno : ret;
+}
+#endif
+
+static int handle_aiocb_copy_range(void *opaque)
+{
+ RawPosixAIOData *aiocb = opaque;
uint64_t bytes = aiocb->aio_nbytes;
off_t in_off = aiocb->aio_offset;
- off_t out_off = aiocb->aio_offset2;
+ off_t out_off = aiocb->copy_range.aio_offset2;
while (bytes) {
ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
- aiocb->aio_fd2, &out_off,
+ aiocb->copy_range.aio_fd2, &out_off,
bytes, 0);
trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
- aiocb->aio_fd2, out_off, bytes, 0, ret);
+ aiocb->copy_range.aio_fd2, out_off, bytes,
+ 0, ret);
if (ret == 0) {
/* No progress (e.g. when beyond EOF), let the caller fall back to
* buffer I/O. */
@@ -1566,9 +2188,10 @@ static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb)
return 0;
}
-static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
+static int handle_aiocb_discard(void *opaque)
{
- int ret = -EOPNOTSUPP;
+ RawPosixAIOData *aiocb = opaque;
+ int ret = -ENOTSUP;
BDRVRawState *s = aiocb->bs->opaque;
if (!s->has_discard) {
@@ -1584,37 +2207,79 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
}
} while (errno == EINTR);
- ret = -errno;
+ ret = translate_err(-errno);
#endif
} else {
-#ifdef CONFIG_XFS
- if (s->is_xfs) {
- return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
- }
-#endif
-
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
aiocb->aio_offset, aiocb->aio_nbytes);
+ ret = translate_err(ret);
+#elif defined(__APPLE__) && (__MACH__)
+ fpunchhole_t fpunchhole;
+ fpunchhole.fp_flags = 0;
+ fpunchhole.reserved = 0;
+ fpunchhole.fp_offset = aiocb->aio_offset;
+ fpunchhole.fp_length = aiocb->aio_nbytes;
+ if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) {
+ ret = errno == ENODEV ? -ENOTSUP : -errno;
+ } else {
+ ret = 0;
+ }
#endif
}
- ret = translate_err(ret);
if (ret == -ENOTSUP) {
s->has_discard = false;
}
return ret;
}
-static int handle_aiocb_truncate(RawPosixAIOData *aiocb)
+/*
+ * Help alignment probing by allocating the first block.
+ *
+ * When reading with direct I/O from unallocated area on Gluster backed by XFS,
+ * reading succeeds regardless of request length. In this case we fallback to
+ * safe alignment which is not optimal. Allocating the first block avoids this
+ * fallback.
+ *
+ * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
+ * request alignment, so we use safe values.
+ *
+ * Returns: 0 on success, -errno on failure. Since this is an optimization,
+ * caller may ignore failures.
+ */
+static int allocate_first_block(int fd, size_t max_size)
{
+ size_t write_size = (max_size < MAX_BLOCKSIZE)
+ ? BDRV_SECTOR_SIZE
+ : MAX_BLOCKSIZE;
+ size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size());
+ void *buf;
+ ssize_t n;
+ int ret;
+
+ buf = qemu_memalign(max_align, write_size);
+ memset(buf, 0, write_size);
+
+ n = RETRY_ON_EINTR(pwrite(fd, buf, write_size, 0));
+
+ ret = (n == -1) ? -errno : 0;
+
+ qemu_vfree(buf);
+ return ret;
+}
+
+static int handle_aiocb_truncate(void *opaque)
+{
+ RawPosixAIOData *aiocb = opaque;
int result = 0;
int64_t current_length = 0;
char *buf = NULL;
struct stat st;
int fd = aiocb->aio_fildes;
int64_t offset = aiocb->aio_offset;
- Error **errp = aiocb->errp;
+ PreallocMode prealloc = aiocb->truncate.prealloc;
+ Error **errp = aiocb->truncate.errp;
if (fstat(fd, &st) < 0) {
result = -errno;
@@ -1623,12 +2288,12 @@ static int handle_aiocb_truncate(RawPosixAIOData *aiocb)
}
current_length = st.st_size;
- if (current_length > offset && aiocb->prealloc != PREALLOC_MODE_OFF) {
+ if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
error_setg(errp, "Cannot use preallocation for shrinking files");
return -ENOTSUP;
}
- switch (aiocb->prealloc) {
+ switch (prealloc) {
#ifdef CONFIG_POSIX_FALLOCATE
case PREALLOC_MODE_FALLOC:
/*
@@ -1643,6 +2308,17 @@ static int handle_aiocb_truncate(RawPosixAIOData *aiocb)
/* posix_fallocate() doesn't set errno. */
error_setg_errno(errp, -result,
"Could not preallocate new data");
+ } else if (current_length == 0) {
+ /*
+ * posix_fallocate() uses fallocate() if the filesystem
+ * supports it, or fallback to manually writing zeroes. If
+ * fallocate() was used, unaligned reads from the fallocated
+ * area in raw_probe_alignment() will succeed, hence we need to
+ * allocate the first block.
+ *
+ * Optimize future alignment probing; ignore failures.
+ */
+ allocate_first_block(fd, offset);
}
} else {
result = 0;
@@ -1704,12 +2380,15 @@ static int handle_aiocb_truncate(RawPosixAIOData *aiocb)
if (ftruncate(fd, offset) != 0) {
result = -errno;
error_setg_errno(errp, -result, "Could not resize file");
+ } else if (current_length == 0 && offset > current_length) {
+ /* Optimize future alignment probing; ignore failures. */
+ allocate_first_block(fd, offset);
}
return result;
default:
result = -ENOTSUP;
error_setg(errp, "Unsupported preallocation mode: %s",
- PreallocMode_str(aiocb->prealloc));
+ PreallocMode_str(prealloc));
return result;
}
@@ -1725,169 +2404,184 @@ out:
return result;
}
-static int aio_worker(void *arg)
+static int coroutine_fn raw_thread_pool_submit(ThreadPoolFunc func, void *arg)
{
- RawPosixAIOData *aiocb = arg;
- ssize_t ret = 0;
+ return thread_pool_submit_co(func, arg);
+}
- switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
- case QEMU_AIO_READ:
- ret = handle_aiocb_rw(aiocb);
- if (ret >= 0 && ret < aiocb->aio_nbytes) {
- iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
- 0, aiocb->aio_nbytes - ret);
+/*
+ * Check if all memory in this vector is sector aligned.
+ */
+static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
+{
+ int i;
+ size_t alignment = bdrv_min_mem_align(bs);
+ size_t len = bs->bl.request_alignment;
+ IO_CODE();
- ret = aiocb->aio_nbytes;
- }
- if (ret == aiocb->aio_nbytes) {
- ret = 0;
- } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
- ret = -EINVAL;
+ for (i = 0; i < qiov->niov; i++) {
+ if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
+ return false;
}
- break;
- case QEMU_AIO_WRITE:
- ret = handle_aiocb_rw(aiocb);
- if (ret == aiocb->aio_nbytes) {
- ret = 0;
- } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
- ret = -EINVAL;
+ if (qiov->iov[i].iov_len % len) {
+ return false;
}
- break;
- case QEMU_AIO_FLUSH:
- ret = handle_aiocb_flush(aiocb);
- break;
- case QEMU_AIO_IOCTL:
- ret = handle_aiocb_ioctl(aiocb);
- break;
- case QEMU_AIO_DISCARD:
- ret = handle_aiocb_discard(aiocb);
- break;
- case QEMU_AIO_WRITE_ZEROES:
- ret = handle_aiocb_write_zeroes(aiocb);
- break;
- case QEMU_AIO_WRITE_ZEROES | QEMU_AIO_DISCARD:
- ret = handle_aiocb_write_zeroes_unmap(aiocb);
- break;
- case QEMU_AIO_COPY_RANGE:
- ret = handle_aiocb_copy_range(aiocb);
- break;
- case QEMU_AIO_TRUNCATE:
- ret = handle_aiocb_truncate(aiocb);
- break;
- default:
- fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
- ret = -EINVAL;
- break;
}
- g_free(aiocb);
- return ret;
+ return true;
}
-static int paio_submit_co_full(BlockDriverState *bs, int fd,
- int64_t offset, int fd2, int64_t offset2,
- QEMUIOVector *qiov,
- int bytes, int type)
+#ifdef CONFIG_LINUX_IO_URING
+static inline bool raw_check_linux_io_uring(BDRVRawState *s)
{
- RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
- ThreadPool *pool;
-
- acb->bs = bs;
- acb->aio_type = type;
- acb->aio_fildes = fd;
- acb->aio_fd2 = fd2;
- acb->aio_offset2 = offset2;
-
- acb->aio_nbytes = bytes;
- acb->aio_offset = offset;
+ Error *local_err = NULL;
+ AioContext *ctx;
- if (qiov) {
- acb->aio_iov = qiov->iov;
- acb->aio_niov = qiov->niov;
- assert(qiov->size == bytes);
+ if (!s->use_linux_io_uring) {
+ return false;
}
- trace_file_paio_submit_co(offset, bytes, type);
- pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
- return thread_pool_submit_co(pool, aio_worker, acb);
+ ctx = qemu_get_current_aio_context();
+ if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) {
+ error_reportf_err(local_err, "Unable to use linux io_uring, "
+ "falling back to thread pool: ");
+ s->use_linux_io_uring = false;
+ return false;
+ }
+ return true;
}
+#endif
-static inline int paio_submit_co(BlockDriverState *bs, int fd,
- int64_t offset, QEMUIOVector *qiov,
- int bytes, int type)
+#ifdef CONFIG_LINUX_AIO
+static inline bool raw_check_linux_aio(BDRVRawState *s)
{
- return paio_submit_co_full(bs, fd, offset, -1, 0, qiov, bytes, type);
+ Error *local_err = NULL;
+ AioContext *ctx;
+
+ if (!s->use_linux_aio) {
+ return false;
+ }
+
+ ctx = qemu_get_current_aio_context();
+ if (unlikely(!aio_setup_linux_aio(ctx, &local_err))) {
+ error_reportf_err(local_err, "Unable to use Linux AIO, "
+ "falling back to thread pool: ");
+ s->use_linux_aio = false;
+ return false;
+ }
+ return true;
}
+#endif
-static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
+static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
uint64_t bytes, QEMUIOVector *qiov, int type)
{
BDRVRawState *s = bs->opaque;
+ RawPosixAIOData acb;
+ int ret;
+ uint64_t offset = *offset_ptr;
if (fd_open(bs) < 0)
return -EIO;
+#if defined(CONFIG_BLKZONED)
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) &&
+ bs->bl.zoned != BLK_Z_NONE) {
+ qemu_co_mutex_lock(&bs->wps->colock);
+ if (type & QEMU_AIO_ZONE_APPEND) {
+ int index = offset / bs->bl.zone_size;
+ offset = bs->wps->wp[index];
+ }
+ }
+#endif
/*
- * Check if the underlying device requires requests to be aligned,
- * and if the request we are trying to submit is aligned or not.
- * If this is the case tell the low-level driver that it needs
- * to copy the buffer.
+ * When using O_DIRECT, the request must be aligned to be able to use
+ * either libaio or io_uring interface. If not fail back to regular thread
+ * pool read/write code which emulates this for us if we
+ * set QEMU_AIO_MISALIGNED.
*/
- if (s->needs_alignment) {
- if (!bdrv_qiov_is_aligned(bs, qiov)) {
- type |= QEMU_AIO_MISALIGNED;
+ if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
+ type |= QEMU_AIO_MISALIGNED;
+#ifdef CONFIG_LINUX_IO_URING
+ } else if (raw_check_linux_io_uring(s)) {
+ assert(qiov->size == bytes);
+ ret = luring_co_submit(bs, s->fd, offset, qiov, type);
+ goto out;
+#endif
#ifdef CONFIG_LINUX_AIO
- } else if (s->use_linux_aio) {
- LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
- assert(qiov->size == bytes);
- return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
+ } else if (raw_check_linux_aio(s)) {
+ assert(qiov->size == bytes);
+ ret = laio_co_submit(s->fd, offset, qiov, type,
+ s->aio_max_batch);
+ goto out;
#endif
- }
}
- return paio_submit_co(bs, s->fd, offset, qiov, bytes, type);
-}
+ acb = (RawPosixAIOData) {
+ .bs = bs,
+ .aio_fildes = s->fd,
+ .aio_type = type,
+ .aio_offset = offset,
+ .aio_nbytes = bytes,
+ .io = {
+ .iov = qiov->iov,
+ .niov = qiov->niov,
+ },
+ };
-static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
- uint64_t bytes, QEMUIOVector *qiov,
- int flags)
-{
- return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
-}
+ assert(qiov->size == bytes);
+ ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
+ goto out; /* Avoid the compiler err of unused label */
-static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
- uint64_t bytes, QEMUIOVector *qiov,
- int flags)
-{
- assert(flags == 0);
- return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
-}
+out:
+#if defined(CONFIG_BLKZONED)
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) &&
+ bs->bl.zoned != BLK_Z_NONE) {
+ BlockZoneWps *wps = bs->wps;
+ if (ret == 0) {
+ uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
+ if (!BDRV_ZT_IS_CONV(*wp)) {
+ if (type & QEMU_AIO_ZONE_APPEND) {
+ *offset_ptr = *wp;
+ trace_zbd_zone_append_complete(bs, *offset_ptr
+ >> BDRV_SECTOR_BITS);
+ }
+ /* Advance the wp if needed */
+ if (offset + bytes > *wp) {
+ *wp = offset + bytes;
+ }
+ }
+ } else {
+ /*
+ * write and append write are not allowed to cross zone boundaries
+ */
+ update_zones_wp(bs, s->fd, offset, 1);
+ }
-static void raw_aio_plug(BlockDriverState *bs)
-{
-#ifdef CONFIG_LINUX_AIO
- BDRVRawState *s = bs->opaque;
- if (s->use_linux_aio) {
- LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
- laio_io_plug(bs, aio);
+ qemu_co_mutex_unlock(&wps->colock);
}
#endif
+ return ret;
}
-static void raw_aio_unplug(BlockDriverState *bs)
+static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
+ int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
{
-#ifdef CONFIG_LINUX_AIO
- BDRVRawState *s = bs->opaque;
- if (s->use_linux_aio) {
- LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
- laio_io_unplug(bs, aio);
- }
-#endif
+ return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ);
}
-static int raw_co_flush_to_disk(BlockDriverState *bs)
+static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
+ int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE);
+}
+
+static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
{
BDRVRawState *s = bs->opaque;
+ RawPosixAIOData acb;
int ret;
ret = fd_open(bs);
@@ -1895,23 +2589,18 @@ static int raw_co_flush_to_disk(BlockDriverState *bs)
return ret;
}
- return paio_submit_co(bs, s->fd, 0, NULL, 0, QEMU_AIO_FLUSH);
-}
+ acb = (RawPosixAIOData) {
+ .bs = bs,
+ .aio_fildes = s->fd,
+ .aio_type = QEMU_AIO_FLUSH,
+ };
-static void raw_aio_attach_aio_context(BlockDriverState *bs,
- AioContext *new_context)
-{
-#ifdef CONFIG_LINUX_AIO
- BDRVRawState *s = bs->opaque;
- if (s->use_linux_aio) {
- Error *local_err;
- if (!aio_setup_linux_aio(new_context, &local_err)) {
- error_reportf_err(local_err, "Unable to use native AIO, "
- "falling back to thread pool: ");
- s->use_linux_aio = false;
- }
+#ifdef CONFIG_LINUX_IO_URING
+ if (raw_check_linux_io_uring(s)) {
+ return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH);
}
#endif
+ return raw_thread_pool_submit(handle_aiocb_flush, &acb);
}
static void raw_close(BlockDriverState *bs)
@@ -1919,13 +2608,12 @@ static void raw_close(BlockDriverState *bs)
BDRVRawState *s = bs->opaque;
if (s->fd >= 0) {
+#if defined(CONFIG_BLKZONED)
+ g_free(bs->wps);
+#endif
qemu_close(s->fd);
s->fd = -1;
}
- if (s->lock_fd >= 0) {
- qemu_close(s->lock_fd);
- s->lock_fd = -1;
- }
}
/**
@@ -1938,25 +2626,25 @@ static int coroutine_fn
raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
PreallocMode prealloc, Error **errp)
{
- RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
- ThreadPool *pool;
+ RawPosixAIOData acb;
- *acb = (RawPosixAIOData) {
+ acb = (RawPosixAIOData) {
.bs = bs,
.aio_fildes = fd,
.aio_type = QEMU_AIO_TRUNCATE,
.aio_offset = offset,
- .prealloc = prealloc,
- .errp = errp,
+ .truncate = {
+ .prealloc = prealloc,
+ .errp = errp,
+ },
};
- /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
- pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
- return thread_pool_submit_co(pool, aio_worker, acb);
+ return raw_thread_pool_submit(handle_aiocb_truncate, &acb);
}
static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
- PreallocMode prealloc, Error **errp)
+ bool exact, PreallocMode prealloc,
+ BdrvRequestFlags flags, Error **errp)
{
BDRVRawState *s = bs->opaque;
struct stat st;
@@ -1969,6 +2657,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
}
if (S_ISREG(st.st_mode)) {
+ /* Always resizes to the exact @offset */
return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
}
@@ -1979,7 +2668,12 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
}
if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
- if (offset > raw_getlength(bs)) {
+ int64_t cur_length = raw_getlength(bs);
+
+ if (offset != cur_length && exact) {
+ error_setg(errp, "Cannot resize device files");
+ return -ENOTSUP;
+ } else if (offset > cur_length) {
error_setg(errp, "Cannot grow device files");
return -EINVAL;
}
@@ -2086,39 +2780,37 @@ static int64_t raw_getlength(BlockDriverState *bs)
again:
#endif
if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
+ size = 0;
#ifdef DIOCGMEDIASIZE
- if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
-#elif defined(DIOCGPART)
- {
- struct partinfo pi;
- if (ioctl(fd, DIOCGPART, &pi) == 0)
- size = pi.media_size;
- else
- size = 0;
+ if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) {
+ size = 0;
}
- if (size == 0)
#endif
-#if defined(__APPLE__) && defined(__MACH__)
- {
+#ifdef DIOCGPART
+ if (size == 0) {
+ struct partinfo pi;
+ if (ioctl(fd, DIOCGPART, &pi) == 0) {
+ size = pi.media_size;
+ }
+ }
+#endif
+#if defined(DKIOCGETBLOCKCOUNT) && defined(DKIOCGETBLOCKSIZE)
+ if (size == 0) {
uint64_t sectors = 0;
uint32_t sector_size = 0;
if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
&& ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
size = sectors * sector_size;
- } else {
- size = lseek(fd, 0LL, SEEK_END);
- if (size < 0) {
- return -errno;
- }
}
}
-#else
- size = lseek(fd, 0LL, SEEK_END);
+#endif
+ if (size == 0) {
+ size = lseek(fd, 0LL, SEEK_END);
+ }
if (size < 0) {
return -errno;
}
-#endif
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
switch(s->type) {
case FTYPE_CD:
@@ -2160,7 +2852,12 @@ static int64_t raw_getlength(BlockDriverState *bs)
}
#endif
-static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
+static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
+{
+ return raw_getlength(bs);
+}
+
+static int64_t coroutine_fn raw_co_get_allocated_file_size(BlockDriverState *bs)
{
struct stat st;
BDRVRawState *s = bs->opaque;
@@ -2190,12 +2887,19 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
if (!file_opts->has_preallocation) {
file_opts->preallocation = PREALLOC_MODE_OFF;
}
+ if (!file_opts->has_extent_size_hint) {
+ file_opts->extent_size_hint = 1 * MiB;
+ }
+ if (file_opts->extent_size_hint > UINT32_MAX) {
+ result = -EINVAL;
+ error_setg(errp, "Extent size hint is too large");
+ goto out;
+ }
/* Create file */
- fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
+ fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp);
if (fd < 0) {
result = -errno;
- error_setg_errno(errp, -result, "Could not create file");
goto out;
}
@@ -2213,7 +2917,7 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
/* Step one: Take locks */
- result = raw_apply_lock_bytes(fd, perm, ~shared, false, errp);
+ result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
if (result < 0) {
goto out_close;
}
@@ -2247,6 +2951,27 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
}
#endif
}
+#ifdef FS_IOC_FSSETXATTR
+ /*
+ * Try to set the extent size hint. Failure is not fatal, and a warning is
+ * only printed if the option was explicitly specified.
+ */
+ {
+ struct fsxattr attr;
+ result = ioctl(fd, FS_IOC_FSGETXATTR, &attr);
+ if (result == 0) {
+ attr.fsx_xflags |= FS_XFLAG_EXTSIZE;
+ attr.fsx_extsize = file_opts->extent_size_hint;
+ result = ioctl(fd, FS_IOC_FSSETXATTR, &attr);
+ }
+ if (result < 0 && file_opts->has_extent_size_hint &&
+ file_opts->extent_size_hint)
+ {
+ warn_report("Failed to set extent size hint: %s",
+ strerror(errno));
+ }
+ }
+#endif
/* Resize and potentially preallocate the file to the desired
* final size */
@@ -2257,13 +2982,13 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
}
out_unlock:
- raw_apply_lock_bytes(fd, 0, 0, true, &local_err);
+ raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
if (local_err) {
/* The above call should not fail, and if it does, that does
* not mean the whole creation operation has failed. So
* report it the user for their convenience, but do not report
* it to the caller. */
- error_report_err(local_err);
+ warn_report_err(local_err);
}
out_close:
@@ -2275,11 +3000,14 @@ out:
return result;
}
-static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
- Error **errp)
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_create_opts(BlockDriver *drv, const char *filename,
+ QemuOpts *opts, Error **errp)
{
BlockdevCreateOptions options;
int64_t total_size = 0;
+ int64_t extent_size_hint = 0;
+ bool has_extent_size_hint = false;
bool nocow = false;
PreallocMode prealloc;
char *buf = NULL;
@@ -2291,6 +3019,11 @@ static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
/* Read out options */
total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
BDRV_SECTOR_SIZE);
+ if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) {
+ has_extent_size_hint = true;
+ extent_size_hint =
+ qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1);
+ }
nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
@@ -2310,11 +3043,35 @@ static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
.preallocation = prealloc,
.has_nocow = true,
.nocow = nocow,
+ .has_extent_size_hint = has_extent_size_hint,
+ .extent_size_hint = extent_size_hint,
},
};
return raw_co_create(&options, errp);
}
+static int coroutine_fn raw_co_delete_file(BlockDriverState *bs,
+ Error **errp)
+{
+ struct stat st;
+ int ret;
+
+ if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) {
+ error_setg_errno(errp, ENOENT, "%s is not a regular file",
+ bs->filename);
+ return -ENOENT;
+ }
+
+ ret = unlink(bs->filename);
+ if (ret < 0) {
+ ret = -errno;
+ error_setg_errno(errp, -ret, "Error when deleting file %s",
+ bs->filename);
+ }
+
+ return ret;
+}
+
/*
* Find allocation range in @bs around offset @start.
* May change underlying file descriptor's file offset.
@@ -2419,7 +3176,8 @@ static int find_allocation(BlockDriverState *bs, off_t start,
* the specified offset) that are known to be in the same
* allocated/unallocated state.
*
- * 'bytes' is the max value 'pnum' should be set to.
+ * 'bytes' is a soft cap for 'pnum'. If the information is free, 'pnum' may
+ * well exceed it.
*/
static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
bool want_zero,
@@ -2431,6 +3189,8 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
off_t data = 0, hole = 0;
int ret;
+ assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
+
ret = fd_open(bs);
if (ret < 0) {
return ret;
@@ -2455,12 +3215,26 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
} else if (data == offset) {
/* On a data extent, compute bytes to the end of the extent,
* possibly including a partial sector at EOF. */
- *pnum = MIN(bytes, hole - offset);
+ *pnum = hole - offset;
+
+ /*
+ * We are not allowed to return partial sectors, though, so
+ * round up if necessary.
+ */
+ if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
+ int64_t file_length = raw_getlength(bs);
+ if (file_length > 0) {
+ /* Ignore errors, this is just a safeguard */
+ assert(hole == file_length);
+ }
+ *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
+ }
+
ret = BDRV_BLOCK_DATA;
} else {
/* On a hole, compute bytes to the beginning of the next extent. */
assert(hole == offset);
- *pnum = MIN(bytes, data - offset);
+ *pnum = data - offset;
ret = BDRV_BLOCK_ZERO;
}
*map = offset;
@@ -2521,10 +3295,13 @@ static void check_cache_dropped(BlockDriverState *bs, Error **errp)
vec_end = DIV_ROUND_UP(length, page_size);
for (i = 0; i < vec_end; i++) {
if (vec[i] & 0x1) {
- error_setg(errp, "page cache still in use!");
break;
}
}
+ if (i < vec_end) {
+ error_setg(errp, "page cache still in use!");
+ break;
+ }
}
if (window) {
@@ -2535,8 +3312,8 @@ static void check_cache_dropped(BlockDriverState *bs, Error **errp)
}
#endif /* __linux__ */
-static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
- Error **errp)
+static void coroutine_fn GRAPH_RDLOCK
+raw_co_invalidate_cache(BlockDriverState *bs, Error **errp)
{
BDRVRawState *s = bs->opaque;
int ret;
@@ -2547,6 +3324,10 @@ static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
return;
}
+ if (!s->drop_cache) {
+ return;
+ }
+
if (s->open_flags & O_DIRECT) {
return; /* No host kernel page cache */
}
@@ -2582,36 +3363,350 @@ static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
#endif /* !__linux__ */
}
+static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
+{
+ if (ret) {
+ s->stats.discard_nb_failed++;
+ } else {
+ s->stats.discard_nb_ok++;
+ s->stats.discard_bytes_ok += nbytes;
+ }
+}
+
+/*
+ * zone report - Get a zone block device's information in the form
+ * of an array of zone descriptors.
+ * zones is an array of zone descriptors to hold zone information on reply;
+ * offset can be any byte within the entire size of the device;
+ * nr_zones is the maximum number of sectors the command should operate on.
+ */
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
+ unsigned int *nr_zones,
+ BlockZoneDescriptor *zones) {
+ BDRVRawState *s = bs->opaque;
+ RawPosixAIOData acb = (RawPosixAIOData) {
+ .bs = bs,
+ .aio_fildes = s->fd,
+ .aio_type = QEMU_AIO_ZONE_REPORT,
+ .aio_offset = offset,
+ .zone_report = {
+ .nr_zones = nr_zones,
+ .zones = zones,
+ },
+ };
+
+ trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS);
+ return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
+}
+#endif
+
+/*
+ * zone management operations - Execute an operation on a zone
+ */
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+ int64_t offset, int64_t len) {
+ BDRVRawState *s = bs->opaque;
+ RawPosixAIOData acb;
+ int64_t zone_size, zone_size_mask;
+ const char *op_name;
+ unsigned long zo;
+ int ret;
+ BlockZoneWps *wps = bs->wps;
+ int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
+
+ zone_size = bs->bl.zone_size;
+ zone_size_mask = zone_size - 1;
+ if (offset & zone_size_mask) {
+ error_report("sector offset %" PRId64 " is not aligned to zone size "
+ "%" PRId64 "", offset / 512, zone_size / 512);
+ return -EINVAL;
+ }
+
+ if (((offset + len) < capacity && len & zone_size_mask) ||
+ offset + len > capacity) {
+ error_report("number of sectors %" PRId64 " is not aligned to zone size"
+ " %" PRId64 "", len / 512, zone_size / 512);
+ return -EINVAL;
+ }
+
+ uint32_t i = offset / bs->bl.zone_size;
+ uint32_t nrz = len / bs->bl.zone_size;
+ uint64_t *wp = &wps->wp[i];
+ if (BDRV_ZT_IS_CONV(*wp) && len != capacity) {
+ error_report("zone mgmt operations are not allowed for conventional zones");
+ return -EIO;
+ }
+
+ switch (op) {
+ case BLK_ZO_OPEN:
+ op_name = "BLKOPENZONE";
+ zo = BLKOPENZONE;
+ break;
+ case BLK_ZO_CLOSE:
+ op_name = "BLKCLOSEZONE";
+ zo = BLKCLOSEZONE;
+ break;
+ case BLK_ZO_FINISH:
+ op_name = "BLKFINISHZONE";
+ zo = BLKFINISHZONE;
+ break;
+ case BLK_ZO_RESET:
+ op_name = "BLKRESETZONE";
+ zo = BLKRESETZONE;
+ break;
+ default:
+ error_report("Unsupported zone op: 0x%x", op);
+ return -ENOTSUP;
+ }
+
+ acb = (RawPosixAIOData) {
+ .bs = bs,
+ .aio_fildes = s->fd,
+ .aio_type = QEMU_AIO_ZONE_MGMT,
+ .aio_offset = offset,
+ .aio_nbytes = len,
+ .zone_mgmt = {
+ .op = zo,
+ },
+ };
+
+ trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS,
+ len >> BDRV_SECTOR_BITS);
+ ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
+ if (ret != 0) {
+ update_zones_wp(bs, s->fd, offset, nrz);
+ error_report("ioctl %s failed %d", op_name, ret);
+ return ret;
+ }
+
+ if (zo == BLKRESETZONE && len == capacity) {
+ ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 1);
+ if (ret < 0) {
+ error_report("reporting single wp failed");
+ return ret;
+ }
+ } else if (zo == BLKRESETZONE) {
+ for (unsigned int j = 0; j < nrz; ++j) {
+ wp[j] = offset + j * zone_size;
+ }
+ } else if (zo == BLKFINISHZONE) {
+ for (unsigned int j = 0; j < nrz; ++j) {
+ /* The zoned device allows the last zone smaller that the
+ * zone size. */
+ wp[j] = MIN(offset + (j + 1) * zone_size, offset + len);
+ }
+ }
+
+ return ret;
+}
+#endif
+
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
+ int64_t *offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags) {
+ assert(flags == 0);
+ int64_t zone_size_mask = bs->bl.zone_size - 1;
+ int64_t iov_len = 0;
+ int64_t len = 0;
+
+ if (*offset & zone_size_mask) {
+ error_report("sector offset %" PRId64 " is not aligned to zone size "
+ "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
+ return -EINVAL;
+ }
+
+ int64_t wg = bs->bl.write_granularity;
+ int64_t wg_mask = wg - 1;
+ for (int i = 0; i < qiov->niov; i++) {
+ iov_len = qiov->iov[i].iov_len;
+ if (iov_len & wg_mask) {
+ error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
+ "block size %" PRId64 "", i, iov_len, wg);
+ return -EINVAL;
+ }
+ len += iov_len;
+ }
+
+ trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
+ return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND);
+}
+#endif
+
static coroutine_fn int
-raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
+raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
+ bool blkdev)
{
BDRVRawState *s = bs->opaque;
+ RawPosixAIOData acb;
+ int ret;
+
+ acb = (RawPosixAIOData) {
+ .bs = bs,
+ .aio_fildes = s->fd,
+ .aio_type = QEMU_AIO_DISCARD,
+ .aio_offset = offset,
+ .aio_nbytes = bytes,
+ };
+
+ if (blkdev) {
+ acb.aio_type |= QEMU_AIO_BLKDEV;
+ }
- return paio_submit_co(bs, s->fd, offset, NULL, bytes, QEMU_AIO_DISCARD);
+ ret = raw_thread_pool_submit(handle_aiocb_discard, &acb);
+ raw_account_discard(s, bytes, ret);
+ return ret;
}
-static int coroutine_fn raw_co_pwrite_zeroes(
- BlockDriverState *bs, int64_t offset,
- int bytes, BdrvRequestFlags flags)
+static coroutine_fn int
+raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
+{
+ return raw_do_pdiscard(bs, offset, bytes, false);
+}
+
+static int coroutine_fn
+raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
+ BdrvRequestFlags flags, bool blkdev)
{
BDRVRawState *s = bs->opaque;
- int operation = QEMU_AIO_WRITE_ZEROES;
+ RawPosixAIOData acb;
+ ThreadPoolFunc *handler;
+
+#ifdef CONFIG_FALLOCATE
+ if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
+ BdrvTrackedRequest *req;
+
+ /*
+ * This is a workaround for a bug in the Linux XFS driver,
+ * where writes submitted through the AIO interface will be
+ * discarded if they happen beyond a concurrently running
+ * fallocate() that increases the file length (i.e., both the
+ * write and the fallocate() happen beyond the EOF).
+ *
+ * To work around it, we extend the tracked request for this
+ * zero write until INT64_MAX (effectively infinity), and mark
+ * it as serializing.
+ *
+ * We have to enable this workaround for all filesystems and
+ * AIO modes (not just XFS with aio=native), because for
+ * remote filesystems we do not know the host configuration.
+ */
+
+ req = bdrv_co_get_self_request(bs);
+ assert(req);
+ assert(req->type == BDRV_TRACKED_WRITE);
+ assert(req->offset <= offset);
+ assert(req->offset + req->bytes >= offset + bytes);
+
+ req->bytes = BDRV_MAX_LENGTH - req->offset;
+
+ bdrv_check_request(req->offset, req->bytes, &error_abort);
+
+ bdrv_make_request_serialising(req, bs->bl.request_alignment);
+ }
+#endif
+
+ acb = (RawPosixAIOData) {
+ .bs = bs,
+ .aio_fildes = s->fd,
+ .aio_type = QEMU_AIO_WRITE_ZEROES,
+ .aio_offset = offset,
+ .aio_nbytes = bytes,
+ };
+
+ if (blkdev) {
+ acb.aio_type |= QEMU_AIO_BLKDEV;
+ }
+ if (flags & BDRV_REQ_NO_FALLBACK) {
+ acb.aio_type |= QEMU_AIO_NO_FALLBACK;
+ }
if (flags & BDRV_REQ_MAY_UNMAP) {
- operation |= QEMU_AIO_DISCARD;
+ acb.aio_type |= QEMU_AIO_DISCARD;
+ handler = handle_aiocb_write_zeroes_unmap;
+ } else {
+ handler = handle_aiocb_write_zeroes;
}
- return paio_submit_co(bs, s->fd, offset, NULL, bytes, operation);
+ return raw_thread_pool_submit(handler, &acb);
}
-static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+static int coroutine_fn raw_co_pwrite_zeroes(
+ BlockDriverState *bs, int64_t offset,
+ int64_t bytes, BdrvRequestFlags flags)
{
- BDRVRawState *s = bs->opaque;
+ return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
+}
- bdi->unallocated_blocks_are_zero = s->discard_zeroes;
+static int coroutine_fn
+raw_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
return 0;
}
+static ImageInfoSpecific *raw_get_specific_info(BlockDriverState *bs,
+ Error **errp)
+{
+ ImageInfoSpecificFile *file_info = g_new0(ImageInfoSpecificFile, 1);
+ ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1);
+
+ *spec_info = (ImageInfoSpecific){
+ .type = IMAGE_INFO_SPECIFIC_KIND_FILE,
+ .u.file.data = file_info,
+ };
+
+#ifdef FS_IOC_FSGETXATTR
+ {
+ BDRVRawState *s = bs->opaque;
+ struct fsxattr attr;
+ int ret;
+
+ ret = ioctl(s->fd, FS_IOC_FSGETXATTR, &attr);
+ if (!ret && attr.fsx_extsize != 0) {
+ file_info->has_extent_size_hint = true;
+ file_info->extent_size_hint = attr.fsx_extsize;
+ }
+ }
+#endif
+
+ return spec_info;
+}
+
+static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ return (BlockStatsSpecificFile) {
+ .discard_nb_ok = s->stats.discard_nb_ok,
+ .discard_nb_failed = s->stats.discard_nb_failed,
+ .discard_bytes_ok = s->stats.discard_bytes_ok,
+ };
+}
+
+static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
+{
+ BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
+
+ stats->driver = BLOCKDEV_DRIVER_FILE;
+ stats->u.file = get_blockstats_specific_file(bs);
+
+ return stats;
+}
+
+#if defined(HAVE_HOST_BLOCK_DEVICE)
+static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
+{
+ BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
+
+ stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE;
+ stats->u.host_device = get_blockstats_specific_file(bs);
+
+ return stats;
+}
+#endif /* HAVE_HOST_BLOCK_DEVICE */
+
static QemuOptsList raw_create_opts = {
.name = "raw-create-opts",
.head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
@@ -2629,7 +3724,16 @@ static QemuOptsList raw_create_opts = {
{
.name = BLOCK_OPT_PREALLOC,
.type = QEMU_OPT_STRING,
- .help = "Preallocation mode (allowed values: off, falloc, full)"
+ .help = "Preallocation mode (allowed values: off"
+#ifdef CONFIG_POSIX_FALLOCATE
+ ", falloc"
+#endif
+ ", full)"
+ },
+ {
+ .name = BLOCK_OPT_EXTENT_SIZE_HINT,
+ .type = QEMU_OPT_SIZE,
+ .help = "Extent size hint for the image file, 0 to disable"
},
{ /* end of list */ }
}
@@ -2638,12 +3742,72 @@ static QemuOptsList raw_create_opts = {
static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
Error **errp)
{
- return raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
+ BDRVRawState *s = bs->opaque;
+ int input_flags = s->reopen_state ? s->reopen_state->flags : bs->open_flags;
+ int open_flags;
+ int ret;
+
+ /* We may need a new fd if auto-read-only switches the mode */
+ ret = raw_reconfigure_getfd(bs, input_flags, &open_flags, perm,
+ false, errp);
+ if (ret < 0) {
+ return ret;
+ } else if (ret != s->fd) {
+ Error *local_err = NULL;
+
+ /*
+ * Fail already check_perm() if we can't get a working O_DIRECT
+ * alignment with the new fd.
+ */
+ raw_probe_alignment(bs, ret, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return -EINVAL;
+ }
+
+ s->perm_change_fd = ret;
+ s->perm_change_flags = open_flags;
+ }
+
+ /* Prepare permissions on old fd to avoid conflicts between old and new,
+ * but keep everything locked that new will need. */
+ ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Copy locks to the new fd */
+ if (s->perm_change_fd && s->use_lock) {
+ ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
+ false, errp);
+ if (ret < 0) {
+ raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
+ goto fail;
+ }
+ }
+ return 0;
+
+fail:
+ if (s->perm_change_fd) {
+ qemu_close(s->perm_change_fd);
+ }
+ s->perm_change_fd = 0;
+ return ret;
}
static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
{
BDRVRawState *s = bs->opaque;
+
+ /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
+ * called after .bdrv_reopen_commit) */
+ if (s->perm_change_fd && s->fd != s->perm_change_fd) {
+ qemu_close(s->fd);
+ s->fd = s->perm_change_fd;
+ s->open_flags = s->perm_change_flags;
+ }
+ s->perm_change_fd = 0;
+
raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
s->perm = perm;
s->shared_perm = shared;
@@ -2651,27 +3815,35 @@ static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
static void raw_abort_perm_update(BlockDriverState *bs)
{
+ BDRVRawState *s = bs->opaque;
+
+ /* For reopen, .bdrv_reopen_abort is called afterwards and will close
+ * the file descriptor. */
+ if (s->perm_change_fd) {
+ qemu_close(s->perm_change_fd);
+ }
+ s->perm_change_fd = 0;
+
raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
}
-static int coroutine_fn raw_co_copy_range_from(
- BlockDriverState *bs, BdrvChild *src, uint64_t src_offset,
- BdrvChild *dst, uint64_t dst_offset, uint64_t bytes,
+static int coroutine_fn GRAPH_RDLOCK raw_co_copy_range_from(
+ BlockDriverState *bs, BdrvChild *src, int64_t src_offset,
+ BdrvChild *dst, int64_t dst_offset, int64_t bytes,
BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
{
return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
read_flags, write_flags);
}
-static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
- BdrvChild *src,
- uint64_t src_offset,
- BdrvChild *dst,
- uint64_t dst_offset,
- uint64_t bytes,
- BdrvRequestFlags read_flags,
- BdrvRequestFlags write_flags)
+static int coroutine_fn GRAPH_RDLOCK
+raw_co_copy_range_to(BlockDriverState *bs,
+ BdrvChild *src, int64_t src_offset,
+ BdrvChild *dst, int64_t dst_offset,
+ int64_t bytes, BdrvRequestFlags read_flags,
+ BdrvRequestFlags write_flags)
{
+ RawPosixAIOData acb;
BDRVRawState *s = bs->opaque;
BDRVRawState *src_s;
@@ -2684,8 +3856,20 @@ static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
return -EIO;
}
- return paio_submit_co_full(bs, src_s->fd, src_offset, s->fd, dst_offset,
- NULL, bytes, QEMU_AIO_COPY_RANGE);
+
+ acb = (RawPosixAIOData) {
+ .bs = bs,
+ .aio_type = QEMU_AIO_COPY_RANGE,
+ .aio_fildes = src_s->fd,
+ .aio_offset = src_offset,
+ .aio_nbytes = bytes,
+ .copy_range = {
+ .aio_fd2 = s->fd,
+ .aio_offset2 = dst_offset,
+ },
+ };
+
+ return raw_thread_pool_submit(handle_aiocb_copy_range, &acb);
}
BlockDriver bdrv_file = {
@@ -2706,6 +3890,7 @@ BlockDriver bdrv_file = {
.bdrv_co_block_status = raw_co_block_status,
.bdrv_co_invalidate_cache = raw_co_invalidate_cache,
.bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
+ .bdrv_co_delete_file = raw_co_delete_file,
.bdrv_co_preadv = raw_co_preadv,
.bdrv_co_pwritev = raw_co_pwritev,
@@ -2714,38 +3899,45 @@ BlockDriver bdrv_file = {
.bdrv_co_copy_range_from = raw_co_copy_range_from,
.bdrv_co_copy_range_to = raw_co_copy_range_to,
.bdrv_refresh_limits = raw_refresh_limits,
- .bdrv_io_plug = raw_aio_plug,
- .bdrv_io_unplug = raw_aio_unplug,
- .bdrv_attach_aio_context = raw_aio_attach_aio_context,
-
- .bdrv_co_truncate = raw_co_truncate,
- .bdrv_getlength = raw_getlength,
- .bdrv_get_info = raw_get_info,
- .bdrv_get_allocated_file_size
- = raw_get_allocated_file_size,
+
+ .bdrv_co_truncate = raw_co_truncate,
+ .bdrv_co_getlength = raw_co_getlength,
+ .bdrv_co_get_info = raw_co_get_info,
+ .bdrv_get_specific_info = raw_get_specific_info,
+ .bdrv_co_get_allocated_file_size = raw_co_get_allocated_file_size,
+ .bdrv_get_specific_stats = raw_get_specific_stats,
.bdrv_check_perm = raw_check_perm,
.bdrv_set_perm = raw_set_perm,
.bdrv_abort_perm_update = raw_abort_perm_update,
.create_opts = &raw_create_opts,
+ .mutable_opts = mutable_opts,
};
/***********************************************/
/* host device */
+#if defined(HAVE_HOST_BLOCK_DEVICE)
+
#if defined(__APPLE__) && defined(__MACH__)
static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
CFIndex maxPathSize, int flags);
+
+#if !defined(MAC_OS_VERSION_12_0) \
+ || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_VERSION_12_0)
+#define IOMainPort IOMasterPort
+#endif
+
static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
{
kern_return_t kernResult = KERN_FAILURE;
- mach_port_t masterPort;
+ mach_port_t mainPort;
CFMutableDictionaryRef classesToMatch;
const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
char *mediaType = NULL;
- kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
+ kernResult = IOMainPort(MACH_PORT_NULL, &mainPort);
if ( KERN_SUCCESS != kernResult ) {
- printf( "IOMasterPort returned %d\n", kernResult );
+ printf("IOMainPort returned %d\n", kernResult);
}
int index;
@@ -2758,7 +3950,7 @@ static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
}
CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
kCFBooleanTrue);
- kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
+ kernResult = IOServiceGetMatchingServices(mainPort, classesToMatch,
mediaIterator);
if (kernResult != KERN_SUCCESS) {
error_report("Note: IOServiceGetMatchingServices returned %d",
@@ -2768,7 +3960,7 @@ static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
/* If a match was found, leave the loop */
if (*mediaIterator != 0) {
- DPRINTF("Matching using %s\n", matching_array[index]);
+ trace_file_FindEjectableOpticalMedia(matching_array[index]);
mediaType = g_strdup(matching_array[index]);
break;
}
@@ -2816,7 +4008,7 @@ static bool setup_cdrom(char *bsd_path, Error **errp)
for (index = 0; index < num_of_test_partitions; index++) {
snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
index);
- fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
+ fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE, NULL);
if (fd >= 0) {
partition_found = true;
qemu_close(fd);
@@ -2828,7 +4020,7 @@ static bool setup_cdrom(char *bsd_path, Error **errp)
if (partition_found == false) {
error_setg(errp, "Failed to find a working partition on disc");
} else {
- DPRINTF("Using %s as optical disc\n", test_partition);
+ trace_file_setup_cdrom(test_partition);
pstrcpy(bsd_path, MAXPATHLEN, test_partition);
}
return partition_found;
@@ -2862,39 +4054,6 @@ static int hdev_probe_device(const char *filename)
return 0;
}
-static int check_hdev_writable(BDRVRawState *s)
-{
-#if defined(BLKROGET)
- /* Linux block devices can be configured "read-only" using blockdev(8).
- * This is independent of device node permissions and therefore open(2)
- * with O_RDWR succeeds. Actual writes fail with EPERM.
- *
- * bdrv_open() is supposed to fail if the disk is read-only. Explicitly
- * check for read-only block devices so that Linux block devices behave
- * properly.
- */
- struct stat st;
- int readonly = 0;
-
- if (fstat(s->fd, &st)) {
- return -errno;
- }
-
- if (!S_ISBLK(st.st_mode)) {
- return 0;
- }
-
- if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
- return -errno;
- }
-
- if (readonly) {
- return -EACCES;
- }
-#endif /* defined(BLKROGET) */
- return 0;
-}
-
static void hdev_parse_filename(const char *filename, QDict *options,
Error **errp)
{
@@ -2923,8 +4082,7 @@ static bool hdev_is_sg(BlockDriverState *bs)
ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
if (ret >= 0) {
- DPRINTF("SG device found: type=%d, version=%d\n",
- scsiid.scsi_type, sg_version);
+ trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
return true;
}
@@ -2937,7 +4095,6 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
Error **errp)
{
BDRVRawState *s = bs->opaque;
- Error *local_err = NULL;
int ret;
#if defined(__APPLE__) && defined(__MACH__)
@@ -3002,9 +4159,8 @@ hdev_open_Mac_error:
s->type = FTYPE_FILE;
- ret = raw_open_common(bs, options, flags, 0, true, &local_err);
+ ret = raw_open_common(bs, options, flags, 0, true, errp);
if (ret < 0) {
- error_propagate(errp, local_err);
#if defined(__APPLE__) && defined(__MACH__)
if (*bsd_path) {
filename = bsd_path;
@@ -3020,81 +4176,63 @@ hdev_open_Mac_error:
/* Since this does ioctl the device must be already opened */
bs->sg = hdev_is_sg(bs);
- if (flags & BDRV_O_RDWR) {
- ret = check_hdev_writable(s);
- if (ret < 0) {
- raw_close(bs);
- error_setg_errno(errp, -ret, "The device is not writable");
- return ret;
- }
- }
-
return ret;
}
#if defined(__linux__)
-
-static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
- unsigned long int req, void *buf,
- BlockCompletionFunc *cb, void *opaque)
+static int coroutine_fn
+hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
{
BDRVRawState *s = bs->opaque;
- RawPosixAIOData *acb;
- ThreadPool *pool;
+ RawPosixAIOData acb;
+ int ret;
- if (fd_open(bs) < 0)
- return NULL;
+ ret = fd_open(bs);
+ if (ret < 0) {
+ return ret;
+ }
if (req == SG_IO && s->pr_mgr) {
struct sg_io_hdr *io_hdr = buf;
if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
- return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
- s->fd, io_hdr, cb, opaque);
+ return pr_manager_execute(s->pr_mgr, qemu_get_current_aio_context(),
+ s->fd, io_hdr);
}
}
- acb = g_new(RawPosixAIOData, 1);
- acb->bs = bs;
- acb->aio_type = QEMU_AIO_IOCTL;
- acb->aio_fildes = s->fd;
- acb->aio_offset = 0;
- acb->aio_ioctl_buf = buf;
- acb->aio_ioctl_cmd = req;
- pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
- return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
-}
-#endif /* linux */
-
-static int fd_open(BlockDriverState *bs)
-{
- BDRVRawState *s = bs->opaque;
+ acb = (RawPosixAIOData) {
+ .bs = bs,
+ .aio_type = QEMU_AIO_IOCTL,
+ .aio_fildes = s->fd,
+ .aio_offset = 0,
+ .ioctl = {
+ .buf = buf,
+ .cmd = req,
+ },
+ };
- /* this is just to ensure s->fd is sane (its called by io ops) */
- if (s->fd >= 0)
- return 0;
- return -EIO;
+ return raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
}
+#endif /* linux */
static coroutine_fn int
-hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
+hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
{
BDRVRawState *s = bs->opaque;
int ret;
ret = fd_open(bs);
if (ret < 0) {
+ raw_account_discard(s, bytes, ret);
return ret;
}
- return paio_submit_co(bs, s->fd, offset, NULL, bytes,
- QEMU_AIO_DISCARD | QEMU_AIO_BLKDEV);
+ return raw_do_pdiscard(bs, offset, bytes, true);
}
static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
- int64_t offset, int bytes, BdrvRequestFlags flags)
+ int64_t offset, int64_t bytes, BdrvRequestFlags flags)
{
- BDRVRawState *s = bs->opaque;
- int operation = QEMU_AIO_WRITE_ZEROES | QEMU_AIO_BLKDEV;
int rc;
rc = fd_open(bs);
@@ -3102,73 +4240,7 @@ static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
return rc;
}
- if (flags & BDRV_REQ_MAY_UNMAP) {
- operation |= QEMU_AIO_DISCARD;
- }
-
- return paio_submit_co(bs, s->fd, offset, NULL, bytes, operation);
-}
-
-static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts,
- Error **errp)
-{
- int fd;
- int ret = 0;
- struct stat stat_buf;
- int64_t total_size = 0;
- bool has_prefix;
-
- /* This function is used by both protocol block drivers and therefore either
- * of these prefixes may be given.
- * The return value has to be stored somewhere, otherwise this is an error
- * due to -Werror=unused-value. */
- has_prefix =
- strstart(filename, "host_device:", &filename) ||
- strstart(filename, "host_cdrom:" , &filename);
-
- (void)has_prefix;
-
- ret = raw_normalize_devicepath(&filename);
- if (ret < 0) {
- error_setg_errno(errp, -ret, "Could not normalize device path");
- return ret;
- }
-
- /* Read out options */
- total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
- BDRV_SECTOR_SIZE);
-
- fd = qemu_open(filename, O_WRONLY | O_BINARY);
- if (fd < 0) {
- ret = -errno;
- error_setg_errno(errp, -ret, "Could not open device");
- return ret;
- }
-
- if (fstat(fd, &stat_buf) < 0) {
- ret = -errno;
- error_setg_errno(errp, -ret, "Could not stat device");
- } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) {
- error_setg(errp,
- "The given file is neither a block nor a character device");
- ret = -ENODEV;
- } else if (lseek(fd, 0, SEEK_END) < total_size) {
- error_setg(errp, "Device is too small");
- ret = -ENOSPC;
- }
-
- if (!ret && total_size) {
- uint8_t buf[BDRV_SECTOR_SIZE] = { 0 };
- int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size);
- if (lseek(fd, 0, SEEK_SET) == -1) {
- ret = -errno;
- } else {
- ret = qemu_write_full(fd, buf, zero_size);
- ret = ret == zero_size ? 0 : -errno;
- }
- }
- qemu_close(fd);
- return ret;
+ return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
}
static BlockDriver bdrv_host_device = {
@@ -3183,8 +4255,9 @@ static BlockDriver bdrv_host_device = {
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
.bdrv_reopen_abort = raw_reopen_abort,
- .bdrv_co_create_opts = hdev_co_create_opts,
- .create_opts = &raw_create_opts,
+ .bdrv_co_create_opts = bdrv_co_create_opts_simple,
+ .create_opts = &bdrv_create_opts_simple,
+ .mutable_opts = mutable_opts,
.bdrv_co_invalidate_cache = raw_co_invalidate_cache,
.bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
@@ -3195,15 +4268,13 @@ static BlockDriver bdrv_host_device = {
.bdrv_co_copy_range_from = raw_co_copy_range_from,
.bdrv_co_copy_range_to = raw_co_copy_range_to,
.bdrv_refresh_limits = raw_refresh_limits,
- .bdrv_io_plug = raw_aio_plug,
- .bdrv_io_unplug = raw_aio_unplug,
- .bdrv_attach_aio_context = raw_aio_attach_aio_context,
-
- .bdrv_co_truncate = raw_co_truncate,
- .bdrv_getlength = raw_getlength,
- .bdrv_get_info = raw_get_info,
- .bdrv_get_allocated_file_size
- = raw_get_allocated_file_size,
+
+ .bdrv_co_truncate = raw_co_truncate,
+ .bdrv_co_getlength = raw_co_getlength,
+ .bdrv_co_get_info = raw_co_get_info,
+ .bdrv_get_specific_info = raw_get_specific_info,
+ .bdrv_co_get_allocated_file_size = raw_co_get_allocated_file_size,
+ .bdrv_get_specific_stats = hdev_get_specific_stats,
.bdrv_check_perm = raw_check_perm,
.bdrv_set_perm = raw_set_perm,
.bdrv_abort_perm_update = raw_abort_perm_update,
@@ -3212,7 +4283,15 @@ static BlockDriver bdrv_host_device = {
/* generic scsi device */
#ifdef __linux__
- .bdrv_aio_ioctl = hdev_aio_ioctl,
+ .bdrv_co_ioctl = hdev_co_ioctl,
+#endif
+
+ /* zoned device */
+#if defined(CONFIG_BLKZONED)
+ /* zone management operations */
+ .bdrv_co_zone_report = raw_co_zone_report,
+ .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+ .bdrv_co_zone_append = raw_co_zone_append,
#endif
};
@@ -3222,6 +4301,12 @@ static void cdrom_parse_filename(const char *filename, QDict *options,
{
bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
}
+
+static void cdrom_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+ bs->bl.has_variable_length = true;
+ raw_refresh_limits(bs, errp);
+}
#endif
#ifdef __linux__
@@ -3242,7 +4327,7 @@ static int cdrom_probe_device(const char *filename)
int prio = 0;
struct stat st;
- fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
+ fd = qemu_open(filename, O_RDONLY | O_NONBLOCK, NULL);
if (fd < 0) {
goto out;
}
@@ -3262,7 +4347,7 @@ out:
return prio;
}
-static bool cdrom_is_inserted(BlockDriverState *bs)
+static bool coroutine_fn cdrom_co_is_inserted(BlockDriverState *bs)
{
BDRVRawState *s = bs->opaque;
int ret;
@@ -3271,7 +4356,7 @@ static bool cdrom_is_inserted(BlockDriverState *bs)
return ret == CDS_DISC_OK;
}
-static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
+static void coroutine_fn cdrom_co_eject(BlockDriverState *bs, bool eject_flag)
{
BDRVRawState *s = bs->opaque;
@@ -3284,7 +4369,7 @@ static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
}
}
-static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
+static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked)
{
BDRVRawState *s = bs->opaque;
@@ -3309,32 +4394,27 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
.bdrv_reopen_abort = raw_reopen_abort,
- .bdrv_co_create_opts = hdev_co_create_opts,
- .create_opts = &raw_create_opts,
+ .bdrv_co_create_opts = bdrv_co_create_opts_simple,
+ .create_opts = &bdrv_create_opts_simple,
+ .mutable_opts = mutable_opts,
.bdrv_co_invalidate_cache = raw_co_invalidate_cache,
-
.bdrv_co_preadv = raw_co_preadv,
.bdrv_co_pwritev = raw_co_pwritev,
.bdrv_co_flush_to_disk = raw_co_flush_to_disk,
- .bdrv_refresh_limits = raw_refresh_limits,
- .bdrv_io_plug = raw_aio_plug,
- .bdrv_io_unplug = raw_aio_unplug,
- .bdrv_attach_aio_context = raw_aio_attach_aio_context,
+ .bdrv_refresh_limits = cdrom_refresh_limits,
- .bdrv_co_truncate = raw_co_truncate,
- .bdrv_getlength = raw_getlength,
- .has_variable_length = true,
- .bdrv_get_allocated_file_size
- = raw_get_allocated_file_size,
+ .bdrv_co_truncate = raw_co_truncate,
+ .bdrv_co_getlength = raw_co_getlength,
+ .bdrv_co_get_allocated_file_size = raw_co_get_allocated_file_size,
/* removable device support */
- .bdrv_is_inserted = cdrom_is_inserted,
- .bdrv_eject = cdrom_eject,
- .bdrv_lock_medium = cdrom_lock_medium,
+ .bdrv_co_is_inserted = cdrom_co_is_inserted,
+ .bdrv_co_eject = cdrom_co_eject,
+ .bdrv_co_lock_medium = cdrom_co_lock_medium,
/* generic scsi device */
- .bdrv_aio_ioctl = hdev_aio_ioctl,
+ .bdrv_co_ioctl = hdev_co_ioctl,
};
#endif /* __linux__ */
@@ -3343,14 +4423,12 @@ static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
Error **errp)
{
BDRVRawState *s = bs->opaque;
- Error *local_err = NULL;
int ret;
s->type = FTYPE_CD;
- ret = raw_open_common(bs, options, flags, 0, true, &local_err);
+ ret = raw_open_common(bs, options, flags, 0, true, errp);
if (ret) {
- error_propagate(errp, local_err);
return ret;
}
@@ -3378,7 +4456,7 @@ static int cdrom_reopen(BlockDriverState *bs)
*/
if (s->fd >= 0)
qemu_close(s->fd);
- fd = qemu_open(bs->filename, s->open_flags, 0644);
+ fd = qemu_open(bs->filename, s->open_flags, NULL);
if (fd < 0) {
s->fd = -1;
return -EIO;
@@ -3390,12 +4468,12 @@ static int cdrom_reopen(BlockDriverState *bs)
return 0;
}
-static bool cdrom_is_inserted(BlockDriverState *bs)
+static bool coroutine_fn cdrom_co_is_inserted(BlockDriverState *bs)
{
return raw_getlength(bs) > 0;
}
-static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
+static void coroutine_fn cdrom_co_eject(BlockDriverState *bs, bool eject_flag)
{
BDRVRawState *s = bs->opaque;
@@ -3415,7 +4493,7 @@ static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
cdrom_reopen(bs);
}
-static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
+static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked)
{
BDRVRawState *s = bs->opaque;
@@ -3442,30 +4520,28 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
.bdrv_reopen_abort = raw_reopen_abort,
- .bdrv_co_create_opts = hdev_co_create_opts,
- .create_opts = &raw_create_opts,
+ .bdrv_co_create_opts = bdrv_co_create_opts_simple,
+ .create_opts = &bdrv_create_opts_simple,
+ .mutable_opts = mutable_opts,
.bdrv_co_preadv = raw_co_preadv,
.bdrv_co_pwritev = raw_co_pwritev,
.bdrv_co_flush_to_disk = raw_co_flush_to_disk,
- .bdrv_refresh_limits = raw_refresh_limits,
- .bdrv_io_plug = raw_aio_plug,
- .bdrv_io_unplug = raw_aio_unplug,
- .bdrv_attach_aio_context = raw_aio_attach_aio_context,
+ .bdrv_refresh_limits = cdrom_refresh_limits,
- .bdrv_co_truncate = raw_co_truncate,
- .bdrv_getlength = raw_getlength,
- .has_variable_length = true,
- .bdrv_get_allocated_file_size
- = raw_get_allocated_file_size,
+ .bdrv_co_truncate = raw_co_truncate,
+ .bdrv_co_getlength = raw_co_getlength,
+ .bdrv_co_get_allocated_file_size = raw_co_get_allocated_file_size,
/* removable device support */
- .bdrv_is_inserted = cdrom_is_inserted,
- .bdrv_eject = cdrom_eject,
- .bdrv_lock_medium = cdrom_lock_medium,
+ .bdrv_co_is_inserted = cdrom_co_is_inserted,
+ .bdrv_co_eject = cdrom_co_eject,
+ .bdrv_co_lock_medium = cdrom_co_lock_medium,
};
#endif /* __FreeBSD__ */
+#endif /* HAVE_HOST_BLOCK_DEVICE */
+
static void bdrv_file_init(void)
{
/*
@@ -3473,6 +4549,7 @@ static void bdrv_file_init(void)
* registered last will get probed first.
*/
bdrv_register(&bdrv_file);
+#if defined(HAVE_HOST_BLOCK_DEVICE)
bdrv_register(&bdrv_host_device);
#ifdef __linux__
bdrv_register(&bdrv_host_cdrom);
@@ -3480,6 +4557,7 @@ static void bdrv_file_init(void)
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
bdrv_register(&bdrv_host_cdrom);
#endif
+#endif /* HAVE_HOST_BLOCK_DEVICE */
}
block_init(bdrv_file_init);