diff options
Diffstat (limited to 'block/file-posix.c')
-rw-r--r-- | block/file-posix.c | 2742 |
1 files changed, 1910 insertions, 832 deletions
diff --git a/block/file-posix.c b/block/file-posix.c index 2da3a76355..35684f7e21 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -26,9 +26,12 @@ #include "qapi/error.h" #include "qemu/cutils.h" #include "qemu/error-report.h" +#include "block/block-io.h" #include "block/block_int.h" #include "qemu/module.h" #include "qemu/option.h" +#include "qemu/units.h" +#include "qemu/memalign.h" #include "trace.h" #include "block/thread-pool.h" #include "qemu/iov.h" @@ -40,8 +43,11 @@ #include "scsi/constants.h" #if defined(__APPLE__) && (__MACH__) +#include <sys/ioctl.h> +#if defined(HAVE_HOST_BLOCK_DEVICE) #include <paths.h> #include <sys/param.h> +#include <sys/mount.h> #include <IOKit/IOKitLib.h> #include <IOKit/IOBSD.h> #include <IOKit/storage/IOMediaBSDClient.h> @@ -50,6 +56,7 @@ //#include <IOKit/storage/IOCDTypes.h> #include <IOKit/storage/IODVDMedia.h> #include <CoreFoundation/CoreFoundation.h> +#endif /* defined(HAVE_HOST_BLOCK_DEVICE) */ #endif #ifdef __sun__ @@ -60,10 +67,15 @@ #include <sys/ioctl.h> #include <sys/param.h> #include <sys/syscall.h> +#include <sys/vfs.h> +#if defined(CONFIG_BLKZONED) +#include <linux/blkzoned.h> +#endif #include <linux/cdrom.h> #include <linux/fd.h> #include <linux/fs.h> #include <linux/hdreg.h> +#include <linux/magic.h> #include <scsi/sg.h> #ifdef __s390__ #include <asm/dasd.h> @@ -98,24 +110,6 @@ #include <sys/diskslice.h> #endif -#ifdef CONFIG_XFS -#include <xfs/xfs.h> -#endif - -//#define DEBUG_BLOCK - -#ifdef DEBUG_BLOCK -# define DEBUG_BLOCK_PRINT 1 -#else -# define DEBUG_BLOCK_PRINT 0 -#endif -#define DPRINTF(fmt, ...) \ -do { \ - if (DEBUG_BLOCK_PRINT) { \ - printf(fmt, ## __VA_ARGS__); \ - } \ -} while (0) - /* OS X does not have O_DSYNC */ #ifndef O_DSYNC #ifdef O_SYNC @@ -142,7 +136,6 @@ do { \ typedef struct BDRVRawState { int fd; - int lock_fd; bool use_lock; int type; int open_flags; @@ -152,51 +145,87 @@ typedef struct BDRVRawState { uint64_t perm; uint64_t shared_perm; -#ifdef CONFIG_XFS - bool is_xfs:1; -#endif + /* The perms bits whose corresponding bytes are already locked in + * s->fd. */ + uint64_t locked_perm; + uint64_t locked_shared_perm; + + uint64_t aio_max_batch; + + int perm_change_fd; + int perm_change_flags; + BDRVReopenState *reopen_state; + bool has_discard:1; bool has_write_zeroes:1; - bool discard_zeroes:1; bool use_linux_aio:1; - bool page_cache_inconsistent:1; + bool use_linux_io_uring:1; + int page_cache_inconsistent; /* errno from fdatasync failure */ bool has_fallocate; bool needs_alignment; + bool force_alignment; + bool drop_cache; bool check_cache_dropped; + struct { + uint64_t discard_nb_ok; + uint64_t discard_nb_failed; + uint64_t discard_bytes_ok; + } stats; PRManager *pr_mgr; } BDRVRawState; typedef struct BDRVRawReopenState { - int fd; int open_flags; + bool drop_cache; bool check_cache_dropped; } BDRVRawReopenState; -static int fd_open(BlockDriverState *bs); +static int fd_open(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + + /* this is just to ensure s->fd is sane (its called by io ops) */ + if (s->fd >= 0) { + return 0; + } + return -EIO; +} + static int64_t raw_getlength(BlockDriverState *bs); typedef struct RawPosixAIOData { BlockDriverState *bs; + int aio_type; int aio_fildes; - union { - struct iovec *aio_iov; - void *aio_ioctl_buf; - }; - int aio_niov; - uint64_t aio_nbytes; -#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ + off_t aio_offset; - int aio_type; + uint64_t aio_nbytes; + union { struct { + struct iovec *iov; + int niov; + } io; + struct { + uint64_t cmd; + void *buf; + } ioctl; + struct { int aio_fd2; off_t aio_offset2; - }; + } copy_range; struct { PreallocMode prealloc; Error **errp; - }; + } truncate; + struct { + unsigned int *nr_zones; + BlockZoneDescriptor *zones; + } zone_report; + struct { + unsigned long op; + } zone_mgmt; }; } RawPosixAIOData; @@ -204,8 +233,22 @@ typedef struct RawPosixAIOData { static int cdrom_reopen(BlockDriverState *bs); #endif +/* + * Elide EAGAIN and EACCES details when failing to lock, as this + * indicates that the specified file region is already locked by + * another process, which is considered a common scenario. + */ +#define raw_lock_error_setg_errno(errp, err, fmt, ...) \ + do { \ + if ((err) == EAGAIN || (err) == EACCES) { \ + error_setg((errp), (fmt), ## __VA_ARGS__); \ + } else { \ + error_setg_errno((errp), (err), (fmt), ## __VA_ARGS__); \ + } \ + } while (0) + #if defined(__NetBSD__) -static int raw_normalize_devicepath(const char **filename) +static int raw_normalize_devicepath(const char **filename, Error **errp) { static char namebuf[PATH_MAX]; const char *dp, *fname; @@ -214,8 +257,7 @@ static int raw_normalize_devicepath(const char **filename) fname = *filename; dp = strrchr(fname, '/'); if (lstat(fname, &sb) < 0) { - fprintf(stderr, "%s: stat failed: %s\n", - fname, strerror(errno)); + error_setg_file_open(errp, errno, fname); return -errno; } @@ -229,14 +271,13 @@ static int raw_normalize_devicepath(const char **filename) snprintf(namebuf, PATH_MAX, "%.*s/r%s", (int)(dp - fname), fname, dp + 1); } - fprintf(stderr, "%s is a block device", fname); *filename = namebuf; - fprintf(stderr, ", using %s\n", *filename); + warn_report("%s is a block device, using %s", fname, *filename); return 0; } #else -static int raw_normalize_devicepath(const char **filename) +static int raw_normalize_devicepath(const char **filename, Error **errp) { return 0; } @@ -292,6 +333,39 @@ static int probe_physical_blocksize(int fd, unsigned int *blk_size) #endif } +/* + * Returns true if no alignment restrictions are necessary even for files + * opened with O_DIRECT. + * + * raw_probe_alignment() probes the required alignment and assume that 1 means + * the probing failed, so it falls back to a safe default of 4k. This can be + * avoided if we know that byte alignment is okay for the file. + */ +static bool dio_byte_aligned(int fd) +{ +#ifdef __linux__ + struct statfs buf; + int ret; + + ret = fstatfs(fd, &buf); + if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) { + return true; + } +#endif + return false; +} + +static bool raw_needs_alignment(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + + if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) { + return true; + } + + return s->force_alignment; +} + /* Check if read is allowed with given memory buffer and length. * * This function is used to check O_DIRECT memory buffer and request alignment. @@ -321,7 +395,8 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) { BDRVRawState *s = bs->opaque; char *buf; - size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize()); + size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size()); + size_t alignments[] = {1, 512, 1024, 2048, 4096}; /* For SCSI generic devices the alignment is not really used. With buffered I/O, we don't have any restrictions. */ @@ -337,36 +412,57 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) { bs->bl.request_alignment = 0; } -#ifdef CONFIG_XFS - if (s->is_xfs) { - struct dioattr da; - if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) { - bs->bl.request_alignment = da.d_miniosz; - /* The kernel returns wrong information for d_mem */ - /* s->buf_align = da.d_mem; */ - } + +#ifdef __linux__ + /* + * The XFS ioctl definitions are shipped in extra packages that might + * not always be available. Since we just need the XFS_IOC_DIOINFO ioctl + * here, we simply use our own definition instead: + */ + struct xfs_dioattr { + uint32_t d_mem; + uint32_t d_miniosz; + uint32_t d_maxiosz; + } da; + if (ioctl(fd, _IOR('X', 30, struct xfs_dioattr), &da) >= 0) { + bs->bl.request_alignment = da.d_miniosz; + /* The kernel returns wrong information for d_mem */ + /* s->buf_align = da.d_mem; */ } #endif - /* If we could not get the sizes so far, we can only guess them */ - if (!s->buf_align) { + /* + * If we could not get the sizes so far, we can only guess them. First try + * to detect request alignment, since it is more likely to succeed. Then + * try to detect buf_align, which cannot be detected in some cases (e.g. + * Gluster). If buf_align cannot be detected, we fallback to the value of + * request_alignment. + */ + + if (!bs->bl.request_alignment) { + int i; size_t align; - buf = qemu_memalign(max_align, 2 * max_align); - for (align = 512; align <= max_align; align <<= 1) { - if (raw_is_io_aligned(fd, buf + align, max_align)) { - s->buf_align = align; + buf = qemu_memalign(max_align, max_align); + for (i = 0; i < ARRAY_SIZE(alignments); i++) { + align = alignments[i]; + if (raw_is_io_aligned(fd, buf, align)) { + /* Fallback to safe value. */ + bs->bl.request_alignment = (align != 1) ? align : max_align; break; } } qemu_vfree(buf); } - if (!bs->bl.request_alignment) { + if (!s->buf_align) { + int i; size_t align; - buf = qemu_memalign(s->buf_align, max_align); - for (align = 512; align <= max_align; align <<= 1) { - if (raw_is_io_aligned(fd, buf, align)) { - bs->bl.request_alignment = align; + buf = qemu_memalign(max_align, 2 * max_align); + for (i = 0; i < ARRAY_SIZE(alignments); i++) { + align = alignments[i]; + if (raw_is_io_aligned(fd, buf + align, max_align)) { + /* Fallback to request_alignment. */ + s->buf_align = (align != 1) ? align : bs->bl.request_alignment; break; } } @@ -379,13 +475,54 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) } } -static void raw_parse_flags(int bdrv_flags, int *open_flags) +static int check_hdev_writable(int fd) { +#if defined(BLKROGET) + /* Linux block devices can be configured "read-only" using blockdev(8). + * This is independent of device node permissions and therefore open(2) + * with O_RDWR succeeds. Actual writes fail with EPERM. + * + * bdrv_open() is supposed to fail if the disk is read-only. Explicitly + * check for read-only block devices so that Linux block devices behave + * properly. + */ + struct stat st; + int readonly = 0; + + if (fstat(fd, &st)) { + return -errno; + } + + if (!S_ISBLK(st.st_mode)) { + return 0; + } + + if (ioctl(fd, BLKROGET, &readonly) < 0) { + return -errno; + } + + if (readonly) { + return -EACCES; + } +#endif /* defined(BLKROGET) */ + return 0; +} + +static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers) +{ + bool read_write = false; assert(open_flags != NULL); *open_flags |= O_BINARY; *open_flags &= ~O_ACCMODE; - if (bdrv_flags & BDRV_O_RDWR) { + + if (bdrv_flags & BDRV_O_AUTO_RDONLY) { + read_write = has_writers; + } else if (bdrv_flags & BDRV_O_RDWR) { + read_write = true; + } + + if (read_write) { *open_flags |= O_RDWR; } else { *open_flags |= O_RDONLY; @@ -416,7 +553,12 @@ static QemuOptsList raw_runtime_opts = { { .name = "aio", .type = QEMU_OPT_STRING, - .help = "host AIO implementation (threads, native)", + .help = "host AIO implementation (threads, native, io_uring)", + }, + { + .name = "aio-max-batch", + .type = QEMU_OPT_NUMBER, + .help = "AIO max batch size (0 = auto handled by AIO backend, default: 0)", }, { .name = "locking", @@ -428,6 +570,13 @@ static QemuOptsList raw_runtime_opts = { .type = QEMU_OPT_STRING, .help = "id of persistent reservation manager object (default: none)", }, +#if defined(__linux__) + { + .name = "drop-cache", + .type = QEMU_OPT_BOOL, + .help = "invalidate page cache during live migration (default: on)", + }, +#endif { .name = "x-check-cache-dropped", .type = QEMU_OPT_BOOL, @@ -437,6 +586,8 @@ static QemuOptsList raw_runtime_opts = { }, }; +static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL }; + static int raw_open_common(BlockDriverState *bs, QDict *options, int bdrv_flags, int open_flags, bool device, Error **errp) @@ -452,24 +603,28 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, OnOffAuto locking; opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!qemu_opts_absorb_qdict(opts, options, errp)) { ret = -EINVAL; goto fail; } filename = qemu_opt_get(opts, "filename"); - ret = raw_normalize_devicepath(&filename); + ret = raw_normalize_devicepath(&filename, errp); if (ret != 0) { - error_setg_errno(errp, -ret, "Could not normalize device path"); goto fail; } - aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO) - ? BLOCKDEV_AIO_OPTIONS_NATIVE - : BLOCKDEV_AIO_OPTIONS_THREADS; + if (bdrv_flags & BDRV_O_NATIVE_AIO) { + aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE; +#ifdef CONFIG_LINUX_IO_URING + } else if (bdrv_flags & BDRV_O_IO_URING) { + aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING; +#endif + } else { + aio_default = BLOCKDEV_AIO_OPTIONS_THREADS; + } + aio = qapi_enum_parse(&BlockdevAioOptions_lookup, qemu_opt_get(opts, "aio"), aio_default, &local_err); @@ -478,7 +633,13 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = -EINVAL; goto fail; } + s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); +#ifdef CONFIG_LINUX_IO_URING + s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING); +#endif + + s->aio_max_batch = qemu_opt_get_number(opts, "aio-max-batch", 0); locking = qapi_enum_parse(&OnOffAuto_lookup, qemu_opt_get(opts, "locking"), @@ -492,11 +653,10 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, case ON_OFF_AUTO_ON: s->use_lock = true; if (!qemu_has_ofd_lock()) { - fprintf(stderr, - "File lock requested but OFD locking syscall is " - "unavailable, falling back to POSIX file locks.\n" - "Due to the implementation, locks can be lost " - "unexpectedly.\n"); + warn_report("File lock requested but OFD locking syscall is " + "unavailable, falling back to POSIX file locks"); + error_printf("Due to the implementation, locks can be lost " + "unexpectedly.\n"); } break; case ON_OFF_AUTO_OFF: @@ -519,17 +679,18 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } } + s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true); s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped", false); s->open_flags = open_flags; - raw_parse_flags(bdrv_flags, &s->open_flags); + raw_parse_flags(bdrv_flags, &s->open_flags, false); s->fd = -1; - fd = qemu_open(filename, s->open_flags, 0644); - if (fd < 0) { - ret = -errno; - error_setg_errno(errp, errno, "Could not open '%s'", filename); + fd = qemu_open(filename, s->open_flags, errp); + ret = fd < 0 ? -errno : 0; + + if (ret < 0) { if (ret == -EROFS) { ret = -EACCES; } @@ -537,34 +698,25 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } s->fd = fd; - s->lock_fd = -1; - if (s->use_lock) { - fd = qemu_open(filename, s->open_flags); - if (fd < 0) { - ret = -errno; - error_setg_errno(errp, errno, "Could not open '%s' for locking", - filename); - qemu_close(s->fd); + /* Check s->open_flags rather than bdrv_flags due to auto-read-only */ + if (s->open_flags & O_RDWR) { + ret = check_hdev_writable(s->fd); + if (ret < 0) { + error_setg_errno(errp, -ret, "The device is not writable"); goto fail; } - s->lock_fd = fd; } + s->perm = 0; s->shared_perm = BLK_PERM_ALL; #ifdef CONFIG_LINUX_AIO /* Currently Linux does AIO only for files opened with O_DIRECT */ - if (s->use_linux_aio) { - if (!(s->open_flags & O_DIRECT)) { - error_setg(errp, "aio=native was specified, but it requires " - "cache.direct=on, which was not specified."); - ret = -EINVAL; - goto fail; - } - if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) { - error_prepend(errp, "Unable to use native AIO: "); - goto fail; - } + if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) { + error_setg(errp, "aio=native was specified, but it requires " + "cache.direct=on, which was not specified."); + ret = -EINVAL; + goto fail; } #else if (s->use_linux_aio) { @@ -575,11 +727,17 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif /* !defined(CONFIG_LINUX_AIO) */ +#ifndef CONFIG_LINUX_IO_URING + if (s->use_linux_io_uring) { + error_setg(errp, "aio=io_uring was specified, but is not supported " + "in this build."); + ret = -EINVAL; + goto fail; + } +#endif /* !defined(CONFIG_LINUX_IO_URING) */ + s->has_discard = true; s->has_write_zeroes = true; - if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { - s->needs_alignment = true; - } if (fstat(s->fd, &st) < 0) { ret = -errno; @@ -588,44 +746,43 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } if (!device) { - if (S_ISBLK(st.st_mode)) { - warn_report("Opening a block device as a file using the '%s' " - "driver is deprecated", bs->drv->format_name); - } else if (S_ISCHR(st.st_mode)) { - warn_report("Opening a character device as a file using the '%s' " - "driver is deprecated", bs->drv->format_name); - } else if (!S_ISREG(st.st_mode)) { - error_setg(errp, "A regular file was expected by the '%s' driver, " - "but something else was given", bs->drv->format_name); + if (!S_ISREG(st.st_mode)) { + error_setg(errp, "'%s' driver requires '%s' to be a regular file", + bs->drv->format_name, bs->filename); ret = -EINVAL; goto fail; } else { - s->discard_zeroes = true; s->has_fallocate = true; } } else { if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) { - error_setg(errp, "'%s' driver expects either " - "a character or block device", bs->drv->format_name); + error_setg(errp, "'%s' driver requires '%s' to be either " + "a character or block device", + bs->drv->format_name, bs->filename); ret = -EINVAL; goto fail; } } +#ifdef CONFIG_BLKZONED + /* + * The kernel page cache does not reliably work for writes to SWR zones + * of zoned block device because it can not guarantee the order of writes. + */ + if ((bs->bl.zoned != BLK_Z_NONE) && + (!(s->open_flags & O_DIRECT))) { + error_setg(errp, "The driver supports zoned devices, and it requires " + "cache.direct=on, which was not specified."); + return -EINVAL; /* No host kernel page cache */ + } +#endif if (S_ISBLK(st.st_mode)) { -#ifdef BLKDISCARDZEROES - unsigned int arg; - if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { - s->discard_zeroes = true; - } -#endif #ifdef __linux__ /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do * not rely on the contents of discarded blocks unless using O_DIRECT. * Same for BLKZEROOUT. */ if (!(bs->open_flags & BDRV_O_NOCACHE)) { - s->discard_zeroes = false; s->has_write_zeroes = false; } #endif @@ -638,19 +795,21 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, * so QEMU makes sure all IO operations on the device are aligned * to sector size, or else FreeBSD will reject them with EINVAL. */ - s->needs_alignment = true; + s->force_alignment = true; } #endif + s->needs_alignment = raw_needs_alignment(bs); -#ifdef CONFIG_XFS - if (platform_test_xfs_fd(s->fd)) { - s->is_xfs = true; + bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; + if (S_ISREG(st.st_mode)) { + /* When extending regular files, we get zeros from the OS */ + bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE; } -#endif - - bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; ret = 0; fail: + if (ret < 0 && s->fd != -1) { + qemu_close(s->fd); + } if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { unlink(filename); } @@ -680,43 +839,74 @@ typedef enum { * file; if @unlock == true, also unlock the unneeded bytes. * @shared_perm_lock_bits is the mask of all permissions that are NOT shared. */ -static int raw_apply_lock_bytes(int fd, +static int raw_apply_lock_bytes(BDRVRawState *s, int fd, uint64_t perm_lock_bits, uint64_t shared_perm_lock_bits, bool unlock, Error **errp) { int ret; int i; + uint64_t locked_perm, locked_shared_perm; + + if (s) { + locked_perm = s->locked_perm; + locked_shared_perm = s->locked_shared_perm; + } else { + /* + * We don't have the previous bits, just lock/unlock for each of the + * requested bits. + */ + if (unlock) { + locked_perm = BLK_PERM_ALL; + locked_shared_perm = BLK_PERM_ALL; + } else { + locked_perm = 0; + locked_shared_perm = 0; + } + } PERM_FOREACH(i) { int off = RAW_LOCK_PERM_BASE + i; - if (perm_lock_bits & (1ULL << i)) { + uint64_t bit = (1ULL << i); + if ((perm_lock_bits & bit) && !(locked_perm & bit)) { ret = qemu_lock_fd(fd, off, 1, false); if (ret) { - error_setg(errp, "Failed to lock byte %d", off); + raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d", + off); return ret; + } else if (s) { + s->locked_perm |= bit; } - } else if (unlock) { + } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) { ret = qemu_unlock_fd(fd, off, 1); if (ret) { - error_setg(errp, "Failed to unlock byte %d", off); + error_setg_errno(errp, -ret, "Failed to unlock byte %d", off); return ret; + } else if (s) { + s->locked_perm &= ~bit; } } } PERM_FOREACH(i) { int off = RAW_LOCK_SHARED_BASE + i; - if (shared_perm_lock_bits & (1ULL << i)) { + uint64_t bit = (1ULL << i); + if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) { ret = qemu_lock_fd(fd, off, 1, false); if (ret) { - error_setg(errp, "Failed to lock byte %d", off); + raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d", + off); return ret; + } else if (s) { + s->locked_shared_perm |= bit; } - } else if (unlock) { + } else if (unlock && (locked_shared_perm & bit) && + !(shared_perm_lock_bits & bit)) { ret = qemu_unlock_fd(fd, off, 1); if (ret) { - error_setg(errp, "Failed to unlock byte %d", off); + error_setg_errno(errp, -ret, "Failed to unlock byte %d", off); return ret; + } else if (s) { + s->locked_shared_perm &= ~bit; } } } @@ -737,9 +927,10 @@ static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm, ret = qemu_lock_fd_test(fd, off, 1, true); if (ret) { char *perm_name = bdrv_perm_names(p); - error_setg(errp, - "Failed to get \"%s\" lock", - perm_name); + + raw_lock_error_setg_errno(errp, -ret, + "Failed to get \"%s\" lock", + perm_name); g_free(perm_name); return ret; } @@ -752,9 +943,10 @@ static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm, ret = qemu_lock_fd_test(fd, off, 1, true); if (ret) { char *perm_name = bdrv_perm_names(p); - error_setg(errp, - "Failed to get shared \"%s\" lock", - perm_name); + + raw_lock_error_setg_errno(errp, -ret, + "Failed to get shared \"%s\" lock", + perm_name); g_free(perm_name); return ret; } @@ -780,15 +972,25 @@ static int raw_handle_perm_lock(BlockDriverState *bs, return 0; } - assert(s->lock_fd > 0); - switch (op) { case RAW_PL_PREPARE: - ret = raw_apply_lock_bytes(s->lock_fd, s->perm | new_perm, + if ((s->perm | new_perm) == s->perm && + (s->shared_perm & new_shared) == s->shared_perm) + { + /* + * We are going to unlock bytes, it should not fail. If it fail due + * to some fs-dependent permission-unrelated reasons (which occurs + * sometimes on NFS and leads to abort in bdrv_replace_child) we + * can't prevent such errors by any check here. And we ignore them + * anyway in ABORT and COMMIT. + */ + return 0; + } + ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm, ~s->shared_perm | ~new_shared, false, errp); if (!ret) { - ret = raw_check_lock_bytes(s->lock_fd, new_perm, new_shared, errp); + ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp); if (!ret) { return 0; } @@ -796,78 +998,67 @@ static int raw_handle_perm_lock(BlockDriverState *bs, "Is another process using the image [%s]?\n", bs->filename); } - op = RAW_PL_ABORT; /* fall through to unlock bytes. */ case RAW_PL_ABORT: - raw_apply_lock_bytes(s->lock_fd, s->perm, ~s->shared_perm, + raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm, true, &local_err); if (local_err) { /* Theoretically the above call only unlocks bytes and it cannot * fail. Something weird happened, report it. */ - error_report_err(local_err); + warn_report_err(local_err); } break; case RAW_PL_COMMIT: - raw_apply_lock_bytes(s->lock_fd, new_perm, ~new_shared, + raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared, true, &local_err); if (local_err) { /* Theoretically the above call only unlocks bytes and it cannot * fail. Something weird happened, report it. */ - error_report_err(local_err); + warn_report_err(local_err); } break; } return ret; } -static int raw_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) +/* Sets a specific flag */ +static int fcntl_setfl(int fd, int flag) { - BDRVRawState *s; - BDRVRawReopenState *rs; - QemuOpts *opts; - int ret = 0; - Error *local_err = NULL; - - assert(state != NULL); - assert(state->bs != NULL); - - s = state->bs->opaque; - - state->opaque = g_new0(BDRVRawReopenState, 1); - rs = state->opaque; - rs->fd = -1; + int flags; - /* Handle options changes */ - opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, state->options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto out; + flags = fcntl(fd, F_GETFL); + if (flags == -1) { + return -errno; } - - rs->check_cache_dropped = - qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false); - - /* This driver's reopen function doesn't currently allow changing - * other options, so let's put them back in the original QDict and - * bdrv_reopen_prepare() will detect changes and complain. */ - qemu_opts_to_qdict(opts, state->options); - - if (s->type == FTYPE_CD) { - rs->open_flags |= O_NONBLOCK; + if (fcntl(fd, F_SETFL, flags | flag) == -1) { + return -errno; } + return 0; +} - raw_parse_flags(state->flags, &rs->open_flags); - +static int raw_reconfigure_getfd(BlockDriverState *bs, int flags, + int *open_flags, uint64_t perm, bool force_dup, + Error **errp) +{ + BDRVRawState *s = bs->opaque; + int fd = -1; + int ret; + bool has_writers = perm & + (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE); int fcntl_flags = O_APPEND | O_NONBLOCK; #ifdef O_NOATIME fcntl_flags |= O_NOATIME; #endif + *open_flags = 0; + if (s->type == FTYPE_CD) { + *open_flags |= O_NONBLOCK; + } + + raw_parse_flags(flags, open_flags, has_writers); + #ifdef O_ASYNC /* Not all operating systems have O_ASYNC, and those that don't * will not let us track the state into rs->open_flags (typically @@ -877,46 +1068,91 @@ static int raw_reopen_prepare(BDRVReopenState *state, assert((s->open_flags & O_ASYNC) == 0); #endif - if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) { + if (!force_dup && *open_flags == s->open_flags) { + /* We're lucky, the existing fd is fine */ + return s->fd; + } + + if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) { /* dup the original fd */ - rs->fd = qemu_dup(s->fd); - if (rs->fd >= 0) { - ret = fcntl_setfl(rs->fd, rs->open_flags); + fd = qemu_dup(s->fd); + if (fd >= 0) { + ret = fcntl_setfl(fd, *open_flags); if (ret) { - qemu_close(rs->fd); - rs->fd = -1; + qemu_close(fd); + fd = -1; } } } /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */ - if (rs->fd == -1) { - const char *normalized_filename = state->bs->filename; - ret = raw_normalize_devicepath(&normalized_filename); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not normalize device path"); - } else { - assert(!(rs->open_flags & O_CREAT)); - rs->fd = qemu_open(normalized_filename, rs->open_flags); - if (rs->fd == -1) { - error_setg_errno(errp, errno, "Could not reopen file"); - ret = -1; + if (fd == -1) { + const char *normalized_filename = bs->filename; + ret = raw_normalize_devicepath(&normalized_filename, errp); + if (ret >= 0) { + fd = qemu_open(normalized_filename, *open_flags, errp); + if (fd == -1) { + return -1; } } } - /* Fail already reopen_prepare() if we can't get a working O_DIRECT - * alignment with the new fd. */ - if (rs->fd != -1) { - raw_probe_alignment(state->bs, rs->fd, &local_err); - if (local_err) { - qemu_close(rs->fd); - rs->fd = -1; - error_propagate(errp, local_err); - ret = -EINVAL; + if (fd != -1 && (*open_flags & O_RDWR)) { + ret = check_hdev_writable(fd); + if (ret < 0) { + qemu_close(fd); + error_setg_errno(errp, -ret, "The device is not writable"); + return -1; } } + return fd; +} + +static int raw_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + BDRVRawState *s; + BDRVRawReopenState *rs; + QemuOpts *opts; + int ret; + + assert(state != NULL); + assert(state->bs != NULL); + + s = state->bs->opaque; + + state->opaque = g_new0(BDRVRawReopenState, 1); + rs = state->opaque; + + /* Handle options changes */ + opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); + if (!qemu_opts_absorb_qdict(opts, state->options, errp)) { + ret = -EINVAL; + goto out; + } + + rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true); + rs->check_cache_dropped = + qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false); + + /* This driver's reopen function doesn't currently allow changing + * other options, so let's put them back in the original QDict and + * bdrv_reopen_prepare() will detect changes and complain. */ + qemu_opts_to_qdict(opts, state->options); + + /* + * As part of reopen prepare we also want to create new fd by + * raw_reconfigure_getfd(). But it wants updated "perm", when in + * bdrv_reopen_multiple() .bdrv_reopen_prepare() callback called prior to + * permission update. Happily, permission update is always a part + * (a separate stage) of bdrv_reopen_multiple() so we can rely on this + * fact and reconfigure fd in raw_check_perm(). + */ + + s->reopen_state = state; + ret = 0; + out: qemu_opts_del(opts); return ret; @@ -927,118 +1163,360 @@ static void raw_reopen_commit(BDRVReopenState *state) BDRVRawReopenState *rs = state->opaque; BDRVRawState *s = state->bs->opaque; + s->drop_cache = rs->drop_cache; s->check_cache_dropped = rs->check_cache_dropped; s->open_flags = rs->open_flags; - - qemu_close(s->fd); - s->fd = rs->fd; - g_free(state->opaque); state->opaque = NULL; + + assert(s->reopen_state == state); + s->reopen_state = NULL; } static void raw_reopen_abort(BDRVReopenState *state) { BDRVRawReopenState *rs = state->opaque; + BDRVRawState *s = state->bs->opaque; /* nothing to do if NULL, we didn't get far enough */ if (rs == NULL) { return; } - if (rs->fd >= 0) { - qemu_close(rs->fd); - rs->fd = -1; - } g_free(state->opaque); state->opaque = NULL; + + assert(s->reopen_state == state); + s->reopen_state = NULL; } -static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd) +static int hdev_get_max_hw_transfer(int fd, struct stat *st) { #ifdef BLKSECTGET - int max_bytes = 0; - short max_sectors = 0; - if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) { - return max_bytes; - } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) { - return max_sectors << BDRV_SECTOR_BITS; + if (S_ISBLK(st->st_mode)) { + unsigned short max_sectors = 0; + if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) { + return max_sectors * 512; + } } else { - return -errno; + int max_bytes = 0; + if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) { + return max_bytes; + } } + return -errno; #else return -ENOSYS; #endif } -static int hdev_get_max_segments(const struct stat *st) +/* + * Get a sysfs attribute value as character string. + */ +#ifdef CONFIG_LINUX +static int get_sysfs_str_val(struct stat *st, const char *attribute, + char **val) { + g_autofree char *sysfspath = NULL; + size_t len; + + if (!S_ISBLK(st->st_mode)) { + return -ENOTSUP; + } + + sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s", + major(st->st_rdev), minor(st->st_rdev), + attribute); + if (!g_file_get_contents(sysfspath, val, &len, NULL)) { + return -ENOENT; + } + + /* The file is ended with '\n' */ + char *p; + p = *val; + if (*(p + len - 1) == '\n') { + *(p + len - 1) = '\0'; + } + return 0; +} +#endif + +#if defined(CONFIG_BLKZONED) +static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned) { + g_autofree char *val = NULL; + int ret; + + ret = get_sysfs_str_val(st, "zoned", &val); + if (ret < 0) { + return ret; + } + + if (strcmp(val, "host-managed") == 0) { + *zoned = BLK_Z_HM; + } else if (strcmp(val, "host-aware") == 0) { + *zoned = BLK_Z_HA; + } else if (strcmp(val, "none") == 0) { + *zoned = BLK_Z_NONE; + } else { + return -ENOTSUP; + } + return 0; +} +#endif /* defined(CONFIG_BLKZONED) */ + +/* + * Get a sysfs attribute value as a long integer. + */ #ifdef CONFIG_LINUX - char buf[32]; +static long get_sysfs_long_val(struct stat *st, const char *attribute) +{ + g_autofree char *str = NULL; const char *end; - char *sysfspath; + long val; int ret; - int fd = -1; - long max_segments; - sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments", - major(st->st_rdev), minor(st->st_rdev)); - fd = open(sysfspath, O_RDONLY); - if (fd == -1) { - ret = -errno; - goto out; - } - do { - ret = read(fd, buf, sizeof(buf) - 1); - } while (ret == -1 && errno == EINTR); + ret = get_sysfs_str_val(st, attribute, &str); if (ret < 0) { - ret = -errno; - goto out; - } else if (ret == 0) { - ret = -EIO; - goto out; + return ret; } - buf[ret] = 0; + /* The file is ended with '\n', pass 'end' to accept that. */ - ret = qemu_strtol(buf, &end, 10, &max_segments); - if (ret == 0 && end && *end == '\n') { - ret = max_segments; + ret = qemu_strtol(str, &end, 10, &val); + if (ret == 0 && end && *end == '\0') { + ret = val; } + return ret; +} +#endif -out: - if (fd != -1) { - close(fd); +static int hdev_get_max_segments(int fd, struct stat *st) +{ +#ifdef CONFIG_LINUX + int ret; + + if (S_ISCHR(st->st_mode)) { + if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) { + return ret; + } + return -ENOTSUP; } - g_free(sysfspath); - return ret; + return get_sysfs_long_val(st, "max_segments"); #else return -ENOTSUP; #endif } -static void raw_refresh_limits(BlockDriverState *bs, Error **errp) +#if defined(CONFIG_BLKZONED) +/* + * If the reset_all flag is true, then the wps of zone whose state is + * not readonly or offline should be all reset to the start sector. + * Else, take the real wp of the device. + */ +static int get_zones_wp(BlockDriverState *bs, int fd, int64_t offset, + unsigned int nrz, bool reset_all) { - BDRVRawState *s = bs->opaque; - struct stat st; + struct blk_zone *blkz; + size_t rep_size; + uint64_t sector = offset >> BDRV_SECTOR_BITS; + BlockZoneWps *wps = bs->wps; + unsigned int j = offset / bs->bl.zone_size; + unsigned int n = 0, i = 0; + int ret; + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); + g_autofree struct blk_zone_report *rep = NULL; - if (!fstat(s->fd, &st)) { - if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { - int ret = hdev_get_max_transfer_length(bs, s->fd); - if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) { - bs->bl.max_transfer = pow2floor(ret); - } - ret = hdev_get_max_segments(&st); - if (ret > 0) { - bs->bl.max_transfer = MIN(bs->bl.max_transfer, - ret * getpagesize()); + rep = g_malloc(rep_size); + blkz = (struct blk_zone *)(rep + 1); + while (n < nrz) { + memset(rep, 0, rep_size); + rep->sector = sector; + rep->nr_zones = nrz - n; + + do { + ret = ioctl(fd, BLKREPORTZONE, rep); + } while (ret != 0 && errno == EINTR); + if (ret != 0) { + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", + fd, offset, errno); + return -errno; + } + + if (!rep->nr_zones) { + break; + } + + for (i = 0; i < rep->nr_zones; ++i, ++n, ++j) { + /* + * The wp tracking cares only about sequential writes required and + * sequential write preferred zones so that the wp can advance to + * the right location. + * Use the most significant bit of the wp location to indicate the + * zone type: 0 for SWR/SWP zones and 1 for conventional zones. + */ + if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) { + wps->wp[j] |= 1ULL << 63; + } else { + switch(blkz[i].cond) { + case BLK_ZONE_COND_FULL: + case BLK_ZONE_COND_READONLY: + /* Zone not writable */ + wps->wp[j] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS; + break; + case BLK_ZONE_COND_OFFLINE: + /* Zone not writable nor readable */ + wps->wp[j] = (blkz[i].start) << BDRV_SECTOR_BITS; + break; + default: + if (reset_all) { + wps->wp[j] = blkz[i].start << BDRV_SECTOR_BITS; + } else { + wps->wp[j] = blkz[i].wp << BDRV_SECTOR_BITS; + } + break; + } } } + sector = blkz[i - 1].start + blkz[i - 1].len; + } + + return 0; +} + +static void update_zones_wp(BlockDriverState *bs, int fd, int64_t offset, + unsigned int nrz) +{ + if (get_zones_wp(bs, fd, offset, nrz, 0) < 0) { + error_report("update zone wp failed"); + } +} + +static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st, + Error **errp) +{ + BDRVRawState *s = bs->opaque; + BlockZoneModel zoned; + int ret; + + ret = get_sysfs_zoned_model(st, &zoned); + if (ret < 0 || zoned == BLK_Z_NONE) { + goto no_zoned; + } + bs->bl.zoned = zoned; + + ret = get_sysfs_long_val(st, "max_open_zones"); + if (ret >= 0) { + bs->bl.max_open_zones = ret; + } + + ret = get_sysfs_long_val(st, "max_active_zones"); + if (ret >= 0) { + bs->bl.max_active_zones = ret; + } + + /* + * The zoned device must at least have zone size and nr_zones fields. + */ + ret = get_sysfs_long_val(st, "chunk_sectors"); + if (ret < 0) { + error_setg_errno(errp, -ret, "Unable to read chunk_sectors " + "sysfs attribute"); + goto no_zoned; + } else if (!ret) { + error_setg(errp, "Read 0 from chunk_sectors sysfs attribute"); + goto no_zoned; + } + bs->bl.zone_size = ret << BDRV_SECTOR_BITS; + + ret = get_sysfs_long_val(st, "nr_zones"); + if (ret < 0) { + error_setg_errno(errp, -ret, "Unable to read nr_zones " + "sysfs attribute"); + goto no_zoned; + } else if (!ret) { + error_setg(errp, "Read 0 from nr_zones sysfs attribute"); + goto no_zoned; + } + bs->bl.nr_zones = ret; + + ret = get_sysfs_long_val(st, "zone_append_max_bytes"); + if (ret > 0) { + bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS; } + ret = get_sysfs_long_val(st, "physical_block_size"); + if (ret >= 0) { + bs->bl.write_granularity = ret; + } + + /* The refresh_limits() function can be called multiple times. */ + g_free(bs->wps); + bs->wps = g_malloc(sizeof(BlockZoneWps) + + sizeof(int64_t) * bs->bl.nr_zones); + ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0); + if (ret < 0) { + error_setg_errno(errp, -ret, "report wps failed"); + goto no_zoned; + } + qemu_co_mutex_init(&bs->wps->colock); + return; + +no_zoned: + bs->bl.zoned = BLK_Z_NONE; + g_free(bs->wps); + bs->wps = NULL; +} +#else /* !defined(CONFIG_BLKZONED) */ +static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st, + Error **errp) +{ + bs->bl.zoned = BLK_Z_NONE; +} +#endif /* !defined(CONFIG_BLKZONED) */ + +static void raw_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BDRVRawState *s = bs->opaque; + struct stat st; + + s->needs_alignment = raw_needs_alignment(bs); raw_probe_alignment(bs, s->fd, errp); + bs->bl.min_mem_alignment = s->buf_align; - bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize()); + bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size()); + + /* + * Maximum transfers are best effort, so it is okay to ignore any + * errors. That said, based on the man page errors in fstat would be + * very much unexpected; the only possible case seems to be ENOMEM. + */ + if (fstat(s->fd, &st)) { + return; + } + +#if defined(__APPLE__) && (__MACH__) + struct statfs buf; + + if (!fstatfs(s->fd, &buf)) { + bs->bl.opt_transfer = buf.f_iosize; + bs->bl.pdiscard_alignment = buf.f_bsize; + } +#endif + + if (bdrv_is_sg(bs) || S_ISBLK(st.st_mode)) { + int ret = hdev_get_max_hw_transfer(s->fd, &st); + + if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) { + bs->bl.max_hw_transfer = ret; + } + + ret = hdev_get_max_segments(s->fd, &st); + if (ret > 0) { + bs->bl.max_hw_iov = ret; + } + } + + raw_refresh_zoned_limits(bs, &st, errp); } static int check_for_dasd(int fd) @@ -1062,9 +1540,12 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) BDRVRawState *s = bs->opaque; int ret; - /* If DASD, get blocksizes */ + /* If DASD or zoned devices, get blocksizes */ if (check_for_dasd(s->fd) < 0) { - return -ENOTSUP; + /* zoned devices are not DASD */ + if (bs->bl.zoned == BLK_Z_NONE) { + return -ENOTSUP; + } } ret = probe_logical_blocksize(s->fd, &bsz->log); if (ret < 0) { @@ -1114,29 +1595,37 @@ static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) } #endif -static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) +#if defined(__linux__) +static int handle_aiocb_ioctl(void *opaque) { + RawPosixAIOData *aiocb = opaque; int ret; - ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); + ret = RETRY_ON_EINTR( + ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf) + ); if (ret == -1) { return -errno; } return 0; } +#endif /* linux */ -static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb) +static int handle_aiocb_flush(void *opaque) { + RawPosixAIOData *aiocb = opaque; BDRVRawState *s = aiocb->bs->opaque; int ret; if (s->page_cache_inconsistent) { - return -EIO; + return -s->page_cache_inconsistent; } ret = qemu_fdatasync(aiocb->aio_fildes); if (ret == -1) { + trace_file_flush_fdatasync_failed(errno); + /* There is no clear definition of the semantics of a failing fsync(), * so we may have to assume the worst. The sad truth is that this * assumption is correct for Linux. Some pages are now probably marked @@ -1151,7 +1640,7 @@ static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb) * Obviously, this doesn't affect O_DIRECT, which bypasses the page * cache. */ if ((s->open_flags & O_DIRECT) == 0) { - s->page_cache_inconsistent = true; + s->page_cache_inconsistent = errno; } return -errno; } @@ -1196,18 +1685,17 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) { ssize_t len; - do { - if (aiocb->aio_type & QEMU_AIO_WRITE) - len = qemu_pwritev(aiocb->aio_fildes, - aiocb->aio_iov, - aiocb->aio_niov, - aiocb->aio_offset); - else - len = qemu_preadv(aiocb->aio_fildes, - aiocb->aio_iov, - aiocb->aio_niov, - aiocb->aio_offset); - } while (len == -1 && errno == EINTR); + len = RETRY_ON_EINTR( + (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ? + qemu_pwritev(aiocb->aio_fildes, + aiocb->io.iov, + aiocb->io.niov, + aiocb->aio_offset) : + qemu_preadv(aiocb->aio_fildes, + aiocb->io.iov, + aiocb->io.niov, + aiocb->aio_offset) + ); if (len == -1) { return -errno; @@ -1227,7 +1715,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) ssize_t len; while (offset < aiocb->aio_nbytes) { - if (aiocb->aio_type & QEMU_AIO_WRITE) { + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { len = pwrite(aiocb->aio_fildes, (const char *)buf + offset, aiocb->aio_nbytes - offset, @@ -1261,8 +1749,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) return offset; } -static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) +static int handle_aiocb_rw(void *opaque) { + RawPosixAIOData *aiocb = opaque; ssize_t nbytes; char *buf; @@ -1271,8 +1760,9 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) * If there is just a single buffer, and it is properly aligned * we can just use plain pread/pwrite without any problems. */ - if (aiocb->aio_niov == 1) { - return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); + if (aiocb->io.niov == 1) { + nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base); + goto out; } /* * We have more than one iovec, and all are properly aligned. @@ -1284,7 +1774,7 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) nbytes = handle_aiocb_rw_vector(aiocb); if (nbytes == aiocb->aio_nbytes || (nbytes < 0 && nbytes != -ENOSYS)) { - return nbytes; + goto out; } preadv_present = false; } @@ -1302,32 +1792,33 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) */ buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes); if (buf == NULL) { - return -ENOMEM; + nbytes = -ENOMEM; + goto out; } if (aiocb->aio_type & QEMU_AIO_WRITE) { char *p = buf; int i; - for (i = 0; i < aiocb->aio_niov; ++i) { - memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); - p += aiocb->aio_iov[i].iov_len; + for (i = 0; i < aiocb->io.niov; ++i) { + memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len); + p += aiocb->io.iov[i].iov_len; } assert(p - buf == aiocb->aio_nbytes); } nbytes = handle_aiocb_rw_linear(aiocb, buf); - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { char *p = buf; size_t count = aiocb->aio_nbytes, copy; int i; - for (i = 0; i < aiocb->aio_niov && count; ++i) { + for (i = 0; i < aiocb->io.niov && count; ++i) { copy = count; - if (copy > aiocb->aio_iov[i].iov_len) { - copy = aiocb->aio_iov[i].iov_len; + if (copy > aiocb->io.iov[i].iov_len) { + copy = aiocb->io.iov[i].iov_len; } - memcpy(aiocb->aio_iov[i].iov_base, p, copy); + memcpy(aiocb->io.iov[i].iov_base, p, copy); assert(count >= copy); p += copy; count -= copy; @@ -1336,49 +1827,24 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) } qemu_vfree(buf); - return nbytes; -} - -#ifdef CONFIG_XFS -static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) -{ - struct xfs_flock64 fl; - int err; - - memset(&fl, 0, sizeof(fl)); - fl.l_whence = SEEK_SET; - fl.l_start = offset; - fl.l_len = bytes; - - if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) { - err = errno; - DPRINTF("cannot write zero range (%s)\n", strerror(errno)); - return -err; - } - - return 0; -} - -static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) -{ - struct xfs_flock64 fl; - int err; - - memset(&fl, 0, sizeof(fl)); - fl.l_whence = SEEK_SET; - fl.l_start = offset; - fl.l_len = bytes; - - if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { - err = errno; - DPRINTF("cannot punch hole (%s)\n", strerror(errno)); - return -err; +out: + if (nbytes == aiocb->aio_nbytes) { + return 0; + } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) { + if (aiocb->aio_type & QEMU_AIO_WRITE) { + return -EINVAL; + } else { + iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes, + 0, aiocb->aio_nbytes - nbytes); + return 0; + } + } else { + assert(nbytes < 0); + return nbytes; } - - return 0; } -#endif +#if defined(CONFIG_FALLOCATE) || defined(BLKZEROOUT) || defined(BLKDISCARD) static int translate_err(int err) { if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || @@ -1387,6 +1853,7 @@ static int translate_err(int err) } return err; } +#endif #ifdef CONFIG_FALLOCATE static int do_fallocate(int fd, int mode, off_t offset, off_t len) @@ -1410,28 +1877,32 @@ static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) } #ifdef BLKZEROOUT - do { - uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; - if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { - return 0; - } - } while (errno == EINTR); + /* The BLKZEROOUT implementation in the kernel doesn't set + * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow + * fallbacks. */ + if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) { + do { + uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; + if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { + return 0; + } + } while (errno == EINTR); - ret = translate_err(-errno); + ret = translate_err(-errno); + if (ret == -ENOTSUP) { + s->has_write_zeroes = false; + } + } #endif - if (ret == -ENOTSUP) { - s->has_write_zeroes = false; - } return ret; } -static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) +static int handle_aiocb_write_zeroes(void *opaque) { -#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS) - BDRVRawState *s = aiocb->bs->opaque; -#endif + RawPosixAIOData *aiocb = opaque; #ifdef CONFIG_FALLOCATE + BDRVRawState *s = aiocb->bs->opaque; int64_t len; #endif @@ -1439,20 +1910,21 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) return handle_aiocb_write_zeroes_block(aiocb); } -#ifdef CONFIG_XFS - if (s->is_xfs) { - return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); - } -#endif - #ifdef CONFIG_FALLOCATE_ZERO_RANGE if (s->has_write_zeroes) { int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE, aiocb->aio_offset, aiocb->aio_nbytes); - if (ret == 0 || ret != -ENOTSUP) { + if (ret == -ENOTSUP) { + s->has_write_zeroes = false; + } else if (ret == 0 || ret != -EINVAL) { return ret; } - s->has_write_zeroes = false; + /* + * Note: Some file systems do not like unaligned byte ranges, and + * return EINVAL in such a case, though they should not do it according + * to the man-page of fallocate(). Thus we simply ignore this return + * value and try the other fallbacks instead. + */ } #endif @@ -1467,6 +1939,17 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) return ret; } s->has_fallocate = false; + } else if (ret == -EINVAL) { + /* + * Some file systems like older versions of GPFS do not like un- + * aligned byte ranges, and return EINVAL in such a case, though + * they should not do it according to the man-page of fallocate(). + * Warn about the bad filesystem and try the final fallback instead. + */ + warn_report_once("Your file system is misbehaving: " + "fallocate(FALLOC_FL_PUNCH_HOLE) returned EINVAL. " + "Please report this bug to your file system " + "vendor."); } else if (ret != -ENOTSUP) { return ret; } else { @@ -1478,7 +1961,7 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) #ifdef CONFIG_FALLOCATE /* Last resort: we are trying to extend the file with zeroed data. This * can be done via fallocate(fd, 0) */ - len = bdrv_getlength(aiocb->bs); + len = raw_getlength(aiocb->bs); if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) { int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); if (ret == 0 || ret != -ENOTSUP) { @@ -1491,33 +1974,29 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) return -ENOTSUP; } -static ssize_t handle_aiocb_write_zeroes_unmap(RawPosixAIOData *aiocb) +static int handle_aiocb_write_zeroes_unmap(void *opaque) { + RawPosixAIOData *aiocb = opaque; BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque; - int ret; /* First try to write zeros and unmap at the same time */ #ifdef CONFIG_FALLOCATE_PUNCH_HOLE - ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - aiocb->aio_offset, aiocb->aio_nbytes); - if (ret != -ENOTSUP) { + int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + aiocb->aio_offset, aiocb->aio_nbytes); + switch (ret) { + case -ENOTSUP: + case -EINVAL: + case -EBUSY: + break; + default: return ret; } #endif -#ifdef CONFIG_XFS - if (s->is_xfs) { - /* xfs_discard() guarantees that the discarded area reads as all-zero - * afterwards, so we can use it here. */ - return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes); - } -#endif - /* If we couldn't manage to unmap while guaranteed that the area reads as * all-zero afterwards, just write zeroes without unmapping */ - ret = handle_aiocb_write_zeroes(aiocb); - return ret; + return handle_aiocb_write_zeroes(aiocb); } #ifndef HAVE_COPY_FILE_RANGE @@ -1534,18 +2013,161 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, } #endif -static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb) +/* + * parse_zone - Fill a zone descriptor + */ +#if defined(CONFIG_BLKZONED) +static inline int parse_zone(struct BlockZoneDescriptor *zone, + const struct blk_zone *blkz) { + zone->start = blkz->start << BDRV_SECTOR_BITS; + zone->length = blkz->len << BDRV_SECTOR_BITS; + zone->wp = blkz->wp << BDRV_SECTOR_BITS; + +#ifdef HAVE_BLK_ZONE_REP_CAPACITY + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; +#else + zone->cap = blkz->len << BDRV_SECTOR_BITS; +#endif + + switch (blkz->type) { + case BLK_ZONE_TYPE_SEQWRITE_REQ: + zone->type = BLK_ZT_SWR; + break; + case BLK_ZONE_TYPE_SEQWRITE_PREF: + zone->type = BLK_ZT_SWP; + break; + case BLK_ZONE_TYPE_CONVENTIONAL: + zone->type = BLK_ZT_CONV; + break; + default: + error_report("Unsupported zone type: 0x%x", blkz->type); + return -ENOTSUP; + } + + switch (blkz->cond) { + case BLK_ZONE_COND_NOT_WP: + zone->state = BLK_ZS_NOT_WP; + break; + case BLK_ZONE_COND_EMPTY: + zone->state = BLK_ZS_EMPTY; + break; + case BLK_ZONE_COND_IMP_OPEN: + zone->state = BLK_ZS_IOPEN; + break; + case BLK_ZONE_COND_EXP_OPEN: + zone->state = BLK_ZS_EOPEN; + break; + case BLK_ZONE_COND_CLOSED: + zone->state = BLK_ZS_CLOSED; + break; + case BLK_ZONE_COND_READONLY: + zone->state = BLK_ZS_RDONLY; + break; + case BLK_ZONE_COND_FULL: + zone->state = BLK_ZS_FULL; + break; + case BLK_ZONE_COND_OFFLINE: + zone->state = BLK_ZS_OFFLINE; + break; + default: + error_report("Unsupported zone state: 0x%x", blkz->cond); + return -ENOTSUP; + } + return 0; +} +#endif + +#if defined(CONFIG_BLKZONED) +static int handle_aiocb_zone_report(void *opaque) +{ + RawPosixAIOData *aiocb = opaque; + int fd = aiocb->aio_fildes; + unsigned int *nr_zones = aiocb->zone_report.nr_zones; + BlockZoneDescriptor *zones = aiocb->zone_report.zones; + /* zoned block devices use 512-byte sectors */ + uint64_t sector = aiocb->aio_offset / 512; + + struct blk_zone *blkz; + size_t rep_size; + unsigned int nrz; + int ret; + unsigned int n = 0, i = 0; + + nrz = *nr_zones; + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); + g_autofree struct blk_zone_report *rep = NULL; + rep = g_malloc(rep_size); + + blkz = (struct blk_zone *)(rep + 1); + while (n < nrz) { + memset(rep, 0, rep_size); + rep->sector = sector; + rep->nr_zones = nrz - n; + + do { + ret = ioctl(fd, BLKREPORTZONE, rep); + } while (ret != 0 && errno == EINTR); + if (ret != 0) { + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", + fd, sector, errno); + return -errno; + } + + if (!rep->nr_zones) { + break; + } + + for (i = 0; i < rep->nr_zones; i++, n++) { + ret = parse_zone(&zones[n], &blkz[i]); + if (ret != 0) { + return ret; + } + + /* The next report should start after the last zone reported */ + sector = blkz[i].start + blkz[i].len; + } + } + + *nr_zones = n; + return 0; +} +#endif + +#if defined(CONFIG_BLKZONED) +static int handle_aiocb_zone_mgmt(void *opaque) { + RawPosixAIOData *aiocb = opaque; + int fd = aiocb->aio_fildes; + uint64_t sector = aiocb->aio_offset / 512; + int64_t nr_sectors = aiocb->aio_nbytes / 512; + struct blk_zone_range range; + int ret; + + /* Execute the operation */ + range.sector = sector; + range.nr_sectors = nr_sectors; + do { + ret = ioctl(fd, aiocb->zone_mgmt.op, &range); + } while (ret != 0 && errno == EINTR); + + return ret < 0 ? -errno : ret; +} +#endif + +static int handle_aiocb_copy_range(void *opaque) +{ + RawPosixAIOData *aiocb = opaque; uint64_t bytes = aiocb->aio_nbytes; off_t in_off = aiocb->aio_offset; - off_t out_off = aiocb->aio_offset2; + off_t out_off = aiocb->copy_range.aio_offset2; while (bytes) { ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off, - aiocb->aio_fd2, &out_off, + aiocb->copy_range.aio_fd2, &out_off, bytes, 0); trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off, - aiocb->aio_fd2, out_off, bytes, 0, ret); + aiocb->copy_range.aio_fd2, out_off, bytes, + 0, ret); if (ret == 0) { /* No progress (e.g. when beyond EOF), let the caller fall back to * buffer I/O. */ @@ -1566,9 +2188,10 @@ static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb) return 0; } -static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) +static int handle_aiocb_discard(void *opaque) { - int ret = -EOPNOTSUPP; + RawPosixAIOData *aiocb = opaque; + int ret = -ENOTSUP; BDRVRawState *s = aiocb->bs->opaque; if (!s->has_discard) { @@ -1584,37 +2207,79 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) } } while (errno == EINTR); - ret = -errno; + ret = translate_err(-errno); #endif } else { -#ifdef CONFIG_XFS - if (s->is_xfs) { - return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes); - } -#endif - #ifdef CONFIG_FALLOCATE_PUNCH_HOLE ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, aiocb->aio_offset, aiocb->aio_nbytes); + ret = translate_err(ret); +#elif defined(__APPLE__) && (__MACH__) + fpunchhole_t fpunchhole; + fpunchhole.fp_flags = 0; + fpunchhole.reserved = 0; + fpunchhole.fp_offset = aiocb->aio_offset; + fpunchhole.fp_length = aiocb->aio_nbytes; + if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) { + ret = errno == ENODEV ? -ENOTSUP : -errno; + } else { + ret = 0; + } #endif } - ret = translate_err(ret); if (ret == -ENOTSUP) { s->has_discard = false; } return ret; } -static int handle_aiocb_truncate(RawPosixAIOData *aiocb) +/* + * Help alignment probing by allocating the first block. + * + * When reading with direct I/O from unallocated area on Gluster backed by XFS, + * reading succeeds regardless of request length. In this case we fallback to + * safe alignment which is not optimal. Allocating the first block avoids this + * fallback. + * + * fd may be opened with O_DIRECT, but we don't know the buffer alignment or + * request alignment, so we use safe values. + * + * Returns: 0 on success, -errno on failure. Since this is an optimization, + * caller may ignore failures. + */ +static int allocate_first_block(int fd, size_t max_size) { + size_t write_size = (max_size < MAX_BLOCKSIZE) + ? BDRV_SECTOR_SIZE + : MAX_BLOCKSIZE; + size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size()); + void *buf; + ssize_t n; + int ret; + + buf = qemu_memalign(max_align, write_size); + memset(buf, 0, write_size); + + n = RETRY_ON_EINTR(pwrite(fd, buf, write_size, 0)); + + ret = (n == -1) ? -errno : 0; + + qemu_vfree(buf); + return ret; +} + +static int handle_aiocb_truncate(void *opaque) +{ + RawPosixAIOData *aiocb = opaque; int result = 0; int64_t current_length = 0; char *buf = NULL; struct stat st; int fd = aiocb->aio_fildes; int64_t offset = aiocb->aio_offset; - Error **errp = aiocb->errp; + PreallocMode prealloc = aiocb->truncate.prealloc; + Error **errp = aiocb->truncate.errp; if (fstat(fd, &st) < 0) { result = -errno; @@ -1623,12 +2288,12 @@ static int handle_aiocb_truncate(RawPosixAIOData *aiocb) } current_length = st.st_size; - if (current_length > offset && aiocb->prealloc != PREALLOC_MODE_OFF) { + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { error_setg(errp, "Cannot use preallocation for shrinking files"); return -ENOTSUP; } - switch (aiocb->prealloc) { + switch (prealloc) { #ifdef CONFIG_POSIX_FALLOCATE case PREALLOC_MODE_FALLOC: /* @@ -1643,6 +2308,17 @@ static int handle_aiocb_truncate(RawPosixAIOData *aiocb) /* posix_fallocate() doesn't set errno. */ error_setg_errno(errp, -result, "Could not preallocate new data"); + } else if (current_length == 0) { + /* + * posix_fallocate() uses fallocate() if the filesystem + * supports it, or fallback to manually writing zeroes. If + * fallocate() was used, unaligned reads from the fallocated + * area in raw_probe_alignment() will succeed, hence we need to + * allocate the first block. + * + * Optimize future alignment probing; ignore failures. + */ + allocate_first_block(fd, offset); } } else { result = 0; @@ -1704,12 +2380,15 @@ static int handle_aiocb_truncate(RawPosixAIOData *aiocb) if (ftruncate(fd, offset) != 0) { result = -errno; error_setg_errno(errp, -result, "Could not resize file"); + } else if (current_length == 0 && offset > current_length) { + /* Optimize future alignment probing; ignore failures. */ + allocate_first_block(fd, offset); } return result; default: result = -ENOTSUP; error_setg(errp, "Unsupported preallocation mode: %s", - PreallocMode_str(aiocb->prealloc)); + PreallocMode_str(prealloc)); return result; } @@ -1725,169 +2404,184 @@ out: return result; } -static int aio_worker(void *arg) +static int coroutine_fn raw_thread_pool_submit(ThreadPoolFunc func, void *arg) { - RawPosixAIOData *aiocb = arg; - ssize_t ret = 0; + return thread_pool_submit_co(func, arg); +} - switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { - case QEMU_AIO_READ: - ret = handle_aiocb_rw(aiocb); - if (ret >= 0 && ret < aiocb->aio_nbytes) { - iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, - 0, aiocb->aio_nbytes - ret); +/* + * Check if all memory in this vector is sector aligned. + */ +static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) +{ + int i; + size_t alignment = bdrv_min_mem_align(bs); + size_t len = bs->bl.request_alignment; + IO_CODE(); - ret = aiocb->aio_nbytes; - } - if (ret == aiocb->aio_nbytes) { - ret = 0; - } else if (ret >= 0 && ret < aiocb->aio_nbytes) { - ret = -EINVAL; + for (i = 0; i < qiov->niov; i++) { + if ((uintptr_t) qiov->iov[i].iov_base % alignment) { + return false; } - break; - case QEMU_AIO_WRITE: - ret = handle_aiocb_rw(aiocb); - if (ret == aiocb->aio_nbytes) { - ret = 0; - } else if (ret >= 0 && ret < aiocb->aio_nbytes) { - ret = -EINVAL; + if (qiov->iov[i].iov_len % len) { + return false; } - break; - case QEMU_AIO_FLUSH: - ret = handle_aiocb_flush(aiocb); - break; - case QEMU_AIO_IOCTL: - ret = handle_aiocb_ioctl(aiocb); - break; - case QEMU_AIO_DISCARD: - ret = handle_aiocb_discard(aiocb); - break; - case QEMU_AIO_WRITE_ZEROES: - ret = handle_aiocb_write_zeroes(aiocb); - break; - case QEMU_AIO_WRITE_ZEROES | QEMU_AIO_DISCARD: - ret = handle_aiocb_write_zeroes_unmap(aiocb); - break; - case QEMU_AIO_COPY_RANGE: - ret = handle_aiocb_copy_range(aiocb); - break; - case QEMU_AIO_TRUNCATE: - ret = handle_aiocb_truncate(aiocb); - break; - default: - fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); - ret = -EINVAL; - break; } - g_free(aiocb); - return ret; + return true; } -static int paio_submit_co_full(BlockDriverState *bs, int fd, - int64_t offset, int fd2, int64_t offset2, - QEMUIOVector *qiov, - int bytes, int type) +#ifdef CONFIG_LINUX_IO_URING +static inline bool raw_check_linux_io_uring(BDRVRawState *s) { - RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); - ThreadPool *pool; - - acb->bs = bs; - acb->aio_type = type; - acb->aio_fildes = fd; - acb->aio_fd2 = fd2; - acb->aio_offset2 = offset2; - - acb->aio_nbytes = bytes; - acb->aio_offset = offset; + Error *local_err = NULL; + AioContext *ctx; - if (qiov) { - acb->aio_iov = qiov->iov; - acb->aio_niov = qiov->niov; - assert(qiov->size == bytes); + if (!s->use_linux_io_uring) { + return false; } - trace_file_paio_submit_co(offset, bytes, type); - pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_co(pool, aio_worker, acb); + ctx = qemu_get_current_aio_context(); + if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) { + error_reportf_err(local_err, "Unable to use linux io_uring, " + "falling back to thread pool: "); + s->use_linux_io_uring = false; + return false; + } + return true; } +#endif -static inline int paio_submit_co(BlockDriverState *bs, int fd, - int64_t offset, QEMUIOVector *qiov, - int bytes, int type) +#ifdef CONFIG_LINUX_AIO +static inline bool raw_check_linux_aio(BDRVRawState *s) { - return paio_submit_co_full(bs, fd, offset, -1, 0, qiov, bytes, type); + Error *local_err = NULL; + AioContext *ctx; + + if (!s->use_linux_aio) { + return false; + } + + ctx = qemu_get_current_aio_context(); + if (unlikely(!aio_setup_linux_aio(ctx, &local_err))) { + error_reportf_err(local_err, "Unable to use Linux AIO, " + "falling back to thread pool: "); + s->use_linux_aio = false; + return false; + } + return true; } +#endif -static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, +static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, uint64_t bytes, QEMUIOVector *qiov, int type) { BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; + int ret; + uint64_t offset = *offset_ptr; if (fd_open(bs) < 0) return -EIO; +#if defined(CONFIG_BLKZONED) + if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && + bs->bl.zoned != BLK_Z_NONE) { + qemu_co_mutex_lock(&bs->wps->colock); + if (type & QEMU_AIO_ZONE_APPEND) { + int index = offset / bs->bl.zone_size; + offset = bs->wps->wp[index]; + } + } +#endif /* - * Check if the underlying device requires requests to be aligned, - * and if the request we are trying to submit is aligned or not. - * If this is the case tell the low-level driver that it needs - * to copy the buffer. + * When using O_DIRECT, the request must be aligned to be able to use + * either libaio or io_uring interface. If not fail back to regular thread + * pool read/write code which emulates this for us if we + * set QEMU_AIO_MISALIGNED. */ - if (s->needs_alignment) { - if (!bdrv_qiov_is_aligned(bs, qiov)) { - type |= QEMU_AIO_MISALIGNED; + if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) { + type |= QEMU_AIO_MISALIGNED; +#ifdef CONFIG_LINUX_IO_URING + } else if (raw_check_linux_io_uring(s)) { + assert(qiov->size == bytes); + ret = luring_co_submit(bs, s->fd, offset, qiov, type); + goto out; +#endif #ifdef CONFIG_LINUX_AIO - } else if (s->use_linux_aio) { - LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); - assert(qiov->size == bytes); - return laio_co_submit(bs, aio, s->fd, offset, qiov, type); + } else if (raw_check_linux_aio(s)) { + assert(qiov->size == bytes); + ret = laio_co_submit(s->fd, offset, qiov, type, + s->aio_max_batch); + goto out; #endif - } } - return paio_submit_co(bs, s->fd, offset, qiov, bytes, type); -} + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = type, + .aio_offset = offset, + .aio_nbytes = bytes, + .io = { + .iov = qiov->iov, + .niov = qiov->niov, + }, + }; -static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, - int flags) -{ - return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ); -} + assert(qiov->size == bytes); + ret = raw_thread_pool_submit(handle_aiocb_rw, &acb); + goto out; /* Avoid the compiler err of unused label */ -static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, - int flags) -{ - assert(flags == 0); - return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); -} +out: +#if defined(CONFIG_BLKZONED) + if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && + bs->bl.zoned != BLK_Z_NONE) { + BlockZoneWps *wps = bs->wps; + if (ret == 0) { + uint64_t *wp = &wps->wp[offset / bs->bl.zone_size]; + if (!BDRV_ZT_IS_CONV(*wp)) { + if (type & QEMU_AIO_ZONE_APPEND) { + *offset_ptr = *wp; + trace_zbd_zone_append_complete(bs, *offset_ptr + >> BDRV_SECTOR_BITS); + } + /* Advance the wp if needed */ + if (offset + bytes > *wp) { + *wp = offset + bytes; + } + } + } else { + /* + * write and append write are not allowed to cross zone boundaries + */ + update_zones_wp(bs, s->fd, offset, 1); + } -static void raw_aio_plug(BlockDriverState *bs) -{ -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - if (s->use_linux_aio) { - LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); - laio_io_plug(bs, aio); + qemu_co_mutex_unlock(&wps->colock); } #endif + return ret; } -static void raw_aio_unplug(BlockDriverState *bs) +static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - if (s->use_linux_aio) { - LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); - laio_io_unplug(bs, aio); - } -#endif + return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ); } -static int raw_co_flush_to_disk(BlockDriverState *bs) +static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE); +} + +static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; int ret; ret = fd_open(bs); @@ -1895,23 +2589,18 @@ static int raw_co_flush_to_disk(BlockDriverState *bs) return ret; } - return paio_submit_co(bs, s->fd, 0, NULL, 0, QEMU_AIO_FLUSH); -} + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_FLUSH, + }; -static void raw_aio_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - if (s->use_linux_aio) { - Error *local_err; - if (!aio_setup_linux_aio(new_context, &local_err)) { - error_reportf_err(local_err, "Unable to use native AIO, " - "falling back to thread pool: "); - s->use_linux_aio = false; - } +#ifdef CONFIG_LINUX_IO_URING + if (raw_check_linux_io_uring(s)) { + return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH); } #endif + return raw_thread_pool_submit(handle_aiocb_flush, &acb); } static void raw_close(BlockDriverState *bs) @@ -1919,13 +2608,12 @@ static void raw_close(BlockDriverState *bs) BDRVRawState *s = bs->opaque; if (s->fd >= 0) { +#if defined(CONFIG_BLKZONED) + g_free(bs->wps); +#endif qemu_close(s->fd); s->fd = -1; } - if (s->lock_fd >= 0) { - qemu_close(s->lock_fd); - s->lock_fd = -1; - } } /** @@ -1938,25 +2626,25 @@ static int coroutine_fn raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, PreallocMode prealloc, Error **errp) { - RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); - ThreadPool *pool; + RawPosixAIOData acb; - *acb = (RawPosixAIOData) { + acb = (RawPosixAIOData) { .bs = bs, .aio_fildes = fd, .aio_type = QEMU_AIO_TRUNCATE, .aio_offset = offset, - .prealloc = prealloc, - .errp = errp, + .truncate = { + .prealloc = prealloc, + .errp = errp, + }, }; - /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */ - pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_co(pool, aio_worker, acb); + return raw_thread_pool_submit(handle_aiocb_truncate, &acb); } static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, - PreallocMode prealloc, Error **errp) + bool exact, PreallocMode prealloc, + BdrvRequestFlags flags, Error **errp) { BDRVRawState *s = bs->opaque; struct stat st; @@ -1969,6 +2657,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, } if (S_ISREG(st.st_mode)) { + /* Always resizes to the exact @offset */ return raw_regular_truncate(bs, s->fd, offset, prealloc, errp); } @@ -1979,7 +2668,12 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, } if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { - if (offset > raw_getlength(bs)) { + int64_t cur_length = raw_getlength(bs); + + if (offset != cur_length && exact) { + error_setg(errp, "Cannot resize device files"); + return -ENOTSUP; + } else if (offset > cur_length) { error_setg(errp, "Cannot grow device files"); return -EINVAL; } @@ -2086,39 +2780,37 @@ static int64_t raw_getlength(BlockDriverState *bs) again: #endif if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) { + size = 0; #ifdef DIOCGMEDIASIZE - if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) -#elif defined(DIOCGPART) - { - struct partinfo pi; - if (ioctl(fd, DIOCGPART, &pi) == 0) - size = pi.media_size; - else - size = 0; + if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) { + size = 0; } - if (size == 0) #endif -#if defined(__APPLE__) && defined(__MACH__) - { +#ifdef DIOCGPART + if (size == 0) { + struct partinfo pi; + if (ioctl(fd, DIOCGPART, &pi) == 0) { + size = pi.media_size; + } + } +#endif +#if defined(DKIOCGETBLOCKCOUNT) && defined(DKIOCGETBLOCKSIZE) + if (size == 0) { uint64_t sectors = 0; uint32_t sector_size = 0; if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0 && ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) { size = sectors * sector_size; - } else { - size = lseek(fd, 0LL, SEEK_END); - if (size < 0) { - return -errno; - } } } -#else - size = lseek(fd, 0LL, SEEK_END); +#endif + if (size == 0) { + size = lseek(fd, 0LL, SEEK_END); + } if (size < 0) { return -errno; } -#endif #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) switch(s->type) { case FTYPE_CD: @@ -2160,7 +2852,12 @@ static int64_t raw_getlength(BlockDriverState *bs) } #endif -static int64_t raw_get_allocated_file_size(BlockDriverState *bs) +static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs) +{ + return raw_getlength(bs); +} + +static int64_t coroutine_fn raw_co_get_allocated_file_size(BlockDriverState *bs) { struct stat st; BDRVRawState *s = bs->opaque; @@ -2190,12 +2887,19 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) if (!file_opts->has_preallocation) { file_opts->preallocation = PREALLOC_MODE_OFF; } + if (!file_opts->has_extent_size_hint) { + file_opts->extent_size_hint = 1 * MiB; + } + if (file_opts->extent_size_hint > UINT32_MAX) { + result = -EINVAL; + error_setg(errp, "Extent size hint is too large"); + goto out; + } /* Create file */ - fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644); + fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp); if (fd < 0) { result = -errno; - error_setg_errno(errp, -result, "Could not create file"); goto out; } @@ -2213,7 +2917,7 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE; /* Step one: Take locks */ - result = raw_apply_lock_bytes(fd, perm, ~shared, false, errp); + result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp); if (result < 0) { goto out_close; } @@ -2247,6 +2951,27 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) } #endif } +#ifdef FS_IOC_FSSETXATTR + /* + * Try to set the extent size hint. Failure is not fatal, and a warning is + * only printed if the option was explicitly specified. + */ + { + struct fsxattr attr; + result = ioctl(fd, FS_IOC_FSGETXATTR, &attr); + if (result == 0) { + attr.fsx_xflags |= FS_XFLAG_EXTSIZE; + attr.fsx_extsize = file_opts->extent_size_hint; + result = ioctl(fd, FS_IOC_FSSETXATTR, &attr); + } + if (result < 0 && file_opts->has_extent_size_hint && + file_opts->extent_size_hint) + { + warn_report("Failed to set extent size hint: %s", + strerror(errno)); + } + } +#endif /* Resize and potentially preallocate the file to the desired * final size */ @@ -2257,13 +2982,13 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) } out_unlock: - raw_apply_lock_bytes(fd, 0, 0, true, &local_err); + raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err); if (local_err) { /* The above call should not fail, and if it does, that does * not mean the whole creation operation has failed. So * report it the user for their convenience, but do not report * it to the caller. */ - error_report_err(local_err); + warn_report_err(local_err); } out_close: @@ -2275,11 +3000,14 @@ out: return result; } -static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, - Error **errp) +static int coroutine_fn GRAPH_RDLOCK +raw_co_create_opts(BlockDriver *drv, const char *filename, + QemuOpts *opts, Error **errp) { BlockdevCreateOptions options; int64_t total_size = 0; + int64_t extent_size_hint = 0; + bool has_extent_size_hint = false; bool nocow = false; PreallocMode prealloc; char *buf = NULL; @@ -2291,6 +3019,11 @@ static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, /* Read out options */ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), BDRV_SECTOR_SIZE); + if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) { + has_extent_size_hint = true; + extent_size_hint = + qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1); + } nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); prealloc = qapi_enum_parse(&PreallocMode_lookup, buf, @@ -2310,11 +3043,35 @@ static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, .preallocation = prealloc, .has_nocow = true, .nocow = nocow, + .has_extent_size_hint = has_extent_size_hint, + .extent_size_hint = extent_size_hint, }, }; return raw_co_create(&options, errp); } +static int coroutine_fn raw_co_delete_file(BlockDriverState *bs, + Error **errp) +{ + struct stat st; + int ret; + + if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) { + error_setg_errno(errp, ENOENT, "%s is not a regular file", + bs->filename); + return -ENOENT; + } + + ret = unlink(bs->filename); + if (ret < 0) { + ret = -errno; + error_setg_errno(errp, -ret, "Error when deleting file %s", + bs->filename); + } + + return ret; +} + /* * Find allocation range in @bs around offset @start. * May change underlying file descriptor's file offset. @@ -2419,7 +3176,8 @@ static int find_allocation(BlockDriverState *bs, off_t start, * the specified offset) that are known to be in the same * allocated/unallocated state. * - * 'bytes' is the max value 'pnum' should be set to. + * 'bytes' is a soft cap for 'pnum'. If the information is free, 'pnum' may + * well exceed it. */ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, bool want_zero, @@ -2431,6 +3189,8 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, off_t data = 0, hole = 0; int ret; + assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment)); + ret = fd_open(bs); if (ret < 0) { return ret; @@ -2455,12 +3215,26 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, } else if (data == offset) { /* On a data extent, compute bytes to the end of the extent, * possibly including a partial sector at EOF. */ - *pnum = MIN(bytes, hole - offset); + *pnum = hole - offset; + + /* + * We are not allowed to return partial sectors, though, so + * round up if necessary. + */ + if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) { + int64_t file_length = raw_getlength(bs); + if (file_length > 0) { + /* Ignore errors, this is just a safeguard */ + assert(hole == file_length); + } + *pnum = ROUND_UP(*pnum, bs->bl.request_alignment); + } + ret = BDRV_BLOCK_DATA; } else { /* On a hole, compute bytes to the beginning of the next extent. */ assert(hole == offset); - *pnum = MIN(bytes, data - offset); + *pnum = data - offset; ret = BDRV_BLOCK_ZERO; } *map = offset; @@ -2521,10 +3295,13 @@ static void check_cache_dropped(BlockDriverState *bs, Error **errp) vec_end = DIV_ROUND_UP(length, page_size); for (i = 0; i < vec_end; i++) { if (vec[i] & 0x1) { - error_setg(errp, "page cache still in use!"); break; } } + if (i < vec_end) { + error_setg(errp, "page cache still in use!"); + break; + } } if (window) { @@ -2535,8 +3312,8 @@ static void check_cache_dropped(BlockDriverState *bs, Error **errp) } #endif /* __linux__ */ -static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs, - Error **errp) +static void coroutine_fn GRAPH_RDLOCK +raw_co_invalidate_cache(BlockDriverState *bs, Error **errp) { BDRVRawState *s = bs->opaque; int ret; @@ -2547,6 +3324,10 @@ static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs, return; } + if (!s->drop_cache) { + return; + } + if (s->open_flags & O_DIRECT) { return; /* No host kernel page cache */ } @@ -2582,36 +3363,350 @@ static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs, #endif /* !__linux__ */ } +static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) +{ + if (ret) { + s->stats.discard_nb_failed++; + } else { + s->stats.discard_nb_ok++; + s->stats.discard_bytes_ok += nbytes; + } +} + +/* + * zone report - Get a zone block device's information in the form + * of an array of zone descriptors. + * zones is an array of zone descriptors to hold zone information on reply; + * offset can be any byte within the entire size of the device; + * nr_zones is the maximum number of sectors the command should operate on. + */ +#if defined(CONFIG_BLKZONED) +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones) { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_ZONE_REPORT, + .aio_offset = offset, + .zone_report = { + .nr_zones = nr_zones, + .zones = zones, + }, + }; + + trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS); + return raw_thread_pool_submit(handle_aiocb_zone_report, &acb); +} +#endif + +/* + * zone management operations - Execute an operation on a zone + */ +#if defined(CONFIG_BLKZONED) +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len) { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; + int64_t zone_size, zone_size_mask; + const char *op_name; + unsigned long zo; + int ret; + BlockZoneWps *wps = bs->wps; + int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS; + + zone_size = bs->bl.zone_size; + zone_size_mask = zone_size - 1; + if (offset & zone_size_mask) { + error_report("sector offset %" PRId64 " is not aligned to zone size " + "%" PRId64 "", offset / 512, zone_size / 512); + return -EINVAL; + } + + if (((offset + len) < capacity && len & zone_size_mask) || + offset + len > capacity) { + error_report("number of sectors %" PRId64 " is not aligned to zone size" + " %" PRId64 "", len / 512, zone_size / 512); + return -EINVAL; + } + + uint32_t i = offset / bs->bl.zone_size; + uint32_t nrz = len / bs->bl.zone_size; + uint64_t *wp = &wps->wp[i]; + if (BDRV_ZT_IS_CONV(*wp) && len != capacity) { + error_report("zone mgmt operations are not allowed for conventional zones"); + return -EIO; + } + + switch (op) { + case BLK_ZO_OPEN: + op_name = "BLKOPENZONE"; + zo = BLKOPENZONE; + break; + case BLK_ZO_CLOSE: + op_name = "BLKCLOSEZONE"; + zo = BLKCLOSEZONE; + break; + case BLK_ZO_FINISH: + op_name = "BLKFINISHZONE"; + zo = BLKFINISHZONE; + break; + case BLK_ZO_RESET: + op_name = "BLKRESETZONE"; + zo = BLKRESETZONE; + break; + default: + error_report("Unsupported zone op: 0x%x", op); + return -ENOTSUP; + } + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_ZONE_MGMT, + .aio_offset = offset, + .aio_nbytes = len, + .zone_mgmt = { + .op = zo, + }, + }; + + trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS, + len >> BDRV_SECTOR_BITS); + ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb); + if (ret != 0) { + update_zones_wp(bs, s->fd, offset, nrz); + error_report("ioctl %s failed %d", op_name, ret); + return ret; + } + + if (zo == BLKRESETZONE && len == capacity) { + ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 1); + if (ret < 0) { + error_report("reporting single wp failed"); + return ret; + } + } else if (zo == BLKRESETZONE) { + for (unsigned int j = 0; j < nrz; ++j) { + wp[j] = offset + j * zone_size; + } + } else if (zo == BLKFINISHZONE) { + for (unsigned int j = 0; j < nrz; ++j) { + /* The zoned device allows the last zone smaller that the + * zone size. */ + wp[j] = MIN(offset + (j + 1) * zone_size, offset + len); + } + } + + return ret; +} +#endif + +#if defined(CONFIG_BLKZONED) +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, + int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { + assert(flags == 0); + int64_t zone_size_mask = bs->bl.zone_size - 1; + int64_t iov_len = 0; + int64_t len = 0; + + if (*offset & zone_size_mask) { + error_report("sector offset %" PRId64 " is not aligned to zone size " + "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512); + return -EINVAL; + } + + int64_t wg = bs->bl.write_granularity; + int64_t wg_mask = wg - 1; + for (int i = 0; i < qiov->niov; i++) { + iov_len = qiov->iov[i].iov_len; + if (iov_len & wg_mask) { + error_report("len of IOVector[%d] %" PRId64 " is not aligned to " + "block size %" PRId64 "", i, iov_len, wg); + return -EINVAL; + } + len += iov_len; + } + + trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS); + return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND); +} +#endif + static coroutine_fn int -raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) +raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, + bool blkdev) { BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; + int ret; + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_DISCARD, + .aio_offset = offset, + .aio_nbytes = bytes, + }; + + if (blkdev) { + acb.aio_type |= QEMU_AIO_BLKDEV; + } - return paio_submit_co(bs, s->fd, offset, NULL, bytes, QEMU_AIO_DISCARD); + ret = raw_thread_pool_submit(handle_aiocb_discard, &acb); + raw_account_discard(s, bytes, ret); + return ret; } -static int coroutine_fn raw_co_pwrite_zeroes( - BlockDriverState *bs, int64_t offset, - int bytes, BdrvRequestFlags flags) +static coroutine_fn int +raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) +{ + return raw_do_pdiscard(bs, offset, bytes, false); +} + +static int coroutine_fn +raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes, + BdrvRequestFlags flags, bool blkdev) { BDRVRawState *s = bs->opaque; - int operation = QEMU_AIO_WRITE_ZEROES; + RawPosixAIOData acb; + ThreadPoolFunc *handler; + +#ifdef CONFIG_FALLOCATE + if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) { + BdrvTrackedRequest *req; + + /* + * This is a workaround for a bug in the Linux XFS driver, + * where writes submitted through the AIO interface will be + * discarded if they happen beyond a concurrently running + * fallocate() that increases the file length (i.e., both the + * write and the fallocate() happen beyond the EOF). + * + * To work around it, we extend the tracked request for this + * zero write until INT64_MAX (effectively infinity), and mark + * it as serializing. + * + * We have to enable this workaround for all filesystems and + * AIO modes (not just XFS with aio=native), because for + * remote filesystems we do not know the host configuration. + */ + + req = bdrv_co_get_self_request(bs); + assert(req); + assert(req->type == BDRV_TRACKED_WRITE); + assert(req->offset <= offset); + assert(req->offset + req->bytes >= offset + bytes); + + req->bytes = BDRV_MAX_LENGTH - req->offset; + + bdrv_check_request(req->offset, req->bytes, &error_abort); + + bdrv_make_request_serialising(req, bs->bl.request_alignment); + } +#endif + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_WRITE_ZEROES, + .aio_offset = offset, + .aio_nbytes = bytes, + }; + + if (blkdev) { + acb.aio_type |= QEMU_AIO_BLKDEV; + } + if (flags & BDRV_REQ_NO_FALLBACK) { + acb.aio_type |= QEMU_AIO_NO_FALLBACK; + } if (flags & BDRV_REQ_MAY_UNMAP) { - operation |= QEMU_AIO_DISCARD; + acb.aio_type |= QEMU_AIO_DISCARD; + handler = handle_aiocb_write_zeroes_unmap; + } else { + handler = handle_aiocb_write_zeroes; } - return paio_submit_co(bs, s->fd, offset, NULL, bytes, operation); + return raw_thread_pool_submit(handler, &acb); } -static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +static int coroutine_fn raw_co_pwrite_zeroes( + BlockDriverState *bs, int64_t offset, + int64_t bytes, BdrvRequestFlags flags) { - BDRVRawState *s = bs->opaque; + return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false); +} - bdi->unallocated_blocks_are_zero = s->discard_zeroes; +static int coroutine_fn +raw_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ return 0; } +static ImageInfoSpecific *raw_get_specific_info(BlockDriverState *bs, + Error **errp) +{ + ImageInfoSpecificFile *file_info = g_new0(ImageInfoSpecificFile, 1); + ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1); + + *spec_info = (ImageInfoSpecific){ + .type = IMAGE_INFO_SPECIFIC_KIND_FILE, + .u.file.data = file_info, + }; + +#ifdef FS_IOC_FSGETXATTR + { + BDRVRawState *s = bs->opaque; + struct fsxattr attr; + int ret; + + ret = ioctl(s->fd, FS_IOC_FSGETXATTR, &attr); + if (!ret && attr.fsx_extsize != 0) { + file_info->has_extent_size_hint = true; + file_info->extent_size_hint = attr.fsx_extsize; + } + } +#endif + + return spec_info; +} + +static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + return (BlockStatsSpecificFile) { + .discard_nb_ok = s->stats.discard_nb_ok, + .discard_nb_failed = s->stats.discard_nb_failed, + .discard_bytes_ok = s->stats.discard_bytes_ok, + }; +} + +static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs) +{ + BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1); + + stats->driver = BLOCKDEV_DRIVER_FILE; + stats->u.file = get_blockstats_specific_file(bs); + + return stats; +} + +#if defined(HAVE_HOST_BLOCK_DEVICE) +static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs) +{ + BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1); + + stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE; + stats->u.host_device = get_blockstats_specific_file(bs); + + return stats; +} +#endif /* HAVE_HOST_BLOCK_DEVICE */ + static QemuOptsList raw_create_opts = { .name = "raw-create-opts", .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), @@ -2629,7 +3724,16 @@ static QemuOptsList raw_create_opts = { { .name = BLOCK_OPT_PREALLOC, .type = QEMU_OPT_STRING, - .help = "Preallocation mode (allowed values: off, falloc, full)" + .help = "Preallocation mode (allowed values: off" +#ifdef CONFIG_POSIX_FALLOCATE + ", falloc" +#endif + ", full)" + }, + { + .name = BLOCK_OPT_EXTENT_SIZE_HINT, + .type = QEMU_OPT_SIZE, + .help = "Extent size hint for the image file, 0 to disable" }, { /* end of list */ } } @@ -2638,12 +3742,72 @@ static QemuOptsList raw_create_opts = { static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared, Error **errp) { - return raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp); + BDRVRawState *s = bs->opaque; + int input_flags = s->reopen_state ? s->reopen_state->flags : bs->open_flags; + int open_flags; + int ret; + + /* We may need a new fd if auto-read-only switches the mode */ + ret = raw_reconfigure_getfd(bs, input_flags, &open_flags, perm, + false, errp); + if (ret < 0) { + return ret; + } else if (ret != s->fd) { + Error *local_err = NULL; + + /* + * Fail already check_perm() if we can't get a working O_DIRECT + * alignment with the new fd. + */ + raw_probe_alignment(bs, ret, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return -EINVAL; + } + + s->perm_change_fd = ret; + s->perm_change_flags = open_flags; + } + + /* Prepare permissions on old fd to avoid conflicts between old and new, + * but keep everything locked that new will need. */ + ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp); + if (ret < 0) { + goto fail; + } + + /* Copy locks to the new fd */ + if (s->perm_change_fd && s->use_lock) { + ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared, + false, errp); + if (ret < 0) { + raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL); + goto fail; + } + } + return 0; + +fail: + if (s->perm_change_fd) { + qemu_close(s->perm_change_fd); + } + s->perm_change_fd = 0; + return ret; } static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared) { BDRVRawState *s = bs->opaque; + + /* For reopen, we have already switched to the new fd (.bdrv_set_perm is + * called after .bdrv_reopen_commit) */ + if (s->perm_change_fd && s->fd != s->perm_change_fd) { + qemu_close(s->fd); + s->fd = s->perm_change_fd; + s->open_flags = s->perm_change_flags; + } + s->perm_change_fd = 0; + raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL); s->perm = perm; s->shared_perm = shared; @@ -2651,27 +3815,35 @@ static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared) static void raw_abort_perm_update(BlockDriverState *bs) { + BDRVRawState *s = bs->opaque; + + /* For reopen, .bdrv_reopen_abort is called afterwards and will close + * the file descriptor. */ + if (s->perm_change_fd) { + qemu_close(s->perm_change_fd); + } + s->perm_change_fd = 0; + raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL); } -static int coroutine_fn raw_co_copy_range_from( - BlockDriverState *bs, BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, uint64_t bytes, +static int coroutine_fn GRAPH_RDLOCK raw_co_copy_range_from( + BlockDriverState *bs, BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, read_flags, write_flags); } -static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, - BdrvChild *src, - uint64_t src_offset, - BdrvChild *dst, - uint64_t dst_offset, - uint64_t bytes, - BdrvRequestFlags read_flags, - BdrvRequestFlags write_flags) +static int coroutine_fn GRAPH_RDLOCK +raw_co_copy_range_to(BlockDriverState *bs, + BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { + RawPosixAIOData acb; BDRVRawState *s = bs->opaque; BDRVRawState *src_s; @@ -2684,8 +3856,20 @@ static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) { return -EIO; } - return paio_submit_co_full(bs, src_s->fd, src_offset, s->fd, dst_offset, - NULL, bytes, QEMU_AIO_COPY_RANGE); + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_type = QEMU_AIO_COPY_RANGE, + .aio_fildes = src_s->fd, + .aio_offset = src_offset, + .aio_nbytes = bytes, + .copy_range = { + .aio_fd2 = s->fd, + .aio_offset2 = dst_offset, + }, + }; + + return raw_thread_pool_submit(handle_aiocb_copy_range, &acb); } BlockDriver bdrv_file = { @@ -2706,6 +3890,7 @@ BlockDriver bdrv_file = { .bdrv_co_block_status = raw_co_block_status, .bdrv_co_invalidate_cache = raw_co_invalidate_cache, .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes, + .bdrv_co_delete_file = raw_co_delete_file, .bdrv_co_preadv = raw_co_preadv, .bdrv_co_pwritev = raw_co_pwritev, @@ -2714,38 +3899,45 @@ BlockDriver bdrv_file = { .bdrv_co_copy_range_from = raw_co_copy_range_from, .bdrv_co_copy_range_to = raw_co_copy_range_to, .bdrv_refresh_limits = raw_refresh_limits, - .bdrv_io_plug = raw_aio_plug, - .bdrv_io_unplug = raw_aio_unplug, - .bdrv_attach_aio_context = raw_aio_attach_aio_context, - - .bdrv_co_truncate = raw_co_truncate, - .bdrv_getlength = raw_getlength, - .bdrv_get_info = raw_get_info, - .bdrv_get_allocated_file_size - = raw_get_allocated_file_size, + + .bdrv_co_truncate = raw_co_truncate, + .bdrv_co_getlength = raw_co_getlength, + .bdrv_co_get_info = raw_co_get_info, + .bdrv_get_specific_info = raw_get_specific_info, + .bdrv_co_get_allocated_file_size = raw_co_get_allocated_file_size, + .bdrv_get_specific_stats = raw_get_specific_stats, .bdrv_check_perm = raw_check_perm, .bdrv_set_perm = raw_set_perm, .bdrv_abort_perm_update = raw_abort_perm_update, .create_opts = &raw_create_opts, + .mutable_opts = mutable_opts, }; /***********************************************/ /* host device */ +#if defined(HAVE_HOST_BLOCK_DEVICE) + #if defined(__APPLE__) && defined(__MACH__) static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize, int flags); + +#if !defined(MAC_OS_VERSION_12_0) \ + || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_VERSION_12_0) +#define IOMainPort IOMasterPort +#endif + static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator) { kern_return_t kernResult = KERN_FAILURE; - mach_port_t masterPort; + mach_port_t mainPort; CFMutableDictionaryRef classesToMatch; const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass}; char *mediaType = NULL; - kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort ); + kernResult = IOMainPort(MACH_PORT_NULL, &mainPort); if ( KERN_SUCCESS != kernResult ) { - printf( "IOMasterPort returned %d\n", kernResult ); + printf("IOMainPort returned %d\n", kernResult); } int index; @@ -2758,7 +3950,7 @@ static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator) } CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey), kCFBooleanTrue); - kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch, + kernResult = IOServiceGetMatchingServices(mainPort, classesToMatch, mediaIterator); if (kernResult != KERN_SUCCESS) { error_report("Note: IOServiceGetMatchingServices returned %d", @@ -2768,7 +3960,7 @@ static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator) /* If a match was found, leave the loop */ if (*mediaIterator != 0) { - DPRINTF("Matching using %s\n", matching_array[index]); + trace_file_FindEjectableOpticalMedia(matching_array[index]); mediaType = g_strdup(matching_array[index]); break; } @@ -2816,7 +4008,7 @@ static bool setup_cdrom(char *bsd_path, Error **errp) for (index = 0; index < num_of_test_partitions; index++) { snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path, index); - fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE); + fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE, NULL); if (fd >= 0) { partition_found = true; qemu_close(fd); @@ -2828,7 +4020,7 @@ static bool setup_cdrom(char *bsd_path, Error **errp) if (partition_found == false) { error_setg(errp, "Failed to find a working partition on disc"); } else { - DPRINTF("Using %s as optical disc\n", test_partition); + trace_file_setup_cdrom(test_partition); pstrcpy(bsd_path, MAXPATHLEN, test_partition); } return partition_found; @@ -2862,39 +4054,6 @@ static int hdev_probe_device(const char *filename) return 0; } -static int check_hdev_writable(BDRVRawState *s) -{ -#if defined(BLKROGET) - /* Linux block devices can be configured "read-only" using blockdev(8). - * This is independent of device node permissions and therefore open(2) - * with O_RDWR succeeds. Actual writes fail with EPERM. - * - * bdrv_open() is supposed to fail if the disk is read-only. Explicitly - * check for read-only block devices so that Linux block devices behave - * properly. - */ - struct stat st; - int readonly = 0; - - if (fstat(s->fd, &st)) { - return -errno; - } - - if (!S_ISBLK(st.st_mode)) { - return 0; - } - - if (ioctl(s->fd, BLKROGET, &readonly) < 0) { - return -errno; - } - - if (readonly) { - return -EACCES; - } -#endif /* defined(BLKROGET) */ - return 0; -} - static void hdev_parse_filename(const char *filename, QDict *options, Error **errp) { @@ -2923,8 +4082,7 @@ static bool hdev_is_sg(BlockDriverState *bs) ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid); if (ret >= 0) { - DPRINTF("SG device found: type=%d, version=%d\n", - scsiid.scsi_type, sg_version); + trace_file_hdev_is_sg(scsiid.scsi_type, sg_version); return true; } @@ -2937,7 +4095,6 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVRawState *s = bs->opaque; - Error *local_err = NULL; int ret; #if defined(__APPLE__) && defined(__MACH__) @@ -3002,9 +4159,8 @@ hdev_open_Mac_error: s->type = FTYPE_FILE; - ret = raw_open_common(bs, options, flags, 0, true, &local_err); + ret = raw_open_common(bs, options, flags, 0, true, errp); if (ret < 0) { - error_propagate(errp, local_err); #if defined(__APPLE__) && defined(__MACH__) if (*bsd_path) { filename = bsd_path; @@ -3020,81 +4176,63 @@ hdev_open_Mac_error: /* Since this does ioctl the device must be already opened */ bs->sg = hdev_is_sg(bs); - if (flags & BDRV_O_RDWR) { - ret = check_hdev_writable(s); - if (ret < 0) { - raw_close(bs); - error_setg_errno(errp, -ret, "The device is not writable"); - return ret; - } - } - return ret; } #if defined(__linux__) - -static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, - unsigned long int req, void *buf, - BlockCompletionFunc *cb, void *opaque) +static int coroutine_fn +hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) { BDRVRawState *s = bs->opaque; - RawPosixAIOData *acb; - ThreadPool *pool; + RawPosixAIOData acb; + int ret; - if (fd_open(bs) < 0) - return NULL; + ret = fd_open(bs); + if (ret < 0) { + return ret; + } if (req == SG_IO && s->pr_mgr) { struct sg_io_hdr *io_hdr = buf; if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT || io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) { - return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs), - s->fd, io_hdr, cb, opaque); + return pr_manager_execute(s->pr_mgr, qemu_get_current_aio_context(), + s->fd, io_hdr); } } - acb = g_new(RawPosixAIOData, 1); - acb->bs = bs; - acb->aio_type = QEMU_AIO_IOCTL; - acb->aio_fildes = s->fd; - acb->aio_offset = 0; - acb->aio_ioctl_buf = buf; - acb->aio_ioctl_cmd = req; - pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); -} -#endif /* linux */ - -static int fd_open(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; + acb = (RawPosixAIOData) { + .bs = bs, + .aio_type = QEMU_AIO_IOCTL, + .aio_fildes = s->fd, + .aio_offset = 0, + .ioctl = { + .buf = buf, + .cmd = req, + }, + }; - /* this is just to ensure s->fd is sane (its called by io ops) */ - if (s->fd >= 0) - return 0; - return -EIO; + return raw_thread_pool_submit(handle_aiocb_ioctl, &acb); } +#endif /* linux */ static coroutine_fn int -hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) +hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) { BDRVRawState *s = bs->opaque; int ret; ret = fd_open(bs); if (ret < 0) { + raw_account_discard(s, bytes, ret); return ret; } - return paio_submit_co(bs, s->fd, offset, NULL, bytes, - QEMU_AIO_DISCARD | QEMU_AIO_BLKDEV); + return raw_do_pdiscard(bs, offset, bytes, true); } static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, BdrvRequestFlags flags) + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { - BDRVRawState *s = bs->opaque; - int operation = QEMU_AIO_WRITE_ZEROES | QEMU_AIO_BLKDEV; int rc; rc = fd_open(bs); @@ -3102,73 +4240,7 @@ static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, return rc; } - if (flags & BDRV_REQ_MAY_UNMAP) { - operation |= QEMU_AIO_DISCARD; - } - - return paio_submit_co(bs, s->fd, offset, NULL, bytes, operation); -} - -static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts, - Error **errp) -{ - int fd; - int ret = 0; - struct stat stat_buf; - int64_t total_size = 0; - bool has_prefix; - - /* This function is used by both protocol block drivers and therefore either - * of these prefixes may be given. - * The return value has to be stored somewhere, otherwise this is an error - * due to -Werror=unused-value. */ - has_prefix = - strstart(filename, "host_device:", &filename) || - strstart(filename, "host_cdrom:" , &filename); - - (void)has_prefix; - - ret = raw_normalize_devicepath(&filename); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not normalize device path"); - return ret; - } - - /* Read out options */ - total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - - fd = qemu_open(filename, O_WRONLY | O_BINARY); - if (fd < 0) { - ret = -errno; - error_setg_errno(errp, -ret, "Could not open device"); - return ret; - } - - if (fstat(fd, &stat_buf) < 0) { - ret = -errno; - error_setg_errno(errp, -ret, "Could not stat device"); - } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) { - error_setg(errp, - "The given file is neither a block nor a character device"); - ret = -ENODEV; - } else if (lseek(fd, 0, SEEK_END) < total_size) { - error_setg(errp, "Device is too small"); - ret = -ENOSPC; - } - - if (!ret && total_size) { - uint8_t buf[BDRV_SECTOR_SIZE] = { 0 }; - int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size); - if (lseek(fd, 0, SEEK_SET) == -1) { - ret = -errno; - } else { - ret = qemu_write_full(fd, buf, zero_size); - ret = ret == zero_size ? 0 : -errno; - } - } - qemu_close(fd); - return ret; + return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true); } static BlockDriver bdrv_host_device = { @@ -3183,8 +4255,9 @@ static BlockDriver bdrv_host_device = { .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_co_create_opts = hdev_co_create_opts, - .create_opts = &raw_create_opts, + .bdrv_co_create_opts = bdrv_co_create_opts_simple, + .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, .bdrv_co_invalidate_cache = raw_co_invalidate_cache, .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, @@ -3195,15 +4268,13 @@ static BlockDriver bdrv_host_device = { .bdrv_co_copy_range_from = raw_co_copy_range_from, .bdrv_co_copy_range_to = raw_co_copy_range_to, .bdrv_refresh_limits = raw_refresh_limits, - .bdrv_io_plug = raw_aio_plug, - .bdrv_io_unplug = raw_aio_unplug, - .bdrv_attach_aio_context = raw_aio_attach_aio_context, - - .bdrv_co_truncate = raw_co_truncate, - .bdrv_getlength = raw_getlength, - .bdrv_get_info = raw_get_info, - .bdrv_get_allocated_file_size - = raw_get_allocated_file_size, + + .bdrv_co_truncate = raw_co_truncate, + .bdrv_co_getlength = raw_co_getlength, + .bdrv_co_get_info = raw_co_get_info, + .bdrv_get_specific_info = raw_get_specific_info, + .bdrv_co_get_allocated_file_size = raw_co_get_allocated_file_size, + .bdrv_get_specific_stats = hdev_get_specific_stats, .bdrv_check_perm = raw_check_perm, .bdrv_set_perm = raw_set_perm, .bdrv_abort_perm_update = raw_abort_perm_update, @@ -3212,7 +4283,15 @@ static BlockDriver bdrv_host_device = { /* generic scsi device */ #ifdef __linux__ - .bdrv_aio_ioctl = hdev_aio_ioctl, + .bdrv_co_ioctl = hdev_co_ioctl, +#endif + + /* zoned device */ +#if defined(CONFIG_BLKZONED) + /* zone management operations */ + .bdrv_co_zone_report = raw_co_zone_report, + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, + .bdrv_co_zone_append = raw_co_zone_append, #endif }; @@ -3222,6 +4301,12 @@ static void cdrom_parse_filename(const char *filename, QDict *options, { bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options); } + +static void cdrom_refresh_limits(BlockDriverState *bs, Error **errp) +{ + bs->bl.has_variable_length = true; + raw_refresh_limits(bs, errp); +} #endif #ifdef __linux__ @@ -3242,7 +4327,7 @@ static int cdrom_probe_device(const char *filename) int prio = 0; struct stat st; - fd = qemu_open(filename, O_RDONLY | O_NONBLOCK); + fd = qemu_open(filename, O_RDONLY | O_NONBLOCK, NULL); if (fd < 0) { goto out; } @@ -3262,7 +4347,7 @@ out: return prio; } -static bool cdrom_is_inserted(BlockDriverState *bs) +static bool coroutine_fn cdrom_co_is_inserted(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; int ret; @@ -3271,7 +4356,7 @@ static bool cdrom_is_inserted(BlockDriverState *bs) return ret == CDS_DISC_OK; } -static void cdrom_eject(BlockDriverState *bs, bool eject_flag) +static void coroutine_fn cdrom_co_eject(BlockDriverState *bs, bool eject_flag) { BDRVRawState *s = bs->opaque; @@ -3284,7 +4369,7 @@ static void cdrom_eject(BlockDriverState *bs, bool eject_flag) } } -static void cdrom_lock_medium(BlockDriverState *bs, bool locked) +static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked) { BDRVRawState *s = bs->opaque; @@ -3309,32 +4394,27 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_co_create_opts = hdev_co_create_opts, - .create_opts = &raw_create_opts, + .bdrv_co_create_opts = bdrv_co_create_opts_simple, + .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, .bdrv_co_invalidate_cache = raw_co_invalidate_cache, - .bdrv_co_preadv = raw_co_preadv, .bdrv_co_pwritev = raw_co_pwritev, .bdrv_co_flush_to_disk = raw_co_flush_to_disk, - .bdrv_refresh_limits = raw_refresh_limits, - .bdrv_io_plug = raw_aio_plug, - .bdrv_io_unplug = raw_aio_unplug, - .bdrv_attach_aio_context = raw_aio_attach_aio_context, + .bdrv_refresh_limits = cdrom_refresh_limits, - .bdrv_co_truncate = raw_co_truncate, - .bdrv_getlength = raw_getlength, - .has_variable_length = true, - .bdrv_get_allocated_file_size - = raw_get_allocated_file_size, + .bdrv_co_truncate = raw_co_truncate, + .bdrv_co_getlength = raw_co_getlength, + .bdrv_co_get_allocated_file_size = raw_co_get_allocated_file_size, /* removable device support */ - .bdrv_is_inserted = cdrom_is_inserted, - .bdrv_eject = cdrom_eject, - .bdrv_lock_medium = cdrom_lock_medium, + .bdrv_co_is_inserted = cdrom_co_is_inserted, + .bdrv_co_eject = cdrom_co_eject, + .bdrv_co_lock_medium = cdrom_co_lock_medium, /* generic scsi device */ - .bdrv_aio_ioctl = hdev_aio_ioctl, + .bdrv_co_ioctl = hdev_co_ioctl, }; #endif /* __linux__ */ @@ -3343,14 +4423,12 @@ static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVRawState *s = bs->opaque; - Error *local_err = NULL; int ret; s->type = FTYPE_CD; - ret = raw_open_common(bs, options, flags, 0, true, &local_err); + ret = raw_open_common(bs, options, flags, 0, true, errp); if (ret) { - error_propagate(errp, local_err); return ret; } @@ -3378,7 +4456,7 @@ static int cdrom_reopen(BlockDriverState *bs) */ if (s->fd >= 0) qemu_close(s->fd); - fd = qemu_open(bs->filename, s->open_flags, 0644); + fd = qemu_open(bs->filename, s->open_flags, NULL); if (fd < 0) { s->fd = -1; return -EIO; @@ -3390,12 +4468,12 @@ static int cdrom_reopen(BlockDriverState *bs) return 0; } -static bool cdrom_is_inserted(BlockDriverState *bs) +static bool coroutine_fn cdrom_co_is_inserted(BlockDriverState *bs) { return raw_getlength(bs) > 0; } -static void cdrom_eject(BlockDriverState *bs, bool eject_flag) +static void coroutine_fn cdrom_co_eject(BlockDriverState *bs, bool eject_flag) { BDRVRawState *s = bs->opaque; @@ -3415,7 +4493,7 @@ static void cdrom_eject(BlockDriverState *bs, bool eject_flag) cdrom_reopen(bs); } -static void cdrom_lock_medium(BlockDriverState *bs, bool locked) +static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked) { BDRVRawState *s = bs->opaque; @@ -3442,30 +4520,28 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_co_create_opts = hdev_co_create_opts, - .create_opts = &raw_create_opts, + .bdrv_co_create_opts = bdrv_co_create_opts_simple, + .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, .bdrv_co_preadv = raw_co_preadv, .bdrv_co_pwritev = raw_co_pwritev, .bdrv_co_flush_to_disk = raw_co_flush_to_disk, - .bdrv_refresh_limits = raw_refresh_limits, - .bdrv_io_plug = raw_aio_plug, - .bdrv_io_unplug = raw_aio_unplug, - .bdrv_attach_aio_context = raw_aio_attach_aio_context, + .bdrv_refresh_limits = cdrom_refresh_limits, - .bdrv_co_truncate = raw_co_truncate, - .bdrv_getlength = raw_getlength, - .has_variable_length = true, - .bdrv_get_allocated_file_size - = raw_get_allocated_file_size, + .bdrv_co_truncate = raw_co_truncate, + .bdrv_co_getlength = raw_co_getlength, + .bdrv_co_get_allocated_file_size = raw_co_get_allocated_file_size, /* removable device support */ - .bdrv_is_inserted = cdrom_is_inserted, - .bdrv_eject = cdrom_eject, - .bdrv_lock_medium = cdrom_lock_medium, + .bdrv_co_is_inserted = cdrom_co_is_inserted, + .bdrv_co_eject = cdrom_co_eject, + .bdrv_co_lock_medium = cdrom_co_lock_medium, }; #endif /* __FreeBSD__ */ +#endif /* HAVE_HOST_BLOCK_DEVICE */ + static void bdrv_file_init(void) { /* @@ -3473,6 +4549,7 @@ static void bdrv_file_init(void) * registered last will get probed first. */ bdrv_register(&bdrv_file); +#if defined(HAVE_HOST_BLOCK_DEVICE) bdrv_register(&bdrv_host_device); #ifdef __linux__ bdrv_register(&bdrv_host_cdrom); @@ -3480,6 +4557,7 @@ static void bdrv_file_init(void) #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) bdrv_register(&bdrv_host_cdrom); #endif +#endif /* HAVE_HOST_BLOCK_DEVICE */ } block_init(bdrv_file_init); |