blob: e6fa8b598b2635021d29c10eddecbfbe4a21ac92 [file] [log] [blame]
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +03001/*
2 * Support for RAM backed by mmaped host memory.
3 *
4 * Copyright (c) 2015 Red Hat, Inc.
5 *
6 * Authors:
7 * Michael S. Tsirkin <mst@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
11 */
Markus Armbrustera9c94272016-06-22 19:11:19 +020012
Zhang Yi119906af2019-04-22 08:48:48 +080013#ifdef CONFIG_LINUX
14#include <linux/mman.h>
15#else /* !CONFIG_LINUX */
16#define MAP_SYNC 0x0
17#define MAP_SHARED_VALIDATE 0x0
18#endif /* CONFIG_LINUX */
19
Peter Maydellaafd7582016-01-29 17:49:55 +000020#include "qemu/osdep.h"
Markus Armbrustera9c94272016-06-22 19:11:19 +020021#include "qemu/mmap-alloc.h"
Cao jin4a3ecf22016-11-02 21:44:46 +080022#include "qemu/host-utils.h"
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +030023
Michael S. Tsirkin7197fb42015-12-02 21:14:12 +020024#define HUGETLBFS_MAGIC 0x958458f6
25
26#ifdef CONFIG_LINUX
27#include <sys/vfs.h>
28#endif
29
30size_t qemu_fd_getpagesize(int fd)
31{
32#ifdef CONFIG_LINUX
33 struct statfs fs;
34 int ret;
35
36 if (fd != -1) {
37 do {
38 ret = fstatfs(fd, &fs);
39 } while (ret != 0 && errno == EINTR);
40
41 if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) {
42 return fs.f_bsize;
43 }
44 }
Peter Maydell57d1f6d2017-12-08 16:57:28 +000045#ifdef __sparc__
46 /* SPARC Linux needs greater alignment than the pagesize */
47 return QEMU_VMALLOC_ALIGN;
48#endif
Michael S. Tsirkin7197fb42015-12-02 21:14:12 +020049#endif
50
Wei Yang038adc22019-10-13 10:11:45 +080051 return qemu_real_host_page_size;
Michael S. Tsirkin7197fb42015-12-02 21:14:12 +020052}
53
Alexey Kardashevskiy9c607662017-03-02 13:36:11 +110054size_t qemu_mempath_getpagesize(const char *mem_path)
55{
56#ifdef CONFIG_LINUX
57 struct statfs fs;
58 int ret;
59
David Gibson0de6e2a2018-04-03 14:55:11 +100060 if (mem_path) {
61 do {
62 ret = statfs(mem_path, &fs);
63 } while (ret != 0 && errno == EINTR);
Alexey Kardashevskiy9c607662017-03-02 13:36:11 +110064
David Gibson0de6e2a2018-04-03 14:55:11 +100065 if (ret != 0) {
66 fprintf(stderr, "Couldn't statfs() memory path: %s\n",
67 strerror(errno));
68 exit(1);
69 }
Alexey Kardashevskiy9c607662017-03-02 13:36:11 +110070
David Gibson0de6e2a2018-04-03 14:55:11 +100071 if (fs.f_type == HUGETLBFS_MAGIC) {
72 /* It's hugepage, return the huge page size */
73 return fs.f_bsize;
74 }
Alexey Kardashevskiy9c607662017-03-02 13:36:11 +110075 }
Peter Maydell57d1f6d2017-12-08 16:57:28 +000076#ifdef __sparc__
77 /* SPARC Linux needs greater alignment than the pagesize */
78 return QEMU_VMALLOC_ALIGN;
79#endif
Alexey Kardashevskiy9c607662017-03-02 13:36:11 +110080#endif
81
Wei Yang038adc22019-10-13 10:11:45 +080082 return qemu_real_host_page_size;
Alexey Kardashevskiy9c607662017-03-02 13:36:11 +110083}
84
Zhang Yi2ac0f162019-02-08 18:10:37 +080085void *qemu_ram_mmap(int fd,
86 size_t size,
87 size_t align,
Stefan Hajnoczi369d6dc2021-01-04 17:13:18 +000088 bool readonly,
Zhang Yi2ac0f162019-02-08 18:10:37 +080089 bool shared,
Jagannathan Raman44a4ff32021-01-29 11:46:04 -050090 bool is_pmem,
91 off_t map_offset)
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +030092{
Stefan Hajnoczi369d6dc2021-01-04 17:13:18 +000093 int prot;
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -020094 int flags;
Zhang Yi119906af2019-04-22 08:48:48 +080095 int map_sync_flags = 0;
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -020096 int guardfd;
97 size_t offset;
Murilo Opsfelder Araujo53adb9d2019-01-30 21:36:05 -020098 size_t pagesize;
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -020099 size_t total;
100 void *guardptr;
101 void *ptr;
102
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300103 /*
104 * Note: this always allocates at least one extra page of virtual address
105 * space, even if size is already aligned.
106 */
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -0200107 total = size + align;
108
Michael S. Tsirkin7197fb42015-12-02 21:14:12 +0200109#if defined(__powerpc64__) && defined(__linux__)
110 /* On ppc64 mappings in the same segment (aka slice) must share the same
111 * page size. Since we will be re-allocating part of this segment
Michael S. Tsirkin097a50d2015-12-03 10:35:31 +0200112 * from the supplied fd, we should make sure to use the same page size, to
113 * this end we mmap the supplied fd. In this case, set MAP_NORESERVE to
114 * avoid allocating backing store memory.
115 * We do this unless we are using the system page size, in which case
116 * anonymous memory is OK.
Michael S. Tsirkin7197fb42015-12-02 21:14:12 +0200117 */
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -0200118 flags = MAP_PRIVATE;
Murilo Opsfelder Araujo53adb9d2019-01-30 21:36:05 -0200119 pagesize = qemu_fd_getpagesize(fd);
Wei Yang038adc22019-10-13 10:11:45 +0800120 if (fd == -1 || pagesize == qemu_real_host_page_size) {
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -0200121 guardfd = -1;
122 flags |= MAP_ANONYMOUS;
123 } else {
124 guardfd = fd;
125 flags |= MAP_NORESERVE;
126 }
Michael S. Tsirkin7197fb42015-12-02 21:14:12 +0200127#else
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -0200128 guardfd = -1;
Wei Yang038adc22019-10-13 10:11:45 +0800129 pagesize = qemu_real_host_page_size;
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -0200130 flags = MAP_PRIVATE | MAP_ANONYMOUS;
Michael S. Tsirkin7197fb42015-12-02 21:14:12 +0200131#endif
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300132
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -0200133 guardptr = mmap(0, total, PROT_NONE, flags, guardfd, 0);
134
135 if (guardptr == MAP_FAILED) {
Michael S. Tsirkin9d4ec932015-10-25 17:07:45 +0200136 return MAP_FAILED;
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300137 }
138
Cao jin4a3ecf22016-11-02 21:44:46 +0800139 assert(is_power_of_2(align));
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300140 /* Always align to host page size */
Murilo Opsfelder Araujo53adb9d2019-01-30 21:36:05 -0200141 assert(align >= pagesize);
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300142
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -0200143 flags = MAP_FIXED;
144 flags |= fd == -1 ? MAP_ANONYMOUS : 0;
145 flags |= shared ? MAP_SHARED : MAP_PRIVATE;
Zhang Yi119906af2019-04-22 08:48:48 +0800146 if (shared && is_pmem) {
147 map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE;
148 }
149
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -0200150 offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
151
Stefan Hajnoczi369d6dc2021-01-04 17:13:18 +0000152 prot = PROT_READ | (readonly ? 0 : PROT_WRITE);
153
Jagannathan Raman44a4ff32021-01-29 11:46:04 -0500154 ptr = mmap(guardptr + offset, size, prot,
155 flags | map_sync_flags, fd, map_offset);
Zhang Yi119906af2019-04-22 08:48:48 +0800156
157 if (ptr == MAP_FAILED && map_sync_flags) {
158 if (errno == ENOTSUP) {
159 char *proc_link, *file_name;
160 int len;
161 proc_link = g_strdup_printf("/proc/self/fd/%d", fd);
162 file_name = g_malloc0(PATH_MAX);
163 len = readlink(proc_link, file_name, PATH_MAX - 1);
164 if (len < 0) {
165 len = 0;
166 }
167 file_name[len] = '\0';
168 fprintf(stderr, "Warning: requesting persistence across crashes "
169 "for backend file %s failed. Proceeding without "
170 "persistence, data might become corrupted in case of host "
171 "crash.\n", file_name);
172 g_free(proc_link);
173 g_free(file_name);
174 }
175 /*
176 * if map failed with MAP_SHARED_VALIDATE | MAP_SYNC,
177 * we will remove these flags to handle compatibility.
178 */
Jagannathan Raman44a4ff32021-01-29 11:46:04 -0500179 ptr = mmap(guardptr + offset, size, prot, flags, fd, map_offset);
Zhang Yi119906af2019-04-22 08:48:48 +0800180 }
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -0200181
182 if (ptr == MAP_FAILED) {
183 munmap(guardptr, total);
Michael S. Tsirkin9d4ec932015-10-25 17:07:45 +0200184 return MAP_FAILED;
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300185 }
186
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300187 if (offset > 0) {
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -0200188 munmap(guardptr, offset);
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300189 }
190
191 /*
192 * Leave a single PROT_NONE page allocated after the RAM block, to serve as
193 * a guard page guarding against potential buffer overflows.
194 */
Cao jin6e4c8902016-11-02 21:44:47 +0800195 total -= offset;
Murilo Opsfelder Araujo53adb9d2019-01-30 21:36:05 -0200196 if (total > size + pagesize) {
197 munmap(ptr + size + pagesize, total - size - pagesize);
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300198 }
199
Murilo Opsfelder Araujo2044c3e2019-01-30 21:36:04 -0200200 return ptr;
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300201}
202
Murilo Opsfelder Araujo53adb9d2019-01-30 21:36:05 -0200203void qemu_ram_munmap(int fd, void *ptr, size_t size)
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300204{
Murilo Opsfelder Araujo53adb9d2019-01-30 21:36:05 -0200205 size_t pagesize;
206
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300207 if (ptr) {
208 /* Unmap both the RAM block and the guard page */
Murilo Opsfelder Araujo53adb9d2019-01-30 21:36:05 -0200209#if defined(__powerpc64__) && defined(__linux__)
210 pagesize = qemu_fd_getpagesize(fd);
211#else
Wei Yang038adc22019-10-13 10:11:45 +0800212 pagesize = qemu_real_host_page_size;
Murilo Opsfelder Araujo53adb9d2019-01-30 21:36:05 -0200213#endif
214 munmap(ptr, size + pagesize);
Michael S. Tsirkin794e8f32015-09-24 14:41:17 +0300215 }
216}