blob: c9a7ded01661a24dda930804959103e9032b5660 [file] [log] [blame]
Richard Henderson88ca8e82016-08-29 11:46:12 -07001/*
2 * Simple C functions to supplement the C library
3 *
4 * Copyright (c) 2006 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24#include "qemu/osdep.h"
Richard Henderson88ca8e82016-08-29 11:46:12 -070025#include "qemu/cutils.h"
Richard Henderson5e33a872016-08-29 11:46:15 -070026#include "qemu/bswap.h"
Richard Henderson51f4d912023-05-17 19:10:59 -070027#include "host/cpuinfo.h"
Richard Henderson88ca8e82016-08-29 11:46:12 -070028
Alexander Monakovcbe3d522024-02-06 23:48:05 +030029static bool (*buffer_is_zero_accel)(const void *, size_t);
30
Richard Henderson7ae63992024-04-06 14:40:32 -100031static bool buffer_is_zero_int_lt256(const void *buf, size_t len)
Richard Henderson5e33a872016-08-29 11:46:15 -070032{
Richard Henderson7ae63992024-04-06 14:40:32 -100033 uint64_t t;
34 const uint64_t *p, *e;
Richard Henderson5e33a872016-08-29 11:46:15 -070035
Richard Henderson7ae63992024-04-06 14:40:32 -100036 /*
37 * Use unaligned memory access functions to handle
38 * the beginning and end of the buffer.
39 */
40 if (unlikely(len <= 8)) {
41 return (ldl_he_p(buf) | ldl_he_p(buf + len - 4)) == 0;
Richard Henderson5e33a872016-08-29 11:46:15 -070042 }
Richard Henderson7ae63992024-04-06 14:40:32 -100043
44 t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
45 p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
46 e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
47
48 /* Read 0 to 31 aligned words from the middle. */
49 while (p < e) {
50 t |= *p++;
51 }
52 return t == 0;
53}
54
55static bool buffer_is_zero_int_ge256(const void *buf, size_t len)
56{
57 /*
58 * Use unaligned memory access functions to handle
59 * the beginning and end of the buffer.
60 */
61 uint64_t t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
62 const uint64_t *p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
63 const uint64_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
64
65 /* Collect a partial block at the tail end. */
66 t |= e[-7] | e[-6] | e[-5] | e[-4] | e[-3] | e[-2] | e[-1];
67
68 /*
69 * Loop over 64 byte blocks.
70 * With the head and tail removed, e - p >= 30,
71 * so the loop must iterate at least 3 times.
72 */
73 do {
74 if (t) {
75 return false;
76 }
77 t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
78 p += 8;
79 } while (p < e - 7);
80
81 return t == 0;
Richard Henderson5e33a872016-08-29 11:46:15 -070082}
83
Alexander Monakovd0184252024-02-14 10:34:24 -100084#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
Richard Henderson701ea582022-12-03 19:31:12 -060085#include <immintrin.h>
Richard Hendersond9911d12016-09-13 13:57:19 -070086
Alexander Monakovf28e0bb2024-02-06 23:48:08 +030087/* Helper for preventing the compiler from reassociating
88 chains of binary vector operations. */
89#define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
90
91/* Note that these vectorized functions may assume len >= 256. */
Richard Hendersond9911d12016-09-13 13:57:19 -070092
Richard Henderson701ea582022-12-03 19:31:12 -060093static bool __attribute__((target("sse2")))
Richard Hendersond9911d12016-09-13 13:57:19 -070094buffer_zero_sse2(const void *buf, size_t len)
95{
Alexander Monakovf28e0bb2024-02-06 23:48:08 +030096 /* Unaligned loads at head/tail. */
97 __m128i v = *(__m128i_u *)(buf);
98 __m128i w = *(__m128i_u *)(buf + len - 16);
99 /* Align head/tail to 16-byte boundaries. */
100 const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
101 const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
102 __m128i zero = { 0 };
Richard Hendersond9911d12016-09-13 13:57:19 -0700103
Alexander Monakovf28e0bb2024-02-06 23:48:08 +0300104 /* Collect a partial block at tail end. */
105 v |= e[-1]; w |= e[-2];
106 SSE_REASSOC_BARRIER(v, w);
107 v |= e[-3]; w |= e[-4];
108 SSE_REASSOC_BARRIER(v, w);
109 v |= e[-5]; w |= e[-6];
110 SSE_REASSOC_BARRIER(v, w);
111 v |= e[-7]; v |= w;
112
113 /*
114 * Loop over complete 128-byte blocks.
115 * With the head and tail removed, e - p >= 14, so the loop
116 * must iterate at least once.
117 */
118 do {
119 v = _mm_cmpeq_epi8(v, zero);
120 if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) {
Richard Hendersond9911d12016-09-13 13:57:19 -0700121 return false;
122 }
Alexander Monakovf28e0bb2024-02-06 23:48:08 +0300123 v = p[0]; w = p[1];
124 SSE_REASSOC_BARRIER(v, w);
125 v |= p[2]; w |= p[3];
126 SSE_REASSOC_BARRIER(v, w);
127 v |= p[4]; w |= p[5];
128 SSE_REASSOC_BARRIER(v, w);
129 v |= p[6]; w |= p[7];
130 SSE_REASSOC_BARRIER(v, w);
131 v |= w;
132 p += 8;
133 } while (p < e - 7);
Richard Hendersond9911d12016-09-13 13:57:19 -0700134
Alexander Monakovf28e0bb2024-02-06 23:48:08 +0300135 return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF;
Richard Hendersond9911d12016-09-13 13:57:19 -0700136}
Richard Henderson88ca8e82016-08-29 11:46:12 -0700137
Richard Henderson5e33a872016-08-29 11:46:15 -0700138#ifdef CONFIG_AVX2_OPT
Richard Henderson701ea582022-12-03 19:31:12 -0600139static bool __attribute__((target("avx2")))
Richard Hendersond9911d12016-09-13 13:57:19 -0700140buffer_zero_avx2(const void *buf, size_t len)
141{
Alexander Monakovf28e0bb2024-02-06 23:48:08 +0300142 /* Unaligned loads at head/tail. */
143 __m256i v = *(__m256i_u *)(buf);
144 __m256i w = *(__m256i_u *)(buf + len - 32);
145 /* Align head/tail to 32-byte boundaries. */
146 const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32);
147 const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32);
148 __m256i zero = { 0 };
Richard Hendersond9911d12016-09-13 13:57:19 -0700149
Alexander Monakovf28e0bb2024-02-06 23:48:08 +0300150 /* Collect a partial block at tail end. */
151 v |= e[-1]; w |= e[-2];
152 SSE_REASSOC_BARRIER(v, w);
153 v |= e[-3]; w |= e[-4];
154 SSE_REASSOC_BARRIER(v, w);
155 v |= e[-5]; w |= e[-6];
156 SSE_REASSOC_BARRIER(v, w);
157 v |= e[-7]; v |= w;
158
159 /* Loop over complete 256-byte blocks. */
160 for (; p < e - 7; p += 8) {
161 /* PTEST is not profitable here. */
162 v = _mm256_cmpeq_epi8(v, zero);
163 if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) {
Robert Hoo8f13a392020-03-25 14:50:21 +0800164 return false;
Richard Hendersond9911d12016-09-13 13:57:19 -0700165 }
Alexander Monakovf28e0bb2024-02-06 23:48:08 +0300166 v = p[0]; w = p[1];
167 SSE_REASSOC_BARRIER(v, w);
168 v |= p[2]; w |= p[3];
169 SSE_REASSOC_BARRIER(v, w);
170 v |= p[4]; w |= p[5];
171 SSE_REASSOC_BARRIER(v, w);
172 v |= p[6]; w |= p[7];
173 SSE_REASSOC_BARRIER(v, w);
174 v |= w;
175 }
Richard Hendersond9911d12016-09-13 13:57:19 -0700176
Alexander Monakovf28e0bb2024-02-06 23:48:08 +0300177 return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF;
Richard Hendersond9911d12016-09-13 13:57:19 -0700178}
Richard Hendersond9911d12016-09-13 13:57:19 -0700179#endif /* CONFIG_AVX2_OPT */
180
Richard Henderson51f4d912023-05-17 19:10:59 -0700181static unsigned __attribute__((noinline))
182select_accel_cpuinfo(unsigned info)
Richard Hendersond9911d12016-09-13 13:57:19 -0700183{
Richard Henderson51f4d912023-05-17 19:10:59 -0700184 /* Array is sorted in order of algorithm preference. */
185 static const struct {
186 unsigned bit;
Richard Henderson51f4d912023-05-17 19:10:59 -0700187 bool (*fn)(const void *, size_t);
188 } all[] = {
Richard Henderson51f4d912023-05-17 19:10:59 -0700189#ifdef CONFIG_AVX2_OPT
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300190 { CPUINFO_AVX2, buffer_zero_avx2 },
Richard Henderson51f4d912023-05-17 19:10:59 -0700191#endif
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300192 { CPUINFO_SSE2, buffer_zero_sse2 },
Richard Henderson7ae63992024-04-06 14:40:32 -1000193 { CPUINFO_ALWAYS, buffer_is_zero_int_ge256 },
Richard Henderson51f4d912023-05-17 19:10:59 -0700194 };
195
196 for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) {
197 if (info & all[i].bit) {
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300198 buffer_is_zero_accel = all[i].fn;
Richard Henderson51f4d912023-05-17 19:10:59 -0700199 return all[i].bit;
200 }
201 }
202 return 0;
Richard Hendersond9911d12016-09-13 13:57:19 -0700203}
Richard Henderson5e33a872016-08-29 11:46:15 -0700204
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300205static unsigned used_accel;
206
Richard Henderson51f4d912023-05-17 19:10:59 -0700207static void __attribute__((constructor)) init_accel(void)
Richard Henderson88ca8e82016-08-29 11:46:12 -0700208{
Richard Henderson51f4d912023-05-17 19:10:59 -0700209 used_accel = select_accel_cpuinfo(cpuinfo_init());
Richard Henderson88ca8e82016-08-29 11:46:12 -0700210}
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300211
212#define INIT_ACCEL NULL
Richard Henderson88ca8e82016-08-29 11:46:12 -0700213
Richard Hendersonefad6682016-08-29 11:46:16 -0700214bool test_buffer_is_zero_next_accel(void)
215{
Richard Henderson51f4d912023-05-17 19:10:59 -0700216 /*
217 * Accumulate the accelerators that we've already tested, and
218 * remove them from the set to test this round. We'll get back
219 * a zero from select_accel_cpuinfo when there are no more.
220 */
221 unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel);
222 used_accel |= used;
223 return used;
Richard Hendersonefad6682016-08-29 11:46:16 -0700224}
Richard Henderson5e33a872016-08-29 11:46:15 -0700225#else
Richard Hendersonefad6682016-08-29 11:46:16 -0700226bool test_buffer_is_zero_next_accel(void)
227{
228 return false;
229}
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300230
Richard Henderson7ae63992024-04-06 14:40:32 -1000231#define INIT_ACCEL buffer_is_zero_int_ge256
Richard Hendersonefad6682016-08-29 11:46:16 -0700232#endif
233
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300234static bool (*buffer_is_zero_accel)(const void *, size_t) = INIT_ACCEL;
235
236bool buffer_is_zero_ool(const void *buf, size_t len)
Richard Henderson88ca8e82016-08-29 11:46:12 -0700237{
Richard Henderson5e33a872016-08-29 11:46:15 -0700238 if (unlikely(len == 0)) {
239 return true;
Richard Henderson88ca8e82016-08-29 11:46:12 -0700240 }
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300241 if (!buffer_is_zero_sample3(buf, len)) {
242 return false;
243 }
244 /* All bytes are covered for any len <= 3. */
245 if (unlikely(len <= 3)) {
246 return true;
247 }
Richard Henderson88ca8e82016-08-29 11:46:12 -0700248
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300249 if (likely(len >= 256)) {
250 return buffer_is_zero_accel(buf, len);
251 }
Richard Henderson7ae63992024-04-06 14:40:32 -1000252 return buffer_is_zero_int_lt256(buf, len);
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300253}
Richard Henderson083d0122016-08-29 11:46:17 -0700254
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300255bool buffer_is_zero_ge256(const void *buf, size_t len)
256{
257 return buffer_is_zero_accel(buf, len);
Richard Henderson88ca8e82016-08-29 11:46:12 -0700258}