blob: 972f394cbdde19806792edb136705d2aada5f703 [file] [log] [blame]
Richard Henderson88ca8e82016-08-29 11:46:12 -07001/*
2 * Simple C functions to supplement the C library
3 *
4 * Copyright (c) 2006 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24#include "qemu/osdep.h"
Richard Henderson88ca8e82016-08-29 11:46:12 -070025#include "qemu/cutils.h"
Richard Henderson5e33a872016-08-29 11:46:15 -070026#include "qemu/bswap.h"
Richard Henderson51f4d912023-05-17 19:10:59 -070027#include "host/cpuinfo.h"
Richard Henderson88ca8e82016-08-29 11:46:12 -070028
Alexander Monakovcbe3d522024-02-06 23:48:05 +030029static bool (*buffer_is_zero_accel)(const void *, size_t);
30
31static bool buffer_is_zero_integer(const void *buf, size_t len)
Richard Henderson5e33a872016-08-29 11:46:15 -070032{
33 if (unlikely(len < 8)) {
34 /* For a very small buffer, simply accumulate all the bytes. */
35 const unsigned char *p = buf;
36 const unsigned char *e = buf + len;
37 unsigned char t = 0;
38
39 do {
40 t |= *p++;
41 } while (p < e);
42
43 return t == 0;
44 } else {
45 /* Otherwise, use the unaligned memory access functions to
46 handle the beginning and end of the buffer, with a couple
47 of loops handling the middle aligned section. */
48 uint64_t t = ldq_he_p(buf);
49 const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
50 const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
51
52 for (; p + 8 <= e; p += 8) {
53 __builtin_prefetch(p + 8);
54 if (t) {
55 return false;
56 }
57 t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
58 }
59 while (p < e) {
60 t |= *p++;
61 }
62 t |= ldq_he_p(buf + len - 8);
63
64 return t == 0;
65 }
66}
67
Alexander Monakovd0184252024-02-14 10:34:24 -100068#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
Richard Henderson701ea582022-12-03 19:31:12 -060069#include <immintrin.h>
Richard Hendersond9911d12016-09-13 13:57:19 -070070
71/* Note that each of these vectorized functions require len >= 64. */
72
Richard Henderson701ea582022-12-03 19:31:12 -060073static bool __attribute__((target("sse2")))
Richard Hendersond9911d12016-09-13 13:57:19 -070074buffer_zero_sse2(const void *buf, size_t len)
75{
76 __m128i t = _mm_loadu_si128(buf);
77 __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
78 __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
79 __m128i zero = _mm_setzero_si128();
80
81 /* Loop over 16-byte aligned blocks of 64. */
82 while (likely(p <= e)) {
83 __builtin_prefetch(p);
84 t = _mm_cmpeq_epi8(t, zero);
85 if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
86 return false;
87 }
88 t = p[-4] | p[-3] | p[-2] | p[-1];
89 p += 4;
90 }
91
92 /* Finish the aligned tail. */
93 t |= e[-3];
94 t |= e[-2];
95 t |= e[-1];
96
97 /* Finish the unaligned tail. */
98 t |= _mm_loadu_si128(buf + len - 16);
99
100 return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
101}
Richard Henderson88ca8e82016-08-29 11:46:12 -0700102
Richard Henderson5e33a872016-08-29 11:46:15 -0700103#ifdef CONFIG_AVX2_OPT
Richard Henderson701ea582022-12-03 19:31:12 -0600104static bool __attribute__((target("avx2")))
Richard Hendersond9911d12016-09-13 13:57:19 -0700105buffer_zero_avx2(const void *buf, size_t len)
106{
107 /* Begin with an unaligned head of 32 bytes. */
108 __m256i t = _mm256_loadu_si256(buf);
109 __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
110 __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
111
Robert Hoo8f13a392020-03-25 14:50:21 +0800112 /* Loop over 32-byte aligned blocks of 128. */
113 while (p <= e) {
114 __builtin_prefetch(p);
115 if (unlikely(!_mm256_testz_si256(t, t))) {
116 return false;
Richard Hendersond9911d12016-09-13 13:57:19 -0700117 }
Robert Hoo8f13a392020-03-25 14:50:21 +0800118 t = p[-4] | p[-3] | p[-2] | p[-1];
119 p += 4;
120 } ;
Richard Hendersond9911d12016-09-13 13:57:19 -0700121
122 /* Finish the last block of 128 unaligned. */
123 t |= _mm256_loadu_si256(buf + len - 4 * 32);
124 t |= _mm256_loadu_si256(buf + len - 3 * 32);
Richard Hendersond9911d12016-09-13 13:57:19 -0700125 t |= _mm256_loadu_si256(buf + len - 2 * 32);
126 t |= _mm256_loadu_si256(buf + len - 1 * 32);
127
128 return _mm256_testz_si256(t, t);
129}
Richard Hendersond9911d12016-09-13 13:57:19 -0700130#endif /* CONFIG_AVX2_OPT */
131
Richard Henderson51f4d912023-05-17 19:10:59 -0700132static unsigned __attribute__((noinline))
133select_accel_cpuinfo(unsigned info)
Richard Hendersond9911d12016-09-13 13:57:19 -0700134{
Richard Henderson51f4d912023-05-17 19:10:59 -0700135 /* Array is sorted in order of algorithm preference. */
136 static const struct {
137 unsigned bit;
Richard Henderson51f4d912023-05-17 19:10:59 -0700138 bool (*fn)(const void *, size_t);
139 } all[] = {
Richard Henderson51f4d912023-05-17 19:10:59 -0700140#ifdef CONFIG_AVX2_OPT
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300141 { CPUINFO_AVX2, buffer_zero_avx2 },
Richard Henderson51f4d912023-05-17 19:10:59 -0700142#endif
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300143 { CPUINFO_SSE2, buffer_zero_sse2 },
144 { CPUINFO_ALWAYS, buffer_is_zero_integer },
Richard Henderson51f4d912023-05-17 19:10:59 -0700145 };
146
147 for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) {
148 if (info & all[i].bit) {
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300149 buffer_is_zero_accel = all[i].fn;
Richard Henderson51f4d912023-05-17 19:10:59 -0700150 return all[i].bit;
151 }
152 }
153 return 0;
Richard Hendersond9911d12016-09-13 13:57:19 -0700154}
Richard Henderson5e33a872016-08-29 11:46:15 -0700155
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300156static unsigned used_accel;
157
Richard Henderson51f4d912023-05-17 19:10:59 -0700158static void __attribute__((constructor)) init_accel(void)
Richard Henderson88ca8e82016-08-29 11:46:12 -0700159{
Richard Henderson51f4d912023-05-17 19:10:59 -0700160 used_accel = select_accel_cpuinfo(cpuinfo_init());
Richard Henderson88ca8e82016-08-29 11:46:12 -0700161}
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300162
163#define INIT_ACCEL NULL
Richard Henderson88ca8e82016-08-29 11:46:12 -0700164
Richard Hendersonefad6682016-08-29 11:46:16 -0700165bool test_buffer_is_zero_next_accel(void)
166{
Richard Henderson51f4d912023-05-17 19:10:59 -0700167 /*
168 * Accumulate the accelerators that we've already tested, and
169 * remove them from the set to test this round. We'll get back
170 * a zero from select_accel_cpuinfo when there are no more.
171 */
172 unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel);
173 used_accel |= used;
174 return used;
Richard Hendersonefad6682016-08-29 11:46:16 -0700175}
Richard Henderson5e33a872016-08-29 11:46:15 -0700176#else
Richard Hendersonefad6682016-08-29 11:46:16 -0700177bool test_buffer_is_zero_next_accel(void)
178{
179 return false;
180}
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300181
182#define INIT_ACCEL buffer_is_zero_integer
Richard Hendersonefad6682016-08-29 11:46:16 -0700183#endif
184
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300185static bool (*buffer_is_zero_accel)(const void *, size_t) = INIT_ACCEL;
186
187bool buffer_is_zero_ool(const void *buf, size_t len)
Richard Henderson88ca8e82016-08-29 11:46:12 -0700188{
Richard Henderson5e33a872016-08-29 11:46:15 -0700189 if (unlikely(len == 0)) {
190 return true;
Richard Henderson88ca8e82016-08-29 11:46:12 -0700191 }
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300192 if (!buffer_is_zero_sample3(buf, len)) {
193 return false;
194 }
195 /* All bytes are covered for any len <= 3. */
196 if (unlikely(len <= 3)) {
197 return true;
198 }
Richard Henderson88ca8e82016-08-29 11:46:12 -0700199
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300200 if (likely(len >= 256)) {
201 return buffer_is_zero_accel(buf, len);
202 }
203 return buffer_is_zero_integer(buf, len);
204}
Richard Henderson083d0122016-08-29 11:46:17 -0700205
Alexander Monakovcbe3d522024-02-06 23:48:05 +0300206bool buffer_is_zero_ge256(const void *buf, size_t len)
207{
208 return buffer_is_zero_accel(buf, len);
Richard Henderson88ca8e82016-08-29 11:46:12 -0700209}