/* * Xor Based Zero Run Length Encoding * * Copyright 2013 Red Hat, Inc. and/or its affiliates * * Authors: * Orit Wasserman * * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. * */ #include "qemu/osdep.h" #include "qemu/cutils.h" #include "qemu/host-utils.h" #include "xbzrle.h" #if defined(CONFIG_AVX512BW_OPT) #include #include "host/cpuinfo.h" static int __attribute__((target("avx512bw"))) xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, uint8_t *dst, int dlen) { uint32_t zrun_len = 0, nzrun_len = 0; int d = 0, i = 0, num = 0; uint8_t *nzrun_start = NULL; /* add 1 to include residual part in main loop */ uint32_t count512s = (slen >> 6) + 1; /* countResidual is tail of data, i.e., countResidual = slen % 64 */ uint32_t count_residual = slen & 0b111111; bool never_same = true; uint64_t mask_residual = 1; mask_residual <<= count_residual; mask_residual -= 1; __m512i r = _mm512_set1_epi32(0); while (count512s) { int bytes_to_check = 64; uint64_t mask = 0xffffffffffffffff; if (count512s == 1) { bytes_to_check = count_residual; mask = mask_residual; } __m512i old_data = _mm512_mask_loadu_epi8(r, mask, old_buf + i); __m512i new_data = _mm512_mask_loadu_epi8(r, mask, new_buf + i); uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data); count512s--; bool is_same = (comp & 0x1); while (bytes_to_check) { if (d + 2 > dlen) { return -1; } if (is_same) { if (nzrun_len) { d += uleb128_encode_small(dst + d, nzrun_len); if (d + nzrun_len > dlen) { return -1; } nzrun_start = new_buf + i - nzrun_len; memcpy(dst + d, nzrun_start, nzrun_len); d += nzrun_len; nzrun_len = 0; } /* 64 data at a time for speed */ if (count512s && (comp == 0xffffffffffffffff)) { i += 64; zrun_len += 64; break; } never_same = false; num = ctz64(~comp); num = (num < bytes_to_check) ? num : bytes_to_check; zrun_len += num; bytes_to_check -= num; comp >>= num; i += num; if (bytes_to_check) { /* still has different data after same data */ d += uleb128_encode_small(dst + d, zrun_len); zrun_len = 0; } else { break; } } if (never_same || zrun_len) { /* * never_same only acts if * data begins with diff in first count512s */ d += uleb128_encode_small(dst + d, zrun_len); zrun_len = 0; never_same = false; } /* has diff, 64 data at a time for speed */ if ((bytes_to_check == 64) && (comp == 0x0)) { i += 64; nzrun_len += 64; break; } num = ctz64(comp); num = (num < bytes_to_check) ? num : bytes_to_check; nzrun_len += num; bytes_to_check -= num; comp >>= num; i += num; if (bytes_to_check) { /* mask like 111000 */ d += uleb128_encode_small(dst + d, nzrun_len); /* overflow */ if (d + nzrun_len > dlen) { return -1; } nzrun_start = new_buf + i - nzrun_len; memcpy(dst + d, nzrun_start, nzrun_len); d += nzrun_len; nzrun_len = 0; is_same = true; } } } if (nzrun_len != 0) { d += uleb128_encode_small(dst + d, nzrun_len); /* overflow */ if (d + nzrun_len > dlen) { return -1; } nzrun_start = new_buf + i - nzrun_len; memcpy(dst + d, nzrun_start, nzrun_len); d += nzrun_len; } return d; } static int xbzrle_encode_buffer_int(uint8_t *old_buf, uint8_t *new_buf, int slen, uint8_t *dst, int dlen); static int (*accel_func)(uint8_t *, uint8_t *, int, uint8_t *, int); static void __attribute__((constructor)) init_accel(void) { unsigned info = cpuinfo_init(); if (info & CPUINFO_AVX512BW) { accel_func = xbzrle_encode_buffer_avx512; } else { accel_func = xbzrle_encode_buffer_int; } } int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, uint8_t *dst, int dlen) { return accel_func(old_buf, new_buf, slen, dst, dlen); } #define xbzrle_encode_buffer xbzrle_encode_buffer_int #endif /* page = zrun nzrun | zrun nzrun page zrun = length nzrun = length byte... length = uleb128 encoded integer */ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, uint8_t *dst, int dlen) { uint32_t zrun_len = 0, nzrun_len = 0; int d = 0, i = 0; long res; uint8_t *nzrun_start = NULL; g_assert(!(((uintptr_t)old_buf | (uintptr_t)new_buf | slen) % sizeof(long))); while (i < slen) { /* overflow */ if (d + 2 > dlen) { return -1; } /* not aligned to sizeof(long) */ res = (slen - i) % sizeof(long); while (res && old_buf[i] == new_buf[i]) { zrun_len++; i++; res--; } /* word at a time for speed */ if (!res) { while (i < slen && (*(long *)(old_buf + i)) == (*(long *)(new_buf + i))) { i += sizeof(long); zrun_len += sizeof(long); } /* go over the rest */ while (i < slen && old_buf[i] == new_buf[i]) { zrun_len++; i++; } } /* buffer unchanged */ if (zrun_len == slen) { return 0; } /* skip last zero run */ if (i == slen) { return d; } d += uleb128_encode_small(dst + d, zrun_len); zrun_len = 0; nzrun_start = new_buf + i; /* overflow */ if (d + 2 > dlen) { return -1; } /* not aligned to sizeof(long) */ res = (slen - i) % sizeof(long); while (res && old_buf[i] != new_buf[i]) { i++; nzrun_len++; res--; } /* word at a time for speed, use of 32-bit long okay */ if (!res) { /* truncation to 32-bit long okay */ unsigned long mask = (unsigned long)0x0101010101010101ULL; while (i < slen) { unsigned long xor; xor = *(unsigned long *)(old_buf + i) ^ *(unsigned long *)(new_buf + i); if ((xor - mask) & ~xor & (mask << 7)) { /* found the end of an nzrun within the current long */ while (old_buf[i] != new_buf[i]) { nzrun_len++; i++; } break; } else { i += sizeof(long); nzrun_len += sizeof(long); } } } d += uleb128_encode_small(dst + d, nzrun_len); /* overflow */ if (d + nzrun_len > dlen) { return -1; } memcpy(dst + d, nzrun_start, nzrun_len); d += nzrun_len; nzrun_len = 0; } return d; } int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen) { int i = 0, d = 0; int ret; uint32_t count = 0; while (i < slen) { /* zrun */ if ((slen - i) < 2) { return -1; } ret = uleb128_decode_small(src + i, &count); if (ret < 0 || (i && !count)) { return -1; } i += ret; d += count; /* overflow */ if (d > dlen) { return -1; } /* nzrun */ if ((slen - i) < 2) { return -1; } ret = uleb128_decode_small(src + i, &count); if (ret < 0 || !count) { return -1; } i += ret; /* overflow */ if (d + count > dlen || i + count > slen) { return -1; } memcpy(dst + d, src + i, count); d += count; i += count; } return d; }