diff options
author | Siddhesh Poyarekar <siddhesh.poyarekar@linaro.org> | 2018-03-13 15:16:06 +0000 |
---|---|---|
committer | Siddhesh Poyarekar <siddhesh.poyarekar@linaro.org> | 2018-03-15 19:14:08 +0530 |
commit | 73fc89fb7f99900064f5f8eb2bb4477e51934f53 (patch) | |
tree | 15c66d5bb84dd630730a23627148b7fc8ea4af69 | |
parent | f98f2a6780d686ca3d44f8011c7823d42d9b083a (diff) |
src/aarch64/memcmp.S: Import changes from newlib
Optimized memcmp
This is an optimized memcmp for AArch64. This is a complete rewrite
using a different algorithm. The previous version split into cases
where both inputs were aligned, the inputs were mutually aligned and
unaligned using a byte loop. The new version combines all these cases,
while small inputs of less than 8 bytes are handled separately.
This allows the main code to be sped up using unaligned loads since
there are now at least 8 bytes to be compared. After the first 8 bytes
align the first input. This ensures each iteration does at most one
unaligned access and mutually aligned inputs behave as aligned.
After the main loop, process the last 8 bytes using unaligned accesses.
This improves performance of (mutually) aligned cases by 25% and
unaligned by >500% (yes >6 times faster) on large inputs.
ChangeLog:
2017-06-28 Wilco Dijkstra <wdijkstr@arm.com>
* newlib/libc/machine/aarch64/memcmp.S (memcmp):
Rewrite of optimized memcmp.
Change-Id: I0d1d20e44260f9cdfbbe803b8064c477d0c1dc85
-rw-r--r-- | src/aarch64/memcmp.S | 223 |
1 files changed, 112 insertions, 111 deletions
diff --git a/src/aarch64/memcmp.S b/src/aarch64/memcmp.S index abba416..29877af 100644 --- a/src/aarch64/memcmp.S +++ b/src/aarch64/memcmp.S @@ -26,137 +26,138 @@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* Assumptions: +/* + * Copyright (c) 2017 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. * - * ARMv8-a, AArch64 + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + */ /* Parameters and result. */ #define src1 x0 #define src2 x1 #define limit x2 -#define result x0 +#define result w0 /* Internal variables. */ #define data1 x3 #define data1w w3 #define data2 x4 #define data2w w4 -#define has_nul x5 -#define diff x6 -#define endloop x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define pos x11 -#define limit_wd x12 -#define mask x13 +#define tmp1 x5 -def_fn memcmp p2align=6 - cbz limit, .Lret0 - eor tmp1, src1, src2 - tst tmp1, #7 - b.ne .Lmisaligned8 - ands tmp1, src1, #7 - b.ne .Lmutual_align - add limit_wd, limit, #7 - lsr limit_wd, limit_wd, #3 - /* Start of performance-critical section -- one 64B cache line. */ -.Lloop_aligned: - ldr data1, [src1], #8 - ldr data2, [src2], #8 -.Lstart_realigned: - subs limit_wd, limit_wd, #1 - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, ne /* Last Dword or differences. */ - cbz endloop, .Lloop_aligned - /* End of performance-critical section -- one 64B cache line. */ - - /* Not reached the limit, must have found a diff. */ - cbnz limit_wd, .Lnot_limit - - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq .Lnot_limit - - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm - orr diff, diff, mask -.Lnot_limit: +/* Small inputs of less than 8 bytes are handled separately. This allows the + main code to be sped up using unaligned loads since there are now at least + 8 bytes to be compared. If the first 8 bytes are equal, align src1. + This ensures each iteration does at most one unaligned access even if both + src1 and src2 are unaligned, and mutually aligned inputs behave as if + aligned. After the main loop, process the last 8 bytes using unaligned + accesses. */ -#ifndef __AARCH64EB__ - rev diff, diff +def_fn memcmp p2align=6 + subs limit, limit, 8 + b.lo .Lless8 + + /* Limit >= 8, so check first 8 bytes using unaligned loads. */ + ldr data1, [src1], 8 + ldr data2, [src2], 8 + and tmp1, src1, 7 + add limit, limit, tmp1 + cmp data1, data2 + bne .Lreturn + + /* Align src1 and adjust src2 with bytes not yet done. */ + sub src1, src1, tmp1 + sub src2, src2, tmp1 + + subs limit, limit, 8 + b.ls .Llast_bytes + + /* Loop performing 8 bytes per iteration using aligned src1. + Limit is pre-decremented by 8 and must be larger than zero. + Exit if <= 8 bytes left to do or if the data is not equal. */ + .p2align 4 +.Lloop8: + ldr data1, [src1], 8 + ldr data2, [src2], 8 + subs limit, limit, 8 + ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ + b.eq .Lloop8 + + cmp data1, data2 + bne .Lreturn + + /* Compare last 1-8 bytes using unaligned access. */ +.Llast_bytes: + ldr data1, [src1, limit] + ldr data2, [src2, limit] + + /* Compare data bytes and set return value to 0, -1 or 1. */ +.Lreturn: +#ifndef __AARCH64EB__ rev data1, data1 rev data2, data2 #endif - /* The MS-non-zero bit of DIFF marks either the first bit - that is different, or the end of the significant data. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, diff - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 + cmp data1, data2 +.Lret_eq: + cset result, ne + cneg result, result, lo + ret + + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +.Lless8: + adds limit, limit, 4 + b.lo .Lless4 + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne .Lreturn + sub limit, limit, 4 +.Lless4: + adds limit, limit, 4 + beq .Lret_eq +.Lbyte_loop: + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq .Lbyte_loop + sub result, data1w, data2w ret -.Lmutual_align: - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - add limit, limit, tmp1 /* Adjust the limit for the extra. */ - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - add limit_wd, limit, #7 - orr data1, data1, tmp2 - orr data2, data2, tmp2 - lsr limit_wd, limit_wd, #3 - b .Lstart_realigned - -.Lret0: - mov result, #0 - ret - - .p2align 6 -.Lmisaligned8: - sub limit, limit, #1 -1: - /* Perhaps we can do better than this. */ - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq 1b - sub result, data1, data2 - ret .size memcmp, . - memcmp |