src/aarch64/memcmp.S: Import changes from newlib

Optimized memcmp This is an optimized memcmp for AArch64. This is a complete rewrite using a different algorithm. The previous version split into cases where both inputs were aligned, the inputs were mutually aligned and unaligned using a byte loop. The new version combines all these cases, while small inputs of less than 8 bytes are handled separately. This allows the main code to be sped up using unaligned loads since there are now at least 8 bytes to be compared. After the first 8 bytes align the first input. This ensures each iteration does at most one unaligned access and mutually aligned inputs behave as aligned. After the main loop, process the last 8 bytes using unaligned accesses. This improves performance of (mutually) aligned cases by 25% and unaligned by >500% (yes >6 times faster) on large inputs. ChangeLog: 2017-06-28 Wilco Dijkstra <wdijkstr@arm.com> * newlib/libc/machine/aarch64/memcmp.S (memcmp): Rewrite of optimized memcmp. Change-Id: I0d1d20e44260f9cdfbbe803b8064c477d0c1dc85
author: Siddhesh Poyarekar <siddhesh.poyarekar@linaro.org> 2018-03-13 15:16:06 +0000
committer: Siddhesh Poyarekar <siddhesh.poyarekar@linaro.org> 2018-03-15 19:14:08 +0530
commit: 73fc89fb7f99900064f5f8eb2bb4477e51934f53 (patch)
tree: 15c66d5bb84dd630730a23627148b7fc8ea4af69
parent: f98f2a6780d686ca3d44f8011c7823d42d9b083a (diff)
1 files changed, 112 insertions, 111 deletions
diff --git a/src/aarch64/memcmp.S b/src/aarch64/memcmp.S
index abba416..29877af 100644
--- a/src/aarch64/memcmp.S
+++ b/src/aarch64/memcmp.S
@@ -26,137 +26,138 @@
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
 
-/* Assumptions:
+/*
+ * Copyright (c) 2017 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
  *
- * ARMv8-a, AArch64
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
 
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define limit		x2
-#define result		x0
+#define result		w0
 
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
 #define data2		x4
 #define data2w		w4
-#define has_nul		x5
-#define diff		x6
-#define endloop		x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define pos		x11
-#define limit_wd	x12
-#define mask		x13
+#define tmp1		x5
 
-def_fn memcmp p2align=6
-	cbz	limit, .Lret0
-	eor	tmp1, src1, src2
-	tst	tmp1, #7
-	b.ne	.Lmisaligned8
-	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
-	add	limit_wd, limit, #7
-	lsr	limit_wd, limit_wd, #3
-	/* Start of performance-critical section  -- one 64B cache line.  */
-.Lloop_aligned:
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-.Lstart_realigned:
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */
-	cbz	endloop, .Lloop_aligned
-	/* End of performance-critical section  -- one 64B cache line.  */
-
-	/* Not reached the limit, must have found a diff.  */
-	cbnz	limit_wd, .Lnot_limit
-
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	.Lnot_limit
-
-	lsl	limit, limit, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-#ifdef __AARCH64EB__
-	lsr	mask, mask, limit
-#else
-	lsl	mask, mask, limit
-#endif
-	bic	data1, data1, mask
-	bic	data2, data2, mask
+        .macro def_fn f p2align=0
+        .text
+        .p2align \p2align
+        .global \f
+        .type \f, %function
+\f:
+        .endm
 
-	orr	diff, diff, mask
-.Lnot_limit:
+/* Small inputs of less than 8 bytes are handled separately.  This allows the
+   main code to be sped up using unaligned loads since there are now at least
+   8 bytes to be compared.  If the first 8 bytes are equal, align src1.
+   This ensures each iteration does at most one unaligned access even if both
+   src1 and src2 are unaligned, and mutually aligned inputs behave as if
+   aligned.  After the main loop, process the last 8 bytes using unaligned
+   accesses.  */
 
-#ifndef	__AARCH64EB__
-	rev	diff, diff
+def_fn memcmp p2align=6
+	subs	limit, limit, 8
+	b.lo	.Lless8
+
+	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	and	tmp1, src1, 7
+	add	limit, limit, tmp1
+	cmp	data1, data2
+	bne	.Lreturn
+
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+
+	subs	limit, limit, 8
+	b.ls	.Llast_bytes
+
+	/* Loop performing 8 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 8 and must be larger than zero.
+	   Exit if <= 8 bytes left to do or if the data is not equal.  */
+	.p2align 4
+.Lloop8:
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	subs	limit, limit, 8
+	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
+	b.eq	.Lloop8
+
+	cmp	data1, data2
+	bne	.Lreturn
+
+	/* Compare last 1-8 bytes using unaligned access.  */
+.Llast_bytes:
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+.Lreturn:
+#ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
-	/* The MS-non-zero bit of DIFF marks either the first bit
-	   that is different, or the end of the significant data.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	clz	pos, diff
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
+	cmp     data1, data2
+.Lret_eq:
+	cset	result, ne
+	cneg	result, result, lo
+        ret
+
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+.Lless8:
+	adds	limit, limit, 4
+	b.lo	.Lless4
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
+	cmp	data1w, data2w
+	b.ne	.Lreturn
+	sub	limit, limit, 4
+.Lless4:
+	adds	limit, limit, 4
+	beq	.Lret_eq
+.Lbyte_loop:
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	.Lbyte_loop
+	sub	result, data1w, data2w
 	ret
 
-.Lmutual_align:
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	ldr	data1, [src1], #8
-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	add	limit_wd, limit, #7
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	lsr	limit_wd, limit_wd, #3
-	b	.Lstart_realigned
-
-.Lret0:
-	mov	result, #0
-	ret
-
-	.p2align 6
-.Lmisaligned8:
-	sub	limit, limit, #1
-1:
-	/* Perhaps we can do better than this.  */
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
-	sub	result, data1, data2
-	ret
 	.size memcmp, . - memcmp
author	Siddhesh Poyarekar <siddhesh.poyarekar@linaro.org>	2018-03-13 15:16:06 +0000
committer	Siddhesh Poyarekar <siddhesh.poyarekar@linaro.org>	2018-03-15 19:14:08 +0530
commit	73fc89fb7f99900064f5f8eb2bb4477e51934f53 (patch)
tree	15c66d5bb84dd630730a23627148b7fc8ea4af69
parent	f98f2a6780d686ca3d44f8011c7823d42d9b083a (diff)