aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRagesh Radhakrishnan <ragesh.r@linaro.org>2013-10-14 18:00:08 +0530
committerRagesh Radhakrishnan <ragesh.r@linaro.org>2013-12-04 16:19:35 +0530
commit3cabf63c28d8117d7cc6b776ca01a8ff0c72ecfa (patch)
treef94912800f2cb779489e9c6e582b1ec513fafc29
parent337a5f8b00da8767dc699425b10a72ab5473bb51 (diff)
downloadlibjpeg-turbo-3cabf63c28d8117d7cc6b776ca01a8ff0c72ecfa.tar.gz
Add armv8 port idct slow
Armv8 spcific instruction changes incorprated on armv7 neon idct_slow code, instructions like orrs doenot exist in armv8 and cpsr flags needs to be updated with alternate instruction like cmp. RTSM simulator specific changes are incorprated as some instructions particularly related to integer saturation doenot work as expected. vtrans( armv7 instruction) is split into trn1 and trn2 due to which transpose_4x4 macro is modified and a new transpose macro replaces the orginal vtrn1 instruction for armv8. Etra ins instruction are added to move data to higher half of 128bit registers due to non overlap of lower registers to higher registers as in armv7. Data movement to higher half of registers can be optimized further.
-rw-r--r--simd/jsimd_arm_neon_64.S771
1 files changed, 771 insertions, 0 deletions
diff --git a/simd/jsimd_arm_neon_64.S b/simd/jsimd_arm_neon_64.S
new file mode 100644
index 0000000..1797318
--- /dev/null
+++ b/simd/jsimd_arm_neon_64.S
@@ -0,0 +1,771 @@
+ /*
+ * ARMv8 NEON optimizations for libjpeg-turbo
+ * This file is a copy of the armv7 neon version but ported to armv8.
+ *
+ * Copyright (C) 2009-2011 Nokia Corporation and/or it's subsidiary(-ies).
+ * All rights reserved.
+ * Author Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2013, Linaro Limited
+ * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
+#endif
+
+.text
+
+.arch armv8-a+fp+simd
+#define RESPECT_STRICT_ALIGNMENT 1
+
+#define RTSM_SQSHRN_SIM_ISSUE
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+ .func _\fname
+ .globl _\fname
+_\fname:
+#else
+ .func \fname
+ .global \fname
+#ifdef __ELF__
+ .hidden \fname
+ .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+/* Transpose elements of single 128 bit registers */
+.macro transpose_single x0,x1,xi,xilen,literal
+ ins \xi\xilen[0], \x0\xilen[0]
+ ins \x1\xilen[0], \x0\xilen[1]
+ trn1 \x0\literal, \x0\literal, \x1\literal
+ trn2 \x1\literal, \xi\literal, \x1\literal
+.endm
+
+
+/* Transpose elements of 2 differnet registers */
+.macro transpose x0,x1,xi,xilen,literal
+ mov \xi\xilen, \x0\xilen
+ trn1 \x0\literal, \x0\literal, \x1\literal
+ trn2 \x1\literal, \xi\literal, \x1\literal
+.endm
+/* Transpose a block of 4x4 coefficients in four 64-bit registers */
+
+.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
+ mov \xi\xilen, \x0\xilen
+ trn1 \x0\x0len, \x0\x0len, \x2\x2len
+ trn2 \x2\x2len, \xi\x0len, \x2\x2len
+ mov \xi\xilen, \x1\xilen
+ trn1 \x1\x1len, \x1\x1len, \x3\x3len
+ trn2 \x3\x3len, \xi\x1len, \x3\x3len
+.endm
+
+.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
+ mov \xi\xilen, \x0\xilen
+ trn1 \x0\x0len, \x0\x0len, \x1\x1len
+ trn2 \x1\x2len, \xi\x0len, \x1\x2len
+ mov \xi\xilen, \x2\xilen
+ trn1 \x2\x2len, \x2\x2len, \x3\x3len
+ trn2 \x3\x2len, \xi\x1len, \x3\x3len
+.endm
+
+.macro transpose_4x4 x0, x1, x2, x3,x5
+ transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
+ transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
+.endm
+
+
+
+
+#define CENTERJSAMPLE 128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
+ * JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define FIX_0_298631336 (2446)
+#define FIX_0_390180644 (3196)
+#define FIX_0_541196100 (4433)
+#define FIX_0_765366865 (6270)
+#define FIX_0_899976223 (7373)
+#define FIX_1_175875602 (9633)
+#define FIX_1_501321110 (12299)
+#define FIX_1_847759065 (15137)
+#define FIX_1_961570560 (16069)
+#define FIX_2_053119869 (16819)
+#define FIX_2_562915447 (20995)
+#define FIX_3_072711026 (25172)
+
+#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
+#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
+#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
+#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
+#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
+#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
+#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
+#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
+
+/*
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
+ */
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
+{ \
+ DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
+ INT32 q1, q2, q3, q4, q5, q6, q7; \
+ INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \
+ \
+ /* 1-D iDCT input data */ \
+ row0 = xrow0; \
+ row1 = xrow1; \
+ row2 = xrow2; \
+ row3 = xrow3; \
+ row4 = xrow4; \
+ row5 = xrow5; \
+ row6 = xrow6; \
+ row7 = xrow7; \
+ \
+ q5 = row7 + row3; \
+ q4 = row5 + row1; \
+ q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
+ MULTIPLY(q4, FIX_1_175875602); \
+ q7 = MULTIPLY(q5, FIX_1_175875602) + \
+ MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
+ q2 = MULTIPLY(row2, FIX_0_541196100) + \
+ MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
+ q4 = q6; \
+ q3 = ((INT32) row0 - (INT32) row4) << 13; \
+ q6 += MULTIPLY(row5, -FIX_2_562915447) + \
+ MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
+ /* now we can use q1 (reloadable constants have been used up) */ \
+ q1 = q3 + q2; \
+ q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
+ MULTIPLY(row1, -FIX_0_899976223); \
+ q5 = q7; \
+ q1 = q1 + q6; \
+ q7 += MULTIPLY(row7, -FIX_0_899976223) + \
+ MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
+ \
+ /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
+ tmp11_plus_tmp2 = q1; \
+ row1 = 0; \
+ \
+ q1 = q1 - q6; \
+ q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
+ MULTIPLY(row3, -FIX_2_562915447); \
+ q1 = q1 - q6; \
+ q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
+ MULTIPLY(row6, FIX_0_541196100); \
+ q3 = q3 - q2; \
+ \
+ /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
+ tmp11_minus_tmp2 = q1; \
+ \
+ q1 = ((INT32) row0 + (INT32) row4) << 13; \
+ q2 = q1 + q6; \
+ q1 = q1 - q6; \
+ \
+ /* pick up the results */ \
+ tmp0 = q4; \
+ tmp1 = q5; \
+ tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
+ tmp3 = q7; \
+ tmp10 = q2; \
+ tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
+ tmp12 = q3; \
+ tmp13 = q1; \
+}
+
+#define XFIX_0_899976223 v0.4h[0]
+#define XFIX_0_541196100 v0.4h[1]
+#define XFIX_2_562915447 v0.4h[2]
+#define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]
+#define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]
+#define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]
+#define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]
+#define XFIX_1_175875602 v1.4h[3]
+#define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]
+#define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]
+#define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]
+#define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]
+
+.balign 16
+jsimd_idct_islow_neon_consts:
+ .short FIX_0_899976223 /* d0[0] */
+ .short FIX_0_541196100 /* d0[1] */
+ .short FIX_2_562915447 /* d0[2] */
+ .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
+ .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
+ .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
+ .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
+ .short FIX_1_175875602 /* d1[3] */
+ /* reloadable constants */
+ .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
+ .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
+ .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
+ .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
+
+/******************************************************************************
+*
+* jsimd_idct_islow_neon
+*
+*****************************************************************************/
+
+asm_function jsimd_idct_islow_neon
+ DCT_TABLE .req x0
+ COEF_BLOCK .req x1
+ OUTPUT_BUF .req x2
+ OUTPUT_COL .req x3
+ TMP1 .req x0
+ TMP2 .req x1
+ TMP3 .req x2
+ TMP4 .req x15
+
+ ROW0L .req v16
+ ROW0R .req v17
+ ROW1L .req v18
+ ROW1R .req v19
+ ROW2L .req v20
+ ROW2R .req v21
+ ROW3L .req v22
+ ROW3R .req v23
+ ROW4L .req v24
+ ROW4R .req v25
+ ROW5L .req v26
+ ROW5R .req v27
+ ROW6L .req v28
+ ROW6R .req v29
+ ROW7L .req v30
+ ROW7R .req v31
+
+ adr x15, jsimd_idct_islow_neon_consts
+ ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
+ ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
+ ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
+ mul v16.4h,v16.4h,v0.4h
+ mul v17.4h,v17.4h,v1.4h
+ ins v16.2d[1],v17.2d[0] /* 128 bit q8 */
+ ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [DCT_TABLE], 32
+ mul v18.4h,v18.4h,v2.4h
+ mul v19.4h,v19.4h,v3.4h
+ ins v18.2d[1],v19.2d[0] /* 128 bit q9 */
+ ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
+ mul v20.4h,v20.4h,v4.4h
+ mul v21.4h,v21.4h,v5.4h
+ ins v20.2d[1],v21.2d[0] /* 128 bit q10 */
+ ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
+ mul v22.4h,v22.4h,v6.4h
+ mul v23.4h,v23.4h,v7.4h
+ ins v22.2d[1],v23.2d[0] /* 128 bit q11 */
+ ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK], 32
+ mul v24.4h, v24.4h, v0.4h
+ mul v25.4h, v25.4h, v1.4h
+ ins v24.2d[1],v25.2d[0] /* 128 bit q12 */
+ ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
+ mul v28.4h, v28.4h, v4.4h
+ mul v29.4h, v29.4h, v5.4h
+ ins v28.2d[1],v29.2d[0] /* 128 bit q14 */
+ mul v26.4h, v26.4h, v2.4h
+ mul v27.4h, v27.4h, v3.4h
+ ins v26.2d[1],v27.2d[0] /* 128 bit q13 */
+ ld1 {v0.4h,v1.4h, v2.4h,v3.4h}, [x15] /* load constants */
+ add x15, x15, #16
+ mul v30.4h, v30.4h, v6.4h
+ mul v31.4h, v31.4h, v7.4h
+ ins v30.2d[1],v31.2d[0] /* 128 bit q15 */
+ sub sp, sp, #32
+ st1 {v8.4h-v11.4h}, [sp]/* save NEON registers */
+ sub sp, sp, #32
+ st1 {v12.4h-v15.4h}, [sp]
+ /* 1-D IDCT, pass 1, left 4x8 half */
+ add v4.4h, ROW7L.4h, ROW3L.4h
+ add v5.4h, ROW5L.4h, ROW1L.4h
+ smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560
+ smlal v12.4s, v5.4h, XFIX_1_175875602
+ smull v14.4s, v4.4h, XFIX_1_175875602
+ /* check for the zero coeffecients in the right 4*8 half */
+ /*push {x4, x5}*/ /*--------> need to be fixed */
+ stp x4,x5,[sp,-16]!
+ mov x5, #0
+ smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644
+ ssubl v6.4s, ROW0L.4h, ROW4L.4h
+ ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+ smull v4.4s, ROW2L.4h, XFIX_0_541196100
+ smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
+ orr x0, x4, x5
+ mov v8.16b, v12.16b
+ smlsl v12.4s, ROW5L.4h, XFIX_2_562915447
+ ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+ smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
+ shl v6.4s, v6.4s, #13
+ orr x0, x0, x4
+ smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
+ orr x0, x0 , x5
+ add v2.4s, v6.4s, v4.4s
+ ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+ mov v10.16b, v14.16b
+ add v2.4s, v2.4s, v12.4s
+ orr x0, x0, x4
+ smlsl v14.4s, ROW7L.4h, XFIX_0_899976223
+ orr x0, x0, x5
+ smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
+ rshrn ROW1L.4h, v2.4s, #11
+ ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+ sub v2.4s, v2.4s, v12.4s
+ smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
+ orr x0, x0, x4
+ smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
+ orr x0, x0, x5
+ sub v2.4s, v2.4s, v12.4s
+ smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
+ ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+ smlal v12.4s, ROW6L.4h, XFIX_0_541196100
+ sub v6.4s, v6.4s, v4.4s
+ orr x0, x0, x4
+ rshrn ROW6L.4h, v2.4s, #11
+ orr x0, x0, x5
+ add v2.4s, v6.4s, v10.4s
+ ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+ sub v6.4s, v6.4s, v10.4s
+ saddl v10.4s, ROW0L.4h, ROW4L.4h
+ orr x0, x0, x4
+ rshrn ROW2L.4h, v2.4s, #11
+ orr x0, x0, x5
+ rshrn ROW5L.4h, v6.4s, #11
+ ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+ shl v10.4s, v10.4s, #13
+ smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
+ orr x0, x0, x4
+ add v4.4s, v10.4s, v12.4s
+ orr x0, x0, x5
+ sub v2.4s, v10.4s, v12.4s
+ add v12.4s, v4.4s, v14.4s
+ ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+ sub v4.4s, v4.4s, v14.4s
+ add v10.4s, v2.4s, v8.4s
+ orr x0, x4, x5
+ sub v6.4s, v2.4s, v8.4s
+ /*pop {x4, x5} */
+ ldp x4, x5, [sp], 16
+ rshrn ROW7L.4h, v4.4s, #11
+ rshrn ROW3L.4h, v10.4s, #11
+ rshrn ROW0L.4h, v12.4s, #11
+ rshrn ROW4L.4h, v6.4s, #11
+ cmp x0, #0 /*orrs instruction removed*/
+ beq 3f /* Go to do some special handling for the sparse right 4x8 half */
+
+
+ /* 1-D IDCT, pass 1, right 4x8 half */
+ ld1 {v2.4h}, [x15] /* reload constants */
+ add v10.4h, ROW7R.4h, ROW3R.4h
+ add v8.4h, ROW5R.4h, ROW1R.4h
+ /* Transpose ROW6L <-> ROW7L (v3 avliable free register)*/
+ transpose ROW6L,ROW7L,v3,.16b,.4h
+ smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560
+ smlal v12.4s, v8.4h, XFIX_1_175875602
+ /* Transpose ROW2L <-> ROW3L (v3 avliable free register)*/
+ transpose ROW2L,ROW3L,v3,.16b,.4h
+ smull v14.4s, v10.4h, XFIX_1_175875602
+ smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644
+ /* Transpose ROW0L <-> ROW1L (v3 avliable free register)*/
+ transpose ROW0L,ROW1L,v3,.16b,.4h
+ ssubl v6.4s, ROW0R.4h, ROW4R.4h
+ smull v4.4s, ROW2R.4h, XFIX_0_541196100
+ smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
+ /* Transpose ROW4L <-> ROW5L (v3 avliable free register)*/
+ transpose ROW4L,ROW5L,v3,.16b,.4h
+ mov v8.16b, v12.16b
+ smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
+ smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447
+ /* Transpose ROW1L <-> ROW3L (v3 avliable free register)*/
+ transpose ROW1L,ROW3L,v3,.16b,.2s
+ shl v6.4s, v6.4s, #13
+ smlsl v8.4s, ROW1R.4h, XFIX_0_899976223
+ /* Trannnnnose ROW4L <-> ROW6L (v3 avliable free register)*/
+ transpose ROW4L,ROW6L,v3,.16b,.2s
+ add v2.4s, v6.4s, v4.4s
+ mov v10.16b, v14.16b
+ add v2.4s, v2.4s, v12.4s
+ /* Transpose ROW0L <-> ROW2L (v3 avliable free register)*/
+ transpose ROW0L,ROW2L,v3,.16b,.2s
+ smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
+ smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223
+ rshrn ROW1R.4h, v2.4s, #11
+ /* Transpose ROW5L <-> ROW7L (v3 avliable free register)*/
+ transpose ROW5L,ROW7L,v3,.16b,.2s
+ sub v2.4s, v2.4s, v12.4s
+ smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
+ smlsl v10.4s, ROW3R.4h, XFIX_2_562915447
+ sub v2.4s, v2.4s, v12.4s
+ smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865
+ smlal v12.4s, ROW6R.4h, XFIX_0_541196100
+ sub v6.4s, v6.4s, v4.4s
+ rshrn ROW6R.4h, v2.4s, #11
+ add v2.4s, v6.4s, v10.4s
+ sub v6.4s, v6.4s, v10.4s
+ saddl v10.4s, ROW0R.4h, ROW4R.4h
+ rshrn ROW2R.4h, v2.4s, #11
+ rshrn ROW5R.4h, v6.4s, #11
+ shl v10.4s, v10.4s, #13
+ smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
+ add v4.4s, v10.4s, v12.4s
+ sub v2.4s, v10.4s, v12.4s
+ add v12.4s, v4.4s, v14.4s
+ sub v4.4s, v4.4s, v14.4s
+ add v10.4s, v2.4s, v8.4s
+ sub v12.4s, v2.4s, v8.4s
+ rshrn ROW7R.4h, v4.4s, #11
+ rshrn ROW3R.4h, v10.4s, #11
+ rshrn ROW0R.4h, v12.4s, #11
+ rshrn ROW4R.4h, v6.4s, #11
+/* Transpose right 4x8 half */
+ transpose ROW6R, ROW7R,v3,.16b,.4h
+ transpose ROW2R, ROW3R,v3,.16b,.4h
+ transpose ROW0R, ROW1R,v3,.16b,.4h
+ transpose ROW4R, ROW5R,v3,.16b,.4h
+ transpose ROW1R, ROW3R,v3,.16b,.2s
+ transpose ROW4R, ROW6R,v3,.16b,.2s
+ transpose ROW0R, ROW2R,v3,.16b,.2s
+ transpose ROW5R, ROW7R,v3,.16b,.2s
+
+
+1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
+ ld1 {v2.4h}, [x15] /* reload constants */
+ smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
+ smlal v12.4s, ROW1L.4h, XFIX_1_175875602
+ smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
+ smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
+ smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
+ smlal v14.4s, ROW3L.4h, XFIX_1_175875602
+ smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
+ smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
+ ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
+ smull v4.4s, ROW2L.4h, XFIX_0_541196100
+ smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L.4h <-> ROW2R.4h */
+ mov v8.16b, v12.16b
+ smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
+ smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
+ shl v6.4s, v6.4s, #13
+ smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
+ add v2.4s, v6.4s, v4.4s
+ mov v10.16b, v14.16b
+ add v2.4s, v2.4s, v12.4s
+ smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
+ smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
+ shrn ROW1L.4h, v2.4s, #16
+ sub v2.4s, v2.4s, v12.4s
+ smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
+ smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
+ sub v2.4s, v2.4s, v12.4s
+ smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
+ smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
+ sub v6.4s, v6.4s, v4.4s
+ shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
+ add v2.4s, v6.4s, v10.4s
+ sub v6.4s, v6.4s, v10.4s
+ saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
+ shrn ROW2L.4h, v2.4s, #16
+ shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
+ shl v10.4s, v10.4s, #13
+ smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
+ add v4.4s, v10.4s, v12.4s
+ sub v2.4s, v10.4s, v12.4s
+ add v12.4s, v4.4s, v14.4s
+ sub v4.4s, v4.4s, v14.4s
+ add v10.4s, v2.4s, v8.4s
+ sub v6.4s, v2.4s, v8.4s
+ shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
+ shrn ROW3L.4h, v10.4s, #16
+ shrn ROW0L.4h, v12.4s, #16
+ shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
+ /* 1-D IDCT,pass 2, right 4x8 half */
+ ld1 {v2.4h}, [x15] /* reload constants */
+ smull v12.4s, ROW5R.4h, XFIX_1_175875602
+ smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
+ smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
+ smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
+ smull v14.4s, ROW7R.4h, XFIX_1_175875602
+ smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
+ smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
+ smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
+ ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
+ smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
+ smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
+ mov v8.16b, v12.16b
+ smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
+ smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
+ shl v6.4s, v6.4s, #13
+ smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
+ add v2.4s, v6.4s, v4.4s
+ mov v10.16b, v14.16b
+ add v2.4s, v2.4s, v12.4s
+ smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
+ smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
+ shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
+ sub v2.4s, v2.4s, v12.4s
+ smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
+ smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
+ sub v2.4s, v2.4s, v12.4s
+ smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L.4h <-> ROW2R.4h */
+ smlal v12.4s, ROW6R.4h, XFIX_0_541196100
+ sub v6.4s, v6.4s, v4.4s
+ shrn ROW6R.4h, v2.4s, #16
+ add v2.4s, v6.4s, v10.4s
+ sub v6.4s, v6.4s, v10.4s
+ saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
+ shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
+ shrn ROW5R.4h, v6.4s, #16
+ shl v10.4s, v10.4s, #13
+ smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
+ add v4.4s, v10.4s, v12.4s
+ sub v2.4s, v10.4s, v12.4s
+ add v12.4s, v4.4s, v14.4s
+ sub v4.4s, v4.4s, v14.4s
+ add v10.4s, v2.4s, v8.4s
+ sub v6.4s, v2.4s, v8.4s
+ shrn ROW7R.4h, v4.4s, #16
+ shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
+ shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
+ shrn ROW4R.4h, v6.4s, #16
+
+2: /* Descale to 8-bit and range limit */
+ ins v16.2d[1], v17.2d[0]
+ ins v18.2d[1], v19.2d[0]
+ ins v20.2d[1], v21.2d[0]
+ ins v22.2d[1], v23.2d[0]
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+ sqrshrn v16.8b, v16.8h, #2
+ sqrshrn2 v16.16b, v18.8h, #2
+ sqrshrn v18.8b, v20.8h, #2
+ sqrshrn2 v18.16b, v22.8h, #2
+#else
+ sqrshrn v16.4h, v16.4s, #2
+ sqrshrn2 v16.8h, v18.4s, #2
+ sqrshrn v18.4h, v20.4s, #2
+ sqrshrn2 v18.8h, v22.4s, #2
+#endif
+ /*vpop {v8.4h-d15.4h} *//* restore NEON registers */
+
+ ld1 {v12.4h-v15.4h}, [sp], 32
+ ld1 {v8.4h-v11.4h}, [sp], 32
+ ins v24.2d[1], v25.2d[0]
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+
+ sqrshrn v20.8b, v24.8h, #2
+#else
+
+ sqrshrn v20.4h, v24.4s, #2
+#endif
+ /* Transpose the final 8-bit samples and do signed->unsigned conversion */
+ /*trn1 v16.8h, v16.8h, v18.8h*/
+ transpose v16,v18,v3,.16b,.8h
+ ins v26.2d[1], v27.2d[0]
+ ins v28.2d[1], v29.2d[0]
+ ins v30.2d[1], v31.2d[0]
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+ sqrshrn2 v20.16b, v26.8h, #2
+ sqrshrn v22.8b, v28.8h, #2
+#else
+ sqrshrn2 v20.8h, v26.4s, #2
+ sqrshrn v22.4h, v28.4s, #2
+#endif
+ movi v0.16b, #(CENTERJSAMPLE)
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+ sqrshrn2 v22.16b, v30.8h, #2
+#else
+ sqrshrn2 v22.8h, v30.4s, #2
+#endif
+ transpose_single v16,v17,v3,.2d,.8b
+ transpose_single v18,v19,v3,.2d,.8b
+ add v16.8b, v16.8b, v0.8b
+ add v17.8b, v17.8b, v0.8b
+ add v18.8b, v18.8b, v0.8b
+ add v19.8b, v19.8b, v0.8b
+ transpose v20,v22,v3,.16b,.8h
+ /* Store results to the output buffer */
+
+ ldp TMP1, TMP2, [OUTPUT_BUF],16
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ st1 {v16.8b}, [TMP1]
+ transpose_single v20,v21,v3,.2d,.8b
+ st1 {v17.8b}, [TMP2]
+ ldp TMP1, TMP2, [OUTPUT_BUF],16
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ st1 {v18.8b}, [TMP1]
+ add v20.8b, v20.8b, v0.8b
+ add v21.8b, v21.8b, v0.8b
+ st1 {v19.8b}, [TMP2]
+ ldp TMP1, TMP2, [OUTPUT_BUF],16
+ ldp TMP3, TMP4, [OUTPUT_BUF]
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ add TMP3, TMP3, OUTPUT_COL
+ add TMP4, TMP4, OUTPUT_COL
+ transpose_single v22, v23, v3, .2d,.8b
+ st1 {v20.8b}, [TMP1]
+ add v22.8b, v22.8b, v0.8b
+ add v23.8b, v23.8b, v0.8b
+ st1 {v21.8b}, [TMP2]
+ st1 {v22.8b}, [TMP3]
+ st1 {v23.8b}, [TMP4]
+ blr x30
+
+3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+
+ /* Transpose left 4x8 half */
+ transpose ROW6L,ROW7L,v3,.16b,.4h
+ transpose ROW2L,ROW3L,v3,.16b,.4h
+ transpose ROW0L,ROW1L,v3,.16b,.4h
+ transpose ROW4L,ROW5L,v3,.16b,.4h
+ shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
+ transpose ROW1L,ROW3L,v3,.16b,.2s
+ transpose ROW4L,ROW6L,v3,.16b,.2s
+ transpose ROW0L,ROW2L,v3,.16b,.2s
+ transpose ROW5L,ROW7L,v3,.16b,.2s
+ cmp x0, #0
+ beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
+
+ /* Only row 0 is non-zero for the right 4x8 half */
+ dup ROW1R.4h, ROW0R.4h[1]
+ dup ROW2R.4h, ROW0R.4h[2]
+ dup ROW3R.4h, ROW0R.4h[3]
+ dup ROW4R.4h, ROW0R.4h[0]
+ dup ROW5R.4h, ROW0R.4h[1]
+ dup ROW6R.4h, ROW0R.4h[2]
+ dup ROW7R.4h, ROW0R.4h[3]
+ dup ROW0R.4h, ROW0R.4h[0]
+ b 1b /* Go to 'normal' second pass */
+
+4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
+ ld1 {v2.4h}, [x15] /* reload constants */
+ smull v12.4s, ROW1L.4h, XFIX_1_175875602
+ smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
+ smull v14.4s, ROW3L.4h, XFIX_1_175875602
+ smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
+ smull v4.4s, ROW2L.4h, XFIX_0_541196100
+ sshll v6.4s, ROW0L.4h, #13
+ mov v8.16b, v12.16b
+ smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
+ smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
+ add v2.4s, v6.4s, v4.4s
+ mov v10.16b, v14.16b
+ smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
+ add v2.4s, v2.4s, v12.4s
+ add v12.4s, v12.4s, v12.4s
+ smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
+ shrn ROW1L.4h, v2.4s, #16
+ sub v2.4s, v2.4s, v12.4s
+ smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
+ sub v6.4s, v6.4s, v4.4s
+ shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
+ add v2.4s, v6.4s, v10.4s
+ sub v6.4s, v6.4s, v10.4s
+ sshll v10.4s, ROW0L.4h, #13
+ shrn ROW2L.4h, v2.4s, #16
+ shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
+ add v4.4s, v10.4s, v12.4s
+ sub v2.4s, v10.4s, v12.4s
+ add v12.4s, v4.4s, v14.4s
+ sub v4.4s, v4.4s, v14.4s
+ add v10.4s, v2.4s, v8.4s
+ sub v6.4s, v2.4s, v8.4s
+ shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
+ shrn ROW3L.4h, v10.4s, #16
+ shrn ROW0L.4h, v12.4s, #16
+ shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
+ /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
+ ld1 {v2.4h}, [x15] /* reload constants */
+ smull v12.4s, ROW5L.4h, XFIX_1_175875602
+ smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560
+ smull v14.4s, ROW7L.4h, XFIX_1_175875602
+ smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644
+ smull v4.4s, ROW6L.4h, XFIX_0_541196100
+ sshll v6.4s, ROW4L.4h, #13
+ mov v8.16b, v12.16b
+ smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447
+ smlsl v8.4s, ROW5L.4h, XFIX_0_899976223
+ add v2.4s, v6.4s, v4.4s
+ mov v10.16b, v14.16b
+ smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223
+ add v2.4s, v2.4s, v12.4s
+ add v12.4s, v12.4s, v12.4s
+ smlsl v10.4s, ROW7L.4h, XFIX_2_562915447
+ shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
+ sub v2.4s, v2.4s, v12.4s
+ smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865
+ sub v6.4s, v6.4s, v4.4s
+ shrn ROW6R.4h, v2.4s, #16
+ add v2.4s, v6.4s, v10.4s
+ sub v6.4s, v6.4s, v10.4s
+ sshll v10.4s, ROW4L.4h, #13
+ shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
+ shrn ROW5R.4h, v6.4s, #16
+ add v4.4s, v10.4s, v12.4s
+ sub v2.4s, v10.4s, v12.4s
+ add v12.4s, v4.4s, v14.4s
+ sub v4.4s, v4.4s, v14.4s
+ add v10.4s, v2.4s, v8.4s
+ sub v6.4s, v2.4s, v8.4s
+ shrn ROW7R.4h, v4.4s, #16
+ shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
+ shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
+ shrn ROW4R.4h, v6.4s, #16
+ b 2b /* Go to epilogue */
+
+
+
+
+ .unreq DCT_TABLE
+ .unreq COEF_BLOCK
+ .unreq OUTPUT_BUF
+ .unreq OUTPUT_COL
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq TMP4
+
+ .unreq ROW0L
+ .unreq ROW0R
+ .unreq ROW1L
+ .unreq ROW1R
+ .unreq ROW2L
+ .unreq ROW2R
+ .unreq ROW3L
+ .unreq ROW3R
+ .unreq ROW4L
+ .unreq ROW4R
+ .unreq ROW5L
+ .unreq ROW5R
+ .unreq ROW6L
+ .unreq ROW6R
+ .unreq ROW7L
+ .unreq ROW7R
+.endfunc