diff options
-rw-r--r-- | simd/jsimd_arm_neon_64.S | 771 |
1 files changed, 771 insertions, 0 deletions
diff --git a/simd/jsimd_arm_neon_64.S b/simd/jsimd_arm_neon_64.S new file mode 100644 index 0000000..1797318 --- /dev/null +++ b/simd/jsimd_arm_neon_64.S @@ -0,0 +1,771 @@ + /* + * ARMv8 NEON optimizations for libjpeg-turbo + * This file is a copy of the armv7 neon version but ported to armv8. + * + * Copyright (C) 2009-2011 Nokia Corporation and/or it's subsidiary(-ies). + * All rights reserved. + * Author Siarhei Siamashka <siarhei.siamashka@nokia.com> + * Copyright (C) 2013, Linaro Limited + * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ +#endif + +.text + +.arch armv8-a+fp+simd +#define RESPECT_STRICT_ALIGNMENT 1 + +#define RTSM_SQSHRN_SIM_ISSUE +/*****************************************************************************/ + +/* Supplementary macro for setting function attributes */ +.macro asm_function fname +#ifdef __APPLE__ + .func _\fname + .globl _\fname +_\fname: +#else + .func \fname + .global \fname +#ifdef __ELF__ + .hidden \fname + .type \fname, %function +#endif +\fname: +#endif +.endm + +/* Transpose elements of single 128 bit registers */ +.macro transpose_single x0,x1,xi,xilen,literal + ins \xi\xilen[0], \x0\xilen[0] + ins \x1\xilen[0], \x0\xilen[1] + trn1 \x0\literal, \x0\literal, \x1\literal + trn2 \x1\literal, \xi\literal, \x1\literal +.endm + + +/* Transpose elements of 2 differnet registers */ +.macro transpose x0,x1,xi,xilen,literal + mov \xi\xilen, \x0\xilen + trn1 \x0\literal, \x0\literal, \x1\literal + trn2 \x1\literal, \xi\literal, \x1\literal +.endm +/* Transpose a block of 4x4 coefficients in four 64-bit registers */ + +.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen + mov \xi\xilen, \x0\xilen + trn1 \x0\x0len, \x0\x0len, \x2\x2len + trn2 \x2\x2len, \xi\x0len, \x2\x2len + mov \xi\xilen, \x1\xilen + trn1 \x1\x1len, \x1\x1len, \x3\x3len + trn2 \x3\x3len, \xi\x1len, \x3\x3len +.endm + +.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen + mov \xi\xilen, \x0\xilen + trn1 \x0\x0len, \x0\x0len, \x1\x1len + trn2 \x1\x2len, \xi\x0len, \x1\x2len + mov \xi\xilen, \x2\xilen + trn1 \x2\x2len, \x2\x2len, \x3\x3len + trn2 \x3\x2len, \xi\x1len, \x3\x3len +.endm + +.macro transpose_4x4 x0, x1, x2, x3,x5 + transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b + transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b +.endm + + + + +#define CENTERJSAMPLE 128 + +/*****************************************************************************/ + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + * + * GLOBAL(void) + * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, + * JSAMPARRAY output_buf, JDIMENSION output_col) + */ + +#define FIX_0_298631336 (2446) +#define FIX_0_390180644 (3196) +#define FIX_0_541196100 (4433) +#define FIX_0_765366865 (6270) +#define FIX_0_899976223 (7373) +#define FIX_1_175875602 (9633) +#define FIX_1_501321110 (12299) +#define FIX_1_847759065 (15137) +#define FIX_1_961570560 (16069) +#define FIX_2_053119869 (16819) +#define FIX_2_562915447 (20995) +#define FIX_3_072711026 (25172) + +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) +#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) + +/* + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. + * Uses some ideas from the comments in 'simd/jiss2int-64.asm' + */ +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ +{ \ + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ + INT32 q1, q2, q3, q4, q5, q6, q7; \ + INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ + \ + /* 1-D iDCT input data */ \ + row0 = xrow0; \ + row1 = xrow1; \ + row2 = xrow2; \ + row3 = xrow3; \ + row4 = xrow4; \ + row5 = xrow5; \ + row6 = xrow6; \ + row7 = xrow7; \ + \ + q5 = row7 + row3; \ + q4 = row5 + row1; \ + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ + MULTIPLY(q4, FIX_1_175875602); \ + q7 = MULTIPLY(q5, FIX_1_175875602) + \ + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ + q2 = MULTIPLY(row2, FIX_0_541196100) + \ + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ + q4 = q6; \ + q3 = ((INT32) row0 - (INT32) row4) << 13; \ + q6 += MULTIPLY(row5, -FIX_2_562915447) + \ + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ + /* now we can use q1 (reloadable constants have been used up) */ \ + q1 = q3 + q2; \ + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ + MULTIPLY(row1, -FIX_0_899976223); \ + q5 = q7; \ + q1 = q1 + q6; \ + q7 += MULTIPLY(row7, -FIX_0_899976223) + \ + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ + \ + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ + tmp11_plus_tmp2 = q1; \ + row1 = 0; \ + \ + q1 = q1 - q6; \ + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ + MULTIPLY(row3, -FIX_2_562915447); \ + q1 = q1 - q6; \ + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ + MULTIPLY(row6, FIX_0_541196100); \ + q3 = q3 - q2; \ + \ + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ + tmp11_minus_tmp2 = q1; \ + \ + q1 = ((INT32) row0 + (INT32) row4) << 13; \ + q2 = q1 + q6; \ + q1 = q1 - q6; \ + \ + /* pick up the results */ \ + tmp0 = q4; \ + tmp1 = q5; \ + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ + tmp3 = q7; \ + tmp10 = q2; \ + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ + tmp12 = q3; \ + tmp13 = q1; \ +} + +#define XFIX_0_899976223 v0.4h[0] +#define XFIX_0_541196100 v0.4h[1] +#define XFIX_2_562915447 v0.4h[2] +#define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3] +#define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0] +#define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1] +#define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2] +#define XFIX_1_175875602 v1.4h[3] +#define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0] +#define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1] +#define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2] +#define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3] + +.balign 16 +jsimd_idct_islow_neon_consts: + .short FIX_0_899976223 /* d0[0] */ + .short FIX_0_541196100 /* d0[1] */ + .short FIX_2_562915447 /* d0[2] */ + .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ + .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ + .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ + .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ + .short FIX_1_175875602 /* d1[3] */ + /* reloadable constants */ + .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ + .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ + .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ + .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ + +/****************************************************************************** +* +* jsimd_idct_islow_neon +* +*****************************************************************************/ + +asm_function jsimd_idct_islow_neon + DCT_TABLE .req x0 + COEF_BLOCK .req x1 + OUTPUT_BUF .req x2 + OUTPUT_COL .req x3 + TMP1 .req x0 + TMP2 .req x1 + TMP3 .req x2 + TMP4 .req x15 + + ROW0L .req v16 + ROW0R .req v17 + ROW1L .req v18 + ROW1R .req v19 + ROW2L .req v20 + ROW2R .req v21 + ROW3L .req v22 + ROW3R .req v23 + ROW4L .req v24 + ROW4R .req v25 + ROW5L .req v26 + ROW5R .req v27 + ROW6L .req v28 + ROW6R .req v29 + ROW7L .req v30 + ROW7R .req v31 + + adr x15, jsimd_idct_islow_neon_consts + ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 + ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32 + mul v16.4h,v16.4h,v0.4h + mul v17.4h,v17.4h,v1.4h + ins v16.2d[1],v17.2d[0] /* 128 bit q8 */ + ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [DCT_TABLE], 32 + mul v18.4h,v18.4h,v2.4h + mul v19.4h,v19.4h,v3.4h + ins v18.2d[1],v19.2d[0] /* 128 bit q9 */ + ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32 + mul v20.4h,v20.4h,v4.4h + mul v21.4h,v21.4h,v5.4h + ins v20.2d[1],v21.2d[0] /* 128 bit q10 */ + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 + mul v22.4h,v22.4h,v6.4h + mul v23.4h,v23.4h,v7.4h + ins v22.2d[1],v23.2d[0] /* 128 bit q11 */ + ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK], 32 + mul v24.4h, v24.4h, v0.4h + mul v25.4h, v25.4h, v1.4h + ins v24.2d[1],v25.2d[0] /* 128 bit q12 */ + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 + mul v28.4h, v28.4h, v4.4h + mul v29.4h, v29.4h, v5.4h + ins v28.2d[1],v29.2d[0] /* 128 bit q14 */ + mul v26.4h, v26.4h, v2.4h + mul v27.4h, v27.4h, v3.4h + ins v26.2d[1],v27.2d[0] /* 128 bit q13 */ + ld1 {v0.4h,v1.4h, v2.4h,v3.4h}, [x15] /* load constants */ + add x15, x15, #16 + mul v30.4h, v30.4h, v6.4h + mul v31.4h, v31.4h, v7.4h + ins v30.2d[1],v31.2d[0] /* 128 bit q15 */ + sub sp, sp, #32 + st1 {v8.4h-v11.4h}, [sp]/* save NEON registers */ + sub sp, sp, #32 + st1 {v12.4h-v15.4h}, [sp] + /* 1-D IDCT, pass 1, left 4x8 half */ + add v4.4h, ROW7L.4h, ROW3L.4h + add v5.4h, ROW5L.4h, ROW1L.4h + smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560 + smlal v12.4s, v5.4h, XFIX_1_175875602 + smull v14.4s, v4.4h, XFIX_1_175875602 + /* check for the zero coeffecients in the right 4*8 half */ + /*push {x4, x5}*/ /*--------> need to be fixed */ + stp x4,x5,[sp,-16]! + mov x5, #0 + smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644 + ssubl v6.4s, ROW0L.4h, ROW4L.4h + ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] + smull v4.4s, ROW2L.4h, XFIX_0_541196100 + smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065 + orr x0, x4, x5 + mov v8.16b, v12.16b + smlsl v12.4s, ROW5L.4h, XFIX_2_562915447 + ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 + shl v6.4s, v6.4s, #13 + orr x0, x0, x4 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 + orr x0, x0 , x5 + add v2.4s, v6.4s, v4.4s + ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] + mov v10.16b, v14.16b + add v2.4s, v2.4s, v12.4s + orr x0, x0, x4 + smlsl v14.4s, ROW7L.4h, XFIX_0_899976223 + orr x0, x0, x5 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 + rshrn ROW1L.4h, v2.4s, #11 + ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] + sub v2.4s, v2.4s, v12.4s + smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447 + orr x0, x0, x4 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 + orr x0, x0, x5 + sub v2.4s, v2.4s, v12.4s + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 + ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] + smlal v12.4s, ROW6L.4h, XFIX_0_541196100 + sub v6.4s, v6.4s, v4.4s + orr x0, x0, x4 + rshrn ROW6L.4h, v2.4s, #11 + orr x0, x0, x5 + add v2.4s, v6.4s, v10.4s + ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] + sub v6.4s, v6.4s, v10.4s + saddl v10.4s, ROW0L.4h, ROW4L.4h + orr x0, x0, x4 + rshrn ROW2L.4h, v2.4s, #11 + orr x0, x0, x5 + rshrn ROW5L.4h, v6.4s, #11 + ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] + shl v10.4s, v10.4s, #13 + smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223 + orr x0, x0, x4 + add v4.4s, v10.4s, v12.4s + orr x0, x0, x5 + sub v2.4s, v10.4s, v12.4s + add v12.4s, v4.4s, v14.4s + ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] + sub v4.4s, v4.4s, v14.4s + add v10.4s, v2.4s, v8.4s + orr x0, x4, x5 + sub v6.4s, v2.4s, v8.4s + /*pop {x4, x5} */ + ldp x4, x5, [sp], 16 + rshrn ROW7L.4h, v4.4s, #11 + rshrn ROW3L.4h, v10.4s, #11 + rshrn ROW0L.4h, v12.4s, #11 + rshrn ROW4L.4h, v6.4s, #11 + cmp x0, #0 /*orrs instruction removed*/ + beq 3f /* Go to do some special handling for the sparse right 4x8 half */ + + + /* 1-D IDCT, pass 1, right 4x8 half */ + ld1 {v2.4h}, [x15] /* reload constants */ + add v10.4h, ROW7R.4h, ROW3R.4h + add v8.4h, ROW5R.4h, ROW1R.4h + /* Transpose ROW6L <-> ROW7L (v3 avliable free register)*/ + transpose ROW6L,ROW7L,v3,.16b,.4h + smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560 + smlal v12.4s, v8.4h, XFIX_1_175875602 + /* Transpose ROW2L <-> ROW3L (v3 avliable free register)*/ + transpose ROW2L,ROW3L,v3,.16b,.4h + smull v14.4s, v10.4h, XFIX_1_175875602 + smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644 + /* Transpose ROW0L <-> ROW1L (v3 avliable free register)*/ + transpose ROW0L,ROW1L,v3,.16b,.4h + ssubl v6.4s, ROW0R.4h, ROW4R.4h + smull v4.4s, ROW2R.4h, XFIX_0_541196100 + smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065 + /* Transpose ROW4L <-> ROW5L (v3 avliable free register)*/ + transpose ROW4L,ROW5L,v3,.16b,.4h + mov v8.16b, v12.16b + smlsl v12.4s, ROW5R.4h, XFIX_2_562915447 + smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447 + /* Transpose ROW1L <-> ROW3L (v3 avliable free register)*/ + transpose ROW1L,ROW3L,v3,.16b,.2s + shl v6.4s, v6.4s, #13 + smlsl v8.4s, ROW1R.4h, XFIX_0_899976223 + /* Trannnnnose ROW4L <-> ROW6L (v3 avliable free register)*/ + transpose ROW4L,ROW6L,v3,.16b,.2s + add v2.4s, v6.4s, v4.4s + mov v10.16b, v14.16b + add v2.4s, v2.4s, v12.4s + /* Transpose ROW0L <-> ROW2L (v3 avliable free register)*/ + transpose ROW0L,ROW2L,v3,.16b,.2s + smlsl v14.4s, ROW7R.4h, XFIX_0_899976223 + smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223 + rshrn ROW1R.4h, v2.4s, #11 + /* Transpose ROW5L <-> ROW7L (v3 avliable free register)*/ + transpose ROW5L,ROW7L,v3,.16b,.2s + sub v2.4s, v2.4s, v12.4s + smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447 + smlsl v10.4s, ROW3R.4h, XFIX_2_562915447 + sub v2.4s, v2.4s, v12.4s + smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865 + smlal v12.4s, ROW6R.4h, XFIX_0_541196100 + sub v6.4s, v6.4s, v4.4s + rshrn ROW6R.4h, v2.4s, #11 + add v2.4s, v6.4s, v10.4s + sub v6.4s, v6.4s, v10.4s + saddl v10.4s, ROW0R.4h, ROW4R.4h + rshrn ROW2R.4h, v2.4s, #11 + rshrn ROW5R.4h, v6.4s, #11 + shl v10.4s, v10.4s, #13 + smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223 + add v4.4s, v10.4s, v12.4s + sub v2.4s, v10.4s, v12.4s + add v12.4s, v4.4s, v14.4s + sub v4.4s, v4.4s, v14.4s + add v10.4s, v2.4s, v8.4s + sub v12.4s, v2.4s, v8.4s + rshrn ROW7R.4h, v4.4s, #11 + rshrn ROW3R.4h, v10.4s, #11 + rshrn ROW0R.4h, v12.4s, #11 + rshrn ROW4R.4h, v6.4s, #11 +/* Transpose right 4x8 half */ + transpose ROW6R, ROW7R,v3,.16b,.4h + transpose ROW2R, ROW3R,v3,.16b,.4h + transpose ROW0R, ROW1R,v3,.16b,.4h + transpose ROW4R, ROW5R,v3,.16b,.4h + transpose ROW1R, ROW3R,v3,.16b,.2s + transpose ROW4R, ROW6R,v3,.16b,.2s + transpose ROW0R, ROW2R,v3,.16b,.2s + transpose ROW5R, ROW7R,v3,.16b,.2s + + +1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ + ld1 {v2.4h}, [x15] /* reload constants */ + smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */ + smlal v12.4s, ROW1L.4h, XFIX_1_175875602 + smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */ + smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 + smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */ + smlal v14.4s, ROW3L.4h, XFIX_1_175875602 + smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */ + smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 + ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */ + smull v4.4s, ROW2L.4h, XFIX_0_541196100 + smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L.4h <-> ROW2R.4h */ + mov v8.16b, v12.16b + smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4h */ + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 + shl v6.4s, v6.4s, #13 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 + add v2.4s, v6.4s, v4.4s + mov v10.16b, v14.16b + add v2.4s, v2.4s, v12.4s + smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4h */ + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 + shrn ROW1L.4h, v2.4s, #16 + sub v2.4s, v2.4s, v12.4s + smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L.4h <-> ROW1R.4h */ + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 + sub v2.4s, v2.4s, v12.4s + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 + smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */ + sub v6.4s, v6.4s, v4.4s + shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ + add v2.4s, v6.4s, v10.4s + sub v6.4s, v6.4s, v10.4s + saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */ + shrn ROW2L.4h, v2.4s, #16 + shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ + shl v10.4s, v10.4s, #13 + smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L.4h <-> ROW3R.4h */ + add v4.4s, v10.4s, v12.4s + sub v2.4s, v10.4s, v12.4s + add v12.4s, v4.4s, v14.4s + sub v4.4s, v4.4s, v14.4s + add v10.4s, v2.4s, v8.4s + sub v6.4s, v2.4s, v8.4s + shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ + shrn ROW3L.4h, v10.4s, #16 + shrn ROW0L.4h, v12.4s, #16 + shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ + /* 1-D IDCT,pass 2, right 4x8 half */ + ld1 {v2.4h}, [x15] /* reload constants */ + smull v12.4s, ROW5R.4h, XFIX_1_175875602 + smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */ + smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560 + smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */ + smull v14.4s, ROW7R.4h, XFIX_1_175875602 + smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */ + smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644 + smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */ + ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */ + smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */ + smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065 + mov v8.16b, v12.16b + smlsl v12.4s, ROW5R.4h, XFIX_2_562915447 + smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L.4h <-> ROW3R.4h */ + shl v6.4s, v6.4s, #13 + smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4h */ + add v2.4s, v6.4s, v4.4s + mov v10.16b, v14.16b + add v2.4s, v2.4s, v12.4s + smlsl v14.4s, ROW7R.4h, XFIX_0_899976223 + smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L.4h <-> ROW1R.4h */ + shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ + sub v2.4s, v2.4s, v12.4s + smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447 + smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4h */ + sub v2.4s, v2.4s, v12.4s + smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L.4h <-> ROW2R.4h */ + smlal v12.4s, ROW6R.4h, XFIX_0_541196100 + sub v6.4s, v6.4s, v4.4s + shrn ROW6R.4h, v2.4s, #16 + add v2.4s, v6.4s, v10.4s + sub v6.4s, v6.4s, v10.4s + saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */ + shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ + shrn ROW5R.4h, v6.4s, #16 + shl v10.4s, v10.4s, #13 + smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223 + add v4.4s, v10.4s, v12.4s + sub v2.4s, v10.4s, v12.4s + add v12.4s, v4.4s, v14.4s + sub v4.4s, v4.4s, v14.4s + add v10.4s, v2.4s, v8.4s + sub v6.4s, v2.4s, v8.4s + shrn ROW7R.4h, v4.4s, #16 + shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ + shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ + shrn ROW4R.4h, v6.4s, #16 + +2: /* Descale to 8-bit and range limit */ + ins v16.2d[1], v17.2d[0] + ins v18.2d[1], v19.2d[0] + ins v20.2d[1], v21.2d[0] + ins v22.2d[1], v23.2d[0] +#ifdef RTSM_SQSHRN_SIM_ISSUE + sqrshrn v16.8b, v16.8h, #2 + sqrshrn2 v16.16b, v18.8h, #2 + sqrshrn v18.8b, v20.8h, #2 + sqrshrn2 v18.16b, v22.8h, #2 +#else + sqrshrn v16.4h, v16.4s, #2 + sqrshrn2 v16.8h, v18.4s, #2 + sqrshrn v18.4h, v20.4s, #2 + sqrshrn2 v18.8h, v22.4s, #2 +#endif + /*vpop {v8.4h-d15.4h} *//* restore NEON registers */ + + ld1 {v12.4h-v15.4h}, [sp], 32 + ld1 {v8.4h-v11.4h}, [sp], 32 + ins v24.2d[1], v25.2d[0] +#ifdef RTSM_SQSHRN_SIM_ISSUE + + sqrshrn v20.8b, v24.8h, #2 +#else + + sqrshrn v20.4h, v24.4s, #2 +#endif + /* Transpose the final 8-bit samples and do signed->unsigned conversion */ + /*trn1 v16.8h, v16.8h, v18.8h*/ + transpose v16,v18,v3,.16b,.8h + ins v26.2d[1], v27.2d[0] + ins v28.2d[1], v29.2d[0] + ins v30.2d[1], v31.2d[0] +#ifdef RTSM_SQSHRN_SIM_ISSUE + sqrshrn2 v20.16b, v26.8h, #2 + sqrshrn v22.8b, v28.8h, #2 +#else + sqrshrn2 v20.8h, v26.4s, #2 + sqrshrn v22.4h, v28.4s, #2 +#endif + movi v0.16b, #(CENTERJSAMPLE) +#ifdef RTSM_SQSHRN_SIM_ISSUE + sqrshrn2 v22.16b, v30.8h, #2 +#else + sqrshrn2 v22.8h, v30.4s, #2 +#endif + transpose_single v16,v17,v3,.2d,.8b + transpose_single v18,v19,v3,.2d,.8b + add v16.8b, v16.8b, v0.8b + add v17.8b, v17.8b, v0.8b + add v18.8b, v18.8b, v0.8b + add v19.8b, v19.8b, v0.8b + transpose v20,v22,v3,.16b,.8h + /* Store results to the output buffer */ + + ldp TMP1, TMP2, [OUTPUT_BUF],16 + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + st1 {v16.8b}, [TMP1] + transpose_single v20,v21,v3,.2d,.8b + st1 {v17.8b}, [TMP2] + ldp TMP1, TMP2, [OUTPUT_BUF],16 + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + st1 {v18.8b}, [TMP1] + add v20.8b, v20.8b, v0.8b + add v21.8b, v21.8b, v0.8b + st1 {v19.8b}, [TMP2] + ldp TMP1, TMP2, [OUTPUT_BUF],16 + ldp TMP3, TMP4, [OUTPUT_BUF] + add TMP1, TMP1, OUTPUT_COL + add TMP2, TMP2, OUTPUT_COL + add TMP3, TMP3, OUTPUT_COL + add TMP4, TMP4, OUTPUT_COL + transpose_single v22, v23, v3, .2d,.8b + st1 {v20.8b}, [TMP1] + add v22.8b, v22.8b, v0.8b + add v23.8b, v23.8b, v0.8b + st1 {v21.8b}, [TMP2] + st1 {v22.8b}, [TMP3] + st1 {v23.8b}, [TMP4] + blr x30 + +3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ + + /* Transpose left 4x8 half */ + transpose ROW6L,ROW7L,v3,.16b,.4h + transpose ROW2L,ROW3L,v3,.16b,.4h + transpose ROW0L,ROW1L,v3,.16b,.4h + transpose ROW4L,ROW5L,v3,.16b,.4h + shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */ + transpose ROW1L,ROW3L,v3,.16b,.2s + transpose ROW4L,ROW6L,v3,.16b,.2s + transpose ROW0L,ROW2L,v3,.16b,.2s + transpose ROW5L,ROW7L,v3,.16b,.2s + cmp x0, #0 + beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */ + + /* Only row 0 is non-zero for the right 4x8 half */ + dup ROW1R.4h, ROW0R.4h[1] + dup ROW2R.4h, ROW0R.4h[2] + dup ROW3R.4h, ROW0R.4h[3] + dup ROW4R.4h, ROW0R.4h[0] + dup ROW5R.4h, ROW0R.4h[1] + dup ROW6R.4h, ROW0R.4h[2] + dup ROW7R.4h, ROW0R.4h[3] + dup ROW0R.4h, ROW0R.4h[0] + b 1b /* Go to 'normal' second pass */ + +4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ + ld1 {v2.4h}, [x15] /* reload constants */ + smull v12.4s, ROW1L.4h, XFIX_1_175875602 + smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 + smull v14.4s, ROW3L.4h, XFIX_1_175875602 + smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 + smull v4.4s, ROW2L.4h, XFIX_0_541196100 + sshll v6.4s, ROW0L.4h, #13 + mov v8.16b, v12.16b + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 + add v2.4s, v6.4s, v4.4s + mov v10.16b, v14.16b + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 + add v2.4s, v2.4s, v12.4s + add v12.4s, v12.4s, v12.4s + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 + shrn ROW1L.4h, v2.4s, #16 + sub v2.4s, v2.4s, v12.4s + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 + sub v6.4s, v6.4s, v4.4s + shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ + add v2.4s, v6.4s, v10.4s + sub v6.4s, v6.4s, v10.4s + sshll v10.4s, ROW0L.4h, #13 + shrn ROW2L.4h, v2.4s, #16 + shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ + add v4.4s, v10.4s, v12.4s + sub v2.4s, v10.4s, v12.4s + add v12.4s, v4.4s, v14.4s + sub v4.4s, v4.4s, v14.4s + add v10.4s, v2.4s, v8.4s + sub v6.4s, v2.4s, v8.4s + shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ + shrn ROW3L.4h, v10.4s, #16 + shrn ROW0L.4h, v12.4s, #16 + shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ + /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ + ld1 {v2.4h}, [x15] /* reload constants */ + smull v12.4s, ROW5L.4h, XFIX_1_175875602 + smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 + smull v14.4s, ROW7L.4h, XFIX_1_175875602 + smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 + smull v4.4s, ROW6L.4h, XFIX_0_541196100 + sshll v6.4s, ROW4L.4h, #13 + mov v8.16b, v12.16b + smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 + smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 + add v2.4s, v6.4s, v4.4s + mov v10.16b, v14.16b + smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 + add v2.4s, v2.4s, v12.4s + add v12.4s, v12.4s, v12.4s + smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 + shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ + sub v2.4s, v2.4s, v12.4s + smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 + sub v6.4s, v6.4s, v4.4s + shrn ROW6R.4h, v2.4s, #16 + add v2.4s, v6.4s, v10.4s + sub v6.4s, v6.4s, v10.4s + sshll v10.4s, ROW4L.4h, #13 + shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ + shrn ROW5R.4h, v6.4s, #16 + add v4.4s, v10.4s, v12.4s + sub v2.4s, v10.4s, v12.4s + add v12.4s, v4.4s, v14.4s + sub v4.4s, v4.4s, v14.4s + add v10.4s, v2.4s, v8.4s + sub v6.4s, v2.4s, v8.4s + shrn ROW7R.4h, v4.4s, #16 + shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ + shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ + shrn ROW4R.4h, v6.4s, #16 + b 2b /* Go to epilogue */ + + + + + .unreq DCT_TABLE + .unreq COEF_BLOCK + .unreq OUTPUT_BUF + .unreq OUTPUT_COL + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + + .unreq ROW0L + .unreq ROW0R + .unreq ROW1L + .unreq ROW1R + .unreq ROW2L + .unreq ROW2R + .unreq ROW3L + .unreq ROW3R + .unreq ROW4L + .unreq ROW4R + .unreq ROW5L + .unreq ROW5R + .unreq ROW6L + .unreq ROW6R + .unreq ROW7L + .unreq ROW7R +.endfunc |