aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--simd/jsimd_arm_neon_64.S317
1 files changed, 317 insertions, 0 deletions
diff --git a/simd/jsimd_arm_neon_64.S b/simd/jsimd_arm_neon_64.S
index 1797318..f7fec55 100644
--- a/simd/jsimd_arm_neon_64.S
+++ b/simd/jsimd_arm_neon_64.S
@@ -769,3 +769,320 @@ asm_function jsimd_idct_islow_neon
.unreq ROW7L
.unreq ROW7R
.endfunc
+
+
+/******************************************************************************
+*
+* jsimd_idct_ifast_neon
+*
+******************************************************************************/
+
+
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
+ * function from jidctfst.c
+ *
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
+ * But in ARM NEON case some extra additions are required because VQDMULH
+ * instruction can't handle the constants larger than 1. So the expressions
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
+ * which introduces an extra addition. Overall, there are 6 extra additions
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+ */
+
+#define XFIX_1_082392200 v0.4h[0]
+#define XFIX_1_414213562 v0.4h[1]
+#define XFIX_1_847759065 v0.4h[2]
+#define XFIX_2_613125930 v0.4h[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+ .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
+ .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
+ .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
+ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+
+asm_function jsimd_idct_ifast_neon
+
+ DCT_TABLE .req x0
+ COEF_BLOCK .req x1
+ OUTPUT_BUF .req x2
+ OUTPUT_COL .req x3
+ TMP1 .req x0
+ TMP2 .req x1
+ TMP3 .req x2
+ TMP4 .req x15
+
+ /* Load and dequantize coefficients into NEON registers
+ * with the following allocation:
+ * 0 1 2 3 | 4 5 6 7
+ * ---------+--------
+ * 0 | d16 | d17 ( v8.8h )
+ * 1 | d18 | d19 ( v9.8h )
+ * 2 | d20 | d21 ( v10.8h )
+ * 3 | d22 | d23 ( v11.8h )
+ * 4 | d24 | d25 ( v12.8h )
+ * 5 | d26 | d27 ( v13.8h )
+ * 6 | d28 | d29 ( v14.8h )
+ * 7 | d30 | d31 ( v15.8h )
+ */
+ adr x15, jsimd_idct_ifast_neon_consts
+ ld1 {v8.8h,v9.8h},[COEF_BLOCK],32
+ ld1 {v0.8h,v1.8h},[DCT_TABLE],32
+ ld1 {v10.8h,v11.8h},[COEF_BLOCK],32
+ mul v8.8h, v8.8h, v0.8h
+ ld1 {v2.8h,v3.8h},[DCT_TABLE],32
+ mul v9.8h,v9.8h,v1.8h
+ ld1 {v12.8h, v13.8h}, [COEF_BLOCK],32
+ mul v10.8h, v10.8h, v2.8h
+ ld1 {v0.8h, v1.8h}, [DCT_TABLE],32
+ mul v11.8h, v11.8h, v3.8h
+ ld1 {v14.8h, v15.8h}, [COEF_BLOCK],32
+ mul v12.8h, v12.8h, v0.8h
+ ld1 {v2.8h, v3.8h}, [DCT_TABLE],32
+ mul v14.8h, v14.8h, v2.8h
+ mul v13.8h, v13.8h, v1.8h
+ ld1 {v0.4h}, [x15] /* load constants */
+
+ mul v15.8h, v15.8h, v3.8h
+
+ /* vpush {v4.8h-v6.8h} */ /* save NEON registers */
+
+ sub sp,sp,#32
+ st1 {v4.8h-v5.8h}, [sp]/* save NEON registers */
+ sub sp,sp,#16
+ st1 {v6.8h},[sp]
+ /* 1-D IDCT, pass 1 */
+ sub v2.8h, v10.8h, v14.8h
+ add v14.8h, v10.8h, v14.8h
+ sub v1.8h, v11.8h, v13.8h
+ add v13.8h, v11.8h, v13.8h
+ sub v5.8h, v9.8h, v15.8h
+ add v15.8h, v9.8h, v15.8h
+ sqdmulh v4.8h, v2.8h, XFIX_1_414213562
+ sqdmulh v6.8h, v1.8h, XFIX_2_613125930
+ add v3.8h, v1.8h, v1.8h
+ sub v1.8h, v5.8h, v1.8h
+ add v10.8h, v2.8h, v4.8h
+ sqdmulh v4.8h, v1.8h, XFIX_1_847759065
+ sub v2.8h, v15.8h, v13.8h
+ add v3.8h, v3.8h, v6.8h
+ sqdmulh v6.8h, v2.8h, XFIX_1_414213562
+ add v1.8h, v1.8h, v4.8h
+ sqdmulh v4.8h, v5.8h, XFIX_1_082392200
+ sub v10.8h, v10.8h, v14.8h
+ add v2.8h, v2.8h, v6.8h
+ sub v6.8h, v8.8h, v12.8h
+ add v12.8h, v8.8h, v12.8h
+ add v9.8h, v5.8h, v4.8h
+ add v5.8h, v6.8h, v10.8h
+ sub v10.8h, v6.8h, v10.8h
+ add v6.8h, v15.8h, v13.8h
+ add v8.8h, v12.8h, v14.8h
+ sub v3.8h, v6.8h, v3.8h
+ sub v12.8h, v12.8h, v14.8h
+ sub v3.8h, v3.8h, v1.8h
+ sub v1.8h, v9.8h, v1.8h
+ add v2.8h, v3.8h, v2.8h
+ sub v15.8h, v8.8h, v6.8h
+ add v1.8h, v1.8h, v2.8h
+ add v8.8h, v8.8h, v6.8h
+ add v14.8h, v5.8h, v3.8h
+ sub v9.8h, v5.8h, v3.8h
+ sub v13.8h, v10.8h, v2.8h
+ add v10.8h, v10.8h, v2.8h
+ /* Transpose q8-q9*/
+ mov v18.16b,v8.16b
+ trn1 v8.8h, v8.8h, v9.8h
+ trn2 v9.8h, v18.8h, v9.8h
+ sub v11.8h, v12.8h, v1.8h
+ /* Transpose q14-q15*/
+ mov v18.16b,v14.16b
+ trn1 v14.8h, v14.8h, v15.8h
+ trn2 v15.8h, v18.8h, v15.8h
+ add v12.8h, v12.8h, v1.8h
+ /* Transpose q10-q11*/
+ mov v18.16b,v10.16b
+ trn1 v10.8h, v10.8h, v11.8h
+ trn2 v11.8h, v18.8h, v11.8h
+ /* Transpose q12-q13*/
+ mov v18.16b,v12.16b
+ trn1 v12.8h, v12.8h, v13.8h
+ trn2 v13.8h, v18.8h, v13.8h
+ /* Transpose q9-q11*/
+ mov v18.16b,v9.16b
+ trn1 v9.4s, v9.4s, v11.4s
+ trn2 v11.4s, v18.4s, v11.4s
+ /* Transpose q12-q14*/
+ mov v18.16b,v12.16b
+ trn1 v12.4s, v12.4s, v14.4s
+ trn2 v14.4s, v18.4s, v14.4s
+ /* Transpose q8-q10*/
+ mov v18.16b,v8.16b
+ trn1 v8.4s, v8.4s, v10.4s
+ trn2 v10.4s, v18.4s, v10.4s
+ /* Transpose q13-q15*/
+ mov v18.16b,v13.16b
+ trn1 v13.4s, v13.4s, v15.4s
+ trn2 v15.4s, v18.4s, v15.4s
+ /*vswp v14.4h, v10-MSB.4h*/
+ umov x10,v14.d[0]
+ ins v14.2d[0],v10.2d[1]
+ ins v10.2d[1],x10
+ /*vswp v13.4h, v9MSB.4h*/
+
+ umov x10,v13.d[0]
+ ins v13.2d[0],v9.2d[1]
+ ins v9.2d[1],x10
+ /* 1-D IDCT, pass 2 */
+ sub v2.8h, v10.8h, v14.8h
+ /*vswp v15.4h, v11MSB.4h */
+ umov x10,v15.d[0]
+ ins v15.2d[0],v11.2d[1]
+ ins v11.2d[1],x10
+ add v14.8h, v10.8h, v14.8h
+ /*vswp v12.4h, v8-MSB.4h*/
+ umov x10,v12.d[0]
+ ins v12.2d[0],v8.2d[1]
+ ins v8.2d[1],x10
+ sub v1.8h, v11.8h, v13.8h
+ add v13.8h, v11.8h, v13.8h
+ sub v5.8h, v9.8h, v15.8h
+ add v15.8h, v9.8h, v15.8h
+ sqdmulh v4.8h, v2.8h, XFIX_1_414213562
+ sqdmulh v6.8h, v1.8h, XFIX_2_613125930
+ add v3.8h, v1.8h, v1.8h
+ sub v1.8h, v5.8h, v1.8h
+ add v10.8h, v2.8h, v4.8h
+ sqdmulh v4.8h, v1.8h, XFIX_1_847759065
+ sub v2.8h, v15.8h, v13.8h
+ add v3.8h, v3.8h, v6.8h
+ sqdmulh v6.8h, v2.8h, XFIX_1_414213562
+ add v1.8h, v1.8h, v4.8h
+ sqdmulh v4.8h, v5.8h, XFIX_1_082392200
+ sub v10.8h, v10.8h, v14.8h
+ add v2.8h, v2.8h, v6.8h
+ sub v6.8h, v8.8h, v12.8h
+ add v12.8h, v8.8h, v12.8h
+ add v9.8h, v5.8h, v4.8h
+ add v5.8h, v6.8h, v10.8h
+ sub v10.8h, v6.8h, v10.8h
+ add v6.8h, v15.8h, v13.8h
+ add v8.8h, v12.8h, v14.8h
+ sub v3.8h, v6.8h, v3.8h
+ sub v12.8h, v12.8h, v14.8h
+ sub v3.8h, v3.8h, v1.8h
+ sub v1.8h, v9.8h, v1.8h
+ add v2.8h, v3.8h, v2.8h
+ sub v15.8h, v8.8h, v6.8h
+ add v1.8h, v1.8h, v2.8h
+ add v8.8h, v8.8h, v6.8h
+ add v14.8h, v5.8h, v3.8h
+ sub v9.8h, v5.8h, v3.8h
+ sub v13.8h, v10.8h, v2.8h
+/* vpop {v4.8h-v7.4h} */ /* restore NEON registers...not available */
+ ld1 {v6.8h},[sp],16
+ ld1 {v4.8h-v5.8h},[sp],32
+ add v10.8h, v10.8h, v2.8h
+ sub v11.8h, v12.8h, v1.8h
+ add v12.8h, v12.8h, v1.8h
+ /* Descale to 8-bit and range limit */
+ movi v0.16b, #0x80
+#ifdef RTSM_SQSHRN_SIM_ISSUE
+ sqshrn v8.8b, v8.8h, #5
+ sqshrn2 v8.16b, v9.8h, #5
+ sqshrn v9.8b, v10.8h, #5
+ sqshrn2 v9.16b, v11.8h, #5
+ sqshrn v10.8b, v12.8h, #5
+ sqshrn2 v10.16b,v13.8h, #5
+ sqshrn v11.8b, v14.8h, #5
+ sqshrn2 v11.16b,v15.8h, #5
+#else
+ sqshrn v8.4h, v8.4s, #5
+ sqshrn2 v8.8h, v9.4s, #5
+ sqshrn v9.4h, v10.4s, #5
+ sqshrn2 v9.8h, v11.4s, #5
+ sqshrn v10.4h, v12.4s, #5
+ sqshrn2 v10.8h, v13.4s, #5
+ sqshrn v11.4h, v14.4s, #5
+ sqshrn2 v11.8h, v15.4s, #5
+#endif
+ add v8.16b, v8.16b, v0.16b
+ add v9.16b, v9.16b, v0.16b
+ add v10.16b, v10.16b, v0.16b
+ add v11.16b, v11.16b, v0.16b
+ /* Transpose the final 8-bit samples */
+ /* Transpose q8-q9*/
+ mov v18.16b,v8.16b
+ trn1 v8.8h, v8.8h, v9.8h
+ trn2 v9.8h, v18.8h, v9.8h
+ /* Transpose q10-q11*/
+ mov v18.16b,v10.16b
+ trn1 v10.8h, v10.8h, v11.8h
+ trn2 v11.8h, v18.8h, v11.8h
+ /* Transpose q8-q10*/
+ mov v18.16b,v8.16b
+ trn1 v8.4s, v8.4s, v10.4s
+ trn2 v10.4s, v18.4s, v10.4s
+ /* Transpose q9-q11*/
+ mov v18.16b,v9.16b
+ trn1 v9.4s, v9.4s, v11.4s
+ trn2 v11.4s, v18.4s, v11.4s
+ /* make copy */
+ ins v17.2d[0],v8.2d[1]
+ /* Transpose d16-d17-msb*/
+ mov v18.16b,v8.16b
+ trn1 v8.8b, v8.8b, v17.8b
+ trn2 v17.8b, v18.8b, v17.8b
+ /* make copy */
+ ins v19.2d[0],v9.2d[1]
+ mov v18.16b,v9.16b
+ trn1 v9.8b, v9.8b, v19.8b
+ trn2 v19.8b, v18.8b, v19.8b
+ /* Store results to the output buffer */
+ ldp TMP1, TMP2, [OUTPUT_BUF],16
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ st1 {v8.8b}, [TMP1]
+ st1 {v17.8b}, [TMP2]
+ ldp TMP1, TMP2, [OUTPUT_BUF],16
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ st1 {v9.8b}, [TMP1]
+ /* make copy */
+ ins v21.2d[0],v10.2d[1]
+ mov v18.16b,v10.16b
+ trn1 v10.8b, v10.8b, v21.8b
+ trn2 v21.8b, v18.8b, v21.8b
+ st1 {v19.8b}, [TMP2]
+ ldp TMP1, TMP2,[OUTPUT_BUF],16
+ ldp TMP3, TMP4,[OUTPUT_BUF]
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ add TMP3, TMP3, OUTPUT_COL
+ add TMP4, TMP4, OUTPUT_COL
+ st1 {v10.8b}, [TMP1]
+ /* make copy */
+ ins v23.2d[0],v11.2d[1]
+ mov v18.16b,v11.16b
+ trn1 v11.8b, v11.8b, v23.8b
+ trn2 v23.8b, v18.8b, v23.8b
+ st1 {v21.8b}, [TMP2]
+ st1 {v11.8b}, [TMP3]
+ st1 {v23.8b}, [TMP4]
+ blr x30
+
+ .unreq DCT_TABLE
+ .unreq COEF_BLOCK
+ .unreq OUTPUT_BUF
+ .unreq OUTPUT_COL
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq TMP4
+.endfunc