aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@nokia.com>2009-12-29 23:33:17 +0200
committerSiarhei Siamashka <siarhei.siamashka@nokia.com>2010-11-10 06:31:31 +0200
commit1c7bb8cbf687028663fc69227a70376e7a4c1724 (patch)
treec65bfa07eed0419790791f0c2d48d48667cb4319
parentd134ba037329b11d2ba8a9d160989967b2bfa86e (diff)
ARM NEON optimized version of 'jpeg_idct_ifast'
Is approximately 4x faster than original C variant.
-rw-r--r--jdct.h1
-rw-r--r--jidctfst.c359
2 files changed, 360 insertions, 0 deletions
diff --git a/jdct.h b/jdct.h
index 5e26a7b..7a691d4 100644
--- a/jdct.h
+++ b/jdct.h
@@ -104,6 +104,7 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
#if defined(WITH_SIMD) && defined(__ARM_NEON__)
#define jpeg_idct_4x4 jpeg_idct_4x4_neon
+#define jpeg_idct_ifast jpeg_idct_ifast_neon
#endif
/* Extern declarations for the forward and inverse DCT routines. */
diff --git a/jidctfst.c b/jidctfst.c
index dba4216..7c519da 100644
--- a/jidctfst.c
+++ b/jidctfst.c
@@ -2,6 +2,11 @@
* jidctfst.c
*
* Copyright (C) 1994-1998, Thomas G. Lane.
+ *
+ * ARM NEON optimizations
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
+ * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+ *
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -164,6 +169,358 @@
* Perform dequantization and inverse DCT on one block of coefficients.
*/
+#if defined(WITH_SIMD) && defined(__ARM_NEON__) && (BITS_IN_JSAMPLE == 8)
+
+#define XFIX_1_082392200 ((short)(277 * 128 - 256 * 128))
+#define XFIX_1_414213562 ((short)(362 * 128 - 256 * 128))
+#define XFIX_1_847759065 ((short)(473 * 128 - 256 * 128))
+#define XFIX_2_613125930 ((short)(669 * 128 - 512 * 128))
+
+GLOBAL(void)
+jpeg_idct_ifast_neon (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ JCOEFPTR inptr;
+ IFAST_MULT_TYPE * quantptr;
+ int tmp;
+
+ const static short c[4] = {
+ XFIX_1_082392200, /* d0[0] */
+ XFIX_1_414213562, /* d0[1] */
+ XFIX_1_847759065, /* d0[2] */
+ XFIX_2_613125930 /* d0[3] */
+ };
+
+ quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
+ inptr = coef_block;
+ asm volatile (
+ /* load constants */
+ "vld1.16 {d0}, [%[c]]\n"
+ /* load all coef block:
+ * 0 | d4 d5
+ * 1 | d6 d7
+ * 2 | d8 d9
+ * 3 | d10 d11
+ * 4 | d12 d13
+ * 5 | d14 d15
+ * 6 | d16 d17
+ * 7 | d18 d19
+ */
+ "vld1.16 {d4, d5, d6, d7}, [%[inptr]]!\n"
+ "vld1.16 {d8, d9, d10, d11}, [%[inptr]]!\n"
+ "vld1.16 {d12, d13, d14, d15}, [%[inptr]]!\n"
+ "vld1.16 {d16, d17, d18, d19}, [%[inptr]]!\n"
+ /* dequantize */
+ "vld1.16 {d20, d21, d22, d23}, [%[quantptr]]!\n"
+ "vmul.s16 q2, q2, q10\n"
+ "vld1.16 {d24, d25, d26, d27}, [%[quantptr]]!\n"
+ "vmul.s16 q3, q3, q11\n"
+ "vmul.s16 q4, q4, q12\n"
+ "vld1.16 {d28, d29, d30, d31}, [%[quantptr]]!\n"
+ "vmul.s16 q5, q5, q13\n"
+ "vmul.s16 q6, q6, q14\n"
+ "vld1.16 {d20, d21, d22, d23}, [%[quantptr]]!\n"
+ "vmul.s16 q7, q7, q15\n"
+ "vmul.s16 q8, q8, q10\n"
+ "vmul.s16 q9, q9, q11\n"
+
+ ".macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7,"
+ " t10, t11, t12, t13, t14\n"
+ "vsub.s16 \\t10, \\x0, \\x4\n"
+ "vadd.s16 \\x4, \\x0, \\x4\n"
+ "vswp.s16 \\t10, \\x0\n"
+ "vsub.s16 \\t11, \\x2, \\x6\n"
+ "vadd.s16 \\x6, \\x2, \\x6\n"
+ "vswp.s16 \\t11, \\x2\n"
+ "vsub.s16 \\t10, \\x3, \\x5\n"
+ "vadd.s16 \\x5, \\x3, \\x5\n"
+ "vswp.s16 \\t10, \\x3\n"
+ "vsub.s16 \\t11, \\x1, \\x7\n"
+ "vadd.s16 \\x7, \\x1, \\x7\n"
+ "vswp.s16 \\t11, \\x1\n"
+
+ "vqdmulh.s16 \\t13, \\x2, d0[1]\n"
+ "vadd.s16 \\t12, \\x3, \\x3\n"
+ "vadd.s16 \\x2, \\x2, \\t13\n"
+ "vqdmulh.s16 \\t13, \\x3, d0[3]\n"
+ "vsub.s16 \\t10, \\x1, \\x3\n"
+ "vadd.s16 \\t12, \\t12, \\t13\n"
+ "vqdmulh.s16 \\t13, \\t10, d0[2]\n"
+ "vsub.s16 \\t11, \\x7, \\x5\n"
+ "vadd.s16 \\t10, \\t10, \\t13\n"
+ "vqdmulh.s16 \\t13, \\t11, d0[1]\n"
+ "vadd.s16 \\t11, \\t11, \\t13\n"
+
+ "vqdmulh.s16 \\t13, \\x1, d0[0]\n"
+ "vsub.s16 \\x2, \\x6, \\x2\n"
+ "vsub.s16 \\t14, \\x0, \\x2\n"
+ "vadd.s16 \\x2, \\x0, \\x2\n"
+ "vadd.s16 \\x0, \\x4, \\x6\n"
+ "vsub.s16 \\x4, \\x4, \\x6\n"
+ "vadd.s16 \\x1, \\x1, \\t13\n"
+ "vadd.s16 \\t13, \\x7, \\x5\n"
+ "vsub.s16 \\t12, \\t13, \\t12\n"
+ "vsub.s16 \\t12, \\t12, \\t10\n"
+ "vadd.s16 \\t11, \\t12, \\t11\n"
+ "vsub.s16 \\t10, \\x1, \\t10\n"
+ "vadd.s16 \\t10, \\t10, \\t11\n"
+
+ "vsub.s16 \\x7, \\x0, \\t13\n"
+ "vadd.s16 \\x0, \\x0, \\t13\n"
+ "vadd.s16 \\x6, \\t14, \\t12\n"
+ "vsub.s16 \\x1, \\t14, \\t12\n"
+ "vsub.s16 \\x5, \\x2, \\t11\n"
+ "vadd.s16 \\x2, \\x2, \\t11\n"
+ "vsub.s16 \\x3, \\x4, \\t10\n"
+ "vadd.s16 \\x4, \\x4, \\t10\n"
+ ".endm\n"
+
+ ".macro transpose_4x4 x0, x1, x2, x3\n"
+ "vtrn.16 \\x0, \\x1\n"
+ "vtrn.16 \\x2, \\x3\n"
+ "vtrn.32 \\x0, \\x2\n"
+ "vtrn.32 \\x1, \\x3\n"
+ ".endm\n"
+
+ /* pass 1 */
+ "idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14\n"
+ /* transpose */
+ "transpose_4x4 d4, d6, d8, d10\n"
+ "transpose_4x4 d5, d7, d9, d11\n"
+ "transpose_4x4 d12, d14, d16, d18\n"
+ "transpose_4x4 d13, d15, d17, d19\n"
+ "vswp d12, d5\n"
+ "vswp d14, d7\n"
+ "vswp d16, d9\n"
+ "vswp d18, d11\n"
+
+ /* pass2 */
+ "idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14\n"
+ /* transpose */
+ "transpose_4x4 d4, d6, d8, d10\n"
+ "transpose_4x4 d5, d7, d9, d11\n"
+ "transpose_4x4 d12, d14, d16, d18\n"
+ "transpose_4x4 d13, d15, d17, d19\n"
+ "vswp d12, d5\n"
+ "vswp d14, d7\n"
+ "vswp d16, d9\n"
+ "vswp d18, d11\n"
+
+ /* descale and range limit */
+ "vmov.s16 q15, #(0x80 << 5)\n"
+ "vqadd.s16 q2, q2, q15\n"
+ "vqadd.s16 q3, q3, q15\n"
+ "vqadd.s16 q4, q4, q15\n"
+ "vqadd.s16 q5, q5, q15\n"
+ "vqadd.s16 q6, q6, q15\n"
+ "vqadd.s16 q7, q7, q15\n"
+ "vqadd.s16 q8, q8, q15\n"
+ "vqadd.s16 q9, q9, q15\n"
+ "vqshrun.s16 d4, q2, #5\n"
+ "vqshrun.s16 d6, q3, #5\n"
+ "vqshrun.s16 d8, q4, #5\n"
+ "vqshrun.s16 d10, q5, #5\n"
+ "vqshrun.s16 d12, q6, #5\n"
+ "vqshrun.s16 d14, q7, #5\n"
+ "vqshrun.s16 d16, q8, #5\n"
+ "vqshrun.s16 d18, q9, #5\n"
+
+ /* store results to the output buffer */
+ ".irp x, d4, d6, d8, d10, d12, d14, d16, d18\n"
+ "ldr %[tmp], [%[output_buf]], #4\n"
+ "add %[tmp], %[tmp], %[output_col]\n"
+ "vst1.8 {\\x}, [%[tmp]]!\n"
+ ".endr\n"
+ : [inptr] "+&r" (inptr),
+ [quantptr] "+&r" (quantptr),
+ [tmp] "=&r" (tmp),
+ [output_buf] "+&r" (output_buf)
+ : [c] "r" (c),
+ [output_col] "r" (output_col)
+ : "cc", "memory",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+ "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+ "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");
+}
+
+#if 0
+
+/* Macro which is similar to VQDMULH NEON instruction */
+#define XMULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const) * 2, 16))
+
+/*
+ * A slightly modified C code (which maps to NEON instructions better),
+ * which was used as a reference implementation for converting to NEON.
+ */
+GLOBAL(void)
+jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ DCTELEM q10, q11, q12, q13, q14;
+ short * inptr;
+ IFAST_MULT_TYPE * quantptr;
+ short * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ short workspace[DCTSIZE2]; /* buffers data between passes */
+ SHIFT_TEMPS /* for DESCALE */
+ ISHIFT_TEMPS /* for IDESCALE */
+ JCOEF dequantized_input[DCTSIZE*8];
+
+ /* Pass 0: dequantize data */
+ quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
+ inptr = coef_block;
+ for (ctr = 0; ctr < 64; ctr++)
+ dequantized_input[ctr] = DEQUANTIZE(inptr[ctr], quantptr[ctr]);
+
+ /* Pass 1: process columns from input, store into work array. */
+
+ /* preprocess input data, converting them to sums and differences */
+ inptr = dequantized_input;
+ for (ctr = 0; ctr < 8; ctr++) {
+ int sum = inptr[ctr + DCTSIZE*0] + inptr[ctr + DCTSIZE*4];
+ int diff = inptr[ctr + DCTSIZE*0] - inptr[ctr + DCTSIZE*4];
+ inptr[ctr + DCTSIZE*0] = diff;
+ inptr[ctr + DCTSIZE*4] = sum;
+ sum = inptr[ctr + DCTSIZE*2] + inptr[ctr + DCTSIZE*6];
+ diff = inptr[ctr + DCTSIZE*2] - inptr[ctr + DCTSIZE*6];
+ inptr[ctr + DCTSIZE*2] = diff;
+ inptr[ctr + DCTSIZE*6] = sum;
+ sum = inptr[ctr + DCTSIZE*3] + inptr[ctr + DCTSIZE*5];
+ diff = inptr[ctr + DCTSIZE*3] - inptr[ctr + DCTSIZE*5];
+ inptr[ctr + DCTSIZE*3] = diff;
+ inptr[ctr + DCTSIZE*5] = sum;
+ sum = inptr[ctr + DCTSIZE*1] + inptr[ctr + DCTSIZE*7];
+ diff = inptr[ctr + DCTSIZE*1] - inptr[ctr + DCTSIZE*7];
+ inptr[ctr + DCTSIZE*1] = diff;
+ inptr[ctr + DCTSIZE*7] = sum;
+ }
+ wsptr = workspace;
+ for (ctr = DCTSIZE; ctr > 0; ctr--) {
+
+ q13 = XMULTIPLY(inptr[DCTSIZE*2], XFIX_1_414213562);
+ q12 = inptr[DCTSIZE*3] + inptr[DCTSIZE*3];
+ inptr[DCTSIZE*2] += q13;
+ q13 = XMULTIPLY(inptr[DCTSIZE*3], XFIX_2_613125930);
+ q10 = inptr[DCTSIZE*1] - inptr[DCTSIZE*3];
+ q12 += q13;
+ q13 = XMULTIPLY(q10, XFIX_1_847759065);
+ q11 = inptr[DCTSIZE*7] - inptr[DCTSIZE*5];
+ q10 += q13;
+ q13 = XMULTIPLY(q11, XFIX_1_414213562);
+ q11 += q13;
+
+ q13 = XMULTIPLY(inptr[DCTSIZE*1], XFIX_1_082392200);
+
+ inptr[DCTSIZE*2] = inptr[DCTSIZE*6] - inptr[DCTSIZE*2];
+ q14 = inptr[DCTSIZE*0] - inptr[DCTSIZE*2];
+ inptr[DCTSIZE*2] = inptr[DCTSIZE*0] + inptr[DCTSIZE*2];
+ inptr[DCTSIZE*0] = inptr[DCTSIZE*4] + inptr[DCTSIZE*6];
+ inptr[DCTSIZE*4] = inptr[DCTSIZE*4] - inptr[DCTSIZE*6];
+
+ inptr[DCTSIZE*1] += q13;
+
+ q13 = inptr[DCTSIZE*7] + inptr[DCTSIZE*5];
+ q12 = q13 - q12 - q10;
+ q11 = q12 + q11;
+ q10 = q11 + inptr[DCTSIZE*1] - q10;
+
+ wsptr[7] = (int) (inptr[DCTSIZE*0] - q13);
+ wsptr[0] = (int) (inptr[DCTSIZE*0] + q13);
+ wsptr[6] = (int) (q14 + q12);
+ wsptr[1] = (int) (q14 - q12);
+ wsptr[5] = (int) (inptr[DCTSIZE*2] - q11);
+ wsptr[2] = (int) (inptr[DCTSIZE*2] + q11);
+ wsptr[3] = (int) (inptr[DCTSIZE*4] - q10);
+ wsptr[4] = (int) (inptr[DCTSIZE*4] + q10);
+
+ inptr++; /* advance pointers to next column */
+ wsptr += DCTSIZE;
+ }
+
+ /* Pass 2: process rows from work array, store into output array. */
+ /* Note that we must descale the results by a factor of 8 == 2**3, */
+ /* and also undo the PASS1_BITS scaling. */
+ inptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++) {
+ int sum = inptr[ctr + DCTSIZE*0] + inptr[ctr + DCTSIZE*4];
+ int diff = inptr[ctr + DCTSIZE*0] - inptr[ctr + DCTSIZE*4];
+ inptr[ctr + DCTSIZE*0] = diff;
+ inptr[ctr + DCTSIZE*4] = sum;
+ sum = inptr[ctr + DCTSIZE*2] + inptr[ctr + DCTSIZE*6];
+ diff = inptr[ctr + DCTSIZE*2] - inptr[ctr + DCTSIZE*6];
+ inptr[ctr + DCTSIZE*2] = diff;
+ inptr[ctr + DCTSIZE*6] = sum;
+ sum = inptr[ctr + DCTSIZE*3] + inptr[ctr + DCTSIZE*5];
+ diff = inptr[ctr + DCTSIZE*3] - inptr[ctr + DCTSIZE*5];
+ inptr[ctr + DCTSIZE*3] = diff;
+ inptr[ctr + DCTSIZE*5] = sum;
+ sum = inptr[ctr + DCTSIZE*1] + inptr[ctr + DCTSIZE*7];
+ diff = inptr[ctr + DCTSIZE*1] - inptr[ctr + DCTSIZE*7];
+ inptr[ctr + DCTSIZE*1] = diff;
+ inptr[ctr + DCTSIZE*7] = sum;
+ }
+
+ for (ctr = 0; ctr < DCTSIZE; ctr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ q13 = XMULTIPLY(inptr[DCTSIZE*2], XFIX_1_414213562);
+ q12 = inptr[DCTSIZE*3] + inptr[DCTSIZE*3];
+ inptr[DCTSIZE*2] += q13;
+ q13 = XMULTIPLY(inptr[DCTSIZE*3], XFIX_2_613125930);
+ q10 = inptr[DCTSIZE*1] - inptr[DCTSIZE*3];
+ q12 += q13;
+ q13 = XMULTIPLY(q10, XFIX_1_847759065);
+ q11 = inptr[DCTSIZE*7] - inptr[DCTSIZE*5];
+ q10 += q13;
+ q13 = XMULTIPLY(q11, XFIX_1_414213562);
+ q11 += q13;
+
+ q13 = XMULTIPLY(inptr[DCTSIZE*1], XFIX_1_082392200);
+
+ inptr[DCTSIZE*2] = inptr[DCTSIZE*6] - inptr[DCTSIZE*2];
+ q14 = inptr[DCTSIZE*0] - inptr[DCTSIZE*2];
+ inptr[DCTSIZE*2] = inptr[DCTSIZE*0] + inptr[DCTSIZE*2];
+ inptr[DCTSIZE*0] = inptr[DCTSIZE*4] + inptr[DCTSIZE*6];
+ inptr[DCTSIZE*4] = inptr[DCTSIZE*4] - inptr[DCTSIZE*6];
+
+ inptr[DCTSIZE*1] += q13;
+
+ q13 = inptr[DCTSIZE*7] + inptr[DCTSIZE*5];
+ q12 = q13 - q12 - q10;
+ q11 = q12 + q11;
+ q10 = q11 + inptr[DCTSIZE*1] - q10;
+
+ /* Final output stage: scale down by a factor of 8 and range-limit */
+ outptr[7] = range_limit[IDESCALE(inptr[DCTSIZE*0] - q13, PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[0] = range_limit[IDESCALE(inptr[DCTSIZE*0] + q13, PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[6] = range_limit[IDESCALE(q14 + q12, PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[1] = range_limit[IDESCALE(q14 - q12, PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[5] = range_limit[IDESCALE(inptr[DCTSIZE*2] - q11, PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[2] = range_limit[IDESCALE(inptr[DCTSIZE*2] + q11, PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[3] = range_limit[IDESCALE(inptr[DCTSIZE*4] - q10, PASS1_BITS+3)
+ & RANGE_MASK];
+ outptr[4] = range_limit[IDESCALE(inptr[DCTSIZE*4] + q10, PASS1_BITS+3)
+ & RANGE_MASK];
+
+ inptr++; /* advance pointers to next column */
+ }
+}
+
+#endif
+
+#else
+
GLOBAL(void)
jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block,
@@ -365,4 +722,6 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
}
}
+#endif
+
#endif /* DCT_IFAST_SUPPORTED */