aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@nokia.com>2010-01-26 17:54:31 +0200
committerSiarhei Siamashka <siarhei.siamashka@nokia.com>2010-11-10 06:31:41 +0200
commit6f625a0844943d365ba13c5b00c0285c5139b49b (patch)
tree00693d643c1f78383a7707904666247b5599cd0f
parent1c7bb8cbf687028663fc69227a70376e7a4c1724 (diff)
ARM NEON optimized version of 'jpeg_fdct_ifast'
Is approximately 4x faster than original C variant.
-rw-r--r--jdct.h1
-rw-r--r--jfdctfst.c266
2 files changed, 267 insertions, 0 deletions
diff --git a/jdct.h b/jdct.h
index 7a691d4..e8ba04d 100644
--- a/jdct.h
+++ b/jdct.h
@@ -105,6 +105,7 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
#if defined(WITH_SIMD) && defined(__ARM_NEON__)
#define jpeg_idct_4x4 jpeg_idct_4x4_neon
#define jpeg_idct_ifast jpeg_idct_ifast_neon
+#define jpeg_fdct_ifast jpeg_fdct_ifast_neon
#endif
/* Extern declarations for the forward and inverse DCT routines. */
diff --git a/jfdctfst.c b/jfdctfst.c
index ccb378a..2aa1139 100644
--- a/jfdctfst.c
+++ b/jfdctfst.c
@@ -2,6 +2,11 @@
* jfdctfst.c
*
* Copyright (C) 1994-1996, Thomas G. Lane.
+ *
+ * ARM NEON optimizations
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
+ * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+ *
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -110,6 +115,265 @@
* Perform the forward DCT on one block of samples.
*/
+#if defined(WITH_SIMD) && defined(__ARM_NEON__) && (BITS_IN_JSAMPLE == 8)
+
+#define XFIX_0_382683433 ((short)(98 * 128))
+#define XFIX_0_541196100 ((short)(139 * 128))
+#define XFIX_0_707106781 ((short)(181 * 128))
+#define XFIX_1_306562965 ((short)(334 * 128 - 256 * 128))
+
+/* Macro which is similar to VQDMULH NEON instruction */
+#define XMULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const) * 2, 16))
+
+GLOBAL(void)
+jpeg_fdct_ifast_neon (DCTELEM * data)
+{
+#if 1
+ JCOEFPTR inptr, outptr;
+ const static short c[4] = {
+ XFIX_0_382683433, /* d0[0] */
+ XFIX_0_541196100, /* d0[1] */
+ XFIX_0_707106781, /* d0[2] */
+ XFIX_1_306562965 /* d0[3] */
+ };
+
+ inptr = outptr = data;
+ asm volatile (
+ /* load constants */
+ "vld1.16 {d0}, [%[c]]\n"
+ /* load all coef block:
+ * 0 | d4 d5
+ * 1 | d6 d7
+ * 2 | d8 d9
+ * 3 | d10 d11
+ * 4 | d12 d13
+ * 5 | d14 d15
+ * 6 | d16 d17
+ * 7 | d18 d19
+ */
+ "vld1.16 {d4, d5, d6, d7}, [%[inptr]]!\n"
+ "vld1.16 {d8, d9, d10, d11}, [%[inptr]]!\n"
+ "vld1.16 {d12, d13, d14, d15}, [%[inptr]]!\n"
+ "vld1.16 {d16, d17, d18, d19}, [%[inptr]]!\n"
+
+ ".macro transpose_4x4 x0, x1, x2, x3\n"
+ "vtrn.16 \\x0, \\x1\n"
+ "vtrn.16 \\x2, \\x3\n"
+ "vtrn.32 \\x0, \\x2\n"
+ "vtrn.32 \\x1, \\x3\n"
+ ".endm\n"
+
+ ".macro transpose_all\n"
+ "transpose_4x4 d4, d6, d8, d10\n"
+ "transpose_4x4 d5, d7, d9, d11\n"
+ "transpose_4x4 d12, d14, d16, d18\n"
+ "transpose_4x4 d13, d15, d17, d19\n"
+ "vswp d12, d5\n"
+ "vswp d14, d7\n"
+ "vswp d16, d9\n"
+ "vswp d18, d11\n"
+ ".endm\n"
+
+ ".macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7,"
+ " t10, t11, t12, t13, t14, t15\n"
+ /* preprocess input data, converting them to sums and differences */
+ "vsub.s16 \\t10, \\x0, \\x7\n"
+ "vadd.s16 \\x0, \\x0, \\x7\n"
+ "vswp.s16 \\t10, \\x7\n"
+ "vsub.s16 \\t11, \\x1, \\x6\n"
+ "vadd.s16 \\x1, \\x1, \\x6\n"
+ "vswp.s16 \\t11, \\x6\n"
+ "vsub.s16 \\t10, \\x2, \\x5\n"
+ "vadd.s16 \\x2, \\x2, \\x5\n"
+ "vswp.s16 \\t10, \\x5\n"
+ "vsub.s16 \\t11, \\x3, \\x4\n"
+ "vadd.s16 \\x3, \\x3, \\x4\n"
+ "vswp.s16 \\t11, \\x4\n"
+
+ "vsub.s16 \\t12, \\x1, \\x2\n" /* tmp12 = dataptr[1] - dataptr[2]; */
+ "vsub.s16 \\t13, \\x0, \\x3\n" /* tmp13 = dataptr[0] - dataptr[3]; */
+ "vadd.s16 \\t12, \\t12, \\t13\n" /* tmp12 = tmp12 + tmp13; */
+ "vqdmulh.s16 \\t12, \\t12, d0[2]\n" /* tmp12 = XMULTIPLY(tmp12, XFIX_0_707106781); */
+ "vadd.s16 \\t11, \\x1, \\x2\n" /* tmp11 = dataptr[1] + dataptr[2]; */
+ "vadd.s16 \\t10, \\x0, \\x3\n" /* tmp10 = dataptr[0] + dataptr[3]; */
+
+ "vadd.s16 \\x3, \\x4, \\x5\n" /* wsptr[3*DCTSIZE] = dataptr[4] + dataptr[5]; */
+ "vadd.s16 \\x0, \\t10, \\t11\n" /* wsptr[0*DCTSIZE] = tmp10 + tmp11; */
+ "vsub.s16 \\x4, \\t10, \\t11\n" /* wsptr[4*DCTSIZE] = tmp10 - tmp11; */
+ "vadd.s16 \\t11, \\x5, \\x6\n" /* tmp11 = dataptr[5] + dataptr[6]; */
+ "vqdmulh.s16 \\t11, \\t11, d0[2]\n" /* tmp11 = XMULTIPLY(tmp11, XFIX_0_707106781); */
+ "vadd.s16 \\t15, \\x6, \\x7\n" /* tmp15 = dataptr[6] + dataptr[7]; */
+ "vsub.s16 \\t14, \\x3, \\t15\n" /* tmp14 = wsptr[3*DCTSIZE] - tmp15; */
+ "vqdmulh.s16 \\t14, \\t14, d0[0]\n" /* tmp14 = XMULTIPLY(tmp14, XFIX_0_382683433); */
+ "vadd.s16 \\x2, \\t13, \\t12\n" /* wsptr[2*DCTSIZE] = tmp13 + tmp12; */
+ "vsub.s16 \\x6, \\t13, \\t12\n" /* wsptr[6*DCTSIZE] = tmp13 - tmp12; */
+ "vqdmulh.s16 \\t13, \\t15, d0[3]\n" /* tmp13 = XMULTIPLY(tmp15, XFIX_1_306562965); */
+ "vqdmulh.s16 \\x3, \\x3, d0[1]\n" /* wsptr[3*DCTSIZE] = XMULTIPLY(wsptr[3*DCTSIZE], XFIX_0_541196100); */
+
+ "vadd.s16 \\t12, \\x7, \\t11\n" /* tmp12 = dataptr[7] + tmp11; */
+ "vsub.s16 \\t11, \\x7, \\t11\n" /* tmp11 = dataptr[7] - tmp11; */
+ "vadd.s16 \\t15, \\t15, \\t14\n" /* tmp15 = tmp15 + tmp14; */
+ "vadd.s16 \\t15, \\t15, \\t13\n" /* tmp15 = tmp15 + tmp13; */
+ "vadd.s16 \\x3, \\x3, \\t14\n" /* wsptr[3*DCTSIZE] = wsptr[3*DCTSIZE] + tmp14; */
+
+ "vadd.s16 \\x5, \\t11, \\x3\n" /* wsptr[5*DCTSIZE] = tmp11 + wsptr[3*DCTSIZE]; */
+ "vsub.s16 \\x3, \\t11, \\x3\n" /* wsptr[3*DCTSIZE] = tmp11 - wsptr[3*DCTSIZE]; */
+ "vadd.s16 \\x1, \\t12, \\t15\n" /* wsptr[1*DCTSIZE] = tmp12 + tmp15; */
+ "vsub.s16 \\x7, \\t12, \\t15\n" /* wsptr[7*DCTSIZE] = tmp12 - tmp15; */
+ ".endm\n"
+
+ "transpose_all\n"
+ "idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15\n"
+ "transpose_all\n"
+ "idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15\n"
+
+ /* store results to the output buffer */
+ "vst1.16 {d4, d5, d6, d7}, [%[outptr]]!\n"
+ "vst1.16 {d8, d9, d10, d11}, [%[outptr]]!\n"
+ "vst1.16 {d12, d13, d14, d15}, [%[outptr]]!\n"
+ "vst1.16 {d16, d17, d18, d19}, [%[outptr]]!\n"
+
+ : [inptr] "+&r" (inptr),
+ [outptr] "+&r" (outptr)
+ : [c] "r" (c)
+ : "cc", "memory",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+ "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+ "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");
+#else
+ DCTELEM tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ DCTELEM *dataptr, *wsptr;
+ DCTELEM workspace[64];
+ int ctr;
+ /* Pass 1: process rows. */
+
+ /* preprocess input data, converting them to sums and differences */
+ dataptr = data;
+ for (ctr = 0; ctr < 8; ctr++, dataptr += DCTSIZE) {
+ int sum = dataptr[0] + dataptr[7];
+ int diff = dataptr[0] - dataptr[7];
+ dataptr[7] = diff;
+ dataptr[0] = sum;
+ sum = dataptr[1] + dataptr[6];
+ diff = dataptr[1] - dataptr[6];
+ dataptr[6] = diff;
+ dataptr[1] = sum;
+ sum = dataptr[2] + dataptr[5];
+ diff = dataptr[2] - dataptr[5];
+ dataptr[5] = diff;
+ dataptr[2] = sum;
+ sum = dataptr[3] + dataptr[4];
+ diff = dataptr[3] - dataptr[4];
+ dataptr[4] = diff;
+ dataptr[3] = sum;
+ }
+
+ dataptr = data;
+ wsptr = workspace;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ tmp12 = dataptr[1] - dataptr[2];
+ tmp13 = dataptr[0] - dataptr[3];
+ tmp12 = tmp12 + tmp13;
+ tmp12 = XMULTIPLY(tmp12, XFIX_0_707106781);
+ tmp11 = dataptr[1] + dataptr[2];
+ tmp10 = dataptr[0] + dataptr[3];
+
+ wsptr[3*DCTSIZE] = dataptr[4] + dataptr[5];
+ wsptr[0*DCTSIZE] = tmp10 + tmp11;
+ wsptr[4*DCTSIZE] = tmp10 - tmp11;
+ tmp11 = dataptr[5] + dataptr[6];
+ tmp11 = XMULTIPLY(tmp11, XFIX_0_707106781);
+ tmp15 = dataptr[6] + dataptr[7];
+ tmp14 = wsptr[3*DCTSIZE] - tmp15;
+ tmp14 = XMULTIPLY(tmp14, XFIX_0_382683433);
+ wsptr[2*DCTSIZE] = tmp13 + tmp12;
+ wsptr[6*DCTSIZE] = tmp13 - tmp12;
+ tmp13 = XMULTIPLY(tmp15, XFIX_1_306562965);
+ wsptr[3*DCTSIZE] = XMULTIPLY(wsptr[3*DCTSIZE], XFIX_0_541196100);
+
+ tmp12 = dataptr[7] + tmp11;
+ tmp11 = dataptr[7] - tmp11;
+ tmp15 = tmp15 + tmp14;
+ tmp15 = tmp15 + tmp13;
+ wsptr[3*DCTSIZE] = wsptr[3*DCTSIZE] + tmp14;
+
+ wsptr[5*DCTSIZE] = tmp11 + wsptr[3*DCTSIZE];
+ wsptr[3*DCTSIZE] = tmp11 - wsptr[3*DCTSIZE];
+ wsptr[1*DCTSIZE] = tmp12 + tmp15;
+ wsptr[7*DCTSIZE] = tmp12 - tmp15;
+
+ dataptr += DCTSIZE; /* advance pointer to next column */
+ wsptr++;
+ }
+
+ /* Pass 2: process columns. */
+
+ /* preprocess input data, converting them to sums and differences */
+ dataptr = workspace;
+ for (ctr = 0; ctr < 8; ctr++, dataptr += DCTSIZE) {
+ int sum = dataptr[0] + dataptr[7];
+ int diff = dataptr[0] - dataptr[7];
+ dataptr[7] = diff;
+ dataptr[0] = sum;
+ sum = dataptr[1] + dataptr[6];
+ diff = dataptr[1] - dataptr[6];
+ dataptr[6] = diff;
+ dataptr[1] = sum;
+ sum = dataptr[2] + dataptr[5];
+ diff = dataptr[2] - dataptr[5];
+ dataptr[5] = diff;
+ dataptr[2] = sum;
+ sum = dataptr[3] + dataptr[4];
+ diff = dataptr[3] - dataptr[4];
+ dataptr[4] = diff;
+ dataptr[3] = sum;
+ }
+
+ dataptr = workspace;
+ wsptr = data;
+ for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+ tmp12 = dataptr[1] - dataptr[2];
+ tmp13 = dataptr[0] - dataptr[3];
+ tmp12 = tmp12 + tmp13;
+ tmp12 = XMULTIPLY(tmp12, XFIX_0_707106781);
+ tmp11 = dataptr[1] + dataptr[2];
+ tmp10 = dataptr[0] + dataptr[3];
+
+ wsptr[3*DCTSIZE] = dataptr[4] + dataptr[5];
+ wsptr[0*DCTSIZE] = tmp10 + tmp11;
+ wsptr[4*DCTSIZE] = tmp10 - tmp11;
+ tmp11 = dataptr[5] + dataptr[6];
+ tmp11 = XMULTIPLY(tmp11, XFIX_0_707106781);
+ tmp15 = dataptr[6] + dataptr[7];
+ tmp14 = wsptr[3*DCTSIZE] - tmp15;
+ tmp14 = XMULTIPLY(tmp14, XFIX_0_382683433);
+ wsptr[2*DCTSIZE] = tmp13 + tmp12;
+ wsptr[6*DCTSIZE] = tmp13 - tmp12;
+ tmp13 = XMULTIPLY(tmp15, XFIX_1_306562965);
+ wsptr[3*DCTSIZE] = XMULTIPLY(wsptr[3*DCTSIZE], XFIX_0_541196100);
+
+ tmp12 = dataptr[7] + tmp11;
+ tmp11 = dataptr[7] - tmp11;
+ tmp15 = tmp15 + tmp14;
+ tmp15 = tmp15 + tmp13;
+ wsptr[3*DCTSIZE] = wsptr[3*DCTSIZE] + tmp14;
+
+ wsptr[5*DCTSIZE] = tmp11 + wsptr[3*DCTSIZE];
+ wsptr[3*DCTSIZE] = tmp11 - wsptr[3*DCTSIZE];
+ wsptr[1*DCTSIZE] = tmp12 + tmp15;
+ wsptr[7*DCTSIZE] = tmp12 - tmp15;
+
+ dataptr += DCTSIZE; /* advance pointer to next column */
+ wsptr++;
+ }
+#endif
+}
+
+#define jpeg_fdct_ifast jpeg_fdct_ifast_neon
+
+#else
+
GLOBAL(void)
jpeg_fdct_ifast (DCTELEM * data)
{
@@ -221,4 +485,6 @@ jpeg_fdct_ifast (DCTELEM * data)
}
}
+#endif
+
#endif /* DCT_IFAST_SUPPORTED */