diff options
author | Siarhei Siamashka <siarhei.siamashka@nokia.com> | 2010-01-26 17:54:31 +0200 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@nokia.com> | 2010-11-10 06:31:41 +0200 |
commit | 6f625a0844943d365ba13c5b00c0285c5139b49b (patch) | |
tree | 00693d643c1f78383a7707904666247b5599cd0f | |
parent | 1c7bb8cbf687028663fc69227a70376e7a4c1724 (diff) |
ARM NEON optimized version of 'jpeg_fdct_ifast'
Is approximately 4x faster than original C variant.
-rw-r--r-- | jdct.h | 1 | ||||
-rw-r--r-- | jfdctfst.c | 266 |
2 files changed, 267 insertions, 0 deletions
@@ -105,6 +105,7 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */ #if defined(WITH_SIMD) && defined(__ARM_NEON__) #define jpeg_idct_4x4 jpeg_idct_4x4_neon #define jpeg_idct_ifast jpeg_idct_ifast_neon +#define jpeg_fdct_ifast jpeg_fdct_ifast_neon #endif /* Extern declarations for the forward and inverse DCT routines. */ @@ -2,6 +2,11 @@ * jfdctfst.c * * Copyright (C) 1994-1996, Thomas G. Lane. + * + * ARM NEON optimizations + * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. + * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com> + * * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * @@ -110,6 +115,265 @@ * Perform the forward DCT on one block of samples. */ +#if defined(WITH_SIMD) && defined(__ARM_NEON__) && (BITS_IN_JSAMPLE == 8) + +#define XFIX_0_382683433 ((short)(98 * 128)) +#define XFIX_0_541196100 ((short)(139 * 128)) +#define XFIX_0_707106781 ((short)(181 * 128)) +#define XFIX_1_306562965 ((short)(334 * 128 - 256 * 128)) + +/* Macro which is similar to VQDMULH NEON instruction */ +#define XMULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const) * 2, 16)) + +GLOBAL(void) +jpeg_fdct_ifast_neon (DCTELEM * data) +{ +#if 1 + JCOEFPTR inptr, outptr; + const static short c[4] = { + XFIX_0_382683433, /* d0[0] */ + XFIX_0_541196100, /* d0[1] */ + XFIX_0_707106781, /* d0[2] */ + XFIX_1_306562965 /* d0[3] */ + }; + + inptr = outptr = data; + asm volatile ( + /* load constants */ + "vld1.16 {d0}, [%[c]]\n" + /* load all coef block: + * 0 | d4 d5 + * 1 | d6 d7 + * 2 | d8 d9 + * 3 | d10 d11 + * 4 | d12 d13 + * 5 | d14 d15 + * 6 | d16 d17 + * 7 | d18 d19 + */ + "vld1.16 {d4, d5, d6, d7}, [%[inptr]]!\n" + "vld1.16 {d8, d9, d10, d11}, [%[inptr]]!\n" + "vld1.16 {d12, d13, d14, d15}, [%[inptr]]!\n" + "vld1.16 {d16, d17, d18, d19}, [%[inptr]]!\n" + + ".macro transpose_4x4 x0, x1, x2, x3\n" + "vtrn.16 \\x0, \\x1\n" + "vtrn.16 \\x2, \\x3\n" + "vtrn.32 \\x0, \\x2\n" + "vtrn.32 \\x1, \\x3\n" + ".endm\n" + + ".macro transpose_all\n" + "transpose_4x4 d4, d6, d8, d10\n" + "transpose_4x4 d5, d7, d9, d11\n" + "transpose_4x4 d12, d14, d16, d18\n" + "transpose_4x4 d13, d15, d17, d19\n" + "vswp d12, d5\n" + "vswp d14, d7\n" + "vswp d16, d9\n" + "vswp d18, d11\n" + ".endm\n" + + ".macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7," + " t10, t11, t12, t13, t14, t15\n" + /* preprocess input data, converting them to sums and differences */ + "vsub.s16 \\t10, \\x0, \\x7\n" + "vadd.s16 \\x0, \\x0, \\x7\n" + "vswp.s16 \\t10, \\x7\n" + "vsub.s16 \\t11, \\x1, \\x6\n" + "vadd.s16 \\x1, \\x1, \\x6\n" + "vswp.s16 \\t11, \\x6\n" + "vsub.s16 \\t10, \\x2, \\x5\n" + "vadd.s16 \\x2, \\x2, \\x5\n" + "vswp.s16 \\t10, \\x5\n" + "vsub.s16 \\t11, \\x3, \\x4\n" + "vadd.s16 \\x3, \\x3, \\x4\n" + "vswp.s16 \\t11, \\x4\n" + + "vsub.s16 \\t12, \\x1, \\x2\n" /* tmp12 = dataptr[1] - dataptr[2]; */ + "vsub.s16 \\t13, \\x0, \\x3\n" /* tmp13 = dataptr[0] - dataptr[3]; */ + "vadd.s16 \\t12, \\t12, \\t13\n" /* tmp12 = tmp12 + tmp13; */ + "vqdmulh.s16 \\t12, \\t12, d0[2]\n" /* tmp12 = XMULTIPLY(tmp12, XFIX_0_707106781); */ + "vadd.s16 \\t11, \\x1, \\x2\n" /* tmp11 = dataptr[1] + dataptr[2]; */ + "vadd.s16 \\t10, \\x0, \\x3\n" /* tmp10 = dataptr[0] + dataptr[3]; */ + + "vadd.s16 \\x3, \\x4, \\x5\n" /* wsptr[3*DCTSIZE] = dataptr[4] + dataptr[5]; */ + "vadd.s16 \\x0, \\t10, \\t11\n" /* wsptr[0*DCTSIZE] = tmp10 + tmp11; */ + "vsub.s16 \\x4, \\t10, \\t11\n" /* wsptr[4*DCTSIZE] = tmp10 - tmp11; */ + "vadd.s16 \\t11, \\x5, \\x6\n" /* tmp11 = dataptr[5] + dataptr[6]; */ + "vqdmulh.s16 \\t11, \\t11, d0[2]\n" /* tmp11 = XMULTIPLY(tmp11, XFIX_0_707106781); */ + "vadd.s16 \\t15, \\x6, \\x7\n" /* tmp15 = dataptr[6] + dataptr[7]; */ + "vsub.s16 \\t14, \\x3, \\t15\n" /* tmp14 = wsptr[3*DCTSIZE] - tmp15; */ + "vqdmulh.s16 \\t14, \\t14, d0[0]\n" /* tmp14 = XMULTIPLY(tmp14, XFIX_0_382683433); */ + "vadd.s16 \\x2, \\t13, \\t12\n" /* wsptr[2*DCTSIZE] = tmp13 + tmp12; */ + "vsub.s16 \\x6, \\t13, \\t12\n" /* wsptr[6*DCTSIZE] = tmp13 - tmp12; */ + "vqdmulh.s16 \\t13, \\t15, d0[3]\n" /* tmp13 = XMULTIPLY(tmp15, XFIX_1_306562965); */ + "vqdmulh.s16 \\x3, \\x3, d0[1]\n" /* wsptr[3*DCTSIZE] = XMULTIPLY(wsptr[3*DCTSIZE], XFIX_0_541196100); */ + + "vadd.s16 \\t12, \\x7, \\t11\n" /* tmp12 = dataptr[7] + tmp11; */ + "vsub.s16 \\t11, \\x7, \\t11\n" /* tmp11 = dataptr[7] - tmp11; */ + "vadd.s16 \\t15, \\t15, \\t14\n" /* tmp15 = tmp15 + tmp14; */ + "vadd.s16 \\t15, \\t15, \\t13\n" /* tmp15 = tmp15 + tmp13; */ + "vadd.s16 \\x3, \\x3, \\t14\n" /* wsptr[3*DCTSIZE] = wsptr[3*DCTSIZE] + tmp14; */ + + "vadd.s16 \\x5, \\t11, \\x3\n" /* wsptr[5*DCTSIZE] = tmp11 + wsptr[3*DCTSIZE]; */ + "vsub.s16 \\x3, \\t11, \\x3\n" /* wsptr[3*DCTSIZE] = tmp11 - wsptr[3*DCTSIZE]; */ + "vadd.s16 \\x1, \\t12, \\t15\n" /* wsptr[1*DCTSIZE] = tmp12 + tmp15; */ + "vsub.s16 \\x7, \\t12, \\t15\n" /* wsptr[7*DCTSIZE] = tmp12 - tmp15; */ + ".endm\n" + + "transpose_all\n" + "idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15\n" + "transpose_all\n" + "idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15\n" + + /* store results to the output buffer */ + "vst1.16 {d4, d5, d6, d7}, [%[outptr]]!\n" + "vst1.16 {d8, d9, d10, d11}, [%[outptr]]!\n" + "vst1.16 {d12, d13, d14, d15}, [%[outptr]]!\n" + "vst1.16 {d16, d17, d18, d19}, [%[outptr]]!\n" + + : [inptr] "+&r" (inptr), + [outptr] "+&r" (outptr) + : [c] "r" (c) + : "cc", "memory", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"); +#else + DCTELEM tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + DCTELEM *dataptr, *wsptr; + DCTELEM workspace[64]; + int ctr; + /* Pass 1: process rows. */ + + /* preprocess input data, converting them to sums and differences */ + dataptr = data; + for (ctr = 0; ctr < 8; ctr++, dataptr += DCTSIZE) { + int sum = dataptr[0] + dataptr[7]; + int diff = dataptr[0] - dataptr[7]; + dataptr[7] = diff; + dataptr[0] = sum; + sum = dataptr[1] + dataptr[6]; + diff = dataptr[1] - dataptr[6]; + dataptr[6] = diff; + dataptr[1] = sum; + sum = dataptr[2] + dataptr[5]; + diff = dataptr[2] - dataptr[5]; + dataptr[5] = diff; + dataptr[2] = sum; + sum = dataptr[3] + dataptr[4]; + diff = dataptr[3] - dataptr[4]; + dataptr[4] = diff; + dataptr[3] = sum; + } + + dataptr = data; + wsptr = workspace; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp12 = dataptr[1] - dataptr[2]; + tmp13 = dataptr[0] - dataptr[3]; + tmp12 = tmp12 + tmp13; + tmp12 = XMULTIPLY(tmp12, XFIX_0_707106781); + tmp11 = dataptr[1] + dataptr[2]; + tmp10 = dataptr[0] + dataptr[3]; + + wsptr[3*DCTSIZE] = dataptr[4] + dataptr[5]; + wsptr[0*DCTSIZE] = tmp10 + tmp11; + wsptr[4*DCTSIZE] = tmp10 - tmp11; + tmp11 = dataptr[5] + dataptr[6]; + tmp11 = XMULTIPLY(tmp11, XFIX_0_707106781); + tmp15 = dataptr[6] + dataptr[7]; + tmp14 = wsptr[3*DCTSIZE] - tmp15; + tmp14 = XMULTIPLY(tmp14, XFIX_0_382683433); + wsptr[2*DCTSIZE] = tmp13 + tmp12; + wsptr[6*DCTSIZE] = tmp13 - tmp12; + tmp13 = XMULTIPLY(tmp15, XFIX_1_306562965); + wsptr[3*DCTSIZE] = XMULTIPLY(wsptr[3*DCTSIZE], XFIX_0_541196100); + + tmp12 = dataptr[7] + tmp11; + tmp11 = dataptr[7] - tmp11; + tmp15 = tmp15 + tmp14; + tmp15 = tmp15 + tmp13; + wsptr[3*DCTSIZE] = wsptr[3*DCTSIZE] + tmp14; + + wsptr[5*DCTSIZE] = tmp11 + wsptr[3*DCTSIZE]; + wsptr[3*DCTSIZE] = tmp11 - wsptr[3*DCTSIZE]; + wsptr[1*DCTSIZE] = tmp12 + tmp15; + wsptr[7*DCTSIZE] = tmp12 - tmp15; + + dataptr += DCTSIZE; /* advance pointer to next column */ + wsptr++; + } + + /* Pass 2: process columns. */ + + /* preprocess input data, converting them to sums and differences */ + dataptr = workspace; + for (ctr = 0; ctr < 8; ctr++, dataptr += DCTSIZE) { + int sum = dataptr[0] + dataptr[7]; + int diff = dataptr[0] - dataptr[7]; + dataptr[7] = diff; + dataptr[0] = sum; + sum = dataptr[1] + dataptr[6]; + diff = dataptr[1] - dataptr[6]; + dataptr[6] = diff; + dataptr[1] = sum; + sum = dataptr[2] + dataptr[5]; + diff = dataptr[2] - dataptr[5]; + dataptr[5] = diff; + dataptr[2] = sum; + sum = dataptr[3] + dataptr[4]; + diff = dataptr[3] - dataptr[4]; + dataptr[4] = diff; + dataptr[3] = sum; + } + + dataptr = workspace; + wsptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp12 = dataptr[1] - dataptr[2]; + tmp13 = dataptr[0] - dataptr[3]; + tmp12 = tmp12 + tmp13; + tmp12 = XMULTIPLY(tmp12, XFIX_0_707106781); + tmp11 = dataptr[1] + dataptr[2]; + tmp10 = dataptr[0] + dataptr[3]; + + wsptr[3*DCTSIZE] = dataptr[4] + dataptr[5]; + wsptr[0*DCTSIZE] = tmp10 + tmp11; + wsptr[4*DCTSIZE] = tmp10 - tmp11; + tmp11 = dataptr[5] + dataptr[6]; + tmp11 = XMULTIPLY(tmp11, XFIX_0_707106781); + tmp15 = dataptr[6] + dataptr[7]; + tmp14 = wsptr[3*DCTSIZE] - tmp15; + tmp14 = XMULTIPLY(tmp14, XFIX_0_382683433); + wsptr[2*DCTSIZE] = tmp13 + tmp12; + wsptr[6*DCTSIZE] = tmp13 - tmp12; + tmp13 = XMULTIPLY(tmp15, XFIX_1_306562965); + wsptr[3*DCTSIZE] = XMULTIPLY(wsptr[3*DCTSIZE], XFIX_0_541196100); + + tmp12 = dataptr[7] + tmp11; + tmp11 = dataptr[7] - tmp11; + tmp15 = tmp15 + tmp14; + tmp15 = tmp15 + tmp13; + wsptr[3*DCTSIZE] = wsptr[3*DCTSIZE] + tmp14; + + wsptr[5*DCTSIZE] = tmp11 + wsptr[3*DCTSIZE]; + wsptr[3*DCTSIZE] = tmp11 - wsptr[3*DCTSIZE]; + wsptr[1*DCTSIZE] = tmp12 + tmp15; + wsptr[7*DCTSIZE] = tmp12 - tmp15; + + dataptr += DCTSIZE; /* advance pointer to next column */ + wsptr++; + } +#endif +} + +#define jpeg_fdct_ifast jpeg_fdct_ifast_neon + +#else + GLOBAL(void) jpeg_fdct_ifast (DCTELEM * data) { @@ -221,4 +485,6 @@ jpeg_fdct_ifast (DCTELEM * data) } } +#endif + #endif /* DCT_IFAST_SUPPORTED */ |