diff options
author | Siarhei Siamashka <siarhei.siamashka@nokia.com> | 2009-12-29 23:33:17 +0200 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@nokia.com> | 2010-11-10 06:31:31 +0200 |
commit | 1c7bb8cbf687028663fc69227a70376e7a4c1724 (patch) | |
tree | c65bfa07eed0419790791f0c2d48d48667cb4319 | |
parent | d134ba037329b11d2ba8a9d160989967b2bfa86e (diff) |
ARM NEON optimized version of 'jpeg_idct_ifast'
Is approximately 4x faster than original C variant.
-rw-r--r-- | jdct.h | 1 | ||||
-rw-r--r-- | jidctfst.c | 359 |
2 files changed, 360 insertions, 0 deletions
@@ -104,6 +104,7 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */ #if defined(WITH_SIMD) && defined(__ARM_NEON__) #define jpeg_idct_4x4 jpeg_idct_4x4_neon +#define jpeg_idct_ifast jpeg_idct_ifast_neon #endif /* Extern declarations for the forward and inverse DCT routines. */ @@ -2,6 +2,11 @@ * jidctfst.c * * Copyright (C) 1994-1998, Thomas G. Lane. + * + * ARM NEON optimizations + * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. + * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com> + * * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * @@ -164,6 +169,358 @@ * Perform dequantization and inverse DCT on one block of coefficients. */ +#if defined(WITH_SIMD) && defined(__ARM_NEON__) && (BITS_IN_JSAMPLE == 8) + +#define XFIX_1_082392200 ((short)(277 * 128 - 256 * 128)) +#define XFIX_1_414213562 ((short)(362 * 128 - 256 * 128)) +#define XFIX_1_847759065 ((short)(473 * 128 - 256 * 128)) +#define XFIX_2_613125930 ((short)(669 * 128 - 512 * 128)) + +GLOBAL(void) +jpeg_idct_ifast_neon (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JCOEFPTR inptr; + IFAST_MULT_TYPE * quantptr; + int tmp; + + const static short c[4] = { + XFIX_1_082392200, /* d0[0] */ + XFIX_1_414213562, /* d0[1] */ + XFIX_1_847759065, /* d0[2] */ + XFIX_2_613125930 /* d0[3] */ + }; + + quantptr = (IFAST_MULT_TYPE *) compptr->dct_table; + inptr = coef_block; + asm volatile ( + /* load constants */ + "vld1.16 {d0}, [%[c]]\n" + /* load all coef block: + * 0 | d4 d5 + * 1 | d6 d7 + * 2 | d8 d9 + * 3 | d10 d11 + * 4 | d12 d13 + * 5 | d14 d15 + * 6 | d16 d17 + * 7 | d18 d19 + */ + "vld1.16 {d4, d5, d6, d7}, [%[inptr]]!\n" + "vld1.16 {d8, d9, d10, d11}, [%[inptr]]!\n" + "vld1.16 {d12, d13, d14, d15}, [%[inptr]]!\n" + "vld1.16 {d16, d17, d18, d19}, [%[inptr]]!\n" + /* dequantize */ + "vld1.16 {d20, d21, d22, d23}, [%[quantptr]]!\n" + "vmul.s16 q2, q2, q10\n" + "vld1.16 {d24, d25, d26, d27}, [%[quantptr]]!\n" + "vmul.s16 q3, q3, q11\n" + "vmul.s16 q4, q4, q12\n" + "vld1.16 {d28, d29, d30, d31}, [%[quantptr]]!\n" + "vmul.s16 q5, q5, q13\n" + "vmul.s16 q6, q6, q14\n" + "vld1.16 {d20, d21, d22, d23}, [%[quantptr]]!\n" + "vmul.s16 q7, q7, q15\n" + "vmul.s16 q8, q8, q10\n" + "vmul.s16 q9, q9, q11\n" + + ".macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7," + " t10, t11, t12, t13, t14\n" + "vsub.s16 \\t10, \\x0, \\x4\n" + "vadd.s16 \\x4, \\x0, \\x4\n" + "vswp.s16 \\t10, \\x0\n" + "vsub.s16 \\t11, \\x2, \\x6\n" + "vadd.s16 \\x6, \\x2, \\x6\n" + "vswp.s16 \\t11, \\x2\n" + "vsub.s16 \\t10, \\x3, \\x5\n" + "vadd.s16 \\x5, \\x3, \\x5\n" + "vswp.s16 \\t10, \\x3\n" + "vsub.s16 \\t11, \\x1, \\x7\n" + "vadd.s16 \\x7, \\x1, \\x7\n" + "vswp.s16 \\t11, \\x1\n" + + "vqdmulh.s16 \\t13, \\x2, d0[1]\n" + "vadd.s16 \\t12, \\x3, \\x3\n" + "vadd.s16 \\x2, \\x2, \\t13\n" + "vqdmulh.s16 \\t13, \\x3, d0[3]\n" + "vsub.s16 \\t10, \\x1, \\x3\n" + "vadd.s16 \\t12, \\t12, \\t13\n" + "vqdmulh.s16 \\t13, \\t10, d0[2]\n" + "vsub.s16 \\t11, \\x7, \\x5\n" + "vadd.s16 \\t10, \\t10, \\t13\n" + "vqdmulh.s16 \\t13, \\t11, d0[1]\n" + "vadd.s16 \\t11, \\t11, \\t13\n" + + "vqdmulh.s16 \\t13, \\x1, d0[0]\n" + "vsub.s16 \\x2, \\x6, \\x2\n" + "vsub.s16 \\t14, \\x0, \\x2\n" + "vadd.s16 \\x2, \\x0, \\x2\n" + "vadd.s16 \\x0, \\x4, \\x6\n" + "vsub.s16 \\x4, \\x4, \\x6\n" + "vadd.s16 \\x1, \\x1, \\t13\n" + "vadd.s16 \\t13, \\x7, \\x5\n" + "vsub.s16 \\t12, \\t13, \\t12\n" + "vsub.s16 \\t12, \\t12, \\t10\n" + "vadd.s16 \\t11, \\t12, \\t11\n" + "vsub.s16 \\t10, \\x1, \\t10\n" + "vadd.s16 \\t10, \\t10, \\t11\n" + + "vsub.s16 \\x7, \\x0, \\t13\n" + "vadd.s16 \\x0, \\x0, \\t13\n" + "vadd.s16 \\x6, \\t14, \\t12\n" + "vsub.s16 \\x1, \\t14, \\t12\n" + "vsub.s16 \\x5, \\x2, \\t11\n" + "vadd.s16 \\x2, \\x2, \\t11\n" + "vsub.s16 \\x3, \\x4, \\t10\n" + "vadd.s16 \\x4, \\x4, \\t10\n" + ".endm\n" + + ".macro transpose_4x4 x0, x1, x2, x3\n" + "vtrn.16 \\x0, \\x1\n" + "vtrn.16 \\x2, \\x3\n" + "vtrn.32 \\x0, \\x2\n" + "vtrn.32 \\x1, \\x3\n" + ".endm\n" + + /* pass 1 */ + "idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14\n" + /* transpose */ + "transpose_4x4 d4, d6, d8, d10\n" + "transpose_4x4 d5, d7, d9, d11\n" + "transpose_4x4 d12, d14, d16, d18\n" + "transpose_4x4 d13, d15, d17, d19\n" + "vswp d12, d5\n" + "vswp d14, d7\n" + "vswp d16, d9\n" + "vswp d18, d11\n" + + /* pass2 */ + "idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14\n" + /* transpose */ + "transpose_4x4 d4, d6, d8, d10\n" + "transpose_4x4 d5, d7, d9, d11\n" + "transpose_4x4 d12, d14, d16, d18\n" + "transpose_4x4 d13, d15, d17, d19\n" + "vswp d12, d5\n" + "vswp d14, d7\n" + "vswp d16, d9\n" + "vswp d18, d11\n" + + /* descale and range limit */ + "vmov.s16 q15, #(0x80 << 5)\n" + "vqadd.s16 q2, q2, q15\n" + "vqadd.s16 q3, q3, q15\n" + "vqadd.s16 q4, q4, q15\n" + "vqadd.s16 q5, q5, q15\n" + "vqadd.s16 q6, q6, q15\n" + "vqadd.s16 q7, q7, q15\n" + "vqadd.s16 q8, q8, q15\n" + "vqadd.s16 q9, q9, q15\n" + "vqshrun.s16 d4, q2, #5\n" + "vqshrun.s16 d6, q3, #5\n" + "vqshrun.s16 d8, q4, #5\n" + "vqshrun.s16 d10, q5, #5\n" + "vqshrun.s16 d12, q6, #5\n" + "vqshrun.s16 d14, q7, #5\n" + "vqshrun.s16 d16, q8, #5\n" + "vqshrun.s16 d18, q9, #5\n" + + /* store results to the output buffer */ + ".irp x, d4, d6, d8, d10, d12, d14, d16, d18\n" + "ldr %[tmp], [%[output_buf]], #4\n" + "add %[tmp], %[tmp], %[output_col]\n" + "vst1.8 {\\x}, [%[tmp]]!\n" + ".endr\n" + : [inptr] "+&r" (inptr), + [quantptr] "+&r" (quantptr), + [tmp] "=&r" (tmp), + [output_buf] "+&r" (output_buf) + : [c] "r" (c), + [output_col] "r" (output_col) + : "cc", "memory", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"); +} + +#if 0 + +/* Macro which is similar to VQDMULH NEON instruction */ +#define XMULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const) * 2, 16)) + +/* + * A slightly modified C code (which maps to NEON instructions better), + * which was used as a reference implementation for converting to NEON. + */ +GLOBAL(void) +jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + DCTELEM q10, q11, q12, q13, q14; + short * inptr; + IFAST_MULT_TYPE * quantptr; + short * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + short workspace[DCTSIZE2]; /* buffers data between passes */ + SHIFT_TEMPS /* for DESCALE */ + ISHIFT_TEMPS /* for IDESCALE */ + JCOEF dequantized_input[DCTSIZE*8]; + + /* Pass 0: dequantize data */ + quantptr = (IFAST_MULT_TYPE *) compptr->dct_table; + inptr = coef_block; + for (ctr = 0; ctr < 64; ctr++) + dequantized_input[ctr] = DEQUANTIZE(inptr[ctr], quantptr[ctr]); + + /* Pass 1: process columns from input, store into work array. */ + + /* preprocess input data, converting them to sums and differences */ + inptr = dequantized_input; + for (ctr = 0; ctr < 8; ctr++) { + int sum = inptr[ctr + DCTSIZE*0] + inptr[ctr + DCTSIZE*4]; + int diff = inptr[ctr + DCTSIZE*0] - inptr[ctr + DCTSIZE*4]; + inptr[ctr + DCTSIZE*0] = diff; + inptr[ctr + DCTSIZE*4] = sum; + sum = inptr[ctr + DCTSIZE*2] + inptr[ctr + DCTSIZE*6]; + diff = inptr[ctr + DCTSIZE*2] - inptr[ctr + DCTSIZE*6]; + inptr[ctr + DCTSIZE*2] = diff; + inptr[ctr + DCTSIZE*6] = sum; + sum = inptr[ctr + DCTSIZE*3] + inptr[ctr + DCTSIZE*5]; + diff = inptr[ctr + DCTSIZE*3] - inptr[ctr + DCTSIZE*5]; + inptr[ctr + DCTSIZE*3] = diff; + inptr[ctr + DCTSIZE*5] = sum; + sum = inptr[ctr + DCTSIZE*1] + inptr[ctr + DCTSIZE*7]; + diff = inptr[ctr + DCTSIZE*1] - inptr[ctr + DCTSIZE*7]; + inptr[ctr + DCTSIZE*1] = diff; + inptr[ctr + DCTSIZE*7] = sum; + } + wsptr = workspace; + for (ctr = DCTSIZE; ctr > 0; ctr--) { + + q13 = XMULTIPLY(inptr[DCTSIZE*2], XFIX_1_414213562); + q12 = inptr[DCTSIZE*3] + inptr[DCTSIZE*3]; + inptr[DCTSIZE*2] += q13; + q13 = XMULTIPLY(inptr[DCTSIZE*3], XFIX_2_613125930); + q10 = inptr[DCTSIZE*1] - inptr[DCTSIZE*3]; + q12 += q13; + q13 = XMULTIPLY(q10, XFIX_1_847759065); + q11 = inptr[DCTSIZE*7] - inptr[DCTSIZE*5]; + q10 += q13; + q13 = XMULTIPLY(q11, XFIX_1_414213562); + q11 += q13; + + q13 = XMULTIPLY(inptr[DCTSIZE*1], XFIX_1_082392200); + + inptr[DCTSIZE*2] = inptr[DCTSIZE*6] - inptr[DCTSIZE*2]; + q14 = inptr[DCTSIZE*0] - inptr[DCTSIZE*2]; + inptr[DCTSIZE*2] = inptr[DCTSIZE*0] + inptr[DCTSIZE*2]; + inptr[DCTSIZE*0] = inptr[DCTSIZE*4] + inptr[DCTSIZE*6]; + inptr[DCTSIZE*4] = inptr[DCTSIZE*4] - inptr[DCTSIZE*6]; + + inptr[DCTSIZE*1] += q13; + + q13 = inptr[DCTSIZE*7] + inptr[DCTSIZE*5]; + q12 = q13 - q12 - q10; + q11 = q12 + q11; + q10 = q11 + inptr[DCTSIZE*1] - q10; + + wsptr[7] = (int) (inptr[DCTSIZE*0] - q13); + wsptr[0] = (int) (inptr[DCTSIZE*0] + q13); + wsptr[6] = (int) (q14 + q12); + wsptr[1] = (int) (q14 - q12); + wsptr[5] = (int) (inptr[DCTSIZE*2] - q11); + wsptr[2] = (int) (inptr[DCTSIZE*2] + q11); + wsptr[3] = (int) (inptr[DCTSIZE*4] - q10); + wsptr[4] = (int) (inptr[DCTSIZE*4] + q10); + + inptr++; /* advance pointers to next column */ + wsptr += DCTSIZE; + } + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + inptr = workspace; + for (ctr = 0; ctr < 8; ctr++) { + int sum = inptr[ctr + DCTSIZE*0] + inptr[ctr + DCTSIZE*4]; + int diff = inptr[ctr + DCTSIZE*0] - inptr[ctr + DCTSIZE*4]; + inptr[ctr + DCTSIZE*0] = diff; + inptr[ctr + DCTSIZE*4] = sum; + sum = inptr[ctr + DCTSIZE*2] + inptr[ctr + DCTSIZE*6]; + diff = inptr[ctr + DCTSIZE*2] - inptr[ctr + DCTSIZE*6]; + inptr[ctr + DCTSIZE*2] = diff; + inptr[ctr + DCTSIZE*6] = sum; + sum = inptr[ctr + DCTSIZE*3] + inptr[ctr + DCTSIZE*5]; + diff = inptr[ctr + DCTSIZE*3] - inptr[ctr + DCTSIZE*5]; + inptr[ctr + DCTSIZE*3] = diff; + inptr[ctr + DCTSIZE*5] = sum; + sum = inptr[ctr + DCTSIZE*1] + inptr[ctr + DCTSIZE*7]; + diff = inptr[ctr + DCTSIZE*1] - inptr[ctr + DCTSIZE*7]; + inptr[ctr + DCTSIZE*1] = diff; + inptr[ctr + DCTSIZE*7] = sum; + } + + for (ctr = 0; ctr < DCTSIZE; ctr++) { + outptr = output_buf[ctr] + output_col; + + q13 = XMULTIPLY(inptr[DCTSIZE*2], XFIX_1_414213562); + q12 = inptr[DCTSIZE*3] + inptr[DCTSIZE*3]; + inptr[DCTSIZE*2] += q13; + q13 = XMULTIPLY(inptr[DCTSIZE*3], XFIX_2_613125930); + q10 = inptr[DCTSIZE*1] - inptr[DCTSIZE*3]; + q12 += q13; + q13 = XMULTIPLY(q10, XFIX_1_847759065); + q11 = inptr[DCTSIZE*7] - inptr[DCTSIZE*5]; + q10 += q13; + q13 = XMULTIPLY(q11, XFIX_1_414213562); + q11 += q13; + + q13 = XMULTIPLY(inptr[DCTSIZE*1], XFIX_1_082392200); + + inptr[DCTSIZE*2] = inptr[DCTSIZE*6] - inptr[DCTSIZE*2]; + q14 = inptr[DCTSIZE*0] - inptr[DCTSIZE*2]; + inptr[DCTSIZE*2] = inptr[DCTSIZE*0] + inptr[DCTSIZE*2]; + inptr[DCTSIZE*0] = inptr[DCTSIZE*4] + inptr[DCTSIZE*6]; + inptr[DCTSIZE*4] = inptr[DCTSIZE*4] - inptr[DCTSIZE*6]; + + inptr[DCTSIZE*1] += q13; + + q13 = inptr[DCTSIZE*7] + inptr[DCTSIZE*5]; + q12 = q13 - q12 - q10; + q11 = q12 + q11; + q10 = q11 + inptr[DCTSIZE*1] - q10; + + /* Final output stage: scale down by a factor of 8 and range-limit */ + outptr[7] = range_limit[IDESCALE(inptr[DCTSIZE*0] - q13, PASS1_BITS+3) + & RANGE_MASK]; + outptr[0] = range_limit[IDESCALE(inptr[DCTSIZE*0] + q13, PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[IDESCALE(q14 + q12, PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[IDESCALE(q14 - q12, PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[IDESCALE(inptr[DCTSIZE*2] - q11, PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[IDESCALE(inptr[DCTSIZE*2] + q11, PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[IDESCALE(inptr[DCTSIZE*4] - q10, PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[IDESCALE(inptr[DCTSIZE*4] + q10, PASS1_BITS+3) + & RANGE_MASK]; + + inptr++; /* advance pointers to next column */ + } +} + +#endif + +#else + GLOBAL(void) jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, @@ -365,4 +722,6 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, } } +#endif + #endif /* DCT_IFAST_SUPPORTED */ |