diff options
-rw-r--r-- | jcdctmgr.c | 94 |
1 files changed, 91 insertions, 3 deletions
@@ -4,6 +4,11 @@ * Copyright (C) 1994-1996, Thomas G. Lane. * Copyright (C) 1999-2006, MIYASAKA Masaru. * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB + * + * ARM NEON optimizations + * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. + * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com> + * * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * @@ -19,6 +24,10 @@ #include "jdct.h" /* Private declarations for DCT subsystem */ #include "jsimddct.h" +#if defined(__ARM_NEON__) +/* ARM NEON is good at doing floating point calculations for quantization */ +#define ENABLE_FLOAT_QUANTIZATION +#endif /* Private subobject for this module */ @@ -161,12 +170,19 @@ flss (UINT16 val) * routines. */ LOCAL(void) -compute_reciprocal (UINT16 divisor, DCTELEM * dtbl) +compute_reciprocal (UINT16 divisor, DCTELEM * dtbl, int index) { +#ifdef ENABLE_FLOAT_QUANTIZATION + float *dtbl_f = (float *)dtbl; + /* use a "magic" constant instead of 1.0 for getting bit-exact results */ + dtbl_f[index] = ((float)(1.0 + 1.0 / (1 << 23))) / divisor; +#else UDCTELEM2 fq, fr; UDCTELEM c; int b, r; + dtbl += index; + b = flss(divisor) - 1; r = sizeof(DCTELEM) * 8 + b; @@ -189,6 +205,7 @@ compute_reciprocal (UINT16 divisor, DCTELEM * dtbl) dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */ dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r)); /* scale */ dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */ +#endif } /* @@ -232,7 +249,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo) } dtbl = fdct->divisors[qtblno]; for (i = 0; i < DCTSIZE2; i++) { - compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]); + compute_reciprocal(qtbl->quantval[i] << 3, dtbl, i); } break; #endif @@ -269,7 +286,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo) compute_reciprocal( DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i], (INT32) aanscales[i]), - CONST_BITS-3), &dtbl[i]); + CONST_BITS-3), dtbl, i); } } break; @@ -371,6 +388,76 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace) METHODDEF(void) quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace) { +#ifdef ENABLE_FLOAT_QUANTIZATION +#ifdef __ARM_NEON__ + if (DCTSIZE == 8 && sizeof(DCTELEM) == 2 && sizeof(JCOEF) == 2) { + static int consts[8] = { + 0x4B189680, 0x4B189680, 0x4B189680, 0x4B189680, + 10000000, 10000000, 10000000, 10000000 + }; + register DCTELEM *workspace_ptr = workspace; + register float *divisors_ptr = divisors; + register JCOEFPTR output_ptr = coef_block; + register int i; + asm volatile ( + "vld1.32 {d0, d1, d2, d3}, [%[consts]]\n" + "mov %[i], #4\n" + "1:\n" + "vld1.16 {d4, d5, d6, d7}, [%[workspace_ptr]]!\n" /* load 16 */ + "vmov q12, q0\n" /* prepare accumulator */ + "vmovl.s16 q4, d4\n" + "vmovl.s16 q5, d5\n" + "vmovl.s16 q6, d6\n" + "vmovl.s16 q7, d7\n" + "vcvt.f32.s32 q4, q4\n" + "vcvt.f32.s32 q5, q5\n" + "vcvt.f32.s32 q6, q6\n" + "vcvt.f32.s32 q7, q7\n" + "vmov q13, q0\n" /* prepare accumulator */ + "vld1.32 {d16, d17, d18, d19}, [%[divisors_ptr], :128]!\n" + "vmov q14, q0\n" /* prepare accumulator */ + "vld1.32 {d20, d21, d22, d23}, [%[divisors_ptr], :128]!\n" + "vmov q15, q0\n" /* prepare accumulator */ + "vmla.f32 q12, q4, q8\n" + "vmla.f32 q13, q5, q9\n" + "vmla.f32 q14, q6, q10\n" + "vmla.f32 q15, q7, q11\n" + "vcvt.s32.f32 q12, q12\n" + "vcvt.s32.f32 q13, q13\n" + "vcvt.s32.f32 q14, q14\n" + "vcvt.s32.f32 q15, q15\n" + "vsub.s32 q12, q1\n" + "vsub.s32 q13, q1\n" + "vsub.s32 q14, q1\n" + "vsub.s32 q15, q1\n" + "vmovn.s32 d4, q12\n" + "vmovn.s32 d5, q13\n" + "vmovn.s32 d6, q14\n" + "vmovn.s32 d7, q15\n" + "vst1.16 {d4, d5, d6, d7}, [%[output_ptr]]!\n" /* store 16 */ + "subs %[i], %[i], #1\n" + "bgt 1b\n" + : [output_ptr] "+&r" (output_ptr), + [workspace_ptr] "+&r" (workspace_ptr), + [divisors_ptr] "+&r" (divisors_ptr), + [i] "=r" (i) + : [consts] "r" (consts) + : "cc", "memory", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"); + return; + } +#endif + int i; + float *divisors_f = (float *)divisors; + JCOEFPTR output_ptr = coef_block; + for (i = 0; i < DCTSIZE2; i++) { + output_ptr[i] = (JCOEF)((int)( + (divisors_f[i] * workspace[i]) + 10000000.5) - 10000000); + } +#else int i; DCTELEM temp; UDCTELEM recip, corr, shift; @@ -397,6 +484,7 @@ quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace) output_ptr[i] = (JCOEF) temp; } +#endif } |