aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@nokia.com>2010-11-10 06:05:50 +0200
committerSiarhei Siamashka <siarhei.siamashka@nokia.com>2010-11-10 06:34:45 +0200
commit8a20111152bbe1d7f61fef089672c7f596ba7c81 (patch)
treeaf6bca1557c6e1a38671e2fe01f9705c14039063
parent5e5540254137b18be88653a7681d776f26eceb53 (diff)
ARM NEON optimizations for quantization code in 'forward_DCT'HEADmaster
Integer divisions replaced with floating point multiplications. Code is still bitexact (calculations produce the same results for all the possible range of input values). This optimized function is now almost 3x faster than original C variant and improves encoding performance somewhat.
-rw-r--r--jcdctmgr.c94
1 files changed, 91 insertions, 3 deletions
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 060c58a..93c8265 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -4,6 +4,11 @@
* Copyright (C) 1994-1996, Thomas G. Lane.
* Copyright (C) 1999-2006, MIYASAKA Masaru.
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ *
+ * ARM NEON optimizations
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
+ * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+ *
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -19,6 +24,10 @@
#include "jdct.h" /* Private declarations for DCT subsystem */
#include "jsimddct.h"
+#if defined(__ARM_NEON__)
+/* ARM NEON is good at doing floating point calculations for quantization */
+#define ENABLE_FLOAT_QUANTIZATION
+#endif
/* Private subobject for this module */
@@ -161,12 +170,19 @@ flss (UINT16 val)
* routines.
*/
LOCAL(void)
-compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
+compute_reciprocal (UINT16 divisor, DCTELEM * dtbl, int index)
{
+#ifdef ENABLE_FLOAT_QUANTIZATION
+ float *dtbl_f = (float *)dtbl;
+ /* use a "magic" constant instead of 1.0 for getting bit-exact results */
+ dtbl_f[index] = ((float)(1.0 + 1.0 / (1 << 23))) / divisor;
+#else
UDCTELEM2 fq, fr;
UDCTELEM c;
int b, r;
+ dtbl += index;
+
b = flss(divisor) - 1;
r = sizeof(DCTELEM) * 8 + b;
@@ -189,6 +205,7 @@ compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */
dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r)); /* scale */
dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+#endif
}
/*
@@ -232,7 +249,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
}
dtbl = fdct->divisors[qtblno];
for (i = 0; i < DCTSIZE2; i++) {
- compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
+ compute_reciprocal(qtbl->quantval[i] << 3, dtbl, i);
}
break;
#endif
@@ -269,7 +286,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
compute_reciprocal(
DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
(INT32) aanscales[i]),
- CONST_BITS-3), &dtbl[i]);
+ CONST_BITS-3), dtbl, i);
}
}
break;
@@ -371,6 +388,76 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
METHODDEF(void)
quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
{
+#ifdef ENABLE_FLOAT_QUANTIZATION
+#ifdef __ARM_NEON__
+ if (DCTSIZE == 8 && sizeof(DCTELEM) == 2 && sizeof(JCOEF) == 2) {
+ static int consts[8] = {
+ 0x4B189680, 0x4B189680, 0x4B189680, 0x4B189680,
+ 10000000, 10000000, 10000000, 10000000
+ };
+ register DCTELEM *workspace_ptr = workspace;
+ register float *divisors_ptr = divisors;
+ register JCOEFPTR output_ptr = coef_block;
+ register int i;
+ asm volatile (
+ "vld1.32 {d0, d1, d2, d3}, [%[consts]]\n"
+ "mov %[i], #4\n"
+ "1:\n"
+ "vld1.16 {d4, d5, d6, d7}, [%[workspace_ptr]]!\n" /* load 16 */
+ "vmov q12, q0\n" /* prepare accumulator */
+ "vmovl.s16 q4, d4\n"
+ "vmovl.s16 q5, d5\n"
+ "vmovl.s16 q6, d6\n"
+ "vmovl.s16 q7, d7\n"
+ "vcvt.f32.s32 q4, q4\n"
+ "vcvt.f32.s32 q5, q5\n"
+ "vcvt.f32.s32 q6, q6\n"
+ "vcvt.f32.s32 q7, q7\n"
+ "vmov q13, q0\n" /* prepare accumulator */
+ "vld1.32 {d16, d17, d18, d19}, [%[divisors_ptr], :128]!\n"
+ "vmov q14, q0\n" /* prepare accumulator */
+ "vld1.32 {d20, d21, d22, d23}, [%[divisors_ptr], :128]!\n"
+ "vmov q15, q0\n" /* prepare accumulator */
+ "vmla.f32 q12, q4, q8\n"
+ "vmla.f32 q13, q5, q9\n"
+ "vmla.f32 q14, q6, q10\n"
+ "vmla.f32 q15, q7, q11\n"
+ "vcvt.s32.f32 q12, q12\n"
+ "vcvt.s32.f32 q13, q13\n"
+ "vcvt.s32.f32 q14, q14\n"
+ "vcvt.s32.f32 q15, q15\n"
+ "vsub.s32 q12, q1\n"
+ "vsub.s32 q13, q1\n"
+ "vsub.s32 q14, q1\n"
+ "vsub.s32 q15, q1\n"
+ "vmovn.s32 d4, q12\n"
+ "vmovn.s32 d5, q13\n"
+ "vmovn.s32 d6, q14\n"
+ "vmovn.s32 d7, q15\n"
+ "vst1.16 {d4, d5, d6, d7}, [%[output_ptr]]!\n" /* store 16 */
+ "subs %[i], %[i], #1\n"
+ "bgt 1b\n"
+ : [output_ptr] "+&r" (output_ptr),
+ [workspace_ptr] "+&r" (workspace_ptr),
+ [divisors_ptr] "+&r" (divisors_ptr),
+ [i] "=r" (i)
+ : [consts] "r" (consts)
+ : "cc", "memory",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+ "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+ "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");
+ return;
+ }
+#endif
+ int i;
+ float *divisors_f = (float *)divisors;
+ JCOEFPTR output_ptr = coef_block;
+ for (i = 0; i < DCTSIZE2; i++) {
+ output_ptr[i] = (JCOEF)((int)(
+ (divisors_f[i] * workspace[i]) + 10000000.5) - 10000000);
+ }
+#else
int i;
DCTELEM temp;
UDCTELEM recip, corr, shift;
@@ -397,6 +484,7 @@ quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
output_ptr[i] = (JCOEF) temp;
}
+#endif
}