aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--jcdctmgr.c94
1 files changed, 91 insertions, 3 deletions
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 060c58a..93c8265 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -4,6 +4,11 @@
* Copyright (C) 1994-1996, Thomas G. Lane.
* Copyright (C) 1999-2006, MIYASAKA Masaru.
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ *
+ * ARM NEON optimizations
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
+ * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+ *
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -19,6 +24,10 @@
#include "jdct.h" /* Private declarations for DCT subsystem */
#include "jsimddct.h"
+#if defined(__ARM_NEON__)
+/* ARM NEON is good at doing floating point calculations for quantization */
+#define ENABLE_FLOAT_QUANTIZATION
+#endif
/* Private subobject for this module */
@@ -161,12 +170,19 @@ flss (UINT16 val)
* routines.
*/
LOCAL(void)
-compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
+compute_reciprocal (UINT16 divisor, DCTELEM * dtbl, int index)
{
+#ifdef ENABLE_FLOAT_QUANTIZATION
+ float *dtbl_f = (float *)dtbl;
+ /* use a "magic" constant instead of 1.0 for getting bit-exact results */
+ dtbl_f[index] = ((float)(1.0 + 1.0 / (1 << 23))) / divisor;
+#else
UDCTELEM2 fq, fr;
UDCTELEM c;
int b, r;
+ dtbl += index;
+
b = flss(divisor) - 1;
r = sizeof(DCTELEM) * 8 + b;
@@ -189,6 +205,7 @@ compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */
dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r)); /* scale */
dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+#endif
}
/*
@@ -232,7 +249,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
}
dtbl = fdct->divisors[qtblno];
for (i = 0; i < DCTSIZE2; i++) {
- compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
+ compute_reciprocal(qtbl->quantval[i] << 3, dtbl, i);
}
break;
#endif
@@ -269,7 +286,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
compute_reciprocal(
DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
(INT32) aanscales[i]),
- CONST_BITS-3), &dtbl[i]);
+ CONST_BITS-3), dtbl, i);
}
}
break;
@@ -371,6 +388,76 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
METHODDEF(void)
quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
{
+#ifdef ENABLE_FLOAT_QUANTIZATION
+#ifdef __ARM_NEON__
+ if (DCTSIZE == 8 && sizeof(DCTELEM) == 2 && sizeof(JCOEF) == 2) {
+ static int consts[8] = {
+ 0x4B189680, 0x4B189680, 0x4B189680, 0x4B189680,
+ 10000000, 10000000, 10000000, 10000000
+ };
+ register DCTELEM *workspace_ptr = workspace;
+ register float *divisors_ptr = divisors;
+ register JCOEFPTR output_ptr = coef_block;
+ register int i;
+ asm volatile (
+ "vld1.32 {d0, d1, d2, d3}, [%[consts]]\n"
+ "mov %[i], #4\n"
+ "1:\n"
+ "vld1.16 {d4, d5, d6, d7}, [%[workspace_ptr]]!\n" /* load 16 */
+ "vmov q12, q0\n" /* prepare accumulator */
+ "vmovl.s16 q4, d4\n"
+ "vmovl.s16 q5, d5\n"
+ "vmovl.s16 q6, d6\n"
+ "vmovl.s16 q7, d7\n"
+ "vcvt.f32.s32 q4, q4\n"
+ "vcvt.f32.s32 q5, q5\n"
+ "vcvt.f32.s32 q6, q6\n"
+ "vcvt.f32.s32 q7, q7\n"
+ "vmov q13, q0\n" /* prepare accumulator */
+ "vld1.32 {d16, d17, d18, d19}, [%[divisors_ptr], :128]!\n"
+ "vmov q14, q0\n" /* prepare accumulator */
+ "vld1.32 {d20, d21, d22, d23}, [%[divisors_ptr], :128]!\n"
+ "vmov q15, q0\n" /* prepare accumulator */
+ "vmla.f32 q12, q4, q8\n"
+ "vmla.f32 q13, q5, q9\n"
+ "vmla.f32 q14, q6, q10\n"
+ "vmla.f32 q15, q7, q11\n"
+ "vcvt.s32.f32 q12, q12\n"
+ "vcvt.s32.f32 q13, q13\n"
+ "vcvt.s32.f32 q14, q14\n"
+ "vcvt.s32.f32 q15, q15\n"
+ "vsub.s32 q12, q1\n"
+ "vsub.s32 q13, q1\n"
+ "vsub.s32 q14, q1\n"
+ "vsub.s32 q15, q1\n"
+ "vmovn.s32 d4, q12\n"
+ "vmovn.s32 d5, q13\n"
+ "vmovn.s32 d6, q14\n"
+ "vmovn.s32 d7, q15\n"
+ "vst1.16 {d4, d5, d6, d7}, [%[output_ptr]]!\n" /* store 16 */
+ "subs %[i], %[i], #1\n"
+ "bgt 1b\n"
+ : [output_ptr] "+&r" (output_ptr),
+ [workspace_ptr] "+&r" (workspace_ptr),
+ [divisors_ptr] "+&r" (divisors_ptr),
+ [i] "=r" (i)
+ : [consts] "r" (consts)
+ : "cc", "memory",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+ "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+ "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");
+ return;
+ }
+#endif
+ int i;
+ float *divisors_f = (float *)divisors;
+ JCOEFPTR output_ptr = coef_block;
+ for (i = 0; i < DCTSIZE2; i++) {
+ output_ptr[i] = (JCOEF)((int)(
+ (divisors_f[i] * workspace[i]) + 10000000.5) - 10000000);
+ }
+#else
int i;
DCTELEM temp;
UDCTELEM recip, corr, shift;
@@ -397,6 +484,7 @@ quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
output_ptr[i] = (JCOEF) temp;
}
+#endif
}