ARM NEON optimizations for quantization code in 'forward_DCT'HEAD master

Integer divisions replaced with floating point multiplications. Code is still bitexact (calculations produce the same results for all the possible range of input values). This optimized function is now almost 3x faster than original C variant and improves encoding performance somewhat.
author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 2010-11-10 06:05:50 +0200
committer: Siarhei Siamashka <siarhei.siamashka@nokia.com> 2010-11-10 06:34:45 +0200
commit: 8a20111152bbe1d7f61fef089672c7f596ba7c81 (patch)
tree: af6bca1557c6e1a38671e2fe01f9705c14039063
parent: 5e5540254137b18be88653a7681d776f26eceb53 (diff)
1 files changed, 91 insertions, 3 deletions
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 060c58a..93c8265 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -4,6 +4,11 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ *
+ * ARM NEON optimizations
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
+ * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+ *
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -19,6 +24,10 @@
 #include "jdct.h"		/* Private declarations for DCT subsystem */
 #include "jsimddct.h"
 
+#if defined(__ARM_NEON__)
+/* ARM NEON is good at doing floating point calculations for quantization */
+#define ENABLE_FLOAT_QUANTIZATION
+#endif
 
 /* Private subobject for this module */
 
@@ -161,12 +170,19 @@ flss (UINT16 val)
  * routines.
  */
 LOCAL(void)
-compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
+compute_reciprocal (UINT16 divisor, DCTELEM * dtbl, int index)
 {
+#ifdef ENABLE_FLOAT_QUANTIZATION
+  float *dtbl_f = (float *)dtbl;
+  /* use a "magic" constant instead of 1.0 for getting bit-exact results */
+  dtbl_f[index] = ((float)(1.0 + 1.0 / (1 << 23))) / divisor;
+#else
   UDCTELEM2 fq, fr;
   UDCTELEM c;
   int b, r;
 
+  dtbl += index;
+
   b = flss(divisor) - 1;
   r  = sizeof(DCTELEM) * 8 + b;
 
@@ -189,6 +205,7 @@ compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
   dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
   dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
   dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+#endif
 }
 
 /*
@@ -232,7 +249,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
       }
       dtbl = fdct->divisors[qtblno];
       for (i = 0; i < DCTSIZE2; i++) {
-	compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
+	compute_reciprocal(qtbl->quantval[i] << 3, dtbl, i);
       }
       break;
 #endif
@@ -269,7 +286,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 	  compute_reciprocal(
 	    DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
 				  (INT32) aanscales[i]),
-		    CONST_BITS-3), &dtbl[i]);
+		    CONST_BITS-3), dtbl, i);
 	}
       }
       break;
@@ -371,6 +388,76 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
 METHODDEF(void)
 quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
 {
+#ifdef ENABLE_FLOAT_QUANTIZATION
+#ifdef __ARM_NEON__
+  if (DCTSIZE == 8 && sizeof(DCTELEM) == 2 && sizeof(JCOEF) == 2) {
+    static int consts[8] = {
+      0x4B189680, 0x4B189680, 0x4B189680, 0x4B189680,
+      10000000, 10000000, 10000000, 10000000
+    };
+    register DCTELEM *workspace_ptr = workspace;
+    register float *divisors_ptr = divisors;
+    register JCOEFPTR output_ptr = coef_block;
+    register int i;
+    asm volatile (
+	"vld1.32      {d0, d1, d2, d3}, [%[consts]]\n"
+	"mov          %[i], #4\n"
+	"1:\n"
+	"vld1.16      {d4, d5, d6, d7}, [%[workspace_ptr]]!\n" /* load 16 */
+	"vmov         q12, q0\n" /* prepare accumulator */
+	"vmovl.s16    q4, d4\n"
+	"vmovl.s16    q5, d5\n"
+	"vmovl.s16    q6, d6\n"
+	"vmovl.s16    q7, d7\n"
+	"vcvt.f32.s32 q4, q4\n"
+	"vcvt.f32.s32 q5, q5\n"
+	"vcvt.f32.s32 q6, q6\n"
+	"vcvt.f32.s32 q7, q7\n"
+	"vmov         q13, q0\n" /* prepare accumulator */
+	"vld1.32      {d16, d17, d18, d19}, [%[divisors_ptr], :128]!\n"
+	"vmov         q14, q0\n" /* prepare accumulator */
+	"vld1.32      {d20, d21, d22, d23}, [%[divisors_ptr], :128]!\n"
+	"vmov         q15, q0\n" /* prepare accumulator */
+	"vmla.f32     q12, q4, q8\n"
+	"vmla.f32     q13, q5, q9\n"
+	"vmla.f32     q14, q6, q10\n"
+	"vmla.f32     q15, q7, q11\n"
+	"vcvt.s32.f32 q12, q12\n"
+	"vcvt.s32.f32 q13, q13\n"
+	"vcvt.s32.f32 q14, q14\n"
+	"vcvt.s32.f32 q15, q15\n"
+	"vsub.s32     q12, q1\n"
+	"vsub.s32     q13, q1\n"
+	"vsub.s32     q14, q1\n"
+	"vsub.s32     q15, q1\n"
+	"vmovn.s32    d4, q12\n"
+	"vmovn.s32    d5, q13\n"
+	"vmovn.s32    d6, q14\n"
+	"vmovn.s32    d7, q15\n"
+	"vst1.16      {d4, d5, d6, d7}, [%[output_ptr]]!\n" /* store 16 */
+	"subs         %[i], %[i], #1\n"
+	"bgt          1b\n"
+	: [output_ptr] "+&r" (output_ptr),
+	  [workspace_ptr] "+&r" (workspace_ptr),
+	  [divisors_ptr] "+&r" (divisors_ptr),
+	  [i] "=r" (i)
+	: [consts] "r" (consts)
+	: "cc", "memory",
+	  "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
+	  "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
+	  "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+	  "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");
+    return;
+  }
+#endif
+  int i;
+  float *divisors_f = (float *)divisors;
+  JCOEFPTR output_ptr = coef_block;
+  for (i = 0; i < DCTSIZE2; i++) {
+    output_ptr[i] = (JCOEF)((int)(
+      (divisors_f[i] * workspace[i]) + 10000000.5) - 10000000);
+  }
+#else
   int i;
   DCTELEM temp;
   UDCTELEM recip, corr, shift;
@@ -397,6 +484,7 @@ quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
 
     output_ptr[i] = (JCOEF) temp;
   }
+#endif
 }
author	Siarhei Siamashka <siarhei.siamashka@nokia.com>	2010-11-10 06:05:50 +0200
committer	Siarhei Siamashka <siarhei.siamashka@nokia.com>	2010-11-10 06:34:45 +0200
commit	8a20111152bbe1d7f61fef089672c7f596ba7c81 (patch)
tree	af6bca1557c6e1a38671e2fe01f9705c14039063
parent	5e5540254137b18be88653a7681d776f26eceb53 (diff)