aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@nokia.com>2010-11-10 04:03:36 +0200
committerSiarhei Siamashka <siarhei.siamashka@nokia.com>2010-11-10 06:34:05 +0200
commit5e5540254137b18be88653a7681d776f26eceb53 (patch)
tree225786acaf04ccb1b7b6044dfe71efbdd0a19a54
parentd7f750fa9fb6b55e2ffb3a6d6cafa3cb4494cec4 (diff)
ARM assembly optimizations for 'encode_one_block'
Almost 2x faster than original C variant.
-rw-r--r--jchuff.c362
1 files changed, 362 insertions, 0 deletions
diff --git a/jchuff.c b/jchuff.c
index b05c8e7..5fbc53d 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -2,6 +2,11 @@
* jchuff.c
*
* Copyright (C) 1991-1997, Thomas G. Lane.
+ *
+ * ARM optimizations
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
+ * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+ *
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -505,6 +510,349 @@ flush_bits (working_state * state)
return TRUE;
}
+/*********************************/
+
+#ifdef __arm__
+
+/*
+#!/usr/bin/env ruby
+
+require 'generator'
+
+$jpeg_natural_order = Generator.new(
+[
+ 1, 8, 16, 9, 2, 3, 10,
+ 17, 24, 32, 25, 18, 11, 4, 5,
+ 12, 19, 26, 33, 40, 48, 41, 34,
+ 27, 20, 13, 6, 7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36,
+ 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46,
+ 53, 60, 61, 54, 47, 55, 62, 63
+])
+
+$ldr_reg = Generator.new {|g|
+ while true do
+ ["%[tmp1]", "%[tmp2]", "%[tmp3]"].each {|x| g.yield x}
+ end
+}
+
+$cmp_reg = Generator.new {|g|
+ while true do
+ ["%[tmp1]", "%[tmp2]", "%[tmp3]"].each {|x| g.yield x}
+ end
+}
+
+$idx = Generator.new(1.upto(1000))
+
+def load
+ printf("\t\t\"ldrh %s, [%%[blk], #%d]\\n\"\n",
+ $ldr_reg.next, $jpeg_natural_order.next * 2)
+end
+
+def store
+ r = $cmp_reg.next
+ i = $idx.next
+ printf("\t\t\"cmp %s, #0\\n\"\n", r)
+ printf("\t\t\"strh %s, [%%[out], #%d]\\n\"\n", r, i * 2)
+ printf("\t\t\"movne %%[n], #%d\\n\"\n", i)
+end
+
+load
+load
+while $jpeg_natural_order.next? do
+ load
+ store
+end
+store
+store
+*/
+
+/*
+ * Find last nonzero coefficient and produce output in natural order,
+ * instructions are scheduled to make use of ARM Cortex-A8 dual-issue
+ * capability
+ */
+LOCAL(int)
+find_last_nonzero_index (JCOEFPTR block, JCOEFPTR out)
+{
+ int tmp1, tmp2, tmp3, n = 0;
+ asm volatile (
+ "ldrh %[tmp1], [%[blk], #2]\n"
+ "ldrh %[tmp2], [%[blk], #16]\n"
+ "ldrh %[tmp3], [%[blk], #32]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #2]\n"
+ "movne %[n], #1\n"
+ "ldrh %[tmp1], [%[blk], #18]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #4]\n"
+ "movne %[n], #2\n"
+ "ldrh %[tmp2], [%[blk], #4]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #6]\n"
+ "movne %[n], #3\n"
+ "ldrh %[tmp3], [%[blk], #6]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #8]\n"
+ "movne %[n], #4\n"
+ "ldrh %[tmp1], [%[blk], #20]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #10]\n"
+ "movne %[n], #5\n"
+ "ldrh %[tmp2], [%[blk], #34]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #12]\n"
+ "movne %[n], #6\n"
+ "ldrh %[tmp3], [%[blk], #48]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #14]\n"
+ "movne %[n], #7\n"
+ "ldrh %[tmp1], [%[blk], #64]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #16]\n"
+ "movne %[n], #8\n"
+ "ldrh %[tmp2], [%[blk], #50]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #18]\n"
+ "movne %[n], #9\n"
+ "ldrh %[tmp3], [%[blk], #36]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #20]\n"
+ "movne %[n], #10\n"
+ "ldrh %[tmp1], [%[blk], #22]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #22]\n"
+ "movne %[n], #11\n"
+ "ldrh %[tmp2], [%[blk], #8]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #24]\n"
+ "movne %[n], #12\n"
+ "ldrh %[tmp3], [%[blk], #10]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #26]\n"
+ "movne %[n], #13\n"
+ "ldrh %[tmp1], [%[blk], #24]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #28]\n"
+ "movne %[n], #14\n"
+ "ldrh %[tmp2], [%[blk], #38]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #30]\n"
+ "movne %[n], #15\n"
+ "ldrh %[tmp3], [%[blk], #52]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #32]\n"
+ "movne %[n], #16\n"
+ "ldrh %[tmp1], [%[blk], #66]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #34]\n"
+ "movne %[n], #17\n"
+ "ldrh %[tmp2], [%[blk], #80]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #36]\n"
+ "movne %[n], #18\n"
+ "ldrh %[tmp3], [%[blk], #96]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #38]\n"
+ "movne %[n], #19\n"
+ "ldrh %[tmp1], [%[blk], #82]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #40]\n"
+ "movne %[n], #20\n"
+ "ldrh %[tmp2], [%[blk], #68]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #42]\n"
+ "movne %[n], #21\n"
+ "ldrh %[tmp3], [%[blk], #54]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #44]\n"
+ "movne %[n], #22\n"
+ "ldrh %[tmp1], [%[blk], #40]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #46]\n"
+ "movne %[n], #23\n"
+ "ldrh %[tmp2], [%[blk], #26]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #48]\n"
+ "movne %[n], #24\n"
+ "ldrh %[tmp3], [%[blk], #12]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #50]\n"
+ "movne %[n], #25\n"
+ "ldrh %[tmp1], [%[blk], #14]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #52]\n"
+ "movne %[n], #26\n"
+ "ldrh %[tmp2], [%[blk], #28]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #54]\n"
+ "movne %[n], #27\n"
+ "ldrh %[tmp3], [%[blk], #42]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #56]\n"
+ "movne %[n], #28\n"
+ "ldrh %[tmp1], [%[blk], #56]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #58]\n"
+ "movne %[n], #29\n"
+ "ldrh %[tmp2], [%[blk], #70]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #60]\n"
+ "movne %[n], #30\n"
+ "ldrh %[tmp3], [%[blk], #84]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #62]\n"
+ "movne %[n], #31\n"
+ "ldrh %[tmp1], [%[blk], #98]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #64]\n"
+ "movne %[n], #32\n"
+ "ldrh %[tmp2], [%[blk], #112]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #66]\n"
+ "movne %[n], #33\n"
+ "ldrh %[tmp3], [%[blk], #114]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #68]\n"
+ "movne %[n], #34\n"
+ "ldrh %[tmp1], [%[blk], #100]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #70]\n"
+ "movne %[n], #35\n"
+ "ldrh %[tmp2], [%[blk], #86]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #72]\n"
+ "movne %[n], #36\n"
+ "ldrh %[tmp3], [%[blk], #72]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #74]\n"
+ "movne %[n], #37\n"
+ "ldrh %[tmp1], [%[blk], #58]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #76]\n"
+ "movne %[n], #38\n"
+ "ldrh %[tmp2], [%[blk], #44]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #78]\n"
+ "movne %[n], #39\n"
+ "ldrh %[tmp3], [%[blk], #30]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #80]\n"
+ "movne %[n], #40\n"
+ "ldrh %[tmp1], [%[blk], #46]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #82]\n"
+ "movne %[n], #41\n"
+ "ldrh %[tmp2], [%[blk], #60]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #84]\n"
+ "movne %[n], #42\n"
+ "ldrh %[tmp3], [%[blk], #74]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #86]\n"
+ "movne %[n], #43\n"
+ "ldrh %[tmp1], [%[blk], #88]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #88]\n"
+ "movne %[n], #44\n"
+ "ldrh %[tmp2], [%[blk], #102]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #90]\n"
+ "movne %[n], #45\n"
+ "ldrh %[tmp3], [%[blk], #116]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #92]\n"
+ "movne %[n], #46\n"
+ "ldrh %[tmp1], [%[blk], #118]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #94]\n"
+ "movne %[n], #47\n"
+ "ldrh %[tmp2], [%[blk], #104]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #96]\n"
+ "movne %[n], #48\n"
+ "ldrh %[tmp3], [%[blk], #90]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #98]\n"
+ "movne %[n], #49\n"
+ "ldrh %[tmp1], [%[blk], #76]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #100]\n"
+ "movne %[n], #50\n"
+ "ldrh %[tmp2], [%[blk], #62]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #102]\n"
+ "movne %[n], #51\n"
+ "ldrh %[tmp3], [%[blk], #78]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #104]\n"
+ "movne %[n], #52\n"
+ "ldrh %[tmp1], [%[blk], #92]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #106]\n"
+ "movne %[n], #53\n"
+ "ldrh %[tmp2], [%[blk], #106]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #108]\n"
+ "movne %[n], #54\n"
+ "ldrh %[tmp3], [%[blk], #120]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #110]\n"
+ "movne %[n], #55\n"
+ "ldrh %[tmp1], [%[blk], #122]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #112]\n"
+ "movne %[n], #56\n"
+ "ldrh %[tmp2], [%[blk], #108]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #114]\n"
+ "movne %[n], #57\n"
+ "ldrh %[tmp3], [%[blk], #94]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #116]\n"
+ "movne %[n], #58\n"
+ "ldrh %[tmp1], [%[blk], #110]\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #118]\n"
+ "movne %[n], #59\n"
+ "ldrh %[tmp2], [%[blk], #124]\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #120]\n"
+ "movne %[n], #60\n"
+ "ldrh %[tmp3], [%[blk], #126]\n"
+ "cmp %[tmp1], #0\n"
+ "strh %[tmp1], [%[out], #122]\n"
+ "movne %[n], #61\n"
+ "cmp %[tmp2], #0\n"
+ "strh %[tmp2], [%[out], #124]\n"
+ "movne %[n], #62\n"
+ "cmp %[tmp3], #0\n"
+ "strh %[tmp3], [%[out], #126]\n"
+ "movne %[n], #63\n"
+ : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3),
+ [n] "+&r" (n)
+ : [blk] "r" (block), [out] "r" (out)
+ : "memory", "cc");
+ return n;
+}
+
+/*
+LOCAL(int)
+find_last_nonzero_index (JCOEFPTR block, JCOEFPTR out)
+{
+ int tmp, i, n = 0;
+ for (i = 1; i < DCTSIZE2; i++) {
+ if ((tmp = block[jpeg_natural_order[i]]) != 0)
+ n = i;
+ out[i] = tmp;
+ }
+ return n;
+}
+*/
+
+#endif
+
/* Encode a single block's worth of coefficients */
LOCAL(boolean)
@@ -518,6 +866,10 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
size_t put_buffer; int put_bits;
int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0];
size_t bytes, bytestocopy; int localbuf = 0;
+#ifdef __arm__
+ int last_nonzero_index, k;
+ JCOEF workspace[DCTSIZE2];
+#endif
put_buffer = state->cur.put_buffer;
put_bits = state->cur.put_bits;
@@ -552,6 +904,15 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
r = 0; \
}}
+#ifdef __arm__
+ last_nonzero_index = find_last_nonzero_index(block, workspace) * 2;
+ block = &workspace[0];
+ for (k = 2; k <= last_nonzero_index; k += 2) {
+ innerloop(k);
+ }
+ /* If the last coef(s) were zero, emit an end-of-block code */
+ if (k < DCTSIZE2 * 2) DUMP_SINGLE_VALUE(actbl, 0x0)
+#else
innerloop(2*1); innerloop(2*8); innerloop(2*16); innerloop(2*9);
innerloop(2*2); innerloop(2*3); innerloop(2*10); innerloop(2*17);
innerloop(2*24); innerloop(2*32); innerloop(2*25); innerloop(2*18);
@@ -571,6 +932,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
/* If the last coef(s) were zero, emit an end-of-block code */
if (r > 0) DUMP_SINGLE_VALUE(actbl, 0x0)
+#endif
state->cur.put_buffer = put_buffer;
state->cur.put_bits = put_bits;