sync to as released debian control files and such for the packageHEAD master

author: Tom Gall <tom.gall@linaro.org> 2011-12-23 10:42:23 -0600
committer: Tom Gall <tom.gall@linaro.org> 2011-12-23 10:42:23 -0600
commit: d519a0ef385e8d74fce083497630ffb4e9ba0adc (patch)
tree: 1b145eb57dd8a40f710ff23ec591703b24e3aa0c
parent: f1a90e77f305fc679c9fcbd4c5a11deb48f92c29 (diff)
34 files changed, 1823 insertions, 3765 deletions
diff --git a/debian/README.source b/debian/README.source
new file mode 100644
index 0000000..7544e70
--- /dev/null
+++ b/debian/README.source
@@ -0,0 +1 @@
+Please refer to /usr/share/doc/quilt/README.source for use of quilt.
diff --git a/debian/changelog b/debian/changelog
index 23fdd23..c49311d 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,72 +1,55 @@
-libjpeg-turbo (1.1.90-1inaror6) unreleased; urgency=low
+libjpeg-turbo (1.1.90+svn733-0ubuntu2) precise; urgency=low
 
-  * sync to change 691
-  * 690 NEON-accelerated slow integer inverse DCT
-  * 691 Improve the performance of YCbCr to RGB conversion on ARM
+  * Sync with upstream to svn733.
 
- -- Tom Gall <tom.gall@linaro.org>  Thu, 25 Aug 2011 18:38:04 +0000
+  * Rename libjpeg-test to libjpeg-turbo-test.
+  * Rename libjpeg-turbo-dbg to libjpeg-turbo8-dbg.
+  * Rename libjpeg8-dev to libjpeg-turbo8-dev.
+  * Move the docs into the -dev package, install the upstream changelog
+    in the -dev only.
+  * Split out libturbojpeg.so into it's own package, don't let
+    libjpeg-turbo8-dev depend on it.
+  * Fix libjpeg-turbo8-dbg package description.
+  * Install jconfig.h into multiarch include path.
+  * Remove HAVE_STD{LIB,DEF}_H from jconfig.h since they are not used and
+    conflict with autoconf.
+  * libjpeg-turbo8:
+    - Add a symbols file, with a different version for symbols only found
+      in the libjpeg-turbo implementation.
+    - Remove the shlibs file.
+    - Breaks/Replaces libjpeg8 (<< 8c-2ubuntu5).
+  * Copy the exifautotran and jpegexiforient tools from the libjpeg8
+    sources, install into libjpeg-turbo-progs.
+  * Don't install tjbench in libjpeg-turbo-progs to avoid dependency
+    on libturbojpeg.
 
-libjpeg-turbo (1.1.90-1inaro6) natty; urgency=low
+ -- Matthias Klose <doko@ubuntu.com>  Tue, 20 Dec 2011 23:12:52 +0100
 
-  * Release
-  * sync to svn change 689
+libjpeg-turbo (1.1.90+svn722-1ubuntu5) precise; urgency=low
 
- -- Tom Gall <tom.gall@linaro.org>  Tue, 16 Aug 2011 22:58:35 +0000
+  * Remove all useage of diverts in preparation to replace
+    libjpeg8 in precise
+  * small clean up in debian/control
 
-libjpeg-turbo (1.1.1-1inaro5) unreleased; urgency=low
+ -- Tom Gall <tom.gall@linaro.org>  Thu, 01 Dec 2011 09:50:26 -0600
 
-  * Merge in 1.1.90 upstream code 
+libjpeg-turbo (1.1.90+svn722-1ubuntu4) precise; urgency=low
 
- -- Tom Gall <tom.gall@linaro.org>  Mon, 15 Aug 2011 23:34:51 +0000
+  * Switch package to include libjpeg8 compatibility
+  * Supply -dev -dbg and -test debs
 
-libjpeg-turbo (1.1.1-1inaro4) natty; urgency=low
+ -- Tom Gall <tom.gall@linaro.org>  Wed, 16 Nov 2011 22:14:00 +0000
 
-  * add libjpeg-turbo-progs.* to handle collision with libjpeg-8
-  * add dpkg-dev to Pre-Depends so multi-host works during builds
+libjpeg-turbo (1.1.90+svn722-1ubuntu2) oneiric; urgency=low
 
- -- Tom Gall <tom.gall@linaro.org>  Mon, 01 Aug 2011 18:48:55 +0000
+  * 11.11 Release
+  * Sync with upstream to svn722
 
-libjpeg-turbo (1.1.1-1inaro3) natty; urgency=low
+ -- Tom Gall <tom.gall@linaro.org>  Wed, 16 Nov 2011 14:32:12 +0000
 
-  * fix divert to correctly move libjpeg.so*
-  * use multiarch
-  * change location of upstream git tree
+libjpeg-turbo (1.1.90+svn702-0ubuntu1) oneiric; urgency=low
 
- -- Tom Gall <tom.gall@linaro.org>  Thu, 28 Jul 2011 04:34:05 +0000
+  * Initial Release based on svn 702
+  * Initial Release and packaging based on svn 702 (LP: #852207)
 
-libjpeg-turbo (1.1.1-1inaro2) natty; urgency=low
-
-  * release
-  * add timestamp code to cjpeg.c
-  * default cjpeg.c and djpeg.c timestamp code to off
-
- -- Tom Gall <tom.gall@linaro.org>  Mon, 11 Jul 2011 20:32:23 +0000
-
-libjpeg-turbo (1.1.1-0inaro1) natty; urgency=low
-
-  * rebase to 1.1.1
-  * include cross build changes from Michael Edwards
-  * add assembler and compiler options for SIMD_ARM_NEON
-
- -- Tom Gall <tom.gall@linaro.org>  Tue, 05 Jul 2011 21:20:43 +0000
-
-libjpeg-turbo (1.1.0-0inaro2) natty; urgency=low
-
-  * Package current git, dated 05262011
-
- --  <tom.gall@linaro.org>  Thu, 26 May 2011 20:23:01 +0000
-
-libjpeg-turbo (1.1.0-0linaro1) natty; urgency=low
-
-  * Change name of built packages to libjpeg-turbo62 and libjpeg-turbo-progs
-  * fix dpkg-divert usage
-  * Don't bother to install libjpegturbo.so
-  * via Steve Langasek, fix use of strh with it ne for thumb2 gcc4.5 compat
-
- -- Tom Gall <tom.gall@linaro.org>  Fri, 04 Mar 2011 22:51:19 -0600
-
-libjpeg-turbo (1.1.0-0linaro0) natty; urgency=low
-
-  * Package for Linaro
-
- -- Tom Gall <tom.gall@linaro.org>  Mon, 17 Jan 2011 11:11:05 -0600
+ -- Tom Gall <tom.gall@linaro.org>  Tue, 13 Sep 2011 03:53:56 +0000
diff --git a/debian/control b/debian/control
index ec5f951..d62c8f1 100644
--- a/debian/control
+++ b/debian/control
@@ -1,34 +1,88 @@
 Source: libjpeg-turbo
-Priority: extra
-Maintainer: Linaro Developers <linaro-dev@linaro.org>
+Priority: optional
+Section: graphics
+Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
 Uploaders: Tom Gall <tom.gall@linaro.org>
-Build-Depends: debhelper (>= 8.1.2ubuntu2), dh-autoreconf, nasm [amd64 i386]
-Standards-Version: 3.9.1
-Section: libs
-Vcs-Git: git://git.linaro.org/people/tomgall/libjpeg-turbo/libjpeg-turbo.git
+Build-Depends: debhelper (>= 8.1.3), dh-autoreconf, nasm [amd64 i386], quilt
+Standards-Version: 3.9.2
+Vcs-Bzr: lp:libjpeg-turbo
+Homepage: http://libjpeg-turbo.virtualgl.org/
 
-Package: libjpeg-turbo62
+Package: libjpeg-turbo8-dev
+Architecture: any
+Section: libdevel
+Multi-Arch: same
+Depends: libjpeg-turbo8 (= ${binary:Version}), libc-dev, ${misc:Depends}
+Conflicts: libjpeg62-dev, libjpeg7-dev
+Replaces: libjpeg62-dev, libjpeg7-dev, libjpeg8-dev (<< 8c-2ubuntu5)
+Provides: libjpeg-dev
+Description: Development files for the IJG JPEG library
+ The Independent JPEG Group's JPEG library is a library for handling
+ JPEG files.
+ .
+ This package contains the static library, headers and documentation.
+
+Package: libjpeg-turbo8
+Architecture: any
+Multi-Arch: same
 Section: libs
+Pre-Depends: multiarch-support
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Breaks: libjpeg8 (<< 8c-2ubuntu5)
+Replaces: libjpeg8 (<< 8c-2ubuntu5)
+Description: IJG JPEG compliant runtime library.
+ Runtime library supporting the Independent JPEG Group's standard
+ for JPEG files.
+ .
+ This package contains the shared library which is a drop in 
+ replacement for libjpeg8, which has better performance than
+ standard libjpeg by use of SIMD and other optimizations.
+
+Package: libturbojpeg
 Architecture: any
 Multi-Arch: same
+Section: libs
+Pre-Depends: multiarch-support
 Depends: ${shlibs:Depends}, ${misc:Depends}
-Pre-Depends: dpkg-dev, ${misc:Pre-Depends}
-Replaces: libjpeg62
-Description: The Independent JPEG Group's JPEG runtime library
- The Independen`t JPEG Group's JPEG library is a library for handling
- JPEG files.
- This variation is from the meego project which includes further
- optimizations.
+Replaces: libjpeg-turbo8 (<< 1.1.90+svn722-1ubuntu6)
+Description: IJG JPEG compliant runtime library.
+ Runtime library supporting the Independent JPEG Group's standard
+ for JPEG files.
  .
- This package contains the shared library.
+ This package contains the libturbojpeg.so library, used by
+ turboVNC and other users of the past TurboJPEG library.
 
 Package: libjpeg-turbo-progs
 Architecture: any
+Depends: ${shlibs:Depends}, ${misc:Depends}, libturbojpeg (= ${binary:Version})
+Replaces: libjpeg-progs (<< 8c-2ubuntu5)
+Provides: libjpeg-progs
 Description: Programs for manipulating JPEG files
  This package contains programs for manipulating JPEG files:
   cjpeg/djpeg: convert to/from the JPEG file format
   rdjpgcom/wrjpgcom: read/write comments in JPEG files
   jpegtran: lossless transformations of JPEG files
-  jpegexiforient/exifautotran: manipulate EXIF orientation tag
+  tjbench: a simple JPEG benchmarking tool
+
+Package: libjpeg-turbo8-dbg
+Architecture: any
+Section: debug
+Priority: extra
+Depends: libjpeg-turbo8 (= ${binary:Version}), ${misc:Depends}
+Replaces: libjpeg62-dbg, libjpeg7-dbg, libjpeg-turbo-dbg (<< 1.1.90+svn722-1ubuntu6)
+Description: Debugging symbols for the libjpeg-turbo library
+ An optimized implementation of the Independent JPEG Group's 
+ JPEG library is a library for handling JPEG files.
+ .
+ This package contains the debugging symbols for libjpeg-turbo.
+
+Package: libjpeg-turbo-test
+Architecture: any
+Priority: extra
+Section: debug
 Depends: ${shlibs:Depends}, ${misc:Depends}
-Pre-Depends: libjpeg-progs, dpkg-dev, ${misc:Pre-Depends}
+Replaces: libjpeg-test (<< 1.1.90+svn722-1ubuntu6)
+Description: Program for testing libjpeg-turbo
+ This package contains tjunittest which tests
+ the basic functionality of the library and 
+ data files for performing those tests.
diff --git a/debian/copyright b/debian/copyright
index f809f92..8452ed9 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,6 +1,6 @@
 Format-Specification: http://svn.debian.org/wsvn/dep/web/deps/dep5.mdwn?op=file&rev=135
 Name: libjpeg-turbo
-Source: git://gitorious.org/meego-image-editor/libjpeg-turbo
+Source: lp:libjpeg-turbo
 
 Files: *
 Copyright: 1999-2006 MIYASAKA Masaru 
@@ -13,17 +13,17 @@ Copyright: 1999-2006 MIYASAKA Masaru
     1998, Thomas G. Lane
     2010 Nokia Corporation
 License:  JPEG
+ .
  In plain English:
  .
  1. We don't promise that this software works.  (But if you find any bugs,
-    please let us know!)
+   please let us know!)
  2. You can use this software for whatever you want.  You don't have to pay us.
  3. You may not pretend that you wrote this software.  If you use it in a
-    program, you must acknowledge somewhere in your documentation that
-    you've used the IJG code.
+   program, you must acknowledge somewhere in your documentation that
+   you've used the IJG code.
  .
  In legalese:
- .
  The authors make NO WARRANTY or representation, either express or implied,
  with respect to this software, its quality, accuracy, merchantability, or
  fitness for a particular purpose.  This software is provided "AS IS", and you,
@@ -58,67 +58,38 @@ License:  JPEG
  We specifically permit and encourage the use of this software as the basis of
  commercial products, provided that all warranty or liability claims are
  assumed by the product vendor.
-
-
-Files: bmp.c, bmp.h, jchuff.ci, jdhuff.*, jpegut.c, jpgtest.cxx, rrtimer.h, rrutil.h, turbojpeg.h, turbojpegl.c
-Copyright: 1998-2005 Julian Smart, Robert Roebling et al
-License: wxWindows
- .
-                wxWindows Library Licence, Version 3.1
-                ======================================
- .
- Copyright (C) 1998-2005 Julian Smart, Robert Roebling et al
- .
- Everyone is permitted to copy and distribute verbatim copies
- of this licence document, but changing it is not allowed.
- .
-                       WXWINDOWS LIBRARY LICENCE
-     TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
- . 
- This library is free software; you can redistribute it and/or modify it
- under the terms of the GNU Library General Public Licence as published by
- the Free Software Foundation; either version 2 of the Licence, or (at
- your option) any later version.
- . 
- This library is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library
- General Public Licence for more details.
- .
- You should have received a copy of the GNU Library General Public Licence
- along with this software, usually in a file named COPYING.LIB.  If not,
- write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, 
- Boston, MA 02110-1301 USA
- .
- EXCEPTION NOTICE
- .
- 1. As a special exception, the copyright holders of this library give
- permission for additional uses of the text contained in this release of
- the library as licenced under the wxWindows Library Licence, applying
- either version 3.1 of the Licence, or (at your option) any later version of
- the Licence as published by the copyright holders of version
- 3.1 of the Licence document.
- .
- 2. The exception is that you may use, copy, link, modify and distribute
- under your own terms, binary object code versions of works based
- on the Library.
- .
- 3. If you copy code from files distributed under the terms of the GNU
- General Public Licence or the GNU Library General Public Licence into a
- copy of this library, as this licence permits, the exception does not
- apply to the code that you add in this way.  To avoid misleading anyone as
- to the status of such modified files, you must delete this exception
- notice from such code and/or adjust the licensing conditions notice
- accordingly.
- .
- 4. If you write modifications of your own for this library, it is your
- choice whether to permit this exception to apply to your modifications. 
- If you do not wish that, you must delete the exception notice from such
- code and/or adjust the licensing conditions notice accordingly.
-
+ .
+ .
+ ansi2knr.c is included in this distribution by permission of L. Peter Deutsch,
+ sole proprietor of its copyright holder, Aladdin Enterprises of Menlo Park, CA
+ .
+ ansi2knr.c is NOT covered by the above copyright and conditions, but instead
+ by the usual distribution terms of the Free Software Foundation; principally,
+ that you must include source code if you redistribute it.  (See the file
+ ansi2knr.c for full details.)  However, since ansi2knr.c is not needed as part
+ of any program generated from the IJG code, this does not limit you more than
+ the foregoing paragraphs do.
+ .
+ The Unix configuration script "configure" was produced with GNU Autoconf.
+ It is copyright by the Free Software Foundation but is freely distributable.
+ The same holds for its supporting scripts (config.guess, config.sub,
+ ltmain.sh).  Another support script, install-sh, is copyright by X Consortium
+ but is also freely distributable.
+ .
+ The IJG distribution formerly included code to read and write GIF files.
+ To avoid entanglement with the Unisys LZW patent, GIF reading support has
+ been removed altogether, and the GIF writer has been simplified to produce
+ "uncompressed GIFs".  This technique does not use the LZW algorithm; the
+ resulting GIF files are larger than usual, but are readable by all standard
+ GIF decoders.
+ .
+ We are required to state that
+    "The Graphics Interchange Format(c) is the Copyright property of
+    CompuServe Incorporated.  GIF(sm) is a Service Mark property of
+    CompuServe Incorporated."
 
 Files: debian/*
-Copyright: 2010 Linaro Limited
+Copyright: 2010, 2011 Linaro Limited
 License: LGPL-2.1
  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Library General Public
diff --git a/debian/extra/Makefile b/debian/extra/Makefile
new file mode 100644
index 0000000..b34f7ac
--- /dev/null
+++ b/debian/extra/Makefile
@@ -0,0 +1,22 @@
+CFLAGS = -O2 -Wall -g
+CC = cc
+INSTALL = install -m755 -o root -g root
+INSTALLDIR = install -m755 -o root -g root -d
+DESTDIR =
+prefix  = /usr/local
+bindir  = $(prefix)/bin
+mandir  = $(prefix)/share/man/man1
+
+all: jpegexiforient
+
+jpegexiforient: jpegexiforient.c
+	$(CC) $(CFLAGS) -o jpegexiforient jpegexiforient.c
+clean:
+	-rm -f jpegexiforient
+install:
+	$(INSTALLDIR) $(DESTDIR)$(bindir) 
+	$(INSTALLDIR) $(DESTDIR)$(mandir) 
+	$(INSTALL) jpegexiforient    $(DESTDIR)$(bindir)
+	$(INSTALL) jpegexiforient.1  $(DESTDIR)$(mandir)
+	$(INSTALL) exifautotran      $(DESTDIR)$(bindir)
+	$(INSTALL) exifautotran.1    $(DESTDIR)$(mandir)
diff --git a/debian/extra/exifautotran b/debian/extra/exifautotran
new file mode 100644
index 0000000..c8b6a3a
--- /dev/null
+++ b/debian/extra/exifautotran
@@ -0,0 +1,50 @@
+#!/bin/sh
+# exifautotran [list of files]
+#
+# Transforms Exif files so that Orientation becomes 1
+#
+
+trap "if test -n \"\$tempfile\"; then rm -f \"\$tempfile\"; fi" INT QUIT TERM
+
+for i
+do
+ case $i in
+ -v|--version) echo "exifautotran"; exit 0;;
+ -h|--help) 
+             cat <<EOF
+exifautotran [list of files]
+
+Transforms Exif files so that Orientation becomes 1
+EOF
+             exit 0;;
+ esac
+
+ case `jpegexiforient -n "$i"` in
+ 1) transform="";;
+ 2) transform="-flip horizontal";;
+ 3) transform="-rotate 180";;
+ 4) transform="-flip vertical";;
+ 5) transform="-transpose";;
+ 6) transform="-rotate 90";;
+ 7) transform="-transverse";;
+ 8) transform="-rotate 270";;
+ *) transform="";;
+ esac
+ if test -n "$transform"; then
+  tempfile=`mktemp`;
+  if test "$?" -ne "0"; then
+    echo "Failed to create temporary file" >&2
+    exit 1;
+  fi
+  echo Executing: jpegtran -copy all $transform $i >&2
+  jpegtran -copy all $transform "$i" > $tempfile
+  if test $? -ne 0; then
+   echo Error while transforming $i - skipped. >&2
+   rm "$tempfile"
+  else
+   cp "$tempfile" "$i"
+   rm "$tempfile"
+   jpegexiforient -1 "$i" > /dev/null
+  fi
+ fi
+done
diff --git a/debian/extra/exifautotran.1 b/debian/extra/exifautotran.1
new file mode 100644
index 0000000..6023e55
--- /dev/null
+++ b/debian/extra/exifautotran.1
@@ -0,0 +1,13 @@
+.TH EXIFAUTOTRAN "1" "February 2005" "exifautotran" "User Commands"
+.SH NAME
+exifautotran \- Transforms Exif files so that Orientation becomes 1
+.SH DESCRIPTION
+exifautotran [list of files]
+.PP
+Take a list of files as input and transform them in place so that the
+Orientation becomes 1.
+.SH "AUTHOR"
+    Guido Vollbeding <guido@jpegclub.org>
+.SH "SEE ALSO"
+.BR jpegtran(1)
+.BR jpegexiforient(1)
diff --git a/debian/extra/jpegexiforient.1 b/debian/extra/jpegexiforient.1
new file mode 100644
index 0000000..d2e32ee
--- /dev/null
+++ b/debian/extra/jpegexiforient.1
@@ -0,0 +1,73 @@
+.TH JPEGEXIFORIENT "1" "February 2005" "jpegexiforient" "User Commands"
+.SH NAME
+jpegexiforient \- reads or writes the Exif Orientation Tag
+.SH SYNOPSIS
+.B jpegexiforient
+[\fIswitches\fR] \fIjpegfile\fR
+.SH DESCRIPTION
+.
+This is a utility program to get and set the Exif Orientation Tag.
+It can be used together with jpegtran in scripts for automatic
+orientation correction of digital camera pictures.
+.PP
+The Exif orientation value gives the orientation of the camera
+relative to the scene when the image was captured.  The relation
+of the '0th row' and '0th column' to visual position is shown as
+below.
+.IP
+.nf
+.ft CR
+Value | 0th Row     | 0th Column
+------+-------------+-----------
+  1   | top         | left side
+  2   | top         | right side
+  3   | bottom      | right side
+  4   | bottom      | left side
+  5   | left side   | top
+  6   | right side  | top
+  7   | right side  | bottom
+  8   | left side   | bottom
+.fi
+.PP
+For convenience, here is what the letter F would look like if it were
+tagged correctly and displayed by a program that ignores the orientation
+tag:
+.IP
+.nf
+.ft CB
+  1        2       3      4     
+
+888888  888888      88  88      
+88          88      88  88      
+8888      8888    8888  8888    
+88          88      88  88
+88          88  888888  888888
+
+    5            6           7          8
+                                               
+8888888888  88                  88  8888888888
+88  88      88  88          88  88      88  88
+88          8888888888  8888888888          88
+.fi
+.PP
+jpegexiforient output the Exif Orientation Tag in a JPEG Exif file.
+With the options -1 .. -8, it can also be used to set the tag.
+.
+.SS "OPTIONS"
+.TP
+\fB\-\-help\fR
+display this help and exit
+.TP
+\fB\-\-version\fR
+output version information and exit
+.TP
+\fB\-n\fR
+Do not output the trailing newline
+.TP
+\fB\-1\fR .. \fB\-8\fR
+Set orientation value 1 .. 8
+.SH "AUTHOR"
+ Guido Vollbeding <guido@jpegclub.org>
+.SH "SEE ALSO"
+.BR jpegtran(1),
+.BR exifautotran(1)
diff --git a/debian/extra/jpegexiforient.c b/debian/extra/jpegexiforient.c
new file mode 100644
index 0000000..2a9db40
--- /dev/null
+++ b/debian/extra/jpegexiforient.c
@@ -0,0 +1,299 @@
+/*
+ * jpegexiforient.c
+ *
+ * This is a utility program to get and set the Exif Orientation Tag.
+ * It can be used together with jpegtran in scripts for automatic
+ * orientation correction of digital camera pictures.
+ *
+ * The Exif orientation value gives the orientation of the camera
+ * relative to the scene when the image was captured.  The relation
+ * of the '0th row' and '0th column' to visual position is shown as
+ * below.
+ *
+ * Value | 0th Row     | 0th Column
+ * ------+-------------+-----------
+ *   1   | top         | left side
+ *   2   | top         | right side
+ *   3   | bottom      | right side
+ *   4   | bottom      | left side
+ *   5   | left side   | top
+ *   6   | right side  | top
+ *   7   | right side  | bottom
+ *   8   | left side   | bottom
+ *
+ * For convenience, here is what the letter F would look like if it were
+ * tagged correctly and displayed by a program that ignores the orientation
+ * tag:
+ *
+ *   1        2       3      4         5            6           7          8
+ *
+ * 888888  888888      88  88      8888888888  88                  88  8888888888
+ * 88          88      88  88      88  88      88  88          88  88      88  88
+ * 8888      8888    8888  8888    88          8888888888  8888888888          88
+ * 88          88      88  88
+ * 88          88  888888  888888
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+static FILE * myfile;		/* My JPEG file */
+
+static unsigned char exif_data[65536L];
+
+/* Return next input byte, or EOF if no more */
+#define NEXTBYTE()  getc(myfile)
+
+/* Error exit handler */
+#define ERREXIT(msg)  (exit(0))
+
+/* Read one byte, testing for EOF */
+static int
+read_1_byte (void)
+{
+  int c;
+
+  c = NEXTBYTE();
+  if (c == EOF)
+    ERREXIT("Premature EOF in JPEG file");
+  return c;
+}
+
+/* Read 2 bytes, convert to unsigned int */
+/* All 2-byte quantities in JPEG markers are MSB first */
+static unsigned int
+read_2_bytes (void)
+{
+  int c1, c2;
+
+  c1 = NEXTBYTE();
+  if (c1 == EOF)
+    ERREXIT("Premature EOF in JPEG file");
+  c2 = NEXTBYTE();
+  if (c2 == EOF)
+    ERREXIT("Premature EOF in JPEG file");
+  return (((unsigned int) c1) << 8) + ((unsigned int) c2);
+}
+
+static const char * progname;	/* program name for error messages */
+
+static void
+usage (FILE *out)
+/* complain about bad command line */
+{
+  fprintf(out, "jpegexiforient reads or writes the Exif Orientation Tag ");
+  fprintf(out, "in a JPEG Exif file.\n");
+
+  fprintf(out, "Usage: %s [switches] jpegfile\n", progname);
+
+  fprintf(out, "Switches:\n");
+  fprintf(out, "  --help     display this help and exit\n");
+  fprintf(out, "  --version  output version information and exit\n");
+  fprintf(out, "  -n         Do not output the trailing newline\n");
+  fprintf(out, "  -1 .. -8   Set orientation value 1 .. 8\n");
+}
+
+/*
+ * The main program.
+ */
+
+int
+main (int argc, char **argv)
+{
+  int n_flag, set_flag;
+  unsigned int length, i;
+  int is_motorola; /* Flag for byte order */
+  unsigned int offset, number_of_tags, tagnum;
+
+  progname = argv[0];
+  if (progname == NULL || progname[0] == 0)
+    progname = "jpegexiforient";	/* in case C library doesn't provide it */
+
+  if (argc < 2) { usage(stderr); return 1; }
+
+  n_flag = 0; set_flag = 0;
+
+  i = 1;
+  while (argv[i][0] == '-') {
+    switch (argv[i][1]) {
+    case '-': 
+      switch (argv[i][2]) {
+        case 'h': usage(stdout); return 0;
+        case 'v': fprintf(stdout,"jpegexiforient\n"); return 0;
+        }
+    case 'n':
+      n_flag = 1;
+      break;
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+      set_flag = argv[i][1] - '0';
+      break;
+    default:
+      usage(stderr); return 1;
+    }
+    if (++i >= argc) { usage(stderr); return 1; }
+  }
+
+  if (set_flag) {
+    if ((myfile = fopen(argv[i], "rb+")) == NULL) {
+      fprintf(stderr, "%s: can't open %s\n", progname, argv[i]);
+      return 0;
+    }
+  } else {
+    if ((myfile = fopen(argv[i], "rb")) == NULL) {
+      fprintf(stderr, "%s: can't open %s\n", progname, argv[i]);
+      return 0;
+    }
+  }
+
+  /* Read File head, check for JPEG SOI + Exif APP1 */
+  for (i = 0; i < 4; i++)
+    exif_data[i] = (unsigned char) read_1_byte();
+  if (exif_data[0] != 0xFF ||
+      exif_data[1] != 0xD8 ||
+      exif_data[2] != 0xFF ||
+      exif_data[3] != 0xE1)
+    return 0;
+
+  /* Get the marker parameter length count */
+  length = read_2_bytes();
+  /* Length includes itself, so must be at least 2 */
+  /* Following Exif data length must be at least 6 */
+  if (length < 8)
+    return 0;
+  length -= 8;
+  /* Read Exif head, check for "Exif" */
+  for (i = 0; i < 6; i++)
+    exif_data[i] = (unsigned char) read_1_byte();
+  if (exif_data[0] != 0x45 ||
+      exif_data[1] != 0x78 ||
+      exif_data[2] != 0x69 ||
+      exif_data[3] != 0x66 ||
+      exif_data[4] != 0 ||
+      exif_data[5] != 0)
+    return 0;
+  /* Read Exif body */
+  for (i = 0; i < length; i++)
+    exif_data[i] = (unsigned char) read_1_byte();
+
+  if (length < 12) return 0; /* Length of an IFD entry */
+
+  /* Discover byte order */
+  if (exif_data[0] == 0x49 && exif_data[1] == 0x49)
+    is_motorola = 0;
+  else if (exif_data[0] == 0x4D && exif_data[1] == 0x4D)
+    is_motorola = 1;
+  else
+    return 0;
+
+  /* Check Tag Mark */
+  if (is_motorola) {
+    if (exif_data[2] != 0) return 0;
+    if (exif_data[3] != 0x2A) return 0;
+  } else {
+    if (exif_data[3] != 0) return 0;
+    if (exif_data[2] != 0x2A) return 0;
+  }
+
+  /* Get first IFD offset (offset to IFD0) */
+  if (is_motorola) {
+    if (exif_data[4] != 0) return 0;
+    if (exif_data[5] != 0) return 0;
+    offset = exif_data[6];
+    offset <<= 8;
+    offset += exif_data[7];
+  } else {
+    if (exif_data[7] != 0) return 0;
+    if (exif_data[6] != 0) return 0;
+    offset = exif_data[5];
+    offset <<= 8;
+    offset += exif_data[4];
+  }
+  if (offset > length - 2) return 0; /* check end of data segment */
+
+  /* Get the number of directory entries contained in this IFD */
+  if (is_motorola) {
+    number_of_tags = exif_data[offset];
+    number_of_tags <<= 8;
+    number_of_tags += exif_data[offset+1];
+  } else {
+    number_of_tags = exif_data[offset+1];
+    number_of_tags <<= 8;
+    number_of_tags += exif_data[offset];
+  }
+  if (number_of_tags == 0) return 0;
+  offset += 2;
+
+  /* Search for Orientation Tag in IFD0 */
+  for (;;) {
+    if (offset > length - 12) return 0; /* check end of data segment */
+    /* Get Tag number */
+    if (is_motorola) {
+      tagnum = exif_data[offset];
+      tagnum <<= 8;
+      tagnum += exif_data[offset+1];
+    } else {
+      tagnum = exif_data[offset+1];
+      tagnum <<= 8;
+      tagnum += exif_data[offset];
+    }
+    if (tagnum == 0x0112) break; /* found Orientation Tag */
+    if (--number_of_tags == 0) return 0;
+    offset += 12;
+  }
+
+  if (set_flag) {
+    /* Set the Orientation value */
+    if (is_motorola) {
+      exif_data[offset+2] = 0; /* Format = unsigned short (2 octets) */
+      exif_data[offset+3] = 3;
+      exif_data[offset+4] = 0; /* Number Of Components = 1 */
+      exif_data[offset+5] = 0;
+      exif_data[offset+6] = 0;
+      exif_data[offset+7] = 1;
+      exif_data[offset+8] = 0;
+      exif_data[offset+9] = (unsigned char)set_flag;
+      exif_data[offset+10] = 0;
+      exif_data[offset+11] = 0;
+    } else {
+      exif_data[offset+2] = 3; /* Format = unsigned short (2 octets) */
+      exif_data[offset+3] = 0;
+      exif_data[offset+4] = 1; /* Number Of Components = 1 */
+      exif_data[offset+5] = 0;
+      exif_data[offset+6] = 0;
+      exif_data[offset+7] = 0;
+      exif_data[offset+8] = (unsigned char)set_flag;
+      exif_data[offset+9] = 0;
+      exif_data[offset+10] = 0;
+      exif_data[offset+11] = 0;
+    }
+    fseek(myfile, (4 + 2 + 6 + 2) + offset, SEEK_SET);
+    fwrite(exif_data + 2 + offset, 1, 10, myfile);
+  } else {
+    /* Get the Orientation value */
+    if (is_motorola) {
+      if (exif_data[offset+8] != 0) return 0;
+      set_flag = exif_data[offset+9];
+    } else {
+      if (exif_data[offset+9] != 0) return 0;
+      set_flag = exif_data[offset+8];
+    }
+    if (set_flag > 8) return 0;
+  }
+
+  /* Write out Orientation value */
+  if (n_flag)
+    printf("%c", '0' + set_flag);
+  else
+    printf("%c\n", '0' + set_flag);
+
+  /* All done. */
+  return 0;
+}
diff --git a/debian/libjpeg-turbo-only.symbols b/debian/libjpeg-turbo-only.symbols
new file mode 100644
index 0000000..8ffee36
--- /dev/null
+++ b/debian/libjpeg-turbo-only.symbols
@@ -0,0 +1,12 @@
+LIBJPEGTURBO_8.0@LIBJPEGTURBO_8.0
+auxv@LIBJPEG_8.0
+init_simd@LIBJPEG_8.0
+jinit_phuff_decoder@LIBJPEG_8.0
+jinit_phuff_encoder@LIBJPEG_8.0
+jpeg_fill_bit_buffer@LIBJPEG_8.0
+jpeg_gen_optimal_table@LIBJPEG_8.0
+jpeg_huff_decode@LIBJPEG_8.0
+jpeg_make_c_derived_tbl@LIBJPEG_8.0
+jpeg_make_d_derived_tbl@LIBJPEG_8.0
+jpeg_simd_cpu_support@LIBJPEG_8.0
+libjpeg_general_init@LIBJPEG_8.0
diff --git a/debian/libjpeg-turbo-progs.install b/debian/libjpeg-turbo-progs.install
index dea78ec..3129476 100644
--- a/debian/libjpeg-turbo-progs.install
+++ b/debian/libjpeg-turbo-progs.install
@@ -1,2 +1,8 @@
-usr/bin
+usr/bin/cjpeg
+usr/bin/rdjpgcom
+usr/bin/djpeg
+usr/bin/jpegtran
+usr/bin/wrjpgcom
+usr/bin/exifautotran
+usr/bin/jpegexiforient
 usr/share/man/*
diff --git a/debian/libjpeg-turbo-progs.postrm b/debian/libjpeg-turbo-progs.postrm
deleted file mode 100644
index 2c5a1b3..0000000
--- a/debian/libjpeg-turbo-progs.postrm
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/sh
-
-set -e
-
-HOST_MULTIARCH=`dpkg-architecture -qDEB_HOST_MULTIARCH`
-
-if [ remove = "$1" -o abort-install = "$1" -o disappear = "$1" ];
-then
-	dpkg-divert --package libjpeg-turbo-progs --rename \
-		--remove /usr/share/man/man1/wrjpgcom.1.gz 
-	dpkg-divert --package libjpeg-turbo-progs --rename \
-		--remove /usr/share/man/man1/cjpeg.1.gz 
-	dpkg-divert --package libjpeg-turbo-progs --rename \
-		--remove /usr/share/man/man1/djpeg.1.gz 
-	dpkg-divert --package libjpeg-turbo-progs --rename \
-		--remove /usr/share/man/man1/rdjpgcom.1.gz
-	dpkg-divert --package libjpeg-turbo-progs --rename \
-		--remove /usr/share/man/man1/jpegtran.1.gz
-	rmdir /usr/share/man/libjpeg-progs-divert
-	dpkg-divert --package libjpeg-turbo-progs --rename \
-		--remove /usr/bin/rdjpgcom
-	dpkg-divert --package libjpeg-turbo-progs --rename \
-		--remove /usr/bin/wrjpgcom
-	dpkg-divert --package libjpeg-turbo-progs --rename \
-		--remove /usr/bin/jpegtran
-	dpkg-divert --package libjpeg-turbo-progs --rename \
-		--remove /usr/bin/cjpeg
-	dpkg-divert --package libjpeg-turbo-progs --rename \
-		--remove /usr/bin/djpeg
-	rmdir /usr/bin/libjpeg-progs-divert
-fi
-
-#DEBHELPER#
diff --git a/debian/libjpeg-turbo-progs.preinst b/debian/libjpeg-turbo-progs.preinst
deleted file mode 100644
index 0aa3519..0000000
--- a/debian/libjpeg-turbo-progs.preinst
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/sh 
-
-set -e
-
-HOST_MULTIARCH=`dpkg-architecture -qDEB_HOST_MULTIARCH`
-
-if [ install = "$1" ] || dpkg --compare-versions "$2" lt 1.1.1-1linaro3; then
-	if [ ! -d /usr/share/man/libjpeg-progs-divert ] 
-        then
-		mkdir -p /usr/share/man/libjpeg-progs-divert
-        fi
-
- 	dpkg-divert --package libjpeg-turbo-progs --rename --divert /usr/share/man/libjpeg-progs-divert/wrjpgcom.1.gz \
-                --add /usr/share/man/man1/wrjpgcom.1.gz 
- 	dpkg-divert --package libjpeg-turbo-progs --rename --divert /usr/share/man/libjpeg-progs-divert/cjpeg.1.gz \
-                --add /usr/share/man/man1/cjpeg.1.gz 
- 	dpkg-divert --package libjpeg-turbo-progs --rename --divert /usr/share/man/libjpeg-progs-divert/djpeg.1.gz \
-                --add /usr/share/man/man1/djpeg.1.gz 
- 	dpkg-divert --package libjpeg-turbo-progs --rename --divert /usr/share/man/libjpeg-progs-divert/rdjpgcom.1.gz \
-                --add /usr/share/man/man1/rdjpgcom.1.gz
- 	dpkg-divert --package libjpeg-turbo-progs --rename --divert /usr/share/man/libjpeg-progs-divert/jpegtran.1.gz \
-                --add /usr/share/man/man1/jpegtran.1.gz
-
-	if [ ! -d /usr/bin/libjpeg-progs-divert ] 
-        then
-		mkdir -p /usr/bin/libjpeg-progs-divert
-        fi
-
- 	dpkg-divert --package libjpeg-turbo-progs --rename --divert /usr/bin/libjpeg-progs-divert/rdjpgcom \
-		--add /usr/bin/rdjpgcom
- 	dpkg-divert --package libjpeg-turbo-progs --rename --divert /usr/bin/libjpeg-progs-divert/wrjpgcom \
-		--add /usr/bin/wrjpgcom
- 	dpkg-divert --package libjpeg-turbo-progs --rename --divert /usr/bin/libjpeg-progs-divert/jpegtranm \
-		--add /usr/bin/jpegtran
- 	dpkg-divert --package libjpeg-turbo-progs --rename --divert /usr/bin/libjpeg-progs-divert/cjpeg \
-		--add /usr/bin/cjpeg
- 	dpkg-divert --package libjpeg-turbo-progs --rename --divert /usr/bin/libjpeg-progs-divert/djpeg \
-		--add /usr/bin/djpeg
-fi
-
-#DEBHELPER#
diff --git a/debian/libjpeg-turbo-test.install b/debian/libjpeg-turbo-test.install
new file mode 100644
index 0000000..0bf7f9e
--- /dev/null
+++ b/debian/libjpeg-turbo-test.install
@@ -0,0 +1 @@
+usr/bin/tjunittest
diff --git a/debian/libjpeg-turbo62.install b/debian/libjpeg-turbo62.install
deleted file mode 100644
index 58cfa1a..0000000
--- a/debian/libjpeg-turbo62.install
+++ /dev/null
@@ -1,2 +0,0 @@
-usr/lib/*/libjpeg.so.62*
-usr/lib/*/libturbojpeg.so*
diff --git a/debian/libjpeg-turbo62.lintian-overrides b/debian/libjpeg-turbo62.lintian-overrides
deleted file mode 100644
index d8349f2..0000000
--- a/debian/libjpeg-turbo62.lintian-overrides
+++ /dev/null
@@ -1,2 +0,0 @@
-# yes, we specifically want linkers to depends on the standard libjpeg name
-libturbojpeg62: shlibs-declares-dependency-on-other-package libjpeg62 (>=6b1)
diff --git a/debian/libjpeg-turbo62.postrm b/debian/libjpeg-turbo62.postrm
deleted file mode 100644
index 59ea59f..0000000
--- a/debian/libjpeg-turbo62.postrm
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/sh
-
-set -e
-
-HOST_MULTIARCH=`dpkg-architecture -qDEB_HOST_MULTIARCH`
-
-if [ remove = "$1" -o abort-install = "$1" -o disappear = "$1" ];
-then
-	dpkg-divert --package libjpeg-turbo62 --rename \
-		--remove /usr/lib/$HOST_MULTIARCH/libjpeg.so.62.0.0
-	dpkg-divert --package libjpeg-turbo62 --rename \
-		--remove /usr/lib/$HOST_MULTIARCH/libjpeg.so.62
-	rmdir /usr/lib/libjpeg-divert
-fi
-
-#DEBHELPER#
diff --git a/debian/libjpeg-turbo62.preinst b/debian/libjpeg-turbo62.preinst
deleted file mode 100644
index 552dcf1..0000000
--- a/debian/libjpeg-turbo62.preinst
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/sh 
-
-set -e
-
-HOST_MULTIARCH=`dpkg-architecture -qDEB_HOST_MULTIARCH`
-
-if [ install = "$1" ] || dpkg --compare-versions "$2" lt 1.1.0-0linaro2; then
-	if [ ! -d /usr/lib/libjpeg-divert ] 
-        then
-		mkdir -p /usr/lib/libjpeg-divert
-        fi
-
- 	dpkg-divert --package libjpeg-turbo62 --rename --divert /usr/lib/libjpeg-divert/libjpeg.so.62.0.0 \
-                --add /usr/lib/$HOST_MULTIARCH/libjpeg.so.62.0.0
- 	dpkg-divert --package libjpeg-turbo62 --rename --divert /usr/lib/libjpeg-divert/libjpeg.so.62 \
-                --add /usr/lib/$HOST_MULTIARCH/libjpeg.so.62
-
-fi
-
-#DEBHELPER#
diff --git a/debian/libjpeg-turbo62.shlibs b/debian/libjpeg-turbo62.shlibs
deleted file mode 100644
index 113927b..0000000
--- a/debian/libjpeg-turbo62.shlibs
+++ /dev/null
@@ -1 +0,0 @@
-libjpeg         62      libjpeg62 (>=6b1) 
diff --git a/debian/libjpeg-turbo8-dev.install b/debian/libjpeg-turbo8-dev.install
new file mode 100644
index 0000000..95e76ff
--- /dev/null
+++ b/debian/libjpeg-turbo8-dev.install
@@ -0,0 +1,4 @@
+usr/include
+usr/lib/*/libjpeg.a
+usr/lib/*/libturbojpeg.a
+usr/lib/*/libjpeg.so
diff --git a/debian/libjpeg-turbo8.install b/debian/libjpeg-turbo8.install
new file mode 100644
index 0000000..1e5caa0
--- /dev/null
+++ b/debian/libjpeg-turbo8.install
@@ -0,0 +1 @@
+usr/lib/*/libjpeg.so.8*
diff --git a/debian/libjpeg-turbo8.lintian-overrides b/debian/libjpeg-turbo8.lintian-overrides
new file mode 100644
index 0000000..851256e
--- /dev/null
+++ b/debian/libjpeg-turbo8.lintian-overrides
@@ -0,0 +1,4 @@
+# yes, we specifically want linkers to depends on the standard libjpeg name
+libjpeg-turbo8: shlibs-declares-dependency-on-other-package libjpeg8 #MINVER#
+libjpeg-turbo8: symbols-declares-dependency-on-other-package libjpeg8 #MINVER#
+libjpeg-turbo8: package-name-doesnt-match-sonames
diff --git a/debian/libjpeg-turbo8.symbols b/debian/libjpeg-turbo8.symbols
new file mode 100644
index 0000000..f630980
--- /dev/null
+++ b/debian/libjpeg-turbo8.symbols
@@ -0,0 +1,183 @@
+libjpeg.so.8 libjpeg8 #MINVER#
+ LIBJPEGTURBO_8.0@LIBJPEGTURBO_8.0 8c-2ubuntu5~
+ LIBJPEG_8.0@LIBJPEG_8.0 8c
+ auxv@LIBJPEG_8.0 8c-2ubuntu5~
+ (arch=armel armhf i386)init_simd@LIBJPEG_8.0 8c-2ubuntu5~
+ jcopy_block_row@LIBJPEG_8.0 8c
+ jcopy_sample_rows@LIBJPEG_8.0 8c
+ jdiv_round_up@LIBJPEG_8.0 8c
+ jinit_1pass_quantizer@LIBJPEG_8.0 8c
+ jinit_2pass_quantizer@LIBJPEG_8.0 8c
+ jinit_arith_decoder@LIBJPEG_8.0 8c
+ jinit_arith_encoder@LIBJPEG_8.0 8c
+ jinit_c_coef_controller@LIBJPEG_8.0 8c
+ jinit_c_main_controller@LIBJPEG_8.0 8c
+ jinit_c_master_control@LIBJPEG_8.0 8c
+ jinit_c_prep_controller@LIBJPEG_8.0 8c
+ jinit_color_converter@LIBJPEG_8.0 8c
+ jinit_color_deconverter@LIBJPEG_8.0 8c
+ jinit_compress_master@LIBJPEG_8.0 8c
+ jinit_d_coef_controller@LIBJPEG_8.0 8c
+ jinit_d_main_controller@LIBJPEG_8.0 8c
+ jinit_d_post_controller@LIBJPEG_8.0 8c
+ jinit_downsampler@LIBJPEG_8.0 8c
+ jinit_forward_dct@LIBJPEG_8.0 8c
+ jinit_huff_decoder@LIBJPEG_8.0 8c
+ jinit_huff_encoder@LIBJPEG_8.0 8c
+ jinit_input_controller@LIBJPEG_8.0 8c
+ jinit_inverse_dct@LIBJPEG_8.0 8c
+ jinit_marker_reader@LIBJPEG_8.0 8c
+ jinit_marker_writer@LIBJPEG_8.0 8c
+ jinit_master_decompress@LIBJPEG_8.0 8c
+ jinit_memory_mgr@LIBJPEG_8.0 8c
+ jinit_merged_upsampler@LIBJPEG_8.0 8c
+ jinit_phuff_decoder@LIBJPEG_8.0 8c-2ubuntu5~
+ jinit_phuff_encoder@LIBJPEG_8.0 8c-2ubuntu5~
+ jinit_upsampler@LIBJPEG_8.0 8c
+ jpeg_CreateCompress@LIBJPEG_8.0 8c
+ jpeg_CreateDecompress@LIBJPEG_8.0 8c
+ jpeg_abort@LIBJPEG_8.0 8c
+ jpeg_abort_compress@LIBJPEG_8.0 8c
+ jpeg_abort_decompress@LIBJPEG_8.0 8c
+ jpeg_add_quant_table@LIBJPEG_8.0 8c
+ jpeg_alloc_huff_table@LIBJPEG_8.0 8c
+ jpeg_alloc_quant_table@LIBJPEG_8.0 8c
+ jpeg_aritab@LIBJPEG_8.0 8c
+ jpeg_calc_jpeg_dimensions@LIBJPEG_8.0 8c
+ jpeg_calc_output_dimensions@LIBJPEG_8.0 8c
+ jpeg_consume_input@LIBJPEG_8.0 8c
+ jpeg_copy_critical_parameters@LIBJPEG_8.0 8c
+ jpeg_core_output_dimensions@LIBJPEG_8.0 8c
+ jpeg_default_colorspace@LIBJPEG_8.0 8c
+ jpeg_default_qtables@LIBJPEG_8.0 8c
+ jpeg_destroy@LIBJPEG_8.0 8c
+ jpeg_destroy_compress@LIBJPEG_8.0 8c
+ jpeg_destroy_decompress@LIBJPEG_8.0 8c
+ jpeg_fdct_10x10@LIBJPEG_8.0 8c
+ jpeg_fdct_10x5@LIBJPEG_8.0 8c
+ jpeg_fdct_11x11@LIBJPEG_8.0 8c
+ jpeg_fdct_12x12@LIBJPEG_8.0 8c
+ jpeg_fdct_12x6@LIBJPEG_8.0 8c
+ jpeg_fdct_13x13@LIBJPEG_8.0 8c
+ jpeg_fdct_14x14@LIBJPEG_8.0 8c
+ jpeg_fdct_14x7@LIBJPEG_8.0 8c
+ jpeg_fdct_15x15@LIBJPEG_8.0 8c
+ jpeg_fdct_16x16@LIBJPEG_8.0 8c
+ jpeg_fdct_16x8@LIBJPEG_8.0 8c
+ jpeg_fdct_1x1@LIBJPEG_8.0 8c
+ jpeg_fdct_1x2@LIBJPEG_8.0 8c
+ jpeg_fdct_2x1@LIBJPEG_8.0 8c
+ jpeg_fdct_2x2@LIBJPEG_8.0 8c
+ jpeg_fdct_2x4@LIBJPEG_8.0 8c
+ jpeg_fdct_3x3@LIBJPEG_8.0 8c
+ jpeg_fdct_3x6@LIBJPEG_8.0 8c
+ jpeg_fdct_4x2@LIBJPEG_8.0 8c
+ jpeg_fdct_4x4@LIBJPEG_8.0 8c
+ jpeg_fdct_4x8@LIBJPEG_8.0 8c
+ jpeg_fdct_5x10@LIBJPEG_8.0 8c
+ jpeg_fdct_5x5@LIBJPEG_8.0 8c
+ jpeg_fdct_6x12@LIBJPEG_8.0 8c
+ jpeg_fdct_6x3@LIBJPEG_8.0 8c
+ jpeg_fdct_6x6@LIBJPEG_8.0 8c
+ jpeg_fdct_7x14@LIBJPEG_8.0 8c
+ jpeg_fdct_7x7@LIBJPEG_8.0 8c
+ jpeg_fdct_8x16@LIBJPEG_8.0 8c
+ jpeg_fdct_8x4@LIBJPEG_8.0 8c
+ jpeg_fdct_9x9@LIBJPEG_8.0 8c
+ jpeg_fdct_float@LIBJPEG_8.0 8c
+ jpeg_fdct_ifast@LIBJPEG_8.0 8c
+ jpeg_fdct_islow@LIBJPEG_8.0 8c
+ jpeg_fill_bit_buffer@LIBJPEG_8.0 8c-2ubuntu5~
+ jpeg_finish_compress@LIBJPEG_8.0 8c
+ jpeg_finish_decompress@LIBJPEG_8.0 8c
+ jpeg_finish_output@LIBJPEG_8.0 8c
+ jpeg_free_large@LIBJPEG_8.0 8c
+ jpeg_free_small@LIBJPEG_8.0 8c
+ jpeg_gen_optimal_table@LIBJPEG_8.0 8c-2ubuntu5~
+ jpeg_get_large@LIBJPEG_8.0 8c
+ jpeg_get_small@LIBJPEG_8.0 8c
+ jpeg_has_multiple_scans@LIBJPEG_8.0 8c
+ jpeg_huff_decode@LIBJPEG_8.0 8c-2ubuntu5~
+ jpeg_idct_10x10@LIBJPEG_8.0 8c
+ jpeg_idct_10x5@LIBJPEG_8.0 8c
+ jpeg_idct_11x11@LIBJPEG_8.0 8c
+ jpeg_idct_12x12@LIBJPEG_8.0 8c
+ jpeg_idct_12x6@LIBJPEG_8.0 8c
+ jpeg_idct_13x13@LIBJPEG_8.0 8c
+ jpeg_idct_14x14@LIBJPEG_8.0 8c
+ jpeg_idct_14x7@LIBJPEG_8.0 8c
+ jpeg_idct_15x15@LIBJPEG_8.0 8c
+ jpeg_idct_16x16@LIBJPEG_8.0 8c
+ jpeg_idct_16x8@LIBJPEG_8.0 8c
+ jpeg_idct_1x1@LIBJPEG_8.0 8c
+ jpeg_idct_1x2@LIBJPEG_8.0 8c
+ jpeg_idct_2x1@LIBJPEG_8.0 8c
+ jpeg_idct_2x2@LIBJPEG_8.0 8c
+ jpeg_idct_2x4@LIBJPEG_8.0 8c
+ jpeg_idct_3x3@LIBJPEG_8.0 8c
+ jpeg_idct_3x6@LIBJPEG_8.0 8c
+ jpeg_idct_4x2@LIBJPEG_8.0 8c
+ jpeg_idct_4x4@LIBJPEG_8.0 8c
+ jpeg_idct_4x8@LIBJPEG_8.0 8c
+ jpeg_idct_5x10@LIBJPEG_8.0 8c
+ jpeg_idct_5x5@LIBJPEG_8.0 8c
+ jpeg_idct_6x12@LIBJPEG_8.0 8c
+ jpeg_idct_6x3@LIBJPEG_8.0 8c
+ jpeg_idct_6x6@LIBJPEG_8.0 8c
+ jpeg_idct_7x14@LIBJPEG_8.0 8c
+ jpeg_idct_7x7@LIBJPEG_8.0 8c
+ jpeg_idct_8x16@LIBJPEG_8.0 8c
+ jpeg_idct_8x4@LIBJPEG_8.0 8c
+ jpeg_idct_9x9@LIBJPEG_8.0 8c
+ jpeg_idct_float@LIBJPEG_8.0 8c
+ jpeg_idct_ifast@LIBJPEG_8.0 8c
+ jpeg_idct_islow@LIBJPEG_8.0 8c
+ jpeg_input_complete@LIBJPEG_8.0 8c
+ jpeg_make_c_derived_tbl@LIBJPEG_8.0 8c-2ubuntu5~
+ jpeg_make_d_derived_tbl@LIBJPEG_8.0 8c-2ubuntu5~
+ jpeg_mem_available@LIBJPEG_8.0 8c
+ jpeg_mem_dest@LIBJPEG_8.0 8c
+ jpeg_mem_init@LIBJPEG_8.0 8c
+ jpeg_mem_src@LIBJPEG_8.0 8c
+ jpeg_mem_term@LIBJPEG_8.0 8c
+ jpeg_natural_order2@LIBJPEG_8.0 8c
+ jpeg_natural_order3@LIBJPEG_8.0 8c
+ jpeg_natural_order4@LIBJPEG_8.0 8c
+ jpeg_natural_order5@LIBJPEG_8.0 8c
+ jpeg_natural_order6@LIBJPEG_8.0 8c
+ jpeg_natural_order7@LIBJPEG_8.0 8c
+ jpeg_natural_order@LIBJPEG_8.0 8c
+ jpeg_new_colormap@LIBJPEG_8.0 8c
+ jpeg_open_backing_store@LIBJPEG_8.0 8c
+ jpeg_quality_scaling@LIBJPEG_8.0 8c
+ jpeg_read_coefficients@LIBJPEG_8.0 8c
+ jpeg_read_header@LIBJPEG_8.0 8c
+ jpeg_read_raw_data@LIBJPEG_8.0 8c
+ jpeg_read_scanlines@LIBJPEG_8.0 8c
+ jpeg_resync_to_restart@LIBJPEG_8.0 8c
+ jpeg_save_markers@LIBJPEG_8.0 8c
+ jpeg_set_colorspace@LIBJPEG_8.0 8c
+ jpeg_set_defaults@LIBJPEG_8.0 8c
+ jpeg_set_linear_quality@LIBJPEG_8.0 8c
+ jpeg_set_marker_processor@LIBJPEG_8.0 8c
+ jpeg_set_quality@LIBJPEG_8.0 8c
+ (arch=i386)jpeg_simd_cpu_support@LIBJPEG_8.0 8c-2ubuntu5~
+ jpeg_simple_progression@LIBJPEG_8.0 8c
+ jpeg_start_compress@LIBJPEG_8.0 8c
+ jpeg_start_decompress@LIBJPEG_8.0 8c
+ jpeg_start_output@LIBJPEG_8.0 8c
+ jpeg_std_error@LIBJPEG_8.0 8c
+ jpeg_std_message_table@LIBJPEG_8.0 8c
+ jpeg_stdio_dest@LIBJPEG_8.0 8c
+ jpeg_stdio_src@LIBJPEG_8.0 8c
+ jpeg_suppress_tables@LIBJPEG_8.0 8c
+ jpeg_write_coefficients@LIBJPEG_8.0 8c
+ jpeg_write_m_byte@LIBJPEG_8.0 8c
+ jpeg_write_m_header@LIBJPEG_8.0 8c
+ jpeg_write_marker@LIBJPEG_8.0 8c
+ jpeg_write_raw_data@LIBJPEG_8.0 8c
+ jpeg_write_scanlines@LIBJPEG_8.0 8c
+ jpeg_write_tables@LIBJPEG_8.0 8c
+ jround_up@LIBJPEG_8.0 8c
+ jzero_far@LIBJPEG_8.0 8c
+ libjpeg_general_init@LIBJPEG_8.0 8c-2ubuntu5~
diff --git a/debian/libturbojpeg.install b/debian/libturbojpeg.install
new file mode 100644
index 0000000..bb72f28
--- /dev/null
+++ b/debian/libturbojpeg.install
@@ -0,0 +1 @@
+usr/lib/*/libturbojpeg.so
diff --git a/debian/libturbojpeg.postinst b/debian/libturbojpeg.postinst
new file mode 100644
index 0000000..bf1f031
--- /dev/null
+++ b/debian/libturbojpeg.postinst
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+set -e
+
+if [ "$1" = "configure" ]; then
+	ldconfig
+fi
+
+#DEBHELPER#
diff --git a/debian/libturbojpeg.postrm b/debian/libturbojpeg.postrm
new file mode 100644
index 0000000..84f9e91
--- /dev/null
+++ b/debian/libturbojpeg.postrm
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+set -e
+
+if [ "$1" = "remove" ]; then
+	ldconfig
+fi
+
+#DEBHELPER#
diff --git a/debian/patches/FixLibraryStartup.patch b/debian/patches/FixLibraryStartup.patch
new file mode 100644
index 0000000..05ffd90
--- /dev/null
+++ b/debian/patches/FixLibraryStartup.patch
@@ -0,0 +1,730 @@
+diff -uNr -x .bzr libjpeg-turbo-1.1.90+svn702/Makefile.am libjpeg-turbo.now/Makefile.am
+--- libjpeg-turbo-1.1.90+svn702/Makefile.am	2011-05-26 10:45:06.000000000 +0000
++++ libjpeg-turbo.now/Makefile.am	2011-09-22 18:25:08.000000000 +0000
+@@ -7,7 +7,7 @@
+ HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+ 	jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h jpegcomp.h
+ 
+-libjpeg_la_SOURCES = $(HDRS) jcapimin.c jcapistd.c jccoefct.c jccolor.c \
++libjpeg_la_SOURCES = $(HDRS) jlibinit.c jcapimin.c jcapistd.c jccoefct.c jccolor.c \
+ 	jcdctmgr.c jchuff.c jcinit.c jcmainct.c jcmarker.c jcmaster.c \
+ 	jcomapi.c jcparam.c jcphuff.c jcprepct.c jcsample.c jctrans.c \
+ 	jdapimin.c jdapistd.c jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c \
+diff -uNr -x .bzr libjpeg-turbo-1.1.90+svn702/jlibinit.c libjpeg-turbo.now/jlibinit.c
+--- libjpeg-turbo-1.1.90+svn702/jlibinit.c	1970-01-01 00:00:00.000000000 +0000
++++ libjpeg-turbo.now/jlibinit.c	2011-09-22 18:25:08.000000000 +0000
+@@ -0,0 +1,149 @@
++/*
++ * jlibinit.c
++ *
++ * Copyright 2011 Tom Gall <tom.gall@linaro.org> for Linaro Limited
++ *
++ * This file is for library initialization
++ * Alg notes:
++ *   At the time the library is loaded, libjpeg_general_init is called
++ *     this will run before main, in the case that the library is NOT
++ *     loaded via dlopen
++ *   libjpeg_general_init will call libjpeg_arch_specific_init which
++ *     if there is work to do should be implemented by each architecture
++ *     otherwise the empty stub will be called.
++ * 
++ *   Example: In the case of arm, we want to query the auxv to 
++ *   determine if neon hardware is present. Further if present
++ *   functional pointers can be setup such that the simd versions
++ *   will be used.
++ */
++
++#include <link.h>
++#include <elf.h>
++#include <malloc.h>
++#include <unistd.h>
++#include <stdio.h>
++
++#include "jinclude.h"
++#include "jpeglib.h"
++#include "jsimd.h"
++
++#if defined(__linux__) || defined(__APPLE__)
++LOCAL(void) libjpeg_arch_specific_init(void);
++
++GLOBAL(void __attribute__ ((constructor))) libjpeg_general_init(void);
++
++#if !defined(__APPLE__)
++volatile ElfW(auxv_t) *auxv = NULL;
++
++LOCAL(ElfW(auxv_t) *)get_auxv(void)
++{
++  FILE *auxv_f;
++  ElfW(auxv_t) auxv_struct;
++  int i = 0;
++
++  if(auxv == NULL) {
++    auxv_f = fopen("/proc/self/auxv", "r");
++
++    if(auxv_f == 0) {
++       perror("Error opening file for reading");
++          return 0;
++    }
++    auxv =(ElfW(auxv_t) *)malloc(getpagesize());
++
++    do
++      {
++      fread(&auxv_struct, sizeof(ElfW(auxv_t)), 1, auxv_f);
++      auxv[i] = auxv_struct;
++      i++;
++      } while(auxv_struct.a_type != AT_NULL);
++  }
++  return auxv;
++}
++#else
++
++/* For OSes without an auxv implementation
++ */
++LOCAL(ElfW(auxv_t) *)get_auxv(void)
++{
++}
++
++#endif /* if not on apple */
++
++/* 
++ * Runs immediately after library load (and before main() unless
++ * dlopen is used.
++ */
++GLOBAL(void __attribute__ ((constructor))) libjpeg_general_init(void) 
++{
++  // Architecture independent library init
++
++  get_auxv();
++
++  // call to arch specific init
++  libjpeg_arch_specific_init();
++}
++
++#if defined(__arm__) || defined(__i386__)
++
++LOCAL(void) libjpeg_arch_specific_init(void) 
++{
++
++  init_simd();
++
++}
++#elif defined(__x86_64__)
++
++LOCAL(void) libjpeg_arch_specific_init(void) 
++{
++
++}
++
++#else
++
++/*
++ * empty stub for architectures that don't define an arch specific
++ * init routine.
++ */
++LOCAL(void) libjpeg_arch_specific_init(void) 
++{
++
++}
++
++#endif /* __arch__ */
++
++
++
++#elif defined(_WIN32) /* Windows */
++
++GLOBAL (BOOL) WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason,
++      LPVOID lpReserved)
++{
++  switch(fdwReason) {
++    case DLL_PROCESS_ATTACH:
++      // call to arch specific init
++      libjpeg_arch_specific_init();
++      break
++    case DLL_PROCESS_DETACH:
++      break
++  }
++
++  return (TRUE);
++}
++
++#if defined(__i386__)
++
++LOCAL(void) libjpeg_arch_specific_init(void) 
++{
++
++  init_simd();
++
++}
++#elif defined(__x86_64__)
++
++LOCAL(void) libjpeg_arch_specific_init(void) 
++{
++
++}
++#endif /* arch for Windows */
++#endif /* __linux__ __APPLE__  */
+diff -uNr -x .bzr libjpeg-turbo-1.1.90+svn702/jsimd.h libjpeg-turbo.now/jsimd.h
+--- libjpeg-turbo-1.1.90+svn702/jsimd.h	2011-02-18 20:43:04.000000000 +0000
++++ libjpeg-turbo.now/jsimd.h	2011-09-22 18:25:08.000000000 +0000
+@@ -37,6 +37,8 @@
+ #define jsimd_h2v1_merged_upsample        jSH2V1MUp
+ #endif /* NEED_SHORT_EXTERNAL_NAMES */
+ 
++
++EXTERN(void) init_simd JPP((void));
+ EXTERN(int) jsimd_can_rgb_ycc JPP((void));
+ EXTERN(int) jsimd_can_rgb_gray JPP((void));
+ EXTERN(int) jsimd_can_ycc_rgb JPP((void));
+diff -uNr -x .bzr libjpeg-turbo-1.1.90+svn702/simd/jsimd_arm.c libjpeg-turbo.now/simd/jsimd_arm.c
+--- libjpeg-turbo-1.1.90+svn702/simd/jsimd_arm.c	2011-08-22 13:48:01.000000000 +0000
++++ libjpeg-turbo.now/simd/jsimd_arm.c	2011-09-22 18:25:08.000000000 +0000
+@@ -15,6 +15,13 @@
+  * Based on the stubs from 'jsimd_none.c'
+  */
+ 
++#include <stdio.h>
++#include <link.h>
++#include <elf.h>
++#include <asm/hwcap.h>
++#include <string.h>
++#include <ctype.h>
++
+ #define JPEG_INTERNALS
+ #include "../jinclude.h"
+ #include "../jpeglib.h"
+@@ -23,105 +30,30 @@
+ #include "../jsimddct.h"
+ #include "jsimd.h"
+ 
+-#include <stdio.h>
+-#include <string.h>
+-#include <ctype.h>
+-
+ static unsigned int simd_support = ~0;
+ 
+-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+-
+-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+-
+-LOCAL(int)
+-check_feature (char *buffer, char *feature)
+-{
+-  char *p;
+-  if (*feature == 0)
+-    return 0;
+-  if (strncmp(buffer, "Features", 8) != 0)
+-    return 0;
+-  buffer += 8;
+-  while (isspace(*buffer))
+-    buffer++;
+-
+-  /* Check if 'feature' is present in the buffer as a separate word */
+-  while ((p = strstr(buffer, feature))) {
+-    if (p > buffer && !isspace(*(p - 1))) {
+-      buffer++;
+-      continue;
+-    }
+-    p += strlen(feature);
+-    if (*p != 0 && !isspace(*p)) {
+-      buffer++;
+-      continue;
+-    }
+-    return 1;
+-  }
+-  return 0;
+-}
+-
+-LOCAL(int)
+-parse_proc_cpuinfo (int bufsize)
+-{
+-  char *buffer = (char *)malloc(bufsize);
+-  FILE *fd;
+-  simd_support = 0;
+-
+-  if (!buffer)
+-    return 0;
+-
+-  fd = fopen("/proc/cpuinfo", "r");
+-  if (fd) {
+-    while (fgets(buffer, bufsize, fd)) {
+-      if (!strchr(buffer, '\n') && !feof(fd)) {
+-        /* "impossible" happened - insufficient size of the buffer! */
+-        fclose(fd);
+-        free(buffer);
+-        return 0;
+-      }
+-      if (check_feature(buffer, "neon"))
+-        simd_support |= JSIMD_ARM_NEON;
+-    }
+-    fclose(fd);
+-  }
+-  free(buffer);
+-  return 1;
+-}
+-
+-#endif
++extern volatile ElfW(auxv_t) *auxv;
+ 
+ /*
+  * Check what SIMD accelerations are supported.
+- *
+- * FIXME: This code is racy under a multi-threaded environment.
++ * this is called once and ONLY once from libjpeg_general_init
+  */
+-LOCAL(void)
++GLOBAL(void)
+ init_simd (void)
+ {
+-  char *env = NULL;
+-#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+-  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+-#endif
++  char *env=NULL;
++  ElfW(auxv_t) *tauxv=(ElfW(auxv_t) *)auxv;
+ 
+   if (simd_support != ~0)
+     return;
+-
+   simd_support = 0;
+ 
+-#if defined(__ARM_NEON__)
+-  simd_support |= JSIMD_ARM_NEON;
+-#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+-  /* We still have a chance to use NEON regardless of globally used
+-   * -mcpu/-mfpu options passed to gcc by performing runtime detection via
+-   * /proc/cpuinfo parsing on linux/android */
+-  while (!parse_proc_cpuinfo(bufsize)) {
+-    bufsize *= 2;
+-    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+-      break;
+-  }
+-#endif
++  while (tauxv && tauxv->a_type != AT_HWCAP)
++    tauxv++;
+ 
++  if( tauxv && tauxv->a_un.a_val & HWCAP_NEON ) 
++    simd_support |= JSIMD_ARM_NEON;
++  
+   /* Force different settings through environment variables */
+   env = getenv("JSIMD_FORCE_ARM_NEON");
+   if ((env != NULL) && (strcmp(env, "1") == 0))
+@@ -134,8 +66,6 @@
+ GLOBAL(int)
+ jsimd_can_rgb_ycc (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -153,16 +83,12 @@
+ GLOBAL(int)
+ jsimd_can_rgb_gray (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+ GLOBAL(int)
+ jsimd_can_ycc_rgb (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -260,16 +186,12 @@
+ GLOBAL(int)
+ jsimd_can_h2v2_downsample (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+ GLOBAL(int)
+ jsimd_can_h2v1_downsample (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+@@ -288,16 +210,12 @@
+ GLOBAL(int)
+ jsimd_can_h2v2_upsample (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+ GLOBAL(int)
+ jsimd_can_h2v1_upsample (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+@@ -320,16 +238,12 @@
+ GLOBAL(int)
+ jsimd_can_h2v2_fancy_upsample (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+ GLOBAL(int)
+ jsimd_can_h2v1_fancy_upsample (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+@@ -352,16 +266,12 @@
+ GLOBAL(int)
+ jsimd_can_h2v2_merged_upsample (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+ GLOBAL(int)
+ jsimd_can_h2v1_merged_upsample (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+@@ -384,8 +294,6 @@
+ GLOBAL(int)
+ jsimd_can_convsamp (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -405,8 +313,6 @@
+ GLOBAL(int)
+ jsimd_can_convsamp_float (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+@@ -427,16 +333,12 @@
+ GLOBAL(int)
+ jsimd_can_fdct_islow (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+ GLOBAL(int)
+ jsimd_can_fdct_ifast (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -452,8 +354,6 @@
+ GLOBAL(int)
+ jsimd_can_fdct_float (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+@@ -477,8 +377,6 @@
+ GLOBAL(int)
+ jsimd_can_quantize (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -496,8 +394,6 @@
+ GLOBAL(int)
+ jsimd_can_quantize_float (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+@@ -518,8 +414,6 @@
+ GLOBAL(int)
+ jsimd_can_idct_2x2 (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -541,8 +435,6 @@
+ GLOBAL(int)
+ jsimd_can_idct_4x4 (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -582,8 +474,6 @@
+ GLOBAL(int)
+ jsimd_can_idct_islow (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -605,8 +495,6 @@
+ GLOBAL(int)
+ jsimd_can_idct_ifast (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -630,8 +518,6 @@
+ GLOBAL(int)
+ jsimd_can_idct_float (void)
+ {
+-  init_simd();
+-
+   return 0;
+ }
+ 
+diff -uNr -x .bzr libjpeg-turbo-1.1.90+svn702/simd/jsimd_i386.c libjpeg-turbo.now/simd/jsimd_i386.c
+--- libjpeg-turbo-1.1.90+svn702/simd/jsimd_i386.c	2011-02-18 20:51:10.000000000 +0000
++++ libjpeg-turbo.now/simd/jsimd_i386.c	2011-09-22 18:25:08.000000000 +0000
+@@ -36,7 +36,7 @@
+  *
+  * FIXME: This code is racy under a multi-threaded environment.
+  */
+-LOCAL(void)
++GLOBAL(void)
+ init_simd (void)
+ {
+   char *env = NULL;
+@@ -64,8 +64,6 @@
+ GLOBAL(int)
+ jsimd_can_rgb_ycc (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -86,8 +84,6 @@
+ GLOBAL(int)
+ jsimd_can_rgb_gray (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -108,8 +104,6 @@
+ GLOBAL(int)
+ jsimd_can_ycc_rgb (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -277,8 +271,6 @@
+ GLOBAL(int)
+ jsimd_can_h2v2_downsample (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -296,8 +288,6 @@
+ GLOBAL(int)
+ jsimd_can_h2v1_downsample (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -343,8 +333,6 @@
+ GLOBAL(int)
+ jsimd_can_h2v2_upsample (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -362,8 +350,6 @@
+ GLOBAL(int)
+ jsimd_can_h2v1_upsample (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -409,8 +395,6 @@
+ GLOBAL(int)
+ jsimd_can_h2v2_fancy_upsample (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -429,8 +413,6 @@
+ GLOBAL(int)
+ jsimd_can_h2v1_fancy_upsample (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -479,8 +461,6 @@
+ GLOBAL(int)
+ jsimd_can_h2v2_merged_upsample (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -499,8 +479,6 @@
+ GLOBAL(int)
+ jsimd_can_h2v1_merged_upsample (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (BITS_IN_JSAMPLE != 8)
+     return 0;
+@@ -619,8 +597,6 @@
+ GLOBAL(int)
+ jsimd_can_convsamp (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -642,8 +618,6 @@
+ GLOBAL(int)
+ jsimd_can_convsamp_float (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -689,8 +663,6 @@
+ GLOBAL(int)
+ jsimd_can_fdct_islow (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -708,8 +680,6 @@
+ GLOBAL(int)
+ jsimd_can_fdct_ifast (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -727,8 +697,6 @@
+ GLOBAL(int)
+ jsimd_can_fdct_float (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -773,8 +741,6 @@
+ GLOBAL(int)
+ jsimd_can_quantize (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -794,8 +760,6 @@
+ GLOBAL(int)
+ jsimd_can_quantize_float (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -839,8 +803,6 @@
+ GLOBAL(int)
+ jsimd_can_idct_2x2 (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -864,8 +826,6 @@
+ GLOBAL(int)
+ jsimd_can_idct_4x4 (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -911,8 +871,6 @@
+ GLOBAL(int)
+ jsimd_can_idct_islow (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -936,8 +894,6 @@
+ GLOBAL(int)
+ jsimd_can_idct_ifast (void)
+ {
+-  init_simd();
+-
+   /* The code is optimised for these values only */
+   if (DCTSIZE != 8)
+     return 0;
+@@ -963,8 +919,6 @@
+ GLOBAL(int)
+ jsimd_can_idct_float (void)
+ {
+-  init_simd();
+-
+   if (DCTSIZE != 8)
+     return 0;
+   if (sizeof(JCOEF) != 2)
diff --git a/debian/patches/add-copying-for-debian-dir.patch b/debian/patches/add-copying-for-debian-dir.patch
new file mode 100644
index 0000000..f8d33e3
--- /dev/null
+++ b/debian/patches/add-copying-for-debian-dir.patch
@@ -0,0 +1,172 @@
+Index: libjpeg-turbo-1.1.90+svn722/COPYING
+===================================================================
+--- /dev/null	1970-01-01 00:00:00.000000000 +0000
++++ libjpeg-turbo-1.1.90+svn722/COPYING	2011-12-01 10:44:50.909771234 -0600
+@@ -0,0 +1,167 @@
++This LGPL License ONLY applies to the contents of the Debian directory and specifically excludes the contents of the debian/patches directory.
++
++Version 2.1, February 1999
++
++Copyright (C) 1991, 1999 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
++
++[This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.]
++
++Preamble
++The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users.
++
++This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below.
++
++When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things.
++
++To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it.
++
++For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights.
++
++We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library.
++
++To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others.
++
++Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license.
++
++Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs.
++
++When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library.
++
++We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances.
++
++For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License.
++
++In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system.
++
++Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library.
++
++The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run.
++
++TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
++0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you".
++
++A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables.
++
++The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".)
++
++"Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library.
++
++Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does.
++
++1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library.
++
++You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee.
++
++2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions:
++
++a) The modified work must itself be a software library.
++
++b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change.
++
++c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License.
++
++d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful.
++
++(For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.)
++
++These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it.
++
++Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library.
++
++In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License.
++
++3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices.
++
++Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy.
++
++This option is useful when you wish to copy part of the code of the Library into a program that is not a library.
++
++4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange.
++
++If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code.
++
++5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License.
++
++However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables.
++
++When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law.
++
++If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.)
++
++Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself.
++
++6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications.
++
++You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things:
++
++a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.)
++
++b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with.
++
++c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution.
++
++d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place.
++
++e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy.
++
++For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable.
++
++It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute.
++
++7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things:
++
++a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above.
++
++b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work.
++
++8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance.
++
++9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it.
++
++10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License.
++
++11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library.
++
++If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances.
++
++It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice.
++
++This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License.
++
++12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License.
++
++13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.
++
++Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation.
++
++14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally.
++
++NO WARRANTY
++
++15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
++
++16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
++
++END OF TERMS AND CONDITIONS
++How to Apply These Terms to Your New Libraries
++If you develop a new library, and you want it to be of the greatest possible use to the public, we recommend making it free software that everyone can redistribute and change. You can do so by permitting redistribution under these terms (or, alternatively, under the terms of the ordinary General Public License).
++To apply these terms, attach the following notices to the library. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found.
++
++<one line to give the library's name and an idea of what it does.> Copyright (C) <year> <name of author>
++
++This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version.
++
++This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
++
++You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++
++Also add information on how to contact you by electronic and paper mail.
++
++You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the library, if necessary. Here is a sample; alter the names:
++
++Yoyodyne, Inc., hereby disclaims all copyright interest in the library `Frob' (a library for tweaking knobs) written by James Random Hacker.
++
++signature of Ty Coon, 1 April 1990
++Ty Coon, President of Vice
++
++That's all there is to it!
diff --git a/debian/patches/debian-changes-1.1.1-1inaro2 b/debian/patches/debian-changes-1.1.1-1inaro2
deleted file mode 100644
index 0f136d4..0000000
--- a/debian/patches/debian-changes-1.1.1-1inaro2
+++ /dev/null
@@ -1,3508 +0,0 @@
-Description: Upstream changes introduced in version 1.1.1-1inaro2
- This patch has been created by dpkg-source during the package build.
- Here's the last changelog entry, hopefully it gives details on why
- those changes were made:
- .
- libjpeg-turbo (1.1.1-1inaro2) natty; urgency=low
- .
-   * release
-   * add timestamp code to cjpeg.c
-   * default cjpeg.c and djpeg.c timestamp code to off
- .
- The person named in the Author field signed this changelog entry.
-Author: Tom Gall <tom.gall@linaro.org>
-
----
-The information above should follow the Patch Tagging Guidelines, please
-checkout http://dep.debian.net/deps/dep3/ to learn about the format. Here
-are templates for supplementary fields that you might want to add:
-
-Origin: <vendor|upstream|other>, <url of original patch>
-Bug: <url in upstream bugtracker>
-Bug-Debian: http://bugs.debian.org/<bugnumber>
-Bug-Ubuntu: https://launchpad.net/bugs/<bugnumber>
-Forwarded: <no|not-needed|url proving that it has been forwarded>
-Reviewed-By: <name and email of someone who approved the patch>
-Last-Update: <YYYY-MM-DD>
-
---- libjpeg-turbo-1.1.1.orig/jdcolor.c
-+++ libjpeg-turbo-1.1.1/jdcolor.c
-@@ -159,6 +159,106 @@ ycc_rgb_convert (j_decompress_ptr cinfo,
-   }
- }
- 
-+/*
-+ * Convert cmyk to rgb
-+ */
-+METHODDEF(void)
-+cmyk_rgb_convert (j_decompress_ptr cinfo,
-+		 JSAMPIMAGE input_buf, JDIMENSION input_row,
-+		 JSAMPARRAY output_buf, int num_rows)
-+{
-+  double c, m, y, k;
-+  register JSAMPROW outptr;
-+  register JSAMPROW inptr0, inptr1, inptr2, inptr3;
-+  register JDIMENSION col;
-+
-+  JDIMENSION num_cols = cinfo->output_width;
-+
-+  while (--num_rows >= 0) {
-+    inptr0 = input_buf[0][input_row];
-+    inptr1 = input_buf[1][input_row];
-+    inptr2 = input_buf[2][input_row];
-+    inptr3 = input_buf[3][input_row];
-+    input_row++;
-+    outptr = *output_buf++;
-+    for (col = 0; col < num_cols; col++) {
-+	c = (double) GETJSAMPLE(inptr0[col]);
-+	m = (double) GETJSAMPLE(inptr1[col]);
-+	y = (double) GETJSAMPLE(inptr2[col]);
-+	k = (double) GETJSAMPLE(inptr3[col]);
-+
-+    outptr[RGB_RED] =   (JSAMPLE)(c*k/255);
-+    outptr[RGB_GREEN] = (JSAMPLE)(m*k/255);
-+    outptr[RGB_BLUE] =  (JSAMPLE)(y*k/255);
-+    outptr += RGB_PIXELSIZE;
-+    }
-+  }
-+}
-+
-+/*
-+ * Convert YCCK to RGB
-+ */
-+METHODDEF(void)
-+ycck_rgb_convert (j_decompress_ptr cinfo,
-+		   JSAMPIMAGE input_buf, JDIMENSION input_row,
-+		   JSAMPARRAY output_buf, int num_rows)
-+{
-+  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-+  double cyan, magenta, yellow, black;
-+  register int y, cb, cr;
-+  register JSAMPROW outptr;
-+  register JSAMPROW inptr0, inptr1, inptr2, inptr3;
-+  register JDIMENSION col;
-+  JDIMENSION num_cols = cinfo->output_width;
-+
-+  /* copy these pointers into registers if possible */
-+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-+  register int * Crrtab = cconvert->Cr_r_tab;
-+  register int * Cbbtab = cconvert->Cb_b_tab;
-+  register INT32 * Crgtab = cconvert->Cr_g_tab;
-+  register INT32 * Cbgtab = cconvert->Cb_g_tab;
-+  SHIFT_TEMPS
-+
-+  while (--num_rows >= 0) {
-+    inptr0 = input_buf[0][input_row];
-+    inptr1 = input_buf[1][input_row];
-+    inptr2 = input_buf[2][input_row];
-+    inptr3 = input_buf[3][input_row];
-+    input_row++;
-+    outptr = *output_buf++;
-+    for (col = 0; col < num_cols; col++) {
-+
-+
-+      /********* Read YCCK Pixel **********/ 
-+      y     = GETJSAMPLE(inptr0[col]);
-+      cb    = GETJSAMPLE(inptr1[col]);
-+      cr    = GETJSAMPLE(inptr2[col]);
-+	  black = (double)GETJSAMPLE(inptr3[col]);
-+
-+      /********* Convert  YCCK to CMYK  **********/ 
-+        /* Range-limiting is essential due to noise introduced by DCT losses. */
-+      outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];
-+      outptr[1] = range_limit[MAXJSAMPLE - (y +
-+			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS)))];
-+      outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];
-+	  /* K passes through unchanged */
-+      outptr[3] = inptr3[col];	/* don't need GETJSAMPLE here */
-+
-+	  cyan 		= (double)GETJSAMPLE(outptr[0]);
-+	  magenta 	= (double)GETJSAMPLE(outptr[1]);
-+	  yellow 	= (double)GETJSAMPLE(outptr[2]);
-+      //Black is same as in YCCK input
-+      
-+      /********* Convert  CMYK to RGB  **********/ 
-+      outptr[RGB_RED] 	= (JSAMPLE)(cyan*black/255);
-+      outptr[RGB_GREEN] = (JSAMPLE)(magenta*black/255);
-+      outptr[RGB_BLUE] 	= (JSAMPLE)(yellow*black/255);
-+
-+      outptr += RGB_PIXELSIZE;
-+    }
-+  }
-+}
-+
- 
- /**************** Cases other than YCbCr -> RGB **************/
- 
-@@ -377,6 +477,11 @@ jinit_color_deconverter (j_decompress_pt
-         cconvert->pub.color_convert = ycc_rgb_convert;
-         build_ycc_rgb_table(cinfo);
-       }
-+    } else if (cinfo->jpeg_color_space == JCS_CMYK) {
-+      cconvert->pub.color_convert = cmyk_rgb_convert;
-+    } else if (cinfo->jpeg_color_space == JCS_YCCK) {
-+      cconvert->pub.color_convert = ycck_rgb_convert;
-+      build_ycc_rgb_table(cinfo);
-     } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
-       cconvert->pub.color_convert = gray_rgb_convert;
-     } else if (cinfo->jpeg_color_space == cinfo->out_color_space &&
---- libjpeg-turbo-1.1.1.orig/cjpeg.c
-+++ libjpeg-turbo-1.1.1/cjpeg.c
-@@ -40,6 +40,28 @@
- #endif
- 
- 
-+
-+#ifdef PROFILE_ENCODING
-+#include <time.h>
-+
-+#define  TIMER_DEFINE_VARS  struct timespec starttime, endtime;
-+#define  TIMER_GETDIFF_MS() (long)( (endtime.tv_sec - starttime.tv_sec)*1000 + (endtime.tv_nsec - starttime.tv_nsec)/1000000)
-+#define  TIMER_START do { clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &starttime); } while (0)
-+#define  TIMER_STOP do { clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &endtime); } while (0)
-+#define  TIMER_PRINT(...) fprintf(stderr, __VA_ARGS__)
-+
-+#else
-+
-+#define TIMER_DEFINE_VARS do {} while (0)
-+#define TIMER_GETDIFF_MS do {} while (0)
-+#define TIMER_START do {} while (0)
-+#define TIMER_STOP do {} while (0)
-+#define TIMER_PRINT(...) do {} while (0)
-+
-+#endif
-+
-+
-+
- /* Create the add-on message string table. */
- 
- #define JMESSAGE(code,string)	string ,
-@@ -575,6 +597,10 @@ main (int argc, char **argv)
-   /* Specify data destination for compression */
-   jpeg_stdio_dest(&cinfo, output_file);
- 
-+
-+  TIMER_DEFINE_VARS;
-+  TIMER_START;
-+
-   /* Start compressor */
-   jpeg_start_compress(&cinfo, TRUE);
- 
-@@ -589,6 +615,10 @@ main (int argc, char **argv)
-   jpeg_finish_compress(&cinfo);
-   jpeg_destroy_compress(&cinfo);
- 
-+
-+  TIMER_STOP;
-+  TIMER_PRINT("Encoding took %d ms\n", TIMER_GETDIFF_MS());
-+
-   /* Close files, if we opened them */
-   if (input_file != stdin)
-     fclose(input_file);
---- libjpeg-turbo-1.1.1.orig/configure.ac
-+++ libjpeg-turbo-1.1.1/configure.ac
-@@ -19,6 +19,7 @@ AC_PROG_CC
- AC_PROG_INSTALL
- AC_PROG_LIBTOOL
- AC_PROG_LN_S
-+AM_PROG_AS
- 
- # Check whether compiler supports pointers to undefined structures
- AC_MSG_CHECKING(whether compiler supports pointers to undefined structures)
-@@ -164,7 +165,7 @@ if test "x$VERSION_SCRIPT_FLAG" = "x"; t
-   VERSION_SCRIPT=no
-   AC_MSG_RESULT(no)
- fi
--LDFLAGS="$SAVED_LDFLAGS"
-+LDFLAGS="$SAVED_LDFLAGS -lrt"
- 
- AC_MSG_CHECKING([whether to use version script when building libjpeg-turbo])
- AC_MSG_RESULT($VERSION_SCRIPT)
-@@ -213,6 +214,10 @@ if test "x${with_simd}" != "xno"; then
-       AC_PROG_NASM
-       simd_arch=i386
-     ;;
-+    arm*)
-+      AC_MSG_RESULT([yes (ARM_NEON)])
-+      simd_arch=arm_neon
-+   ;;
-     *)
-       AC_MSG_RESULT([no ("$host_cpu")])
-       AC_MSG_WARN([SIMD support not available for this CPU.  Performance will suffer.])
-@@ -228,6 +233,7 @@ fi
- AM_CONDITIONAL([WITH_SIMD], [test "x$with_simd" != "xno"])
- AM_CONDITIONAL([SIMD_I386], [test "x$simd_arch" = "xi386"])
- AM_CONDITIONAL([SIMD_X86_64], [test "x$simd_arch" = "xx86_64"])
-+AM_CONDITIONAL([SIMD_ARM_NEON], [test "x$simd_arch" = "xarm_neon"])
- AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"])
- 
- case "$host_cpu" in
-@@ -239,6 +245,10 @@ case "$host_cpu" in
-     RPMARCH=i386
-     DEBARCH=i386
-     ;;
-+  armv7l)
-+    RPMARCH=armel
-+    DEBARCH=armel
-+    ;;
- esac
- 
- AC_SUBST(RPMARCH)
---- libjpeg-turbo-1.1.1.orig/djpeg.c
-+++ libjpeg-turbo-1.1.1/djpeg.c
-@@ -50,6 +50,24 @@ static const char * const cdjpeg_message
-   NULL
- };
- 
-+#ifdef PROFILE_DECODING
-+#include <time.h>
-+
-+#define  TIMER_DEFINE_VARS  struct timespec starttime, endtime;
-+#define  TIMER_GETDIFF_MS() (long)( (endtime.tv_sec - starttime.tv_sec)*1000 + (endtime.tv_nsec - starttime.tv_nsec)/1000000)
-+#define  TIMER_START do { clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &starttime); } while (0)
-+#define  TIMER_STOP do { clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &endtime); } while (0)
-+#define  TIMER_PRINT(...) fprintf(stderr, __VA_ARGS__)
-+
-+#else
-+
-+#define TIMER_DEFINE_VARS do {} while (0)
-+#define TIMER_GETDIFF_MS do {} while (0)
-+#define TIMER_START do {} while (0)
-+#define TIMER_STOP do {} while (0)
-+#define TIMER_PRINT(...) do {} while (0)
-+
-+#endif
- 
- /*
-  * This list defines the known output image formats
-@@ -539,6 +557,11 @@ main (int argc, char **argv)
-   /* Adjust default decompression parameters by re-parsing the options */
-   file_index = parse_switches(&cinfo, argc, argv, 0, TRUE);
- 
-+  if ((cinfo.jpeg_color_space == JCS_CMYK) ||
-+      (cinfo.jpeg_color_space == JCS_YCCK)) {
-+    cinfo.out_color_space = JCS_RGB;
-+  }
-+
-   /* Initialize the output module now to let it override any crucial
-    * option settings (for instance, GIF wants to force color quantization).
-    */
-@@ -583,6 +606,9 @@ main (int argc, char **argv)
-   /* Write output file header */
-   (*dest_mgr->start_output) (&cinfo, dest_mgr);
- 
-+  TIMER_DEFINE_VARS;
-+  TIMER_START;
-+
-   /* Process data */
-   while (cinfo.output_scanline < cinfo.output_height) {
-     num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
-@@ -590,6 +616,9 @@ main (int argc, char **argv)
-     (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
-   }
- 
-+  TIMER_STOP;
-+  TIMER_PRINT ("Decoding took %d ms\n", TIMER_GETDIFF_MS());
-+
- #ifdef PROGRESS_REPORT
-   /* Hack: count final pass as done in case finish_output does an extra pass.
-    * The library won't have updated completed_passes.
---- /dev/null
-+++ libjpeg-turbo-1.1.1/simd/jsimdcfg.inc
-@@ -0,0 +1,69 @@
-+;
-+; Automatically generated include file from jsimdcfg.inc.h
-+;
-+;
-+; -- jpeglib.h
-+;
-+%define DCTSIZE 8
-+%define DCTSIZE2 64
-+;
-+; -- jmorecfg.h
-+;
-+%define RGB_RED 0
-+%define RGB_GREEN 1
-+%define RGB_BLUE 2
-+%define RGB_PIXELSIZE 3
-+; Representation of a single sample (pixel element value).
-+; On this SIMD implementation, this must be 'unsigned char'.
-+;
-+%define JSAMPLE byte ; unsigned char
-+%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
-+%define CENTERJSAMPLE 128
-+; Representation of a DCT frequency coefficient.
-+; On this SIMD implementation, this must be 'short'.
-+;
-+%define JCOEF word ; short
-+%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
-+; Datatype used for image dimensions.
-+; On this SIMD implementation, this must be 'unsigned int'.
-+;
-+%define JDIMENSION dword ; unsigned int
-+%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
-+%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
-+%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
-+%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
-+%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
-+%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
-+%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
-+%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
-+%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
-+;
-+; -- jdct.h
-+;
-+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
-+; the DCT is to be performed in-place in that buffer.
-+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
-+;
-+%define DCTELEM word ; short
-+%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
-+%define float FP32 ; float
-+%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)
-+; To maximize parallelism, Type short is changed to short.
-+;
-+%define ISLOW_MULT_TYPE word ; must be short
-+%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
-+%define IFAST_MULT_TYPE word ; must be short
-+%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
-+%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
-+%define FLOAT_MULT_TYPE FP32 ; must be float
-+%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
-+;
-+; -- jsimd.h
-+;
-+%define JSIMD_NONE 0x00
-+%define JSIMD_MMX 0x01
-+%define JSIMD_3DNOW 0x02
-+%define JSIMD_SSE 0x04
-+%define JSIMD_SSE2 0x08
-+; Short forms of external names for systems with brain-damaged linkers.
-+;
---- /dev/null
-+++ libjpeg-turbo-1.1.1/simd/jdcolor-armv7.s
-@@ -0,0 +1,1741 @@
-+/*------------------------------------------------------------------------
-+* jdcolor-armv7.s
-+*
-+*  Copyright (c) 2010, Code Aurora Forum. All rights reserved.
-+*
-+*  Redistribution and use in source and binary forms, with or without
-+*  modification, are permitted provided that the following conditions are
-+*  met:
-+*      * Redistributions of source code must retain the above copyright
-+*        notice, this list of conditions and the following disclaimer.
-+*      * Redistributions in binary form must reproduce the above
-+*        copyright notice, this list of conditions and the following
-+*        disclaimer in the documentation and/or other materials provided
-+*        with the distribution.
-+*      * Neither the name of Code Aurora Forum, Inc. nor the names of its
-+*        contributors may be used to endorse or promote products derived
-+*        from this software without specific prior written permission.
-+*
-+*  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
-+*  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+*  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
-+*  ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-+*  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+*  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+*  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+*  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-+*  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-+*  OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
-+*  IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*--------------------------------------------------------------------------
-+
-+*--------------------------------------------------------------------------
-+*                         FUNCTION LIST
-+*--------------------------------------------------------------------------
-+*
-+* - yvup2rgb565_venum
-+* - yyvup2rgb565_venum
-+* - yvup2bgr888_venum
-+* - yyvup2bgr888_venum
-+* - yvup2abgr8888_venum
-+* - yyvup2abgr8888_venum
-+*
-+*--------------------------------------------------------------------------
-+*/
-+
-+    .section yvu_plain_to_rgb565, "x"  @ AREA
-+    .text                              @ |.text|, CODE, READONLY
-+    .align 2
-+    .code  32                          @ CODE32
-+
-+/*-----------------------------------------------------------------------------
-+ *   ARM Registers
-+ * ---------------------------------------------------------------------------- */
-+p_y       .req r0
-+p_cr      .req r1
-+p_cb      .req r2
-+p_rgb     .req r3
-+p_bgr     .req r3
-+length    .req r12
-+
-+    .global yvup2rgb565_venum
-+    .global yyvup2rgb565_venum
-+    .global yvup2bgr888_venum
-+    .global yyvup2bgr888_venum
-+    .global yvup2abgr8888_venum
-+    .global yyvup2abgr8888_venum
-+
-+@ coefficients in color conversion matrix multiplication
-+.equ COEFF_Y,          256             @ contribution of Y
-+.equ COEFF_V_RED,      359             @ contribution of V for red
-+.equ COEFF_U_GREEN,    -88             @ contribution of U for green
-+.equ COEFF_V_GREEN,   -183             @ contribution of V for green
-+.equ COEFF_U_BLUE,     454             @ contribution of U for blue
-+
-+@ Clamping constants 0x0 and 0xFF
-+.equ COEFF_0,          0
-+.equ COEFF_255,        255
-+
-+@ Bias coefficients for red, green and blue
-+.equ COEFF_BIAS_R,   -45824            @ Red   bias =     -359*128 + 128
-+.equ COEFF_BIAS_G,    34816            @ Green bias = (88+183)*128 + 128
-+.equ COEFF_BIAS_B,   -57984            @ Blue  bias =     -454*128 + 128
-+
-+
-+/*--------------------------------------------------------------------------
-+* FUNCTION     : yvup2rgb565_venum
-+*--------------------------------------------------------------------------
-+* DESCRIPTION  : Perform YVU planar to RGB565 conversion.
-+*--------------------------------------------------------------------------
-+* C PROTOTYPE  : void yvup2rgb565_venum(uint8_t  *p_y,
-+*                                 uint8_t  *p_cr,
-+*                                 uint8_t  *p_cb,
-+*                                 uint8_t  *p_rgb565,
-+*                                 uint32_t  length)
-+*--------------------------------------------------------------------------
-+* REG INPUT    : R0: uint8_t  *p_y
-+*                      pointer to the input Y Line
-+*                R1: uint8_t  *p_cr
-+*                      pointer to the input Cr Line
-+*                R2: uint8_t  *p_cb
-+*                      pointer to the input Cb Line
-+*                R3: uint8_t  *p_rgb565
-+*                      pointer to the output RGB Line
-+*                R12: uint32_t  length
-+*                      width of Line
-+*--------------------------------------------------------------------------
-+* STACK ARG    : None
-+*--------------------------------------------------------------------------
-+* REG OUTPUT   : None
-+*--------------------------------------------------------------------------
-+* MEM INPUT    : p_y      - a line of Y pixels
-+*                p_cr     - a line of Cr pixels
-+*                p_cb     - a line of Cb pixels
-+*                length   - the width of the input line
-+*--------------------------------------------------------------------------
-+* MEM OUTPUT   : p_rgb565 - the converted rgb pixels
-+*--------------------------------------------------------------------------
-+* REG AFFECTED : ARM:  R0-R4, R12
-+*                NEON: Q0-Q15
-+*--------------------------------------------------------------------------
-+* STACK USAGE  : none
-+*--------------------------------------------------------------------------
-+* CYCLES       : none
-+*
-+*--------------------------------------------------------------------------
-+* NOTES        :
-+*--------------------------------------------------------------------------
-+*/
-+.type yvup2rgb565_venum, %function
-+yvup2rgb565_venum:
-+    /*-------------------------------------------------------------------------
-+     *  Store stack registers
-+     * ------------------------------------------------------------------------ */
-+    STMFD SP!, {LR}
-+
-+    PLD [R0, R3]                       @ preload luma line
-+
-+    ADR   R12, constants
-+
-+    VLD1.S16  {D6, D7}, [R12]!         @ D6, D7: 359 |  -88 | -183 | 454 | 256 | 0 | 255 | 0
-+    VLD1.S32  {D30, D31}, [R12]        @ Q15   :  -45824    |    34816   |  -57984 |     X
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load the 5th parameter via stack
-+     *  R0 ~ R3 are used to pass the first 4 parameters, the 5th and above
-+     *  parameters are passed via stack
-+     * ------------------------------------------------------------------------ */
-+    LDR R12, [SP, #4]                  @ LR is the only one that has been pushed
-+                                       @ into stack, increment SP by 4 to
-+                                       @ get the parameter.
-+                                       @ LDMIB SP, {R12} is an equivalent
-+                                       @ instruction in this case, where only
-+                                       @ one register was pushed into stack.
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load clamping parameters to duplicate vector elements
-+     * ------------------------------------------------------------------------ */
-+    VDUP.S16  Q4,  D7[1]               @ Q4:  0  |  0  |  0  |  0  |  0  |  0  |  0  |  0
-+    VDUP.S16  Q5,  D7[2]               @ Q5: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
-+
-+    /*-------------------------------------------------------------------------
-+     *  Read bias
-+     * ------------------------------------------------------------------------ */
-+    VDUP.S32  Q0,   D30[0]             @ Q0:  -45824 | -45824 | -45824 | -45824
-+    VDUP.S32  Q1,   D30[1]             @ Q1:   34816 |  34816 |  34816 |  34816
-+    VDUP.S32  Q2,   D31[0]             @ Q2:  -70688 | -70688 | -70688 | -70688
-+
-+
-+    /*-------------------------------------------------------------------------
-+     *  The main loop
-+     * ------------------------------------------------------------------------ */
-+loop_yvup2rgb565:
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load input from Y, V and U
-+     *  D12  : Y0  Y1  Y2  Y3  Y4  Y5  Y6  Y7
-+     *  D14  : V0  V1  V2  V3  V4  V5  V6  V7
-+     *  D15  : U0  U1  U2  U3  U4  U5  U6  U7
-+     * ------------------------------------------------------------------------ */
-+    VLD1.U8  {D12},  [p_y]!            @ Load 8 Y  elements (uint8) to D12
-+    VLD1.U8  {D14},  [p_cr]!           @ Load 8 Cr elements (uint8) to D14
-+    VLD1.U8  {D15},  [p_cb]!           @ Load 8 Cb elements (uint8) to D15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Expand uint8 value to uint16
-+     *  D18, D19: Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
-+     *  D20, D21: V0 V1 V2 V3 V4 V5 V6 V7
-+     *  D22, D23: U0 U1 U2 U3 U4 U5 U6 U7
-+     * ------------------------------------------------------------------------ */
-+    VMOVL.U8 Q9,  D12
-+    VMOVL.U8 Q10, D14
-+    VMOVL.U8 Q11, D15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Multiply contribution from chrominance, results are in 32-bit
-+     * ------------------------------------------------------------------------ */
-+    VMULL.S16  Q12, D20, D6[0]         @ Q12:  359*(V0,V1,V2,V3)     Red
-+    VMULL.S16  Q13, D22, D6[1]         @ Q13:  -88*(U0,U1,U2,U3)     Green
-+    VMLAL.S16  Q13, D20, D6[2]         @ Q13:  -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
-+    VMULL.S16  Q14, D22, D6[3]         @ Q14:  454*(U0,U1,U2,U3)     Blue
-+
-+    /*-------------------------------------------------------------------------
-+     *  Add bias
-+     * ------------------------------------------------------------------------ */
-+    VADD.S32  Q12, Q0                  @ Q12 add Red   bias -45824
-+    VADD.S32  Q13, Q1                  @ Q13 add Green bias  34816
-+    VADD.S32  Q14, Q2                  @ Q14 add Blue  bias -57984
-+
-+    /*-------------------------------------------------------------------------
-+     *  Calculate Red, Green, Blue
-+     * ------------------------------------------------------------------------ */
-+    VMLAL.S16  Q12, D18, D7[0]         @ Q12: R0, R1, R2, R3 in 32-bit Q8 format
-+    VMLAL.S16  Q13, D18, D7[0]         @ Q13: G0, G1, G2, G3 in 32-bit Q8 format
-+    VMLAL.S16  Q14, D18, D7[0]         @ Q14: B0, B1, B2, B3 in 32-bit Q8 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Right shift eight bits with rounding
-+     * ------------------------------------------------------------------------ */
-+    VSHRN.S32   D18 , Q12, #8          @ D18: R0, R1, R2, R3 in 16-bit Q0 format
-+    VSHRN.S32   D20 , Q13, #8          @ D20: G0, G1, G2, G3 in 16-bit Q0 format
-+    VSHRN.S32   D22,  Q14, #8          @ D22: B0, B1, B2, B3 in 16-bit Q0 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Done with the first 4 elements, continue on the next 4 elements
-+     * ------------------------------------------------------------------------ */
-+
-+    /*-------------------------------------------------------------------------
-+     *  Multiply contribution from chrominance, results are in 32-bit
-+     * ------------------------------------------------------------------------ */
-+    VMULL.S16  Q12, D21, D6[0]         @ Q12:  359*(V0,V1,V2,V3)     Red
-+    VMULL.S16  Q13, D23, D6[1]         @ Q13:  -88*(U0,U1,U2,U3)     Green
-+    VMLAL.S16  Q13, D21, D6[2]         @ Q13:  -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
-+    VMULL.S16  Q14, D23, D6[3]         @ Q14:  454*(U0,U1,U2,U3)     Blue
-+
-+    /*-------------------------------------------------------------------------
-+     *  Add bias
-+     * ------------------------------------------------------------------------ */
-+    VADD.S32  Q12, Q0                  @ Q12 add Red   bias -45824
-+    VADD.S32  Q13, Q1                  @ Q13 add Green bias  34816
-+    VADD.S32  Q14, Q2                  @ Q14 add Blue  bias -57984
-+
-+    /*-------------------------------------------------------------------------
-+     *  Calculate Red, Green, Blue
-+     * ------------------------------------------------------------------------ */
-+    VMLAL.S16  Q12, D19, D7[0]         @ Q12: R0, R1, R2, R3 in 32-bit Q8 format
-+    VMLAL.S16  Q13, D19, D7[0]         @ Q13: G0, G1, G2, G3 in 32-bit Q8 format
-+    VMLAL.S16  Q14, D19, D7[0]         @ Q14: B0, B1, B2, B3 in 32-bit Q8 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Right shift eight bits with rounding
-+     * ------------------------------------------------------------------------ */
-+    VSHRN.S32   D19 , Q12, #8          @ D18: R0, R1, R2, R3 in 16-bit Q0 format
-+    VSHRN.S32   D21 , Q13, #8          @ D20: G0, G1, G2, G3 in 16-bit Q0 format
-+    VSHRN.S32   D23,  Q14, #8          @ D22: B0, B1, B2, B3 in 16-bit Q0 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Clamp the value to be within [0~255]
-+     * ------------------------------------------------------------------------ */
-+    VMAX.S16  Q9, Q9, Q4               @ if Q9 <   0, Q9 =   0
-+    VMIN.S16  Q9, Q9, Q5               @ if Q9 > 255, Q9 = 255
-+    VQMOVUN.S16  D28, Q9               @ store Red to D28, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q10, Q10, Q4             @ if Q10 <   0, Q10 =   0
-+    VMIN.S16  Q10, Q10, Q5             @ if Q10 > 255, Q10 = 255
-+    VQMOVUN.S16   D27, Q10             @ store Green to D27, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q11, Q11, Q4             @ if Q11 <   0, Q11 =   0
-+    VMIN.S16  Q11, Q11, Q5             @ if Q11 > 255, Q11 = 255
-+    VQMOVUN.S16   D26, Q11             @ store Blue to D26, narrow the value from int16 to int8.
-+
-+    /*-------------------------------------------------------------------------
-+     *  D27:  3 bits of Green + 5 bits of Blue
-+     *  D28:  5 bits of Red   + 3 bits of Green
-+     * ------------------------------------------------------------------------ */
-+    VSRI.8   D28, D27, #5              @ right shift G by 5 and insert to R
-+    VSHL.U8  D27, D27, #3              @ left  shift G by 3
-+    VSRI.8   D27, D26, #3              @ right shift B by 3 and insert to G
-+
-+    SUBS length, length, #8            @ check if the length is less than 8
-+
-+    BMI  trailing_yvup2rgb565          @ jump to trailing processing if remaining length is less than 8
-+
-+    VST2.U8  {D27, D28}, [p_rgb]!      @ vector store Red, Green, Blue to destination
-+                                       @ Blue at LSB
-+
-+    BHI loop_yvup2rgb565               @ loop if more than 8 pixels left
-+
-+    BEQ  end_yvup2rgb565               @ done if exactly 8 pixel processed in the loop
-+
-+
-+trailing_yvup2rgb565:
-+    /*-------------------------------------------------------------------------
-+     *  There are from 1 ~ 7 pixels left in the trailing part.
-+     *  First adding 7 to the length so the length would be from 0 ~ 6.
-+     *  eg: 1 pixel left in the trailing part, so 1-8+7 = 0.
-+     *  Then save 1 pixel unconditionally since at least 1 pixels left in the
-+     *  trailing part.
-+     * ------------------------------------------------------------------------ */
-+    ADDS length, length, #7            @ there are 7 or less in the trailing part
-+
-+    VST2.U8 {D27[0], D28[0]}, [p_rgb]! @ at least 1 pixel left in the trailing part
-+    BEQ  end_yvup2rgb565               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST2.U8 {D27[1], D28[1]}, [p_rgb]! @ store one more pixel
-+    BEQ  end_yvup2rgb565               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST2.U8 {D27[2], D28[2]}, [p_rgb]! @ store one more pixel
-+    BEQ  end_yvup2rgb565               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST2.U8 {D27[3], D28[3]}, [p_rgb]! @ store one more pixel
-+    BEQ  end_yvup2rgb565               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST2.U8 {D27[4], D28[4]}, [p_rgb]! @ store one more pixel
-+    BEQ  end_yvup2rgb565               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST2.U8 {D27[5], D28[5]}, [p_rgb]! @ store one more pixel
-+    BEQ  end_yvup2rgb565               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST2.U8 {D27[6], D28[6]}, [p_rgb]! @ store one more pixel
-+
-+end_yvup2rgb565:
-+    LDMFD SP!, {PC}
-+
-+                                       @ end of yvup2rgb565
-+
-+
-+/*--------------------------------------------------------------------------
-+* FUNCTION     : yyvup2rgb565_venum
-+*--------------------------------------------------------------------------
-+* DESCRIPTION  : Perform YYVU planar to RGB565 conversion.
-+*--------------------------------------------------------------------------
-+* C PROTOTYPE  : void yyvup2rgb565_venum(uint8_t  *p_y,
-+*                                 uint8_t  *p_cr,
-+*                                 uint8_t  *p_cb,
-+*                                 uint8_t  *p_rgb565,
-+*                                 uint32_t  length)
-+*--------------------------------------------------------------------------
-+* REG INPUT    : R0: uint8_t  *p_y
-+*                      pointer to the input Y Line
-+*                R1: uint8_t  *p_cr
-+*                      pointer to the input Cr Line
-+*                R2: uint8_t  *p_cb
-+*                      pointer to the input Cb Line
-+*                R3: uint8_t  *p_rgb565
-+*                      pointer to the output RGB Line
-+*                R12: uint32_t  length
-+*                      width of Line
-+*--------------------------------------------------------------------------
-+* STACK ARG    : None
-+*--------------------------------------------------------------------------
-+* REG OUTPUT   : None
-+*--------------------------------------------------------------------------
-+* MEM INPUT    : p_y      - a line of Y pixels
-+*                p_cr     - a line of Cr pixels
-+*                p_cb     - a line of Cb pixels
-+*                length   - the width of the input line
-+*--------------------------------------------------------------------------
-+* MEM OUTPUT   : p_rgb565 - the converted rgb pixels
-+*--------------------------------------------------------------------------
-+* REG AFFECTED : ARM:  R0-R4, R12
-+*                NEON: Q0-Q15
-+*--------------------------------------------------------------------------
-+* STACK USAGE  : none
-+*--------------------------------------------------------------------------
-+* CYCLES       : none
-+*
-+*--------------------------------------------------------------------------
-+* NOTES        :
-+*--------------------------------------------------------------------------
-+*/
-+.type yyvup2rgb565_venum, %function
-+yyvup2rgb565_venum:
-+    /*-------------------------------------------------------------------------
-+     *  Store stack registers
-+     * ------------------------------------------------------------------------ */
-+    STMFD SP!, {LR}
-+
-+    PLD [R0, R3]                       @ preload luma line
-+
-+    ADR   R12, constants
-+
-+    VLD1.S16  {D6, D7}, [R12]!         @ D6, D7: 359 |  -88 | -183 | 454 | 256 | 0 | 255 | 0
-+    VLD1.S32  {D30, D31}, [R12]        @ Q15   :  -45824    |    34816   |  -57984 |     X
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load the 5th parameter via stack
-+     *  R0 ~ R3 are used to pass the first 4 parameters, the 5th and above
-+     *  parameters are passed via stack
-+     * ------------------------------------------------------------------------ */
-+    LDR R12, [SP, #4]                  @ LR is the only one that has been pushed
-+                                       @ into stack, increment SP by 4 to
-+                                       @ get the parameter.
-+                                       @ LDMIB SP, {R12} is an equivalent
-+                                       @ instruction in this case, where only
-+                                       @ one register was pushed into stack.
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load clamping parameters to duplicate vector elements
-+     * ------------------------------------------------------------------------ */
-+    VDUP.S16  Q4,  D7[1]               @ Q4:  0  |  0  |  0  |  0  |  0  |  0  |  0  |  0
-+    VDUP.S16  Q5,  D7[2]               @ Q5: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
-+
-+    /*-------------------------------------------------------------------------
-+     *  Read bias
-+     * ------------------------------------------------------------------------ */
-+    VDUP.S32  Q0,   D30[0]             @ Q0:  -45824 | -45824 | -45824 | -45824
-+    VDUP.S32  Q1,   D30[1]             @ Q1:   34816 |  34816 |  34816 |  34816
-+    VDUP.S32  Q2,   D31[0]             @ Q2:  -70688 | -70688 | -70688 | -70688
-+
-+
-+    /*-------------------------------------------------------------------------
-+     *  The main loop
-+     * ------------------------------------------------------------------------ */
-+loop_yyvup2rgb565:
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load input from Y, V and U
-+     *  D12, D13: Y0 Y2 Y4 Y6 Y8 Y10 Y12 Y14, Y1 Y3 Y5 Y7 Y9 Y11 Y13 Y15
-+     *  D14     : V0 V1 V2 V3 V4 V5  V6  V7
-+     *  D15     : U0 U1 U2 U3 U4 U5  U6  U7
-+     * ------------------------------------------------------------------------ */
-+    VLD2.U8  {D12,D13}, [p_y]!         @ Load 16 Luma elements (uint8) to D12, D13
-+    VLD1.U8  {D14},     [p_cr]!        @ Load 8 Cr elements (uint8) to D14
-+    VLD1.U8  {D15},     [p_cb]!        @ Load 8 Cb elements (uint8) to D15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Expand uint8 value to uint16
-+     *  D24, D25: Y0 Y2 Y4 Y6 Y8 Y10 Y12 Y14
-+     *  D26, D27: Y1 Y3 Y5 Y7 Y9 Y11 Y13 Y15
-+     *  D28, D29: V0 V1 V2 V3 V4 V5  V6  V7
-+     *  D30, D31: U0 U1 U2 U3 U4 U5  U6  U7
-+     * ------------------------------------------------------------------------ */
-+    VMOVL.U8 Q12, D12
-+    VMOVL.U8 Q13, D13
-+    VMOVL.U8 Q14, D14
-+    VMOVL.U8 Q15, D15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Multiply contribution from chrominance, results are in 32-bit
-+     * ------------------------------------------------------------------------ */
-+    VMULL.S16  Q6, D28, D6[0]          @ Q6:  359*(V0,V1,V2,V3)     Red
-+    VMULL.S16  Q7, D30, D6[1]          @ Q7:  -88*(U0,U1,U2,U3)     Green
-+    VMLAL.S16  Q7, D28, D6[2]          @ q7:  -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
-+    VMULL.S16  Q8, D30, D6[3]          @ q8:  454*(U0,U1,U2,U3)     Blue
-+
-+    /*-------------------------------------------------------------------------
-+     *  Add bias
-+     * ------------------------------------------------------------------------ */
-+    VADD.S32  Q6, Q0                   @ Q6 add Red   bias -45824
-+    VADD.S32  Q7, Q1                   @ Q7 add Green bias  34816
-+    VADD.S32  Q8, Q2                   @ Q8 add Blue  bias -57984
-+
-+    /*-------------------------------------------------------------------------
-+     *  Calculate Red, Green, Blue
-+     * ------------------------------------------------------------------------ */
-+    VMOV.S32   Q9, Q6
-+    VMLAL.S16  Q6, D24, D7[0]          @ Q6: R0, R2, R4, R6 in 32-bit Q8 format
-+    VMLAL.S16  Q9, D26, D7[0]          @ Q9: R1, R3, R5, R7 in 32-bit Q8 format
-+
-+    VMOV.S32   Q10, Q7
-+    VMLAL.S16  Q7,  D24, D7[0]         @ Q7:  G0, G2, G4, G6 in 32-bit Q8 format
-+    VMLAL.S16  Q10, D26, D7[0]         @ Q10: G1, G3, G5, G7 in 32-bit Q8 format
-+
-+    VMOV.S32   Q11, Q8
-+    VMLAL.S16  Q8,  D24, D7[0]         @ Q8:  B0, B2, B4, B6 in 32-bit Q8 format
-+    VMLAL.S16  Q11, D26, D7[0]         @ Q11: B1, B3, B5, B7 in 32-bit Q8 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Right shift eight bits with rounding
-+     * ------------------------------------------------------------------------ */
-+    VSHRN.S32   D12, Q6,  #8           @ D12: R0 R2 R4 R6 in 16-bit Q0 format
-+    VSHRN.S32   D13, Q9,  #8           @ D13: R1 R3 R5 R7 in 16-bit Q0 format
-+    VZIP.16     D12, D13               @ Q6 : R0 R1 R2 R3 R4 R5 R6 R7
-+
-+    VSHRN.S32   D18, Q7,  #8           @ D18: G0 G2 G4 G6 in 16-bit Q0 format
-+    VSHRN.S32   D19, Q10, #8           @ D19: G1 G3 G5 G7 in 16-bit Q0 format
-+    VZIP.16     D18, D19               @ Q9 : G0 G1 G2 G3 G4 G5 G6 G7
-+
-+    VSHRN.S32   D20, Q8,  #8           @ D20: B0 B2 B4 B6 in 16-bit Q0 format
-+    VSHRN.S32   D21, Q11, #8           @ D21: B1 B3 B5 B7 in 16-bit Q0 format
-+    VZIP.16     D20, D21               @ Q10: B0 B1 B2 B3 B4 B5 B6 B7
-+
-+    /*-------------------------------------------------------------------------
-+     *  Clamp the value to be within [0~255]
-+     * ------------------------------------------------------------------------ */
-+    VMAX.S16  Q6, Q6, Q4               @ if Q6 <   0, Q6 =   0
-+    VMIN.S16  Q6, Q6, Q5               @ if Q6 > 255, Q6 = 255
-+    VQMOVUN.S16  D23, Q6               @ store Red to D23, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q9, Q9, Q4               @ if Q9 <   0, Q9 =   0
-+    VMIN.S16  Q9, Q9, Q5               @ if Q9 > 255, Q9 = 255
-+    VQMOVUN.S16  D22, Q9               @ store Green to D22, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q10, Q10, Q4             @ if Q10 <   0, Q10 =   0
-+    VMIN.S16  Q10, Q10, Q5             @ if Q10 > 255, Q10 = 255
-+    VQMOVUN.S16   D21, Q10             @ store Blue to D21, narrow the value from int16 to int8
-+
-+    /*-------------------------------------------------------------------------
-+     *  D22:  3 bits of Green + 5 bits of Blue
-+     *  D23:  5 bits of Red   + 3 bits of Green
-+     * ------------------------------------------------------------------------ */
-+    VSRI.8   D23, D22, #5              @ right shift G by 5 and insert to R
-+    VSHL.U8  D22, D22, #3              @ left shift G by 3
-+    VSRI.8   D22, D21, #3              @ right shift B by 3 and insert to G
-+
-+    SUBS length, length, #8            @ check if the length is less than 8
-+
-+    BMI  trailing_yyvup2rgb565         @ jump to trailing processing if remaining length is less than 8
-+
-+    VST2.U8  {D22,D23}, [p_rgb]!       @ vector store Red, Green, Blue to destination
-+                                       @ Blue at LSB
-+
-+    BEQ  end_yyvup2rgb565              @ done if exactly 8 pixel processed in the loop
-+
-+
-+    /*-------------------------------------------------------------------------
-+     *  Done with the first 8 elements, continue on the next 8 elements
-+     * ------------------------------------------------------------------------ */
-+
-+    /*-------------------------------------------------------------------------
-+     *  Multiply contribution from chrominance, results are in 32-bit
-+     * ------------------------------------------------------------------------ */
-+    VMULL.S16  Q6, D29, D6[0]          @ Q6: 359*(V4,V5,V6,V7)       Red
-+    VMULL.S16  Q7, D31, D6[1]          @ Q7: -88*(U4,U5,U6,U7)      Green
-+    VMLAL.S16  Q7, D29, D6[2]          @ Q7: -88*(U4,U5,U6,U7) - 183*(V4,V5,V6,V7)
-+    VMULL.S16  Q8, D31, D6[3]          @ Q8: 454*(U4,U5,U6,U7)       Blue
-+
-+    /*-------------------------------------------------------------------------
-+     *  Add bias
-+     * ------------------------------------------------------------------------ */
-+    VADD.S32  Q6, Q0                   @ Q6 add Red   bias -45824
-+    VADD.S32  Q7, Q1                   @ Q7 add Green bias  34816
-+    VADD.S32  Q8, Q2                   @ Q8 add Blue  bias -57984
-+
-+    /*-------------------------------------------------------------------------
-+     *  Calculate Red, Green, Blue
-+     * ------------------------------------------------------------------------ */
-+    VMOV.S32   Q9, Q6
-+    VMLAL.S16  Q6, D25, D7[0]          @ Q6: R8 R10 R12 R14 in 32-bit Q8 format
-+    VMLAL.S16  Q9, D27, D7[0]          @ Q9: R9 R11 R13 R15 in 32-bit Q8 format
-+
-+    VMOV.S32   Q10, Q7
-+    VMLAL.S16  Q7,  D25, D7[0]         @ Q7: G0, G2, G4, G6 in 32-bit Q8 format
-+    VMLAL.S16  Q10, D27, D7[0]         @ Q10 : G1, G3, G5, G7 in 32-bit Q8 format
-+
-+    VMOV.S32   Q11, Q8
-+    VMLAL.S16  Q8,  D25, D7[0]         @ Q8: B0, B2, B4, B6 in 32-bit Q8 format
-+    VMLAL.S16  Q11, D27, D7[0]         @ Q11 : B1, B3, B5, B7 in 32-bit Q8 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Right shift eight bits with rounding
-+     * ------------------------------------------------------------------------ */
-+    VSHRN.S32   D12, Q6,  #8           @ D12: R8 R10 R12 R14 in 16-bit Q0 format
-+    VSHRN.S32   D13, Q9,  #8           @ D13: R9 R11 R13 R15 in 16-bit Q0 format
-+    VZIP.16     D12, D13               @ Q6: R8 R9 R10 R11 R12 R13 R14 R15
-+
-+    VSHRN.S32   D18, Q7,  #8           @ D18: G8 G10 G12 G14 in 16-bit Q0 format
-+    VSHRN.S32   D19, Q10, #8           @ D19: G9 G11 G13 G15 in 16-bit Q0 format
-+    VZIP.16     D18, D19               @ Q9:  G8 G9 G10 G11 G12 G13 G14 G15
-+
-+    VSHRN.S32   D20, Q8,  #8           @ D20: B8 B10 B12 B14 in 16-bit Q0 format
-+    VSHRN.S32   D21, Q11, #8           @ D21: B9 B11 B13 B15 in 16-bit Q0 format
-+    VZIP.16     D20, D21               @ Q10: B8 B9 B10 B11 B12 B13 B14 B15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Clamp the value to be within [0~255]
-+     * ------------------------------------------------------------------------ */
-+    VMAX.S16  Q6, Q6, Q4               @ if Q6 <   0, Q6 =   0
-+    VMIN.S16  Q6, Q6, Q5               @ if Q6 > 255, Q6 = 255
-+    VQMOVUN.S16  D23, Q6               @ store Red to D23, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q9, Q9, Q4               @ if Q9 <   0, Q9 =   0
-+    VMIN.S16  Q9, Q9, Q5               @ if Q9 > 255, Q9 = 255
-+    VQMOVUN.S16  D22, Q9               @ store Green to D22, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q10, Q10, Q4             @ if Q10 <   0, Q10 =   0
-+    VMIN.S16  Q10, Q10, Q5             @ if Q10 > 255, Q10 = 255
-+    VQMOVUN.S16   D21, Q10             @ store Blue to D21, narrow the value from int16 to int8
-+
-+    /*-------------------------------------------------------------------------
-+     *  D22:  3 bits of Green + 5 bits of Blue
-+     *  D23:  5 bits of Red   + 3 bits of Green
-+     * ------------------------------------------------------------------------ */
-+    VSRI.8   D23, D22, #5              @ right shift G by 5 and insert to R
-+    VSHL.U8  D22, D22, #3              @ left shift G by 3
-+    VSRI.8   D22, D21, #3              @ right shift B by 3 and insert to G
-+
-+    SUBS length, length, #8            @ check if the length is less than 8
-+
-+    BMI  trailing_yyvup2rgb565         @ jump to trailing processing if remaining length is less than 8
-+
-+    VST2.U8  {D22,D23}, [p_rgb]!       @ vector store Red, Green, Blue to destination
-+                                       @ Blue at LSB
-+
-+    BHI loop_yyvup2rgb565              @ loop if more than 8 pixels left
-+
-+    BEQ  end_yyvup2rgb565              @ done if exactly 8 pixel processed in the loop
-+
-+
-+trailing_yyvup2rgb565:
-+    /*-------------------------------------------------------------------------
-+     *  There are from 1 ~ 7 pixels left in the trailing part.
-+     *  First adding 7 to the length so the length would be from 0 ~ 6.
-+     *  eg: 1 pixel left in the trailing part, so 1-8+7 = 0.
-+     *  Then save 1 pixel unconditionally since at least 1 pixels left in the
-+     *  trailing part.
-+     * ------------------------------------------------------------------------ */
-+    ADDS length, length, #7            @ there are 7 or less in the trailing part
-+
-+    VST2.U8 {D22[0],D23[0]}, [p_rgb]!  @ at least 1 pixel left in the trailing part
-+    BEQ end_yyvup2rgb565               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST2.U8 {D22[1],D23[1]}, [p_rgb]!  @ store one more pixel
-+    BEQ end_yyvup2rgb565               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST2.U8 {D22[2],D23[2]}, [p_rgb]!  @ store one more pixel
-+    BEQ end_yyvup2rgb565               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST2.U8 {D22[3],D23[3]}, [p_rgb]!  @ store one more pixel
-+    BEQ end_yyvup2rgb565               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST2.U8 {D22[4],D23[4]}, [p_rgb]!  @ store one more pixel
-+    BEQ end_yyvup2rgb565               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST2.U8 {D22[5],D23[5]}, [p_rgb]!  @ store one more pixel
-+    BEQ end_yyvup2rgb565               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST2.U8 {D22[6],D23[6]}, [p_rgb]!  @ store one more pixel
-+
-+end_yyvup2rgb565:
-+    LDMFD SP!, {PC}
-+
-+                                       @ end of yyvup2rgb565
-+
-+constants:
-+    .hword (COEFF_V_RED),  (COEFF_U_GREEN), (COEFF_V_GREEN), (COEFF_U_BLUE) @   359  | -88   |  -183  | 454
-+    .hword (COEFF_Y),      (COEFF_0),       (COEFF_255)    , (COEFF_0)      @   256  |   0   |   255  |  0
-+    .word  (COEFF_BIAS_R), (COEFF_BIAS_G),  (COEFF_BIAS_B)                  @ -45824 | 34816 | -57984 |  X
-+
-+/*--------------------------------------------------------------------------
-+* FUNCTION     : yvup2bgr888_venum
-+*--------------------------------------------------------------------------
-+* DESCRIPTION  : Perform YVU planar to BGR888 conversion.
-+*--------------------------------------------------------------------------
-+* C PROTOTYPE  : void yvup2bgr888_venum(uint8_t  *p_y,
-+*                                 uint8_t  *p_cr,
-+*                                 uint8_t  *p_cb,
-+*                                 uint8_t  *p_bgr888,
-+*                                 uint32_t  length)
-+*--------------------------------------------------------------------------
-+* REG INPUT    : R0: uint8_t  *p_y
-+*                      pointer to the input Y Line
-+*                R1: uint8_t  *p_cr
-+*                      pointer to the input Cr Line
-+*                R2: uint8_t  *p_cb
-+*                      pointer to the input Cb Line
-+*                R3: uint8_t  *p_bgr888
-+*                      pointer to the output BGR Line
-+*                R12: uint32_t  length
-+*                      width of Line
-+*--------------------------------------------------------------------------
-+* STACK ARG    : None
-+*--------------------------------------------------------------------------
-+* REG OUTPUT   : None
-+*--------------------------------------------------------------------------
-+* MEM INPUT    : p_y      - a line of Y pixels
-+*                p_cr     - a line of Cr pixels
-+*                p_cb     - a line of Cb pixels
-+*                length   - the width of the input line
-+*--------------------------------------------------------------------------
-+* MEM OUTPUT   : p_bgr888 - the converted bgr pixels
-+*--------------------------------------------------------------------------
-+* REG AFFECTED : ARM:  R0-R4, R12
-+*                NEON: Q0-Q15
-+*--------------------------------------------------------------------------
-+* STACK USAGE  : none
-+*--------------------------------------------------------------------------
-+* CYCLES       : none
-+*
-+*--------------------------------------------------------------------------
-+* NOTES        :
-+*--------------------------------------------------------------------------
-+*/
-+.type yvup2bgr888_venum, %function
-+yvup2bgr888_venum:
-+
-+    /*-------------------------------------------------------------------------
-+     *  Store stack registers
-+     * ------------------------------------------------------------------------ */
-+    STMFD SP!, {LR}
-+
-+    PLD [R0, R3]                      @ preload luma line
-+
-+    ADR   R12, constants
-+
-+    VLD1.S16  {D6, D7}, [R12]!        @ D6, D7: 359 | -88 | -183 | 454 | 256 | 0 | 255 | 0
-+    VLD1.S32  {D30, D31}, [R12]       @ Q15   :  -45824   |    34816   |  -57984 |     X
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load the 5th parameter via stack
-+     *  R0 ~ R3 are used to pass the first 4 parameters, the 5th and above
-+     *  parameters are passed via stack
-+     * ------------------------------------------------------------------------ */
-+    LDR R12, [SP, #4]                 @ LR is the only one that has been pushed
-+                                      @ into stack, increment SP by 4 to
-+                                      @ get the parameter.
-+                                      @ LDMIB SP, {R12} is an equivalent
-+                                      @ instruction in this case, where only
-+                                      @ one register was pushed into stack.
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load clamping parameters to duplicate vector elements
-+     * ------------------------------------------------------------------------ */
-+    VDUP.S16  Q4,  D7[1]              @ Q4:  0  |  0  |  0  |  0  |  0  |  0  |  0  |  0
-+    VDUP.S16  Q5,  D7[2]              @ Q5: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
-+
-+    /*-------------------------------------------------------------------------
-+     *  Read bias
-+     * ------------------------------------------------------------------------ */
-+    VDUP.S32  Q0,   D30[0]            @ Q0:  -45824 | -45824 | -45824 | -45824
-+    VDUP.S32  Q1,   D30[1]            @ Q1:   34816 |  34816 |  34816 |  34816
-+    VDUP.S32  Q2,   D31[0]            @ Q2:  -57984 | -57984 | -57984 | -57984
-+
-+
-+    /*-------------------------------------------------------------------------
-+     *  The main loop
-+     * ------------------------------------------------------------------------ */
-+loop_yvup2bgr888:
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load input from Y, V and U
-+     *  D12  : Y0  Y1  Y2  Y3  Y4  Y5  Y6  Y7
-+     *  D14  : V0  V1  V2  V3  V4  V5  V6  V7
-+     *  D15  : U0  U1  U2  U3  U4  U5  U6  U7
-+     * ------------------------------------------------------------------------ */
-+    VLD1.U8  {D12},  [p_y]!           @ Load 8 Luma elements (uint8) to D12
-+    VLD1.U8  {D14},  [p_cr]!          @ Load 8 Cr elements (uint8) to D14
-+    VLD1.U8  {D15},  [p_cb]!          @ Load 8 Cb elements (uint8) to D15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Expand uint8 value to uint16
-+     *  D18, D19: Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
-+     *  D20, D21: V0 V1 V2 V3 V4 V5 V6 V7
-+     *  D22, D23: U0 U1 U2 U3 U4 U5 U6 U7
-+     * ------------------------------------------------------------------------ */
-+    VMOVL.U8 Q9,  D12
-+    VMOVL.U8 Q10, D14
-+    VMOVL.U8 Q11, D15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Multiply contribution from chrominance, results are in 32-bit
-+     * ------------------------------------------------------------------------ */
-+    VMULL.S16  Q12, D20, D6[0]        @ Q12:  359*(V0,V1,V2,V3)     Red
-+    VMULL.S16  Q13, D22, D6[1]        @ Q13:  -88*(U0,U1,U2,U3)     Green
-+    VMLAL.S16  Q13, D20, D6[2]        @ Q13:  -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
-+    VMULL.S16  Q14, D22, D6[3]        @ Q14:  454*(U0,U1,U2,U3)     Blue
-+
-+    /*-------------------------------------------------------------------------
-+     *  Add bias
-+     * ------------------------------------------------------------------------ */
-+    VADD.S32  Q12, Q0                 @ Q12 add Red   bias -45824
-+    VADD.S32  Q13, Q1                 @ Q13 add Green bias  34816
-+    VADD.S32  Q14, Q2                 @ Q14 add Blue  bias -57984
-+
-+    /*-------------------------------------------------------------------------
-+     *  Calculate Red, Green, Blue
-+     * ------------------------------------------------------------------------ */
-+    VMLAL.S16  Q12, D18, D7[0]        @ Q12: R0, R1, R2, R3 in 32-bit Q8 format
-+    VMLAL.S16  Q13, D18, D7[0]        @ Q13: G0, G1, G2, G3 in 32-bit Q8 format
-+    VMLAL.S16  Q14, D18, D7[0]        @ Q14: B0, B1, B2, B3 in 32-bit Q8 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Right shift eight bits with rounding
-+     * ------------------------------------------------------------------------ */
-+    VSHRN.S32   D18 , Q12, #8         @ D18: R0, R1, R2, R3 in 16-bit Q0 format
-+    VSHRN.S32   D20 , Q13, #8         @ D20: G0, G1, G2, G3 in 16-bit Q0 format
-+    VSHRN.S32   D22,  Q14, #8         @ D22: B0, B1, B2, B3 in 16-bit Q0 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Done with the first 4 elements, continue on the next 4 elements
-+     * ------------------------------------------------------------------------ */
-+
-+    /*-------------------------------------------------------------------------
-+     *  Multiply contribution from chrominance, results are in 32-bit
-+     * ------------------------------------------------------------------------ */
-+    VMULL.S16  Q12, D21, D6[0]        @ Q12:  359*(V0,V1,V2,V3)     Red
-+    VMULL.S16  Q13, D23, D6[1]        @ Q13:  -88*(U0,U1,U2,U3)     Green
-+    VMLAL.S16  Q13, D21, D6[2]        @ Q13:  -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
-+    VMULL.S16  Q14, D23, D6[3]        @ Q14:  454*(U0,U1,U2,U3)     Blue
-+
-+    /*-------------------------------------------------------------------------
-+     *  Add bias
-+     * ------------------------------------------------------------------------ */
-+    VADD.S32  Q12, Q0                 @ Q12 add Red   bias -45824
-+    VADD.S32  Q13, Q1                 @ Q13 add Green bias  34816
-+    VADD.S32  Q14, Q2                 @ Q14 add Blue  bias -57984
-+
-+    /*-------------------------------------------------------------------------
-+     *  Calculate Red, Green, Blue
-+     * ------------------------------------------------------------------------ */
-+    VMLAL.S16  Q12, D19, D7[0]        @ Q12: R0, R1, R2, R3 in 32-bit Q8 format
-+    VMLAL.S16  Q13, D19, D7[0]        @ Q13: G0, G1, G2, G3 in 32-bit Q8 format
-+    VMLAL.S16  Q14, D19, D7[0]        @ Q14: B0, B1, B2, B3 in 32-bit Q8 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Right shift eight bits with rounding
-+     * ------------------------------------------------------------------------ */
-+    VSHRN.S32   D19 , Q12, #8         @ D18: R0, R1, R2, R3 in 16-bit Q0 format
-+    VSHRN.S32   D21 , Q13, #8         @ D20: G0, G1, G2, G3 in 16-bit Q0 format
-+    VSHRN.S32   D23,  Q14, #8         @ D22: B0, B1, B2, B3 in 16-bit Q0 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Clamp the value to be within [0~255]
-+     * ------------------------------------------------------------------------ */
-+    VMAX.S16  Q11, Q11, Q4            @ if Q11 <   0, Q11 =   0
-+    VMIN.S16  Q11, Q11, Q5            @ if Q11 > 255, Q11 = 255
-+    VQMOVUN.S16   D28, Q11            @ store Blue to D28, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q10, Q10, Q4            @ if Q10 <   0, Q10 =   0
-+    VMIN.S16  Q10, Q10, Q5            @ if Q10 > 255, Q10 = 255
-+    VQMOVUN.S16   D27, Q10            @ store Green to D27, narrow the value from int16 to int8
-+
-+    VMAX.S16    Q9, Q9, Q4            @ if Q9 <   0, Q9 =   0
-+    VMIN.S16    Q9, Q9, Q5            @ if Q9 > 255, Q9 = 255
-+    VQMOVUN.S16    D26, Q9            @ store Red to D26, narrow the value from int16 to int8.
-+
-+    SUBS length, length, #8           @ check if the length is less than 8
-+
-+    BMI  trailing_yvup2bgr888         @ jump to trailing processing if remaining length is less than 8
-+
-+    VST3.U8  {D26,D27,D28}, [p_bgr]!  @ vector store Red, Green, Blue to destination
-+                                      @ Blue at LSB
-+
-+    BHI loop_yvup2bgr888              @ loop if more than 8 pixels left
-+
-+    BEQ  end_yvup2bgr888              @ done if exactly 8 pixel processed in the loop
-+
-+
-+trailing_yvup2bgr888:
-+    /*-------------------------------------------------------------------------
-+     *  There are from 1 ~ 7 pixels left in the trailing part.
-+     *  First adding 7 to the length so the length would be from 0 ~ 6.
-+     *  eg: 1 pixel left in the trailing part, so 1-8+7 = 0.
-+     *  Then save 1 pixel unconditionally since at least 1 pixels left in the
-+     *  trailing part.
-+     * ------------------------------------------------------------------------ */
-+    ADDS length, length, #7           @ there are 7 or less in the trailing part
-+
-+    VST3.U8 {D26[0], D27[0], D28[0]}, [p_bgr]! @ at least 1 pixel left in the trailing part
-+    BEQ  end_yvup2bgr888                       @ done if 0 pixel left
-+
-+    SUBS length, length, #1           @ update length counter
-+    VST3.U8 {D26[1], D27[1], D28[1]}, [p_bgr]!  @ store one more pixel
-+    BEQ  end_yvup2bgr888                        @ done if 0 pixel left
-+
-+    SUBS length, length, #1           @ update length counter
-+    VST3.U8 {D26[2], D27[2], D28[2]}, [p_bgr]!  @ store one more pixel
-+    BEQ  end_yvup2bgr888                        @ done if 0 pixel left
-+
-+    SUBS length, length, #1           @ update length counter
-+    VST3.U8 {D26[3], D27[3], D28[3]}, [p_bgr]!  @ store one more pixel
-+    BEQ  end_yvup2bgr888                        @ done if 0 pixel left
-+
-+    SUBS length, length, #1           @ update length counter
-+    VST3.U8 {D26[4], D27[4], D28[4]}, [p_bgr]!  @ store one more pixel
-+    BEQ  end_yvup2bgr888                        @ done if 0 pixel left
-+
-+    SUBS length, length, #1           @ update length counter
-+    VST3.U8 {D26[5], D27[5], D28[5]}, [p_bgr]!  @ store one more pixel
-+    BEQ  end_yvup2bgr888                        @ done if 0 pixel left
-+
-+    SUBS length, length, #1           @ update length counter
-+    VST3.U8 {D26[6], D27[6], D28[6]}, [p_bgr]!  @ store one more pixel
-+
-+end_yvup2bgr888:
-+    LDMFD SP!, {PC}
-+
-+                                      @ end of yvup2bgr888
-+
-+
-+/*-------------------------------------------------------------------------
-+* FUNCTION     : yyvup2bgr888_venum
-+*--------------------------------------------------------------------------
-+* DESCRIPTION  : Perform YYVU planar to BGR888 conversion.
-+*--------------------------------------------------------------------------
-+* C PROTOTYPE  : void yyvup2bgr888_venum(uint8_t  *p_y,
-+*                                 uint8_t  *p_cr,
-+*                                 uint8_t  *p_cb,
-+*                                 uint8_t  *p_bgr888,
-+*                                 uint32_t  length)
-+*--------------------------------------------------------------------------
-+* REG INPUT    : R0: uint8_t  *p_y
-+*                      pointer to the input Y Line
-+*                R1: uint8_t  *p_cr
-+*                      pointer to the input Cr Line
-+*                R2: uint8_t  *p_cb
-+*                      pointer to the input Cb Line
-+*                R3: uint8_t  *p_bgr888
-+*                      pointer to the output BGR Line
-+*                R12: uint32_t  length
-+*                      width of Line
-+*--------------------------------------------------------------------------
-+* STACK ARG    : None
-+*--------------------------------------------------------------------------
-+* REG OUTPUT   : None
-+*--------------------------------------------------------------------------
-+* MEM INPUT    : p_y      - a line of Y pixels
-+*                p_cr     - a line of Cr pixels
-+*                p_cb     - a line of Cb pixels
-+*                length   - the width of the input line
-+*--------------------------------------------------------------------------
-+* MEM OUTPUT   : p_bgr888 - the converted bgr pixels
-+*--------------------------------------------------------------------------
-+* REG AFFECTED : ARM:  R0-R4, R12
-+*                NEON: Q0-Q15
-+*--------------------------------------------------------------------------
-+* STACK USAGE  : none
-+*--------------------------------------------------------------------------
-+* CYCLES       : none
-+*
-+*--------------------------------------------------------------------------
-+* NOTES        :
-+*--------------------------------------------------------------------------
-+*/
-+.type yyvup2bgr888_venum, %function
-+yyvup2bgr888_venum:
-+    /*-------------------------------------------------------------------------
-+     *  Store stack registers
-+     * ------------------------------------------------------------------------ */
-+    STMFD SP!, {LR}
-+
-+    PLD [R0, R3]                       @ preload luma line
-+
-+    ADR   R12, constants
-+
-+    VLD1.S16  {D6, D7}, [R12]!         @ D6, D7: 359 | -88 | -183 | 454 | 256 | 0 | 255 | 0
-+    VLD1.S32  {D30, D31}, [R12]        @ Q15   :  -45824   |    34816   |  -57984 |     X
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load the 5th parameter via stack
-+     *  R0 ~ R3 are used to pass the first 4 parameters, the 5th and above
-+     *  parameters are passed via stack
-+     * ------------------------------------------------------------------------ */
-+    LDR R12, [SP, #4]                  @ LR is the only one that has been pushed
-+                                       @ into stack, increment SP by 4 to
-+                                       @ get the parameter.
-+                                       @ LDMIB SP, {R12} is an equivalent
-+                                       @ instruction in this case, where only
-+                                       @ one register was pushed into stack.
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load clamping parameters to duplicate vector elements
-+     * ------------------------------------------------------------------------ */
-+    VDUP.S16  Q4,  D7[1]               @ Q4:  0  |  0  |  0  |  0  |  0  |  0  |  0  |  0
-+    VDUP.S16  Q5,  D7[2]               @ Q5: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
-+
-+    /*-------------------------------------------------------------------------
-+     *  Read bias
-+     * ------------------------------------------------------------------------ */
-+    VDUP.S32  Q0,   D30[0]             @ Q0:  -45824 | -45824 | -45824 | -45824
-+    VDUP.S32  Q1,   D30[1]             @ Q1:   34816 |  34816 |  34816 |  34816
-+    VDUP.S32  Q2,   D31[0]             @ Q2:  -70688 | -70688 | -70688 | -70688
-+
-+
-+    /*-------------------------------------------------------------------------
-+     *  The main loop
-+     * ------------------------------------------------------------------------ */
-+loop_yyvup2bgr888:
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load input from Y, V and U
-+     *  D12, D13: Y0 Y2 Y4 Y6 Y8 Y10 Y12 Y14, Y1 Y3 Y5 Y7 Y9 Y11 Y13 Y15
-+     *  D14  : V0  V1  V2  V3  V4  V5  V6  V7
-+     *  D15  : U0  U1  U2  U3  U4  U5  U6  U7
-+     * ------------------------------------------------------------------------ */
-+    VLD2.U8  {D12,D13}, [p_y]!         @ Load 16 Luma elements (uint8) to D12, D13
-+    VLD1.U8  {D14},  [p_cr]!           @ Load 8 Cr elements (uint8) to D14
-+    VLD1.U8  {D15},  [p_cb]!           @ Load 8 Cb elements (uint8) to D15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Expand uint8 value to uint16
-+     *  D24, D25: Y0 Y2 Y4 Y6 Y8 Y10 Y12 Y14
-+     *  D26, D27: Y1 Y3 Y5 Y7 Y9 Y11 Y13 Y15
-+     *  D28, D29: V0 V1 V2 V3 V4 V5  V6  V7
-+     *  D30, D31: U0 U1 U2 U3 U4 U5  U6  U7
-+     * ------------------------------------------------------------------------ */
-+    VMOVL.U8 Q12, D12
-+    VMOVL.U8 Q13, D13
-+    VMOVL.U8 Q14, D14
-+    VMOVL.U8 Q15, D15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Multiply contribution from chrominance, results are in 32-bit
-+     * ------------------------------------------------------------------------ */
-+    VMULL.S16  Q6, D28, D6[0]          @ Q6:  359*(V0,V1,V2,V3)     Red
-+    VMULL.S16  Q7, D30, D6[1]          @ Q7: -88*(U0,U1,U2,U3)     Green
-+    VMLAL.S16  Q7, D28, D6[2]          @ q7: -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
-+    VMULL.S16  Q8, D30, D6[3]          @ q8:  454*(U0,U1,U2,U3)     Blue
-+
-+    /*-------------------------------------------------------------------------
-+     *  Add bias
-+     * ------------------------------------------------------------------------ */
-+    VADD.S32  Q6, Q0                   @ Q6 add Red   bias -45824
-+    VADD.S32  Q7, Q1                   @ Q7 add Green bias  34816
-+    VADD.S32  Q8, Q2                   @ Q8 add Blue  bias -57984
-+
-+    /*-------------------------------------------------------------------------
-+     *  Calculate Red, Green, Blue
-+     * ------------------------------------------------------------------------ */
-+    VMOV.S32   Q9, Q6
-+    VMLAL.S16  Q6, D24, D7[0]          @ Q6: R0, R2, R4, R6 in 32-bit Q8 format
-+    VMLAL.S16  Q9, D26, D7[0]          @ Q9: R1, R3, R5, R7 in 32-bit Q8 format
-+
-+    VMOV.S32   Q10, Q7
-+    VMLAL.S16  Q7,  D24, D7[0]         @ Q7:  G0, G2, G4, G6 in 32-bit Q8 format
-+    VMLAL.S16  Q10, D26, D7[0]         @ Q10: G1, G3, G5, G7 in 32-bit Q8 format
-+
-+    VMOV.S32   Q11, Q8
-+    VMLAL.S16  Q8,  D24, D7[0]         @ Q8:  B0, B2, B4, B6 in 32-bit Q8 format
-+    VMLAL.S16  Q11, D26, D7[0]         @ Q11: B1, B3, B5, B7 in 32-bit Q8 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Right shift eight bits with rounding
-+     * ------------------------------------------------------------------------ */
-+    VSHRN.S32   D12, Q6,  #8           @ D12: R0 R2 R4 R6 in 16-bit Q0 format
-+    VSHRN.S32   D13, Q9,  #8           @ D13: R1 R3 R5 R7 in 16-bit Q0 format
-+    VZIP.16     D12, D13               @ Q6 : R0 R1 R2 R3 R4 R5 R6 R7
-+
-+    VSHRN.S32   D18, Q7,  #8           @ D18: G0 G2 G4 G6 in 16-bit Q0 format
-+    VSHRN.S32   D19, Q10, #8           @ D19: G1 G3 G5 G7 in 16-bit Q0 format
-+    VZIP.16     D18, D19               @ Q9 : G0 G1 G2 G3 G4 G5 G6 G7
-+
-+    VSHRN.S32   D20, Q8,  #8           @ D20: B0 B2 B4 B6 in 16-bit Q0 format
-+    VSHRN.S32   D21, Q11, #8           @ D21: B1 B3 B5 B7 in 16-bit Q0 format
-+    VZIP.16     D20, D21               @ Q10: B0 B1 B2 B3 B4 B5 B6 B7
-+
-+    /*-------------------------------------------------------------------------
-+     *  Clamp the value to be within [0~255]
-+     * ------------------------------------------------------------------------ */
-+    VMAX.S16  Q10, Q10, Q4             @ if Q10 <   0, Q10 =   0
-+    VMIN.S16  Q10, Q10, Q5             @ if Q10 > 255, Q10 = 255
-+    VQMOVUN.S16   D23, Q10             @ store Blue to D23, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q9, Q9, Q4               @ if Q9 <   0, Q9 =   0
-+    VMIN.S16  Q9, Q9, Q5               @ if Q9 > 255, Q9 = 255
-+    VQMOVUN.S16  D22, Q9               @ store Green to D22, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q6, Q6, Q4               @ if Q6 <   0, Q6 =   0
-+    VMIN.S16  Q6, Q6, Q5               @ if Q6 > 255, Q6 = 255
-+    VQMOVUN.S16  D21, Q6               @ store Red to D21, narrow the value from int16 to int8
-+
-+    SUBS length, length, #8            @ check if the length is less than 8
-+
-+    BMI  trailing_yyvup2bgr888         @ jump to trailing processing if remaining length is less than 8
-+
-+    VST3.U8  {D21,D22,D23}, [p_bgr]!   @ vector store Blue, Green, Red to destination
-+                                       @ Red at LSB
-+
-+    BEQ  end_yyvup2bgr888              @ done if exactly 8 pixel processed in the loop
-+
-+    /*-------------------------------------------------------------------------
-+     *  Done with the first 8 elements, continue on the next 8 elements
-+     * ------------------------------------------------------------------------ */
-+
-+    /*-------------------------------------------------------------------------
-+     *  Multiply contribution from chrominance, results are in 32-bit
-+     * ------------------------------------------------------------------------ */
-+    VMULL.S16  Q6, D29, D6[0]          @ Q6: 359*(V4,V5,V6,V7)       Red
-+    VMULL.S16  Q7, D31, D6[1]          @ Q7: -88*(U4,U5,U6,U7)      Green
-+    VMLAL.S16  Q7, D29, D6[2]          @ Q7: -88*(U4,U5,U6,U7) - 183*(V4,V5,V6,V7)
-+    VMULL.S16  Q8, D31, D6[3]          @ Q8: 454*(U4,U5,U6,U7)       Blue
-+
-+    /*-------------------------------------------------------------------------
-+     *  Add bias
-+     * ------------------------------------------------------------------------ */
-+    VADD.S32  Q6, Q0                   @ Q6 add Red   bias -45824
-+    VADD.S32  Q7, Q1                   @ Q7 add Green bias  34816
-+    VADD.S32  Q8, Q2                   @ Q8 add Blue  bias -70688
-+
-+    /*-------------------------------------------------------------------------
-+     *  Calculate Red, Green, Blue
-+     * ------------------------------------------------------------------------ */
-+    VMOV.S32   Q9, Q6
-+    VMLAL.S16  Q6, D25, D7[0]          @ Q6: R8 R10 R12 R14 in 32-bit Q8 format
-+    VMLAL.S16  Q9, D27, D7[0]          @ Q9: R9 R11 R13 R15 in 32-bit Q8 format
-+
-+    VMOV.S32   Q10, Q7
-+    VMLAL.S16  Q7,  D25, D7[0]         @ Q7: G0, G2, G4, G6 in 32-bit Q8 format
-+    VMLAL.S16  Q10, D27, D7[0]         @ Q10 : G1, G3, G5, G7 in 32-bit Q8 format
-+
-+    VMOV.S32   Q11, Q8
-+    VMLAL.S16  Q8,  D25, D7[0]         @ Q8: B0, B2, B4, B6 in 32-bit Q8 format
-+    VMLAL.S16  Q11, D27, D7[0]         @ Q11 : B1, B3, B5, B7 in 32-bit Q8 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Right shift eight bits with rounding
-+     * ------------------------------------------------------------------------ */
-+    VSHRN.S32   D12, Q6,  #8           @ D12: R8 R10 R12 R14 in 16-bit Q0 format
-+    VSHRN.S32   D13, Q9,  #8           @ D13: R9 R11 R13 R15 in 16-bit Q0 format
-+    VZIP.16     D12, D13               @ Q6: R8 R9 R10 R11 R12 R13 R14 R15
-+
-+    VSHRN.S32   D18, Q7,  #8           @ D18: G8 G10 G12 G14 in 16-bit Q0 format
-+    VSHRN.S32   D19, Q10, #8           @ D19: G9 G11 G13 G15 in 16-bit Q0 format
-+    VZIP.16     D18, D19               @ Q9:  G8 G9 G10 G11 G12 G13 G14 G15
-+
-+    VSHRN.S32   D20, Q8,  #8           @ D20: B8 B10 B12 B14 in 16-bit Q0 format
-+    VSHRN.S32   D21, Q11, #8           @ D21: B9 B11 B13 B15 in 16-bit Q0 format
-+    VZIP.16     D20, D21               @ Q10: B8 B9 B10 B11 B12 B13 B14 B15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Clamp the value to be within [0~255]
-+     * ------------------------------------------------------------------------ */
-+    VMAX.S16  Q10, Q10, Q4             @ if Q10 <   0, Q10 =   0
-+    VMIN.S16  Q10, Q10, Q5             @ if Q10 > 255, Q10 = 255
-+    VQMOVUN.S16   D23, Q10             @ store Blue to D23, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q9, Q9, Q4               @ if Q9 <   0, Q9 =   0
-+    VMIN.S16  Q9, Q9, Q5               @ if Q9 > 255, Q9 = 255
-+    VQMOVUN.S16  D22, Q9               @ store Green to D22, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q6, Q6, Q4               @ if Q6 <   0, Q6 =   0
-+    VMIN.S16  Q6, Q6, Q5               @ if Q6 > 255, Q6 = 255
-+    VQMOVUN.S16  D21, Q6               @ store Red to D21, narrow the value from int16 to int8
-+
-+
-+    SUBS length, length, #8            @ check if the length is less than 8
-+
-+    BMI  trailing_yyvup2bgr888         @ jump to trailing processing if remaining length is less than 8
-+
-+    VST3.U8  {D21,D22,D23}, [p_bgr]!   @ vector store Blue, Green, Red to destination
-+                                       @ Red at LSB
-+
-+    BHI loop_yyvup2bgr888              @ loop if more than 8 pixels left
-+
-+    BEQ  end_yyvup2bgr888              @ done if exactly 8 pixel processed in the loop
-+
-+
-+trailing_yyvup2bgr888:
-+    /*-------------------------------------------------------------------------
-+     *  There are from 1 ~ 7 pixels left in the trailing part.
-+     *  First adding 7 to the length so the length would be from 0 ~ 6.
-+     *  eg: 1 pixel left in the trailing part, so 1-8+7 = 0.
-+     *  Then save 1 pixel unconditionally since at least 1 pixels left in the
-+     *  trailing part.
-+     * ------------------------------------------------------------------------ */
-+    ADDS length, length, #7            @ there are 7 or less in the trailing part
-+
-+    VST3.U8 {D21[0],D22[0],D23[0]}, [p_bgr]! @ at least 1 pixel left in the trailing part
-+    BEQ end_yyvup2bgr888               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST3.U8 {D21[1],D22[1],D23[1]}, [p_bgr]!  @ store one more pixel
-+    BEQ end_yyvup2bgr888               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST3.U8 {D21[2],D22[2],D23[2]}, [p_bgr]!  @ store one more pixel
-+    BEQ end_yyvup2bgr888               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST3.U8 {D21[3],D22[3],D23[3]}, [p_bgr]!  @ store one more pixel
-+    BEQ end_yyvup2bgr888               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST3.U8 {D21[4],D22[4],D23[4]}, [p_bgr]!  @ store one more pixel
-+    BEQ end_yyvup2bgr888               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST3.U8 {D21[5],D22[5],D23[5]}, [p_bgr]!  @ store one more pixel
-+    BEQ end_yyvup2bgr888               @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST3.U8 {D21[6],D22[6],D23[6]}, [p_bgr]!  @ store one more pixel
-+
-+end_yyvup2bgr888:
-+    LDMFD SP!, {PC}
-+
-+                                       @ end of yyvup2bgr888
-+
-+/*--------------------------------------------------------------------------
-+* FUNCTION     : yvup2abgr8888_venum
-+*--------------------------------------------------------------------------
-+* DESCRIPTION  : Perform YVU planar to ABGR8888 conversion.
-+*--------------------------------------------------------------------------
-+* C PROTOTYPE  : void yvup2abgr8888_venum(uint8_t  *p_y,
-+*                                 uint8_t  *p_cr,
-+*                                 uint8_t  *p_cb,
-+*                                 uint8_t  *p_abgr8888,
-+*                                 uint32_t  length)
-+*--------------------------------------------------------------------------
-+* REG INPUT    : R0: uint8_t  *p_y
-+*                      pointer to the input Y Line
-+*                R1: uint8_t  *p_cr
-+*                      pointer to the input Cr Line
-+*                R2: uint8_t  *p_cb
-+*                      pointer to the input Cb Line
-+*                R3: uint8_t  *p_abgr8888
-+*                      pointer to the output ABGR Line
-+*                R12: uint32_t  length
-+*                      width of Line
-+*--------------------------------------------------------------------------
-+* STACK ARG    : None
-+*--------------------------------------------------------------------------
-+* REG OUTPUT   : None
-+*--------------------------------------------------------------------------
-+* MEM INPUT    : p_y      - a line of Y pixels
-+*                p_cr     - a line of Cr pixels
-+*                p_cb     - a line of Cb pixels
-+*                length   - the width of the input line
-+*--------------------------------------------------------------------------
-+* MEM OUTPUT   : p_abgr8888 - the converted ABGR pixels
-+*--------------------------------------------------------------------------
-+* REG AFFECTED : ARM:  R0-R4, R12
-+*                NEON: Q0-Q15
-+*--------------------------------------------------------------------------
-+* STACK USAGE  : none
-+*--------------------------------------------------------------------------
-+* CYCLES       : none
-+*
-+*--------------------------------------------------------------------------
-+* NOTES        :
-+*--------------------------------------------------------------------------
-+*/
-+.type yvup2abgr8888_venum, %function
-+yvup2abgr8888_venum:
-+    /*-------------------------------------------------------------------------
-+     *  Store stack registers
-+     * ------------------------------------------------------------------------ */
-+    STMFD SP!, {LR}
-+
-+    PLD [R0, R3]                       @ preload luma line
-+
-+    ADR   R12, constants
-+
-+    VLD1.S16  {D6, D7}, [R12]!         @ D6, D7: 359 |  -88 | -183 | 454 | 256 | 0 | 255 | 0
-+    VLD1.S32  {D30, D31}, [R12]        @ Q15   :  -45824    |    34816   |  -57984 |     X
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load the 5th parameter via stack
-+     *  R0 ~ R3 are used to pass the first 4 parameters, the 5th and above
-+     *  parameters are passed via stack
-+     * ------------------------------------------------------------------------ */
-+    LDR R12, [SP, #4]                  @ LR is the only one that has been pushed
-+                                       @ into stack, increment SP by 4 to
-+                                       @ get the parameter.
-+                                       @ LDMIB SP, {R12} is an equivalent
-+                                       @ instruction in this case, where only
-+                                       @ one register was pushed into stack.
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load clamping parameters to duplicate vector elements
-+     * ------------------------------------------------------------------------ */
-+    VDUP.S16  Q4,  D7[1]               @ Q4:  0  |  0  |  0  |  0  |  0  |  0  |  0  |  0
-+    VDUP.S16  Q5,  D7[2]               @ Q5: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
-+
-+    /*-------------------------------------------------------------------------
-+     *  Read bias
-+     * ------------------------------------------------------------------------ */
-+    VDUP.S32  Q0,   D30[0]             @ Q0:  -45824 | -45824 | -45824 | -45824
-+    VDUP.S32  Q1,   D30[1]             @ Q1:   34816 |  34816 |  34816 |  34816
-+    VDUP.S32  Q2,   D31[0]             @ Q2:  -70688 | -70688 | -70688 | -70688
-+
-+
-+    /*-------------------------------------------------------------------------
-+     *  The main loop
-+     * ------------------------------------------------------------------------ */
-+loop_yvup2abgr:
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load input from Y, V and U
-+     *  D12  : Y0  Y1  Y2  Y3  Y4  Y5  Y6  Y7
-+     *  D14  : V0  V1  V2  V3  V4  V5  V6  V7
-+     *  D15  : U0  U1  U2  U3  U4  U5  U6  U7
-+     * ------------------------------------------------------------------------ */
-+    VLD1.U8  {D12},  [p_y]!            @ Load 8 Luma elements (uint8) to D12
-+    VLD1.U8  {D14},  [p_cr]!           @ Load 8 Cr elements (uint8) to D14
-+    VLD1.U8  {D15},  [p_cb]!           @ Load 8 Cb elements (uint8) to D15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Expand uint8 value to uint16
-+     *  D18, D19: Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
-+     *  D20, D21: V0 V1 V2 V3 V4 V5 V6 V7
-+     *  D22, D23: U0 U1 U2 U3 U4 U5 U6 U7
-+     * ------------------------------------------------------------------------ */
-+    VMOVL.U8 Q9,  D12
-+    VMOVL.U8 Q10, D14
-+    VMOVL.U8 Q11, D15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Multiply contribution from chrominance, results are in 32-bit
-+     * ------------------------------------------------------------------------ */
-+    VMULL.S16  Q12, D20, D6[0]         @ Q12:  359*(V0,V1,V2,V3)     Red
-+    VMULL.S16  Q13, D22, D6[1]         @ Q13:  -88*(U0,U1,U2,U3)     Green
-+    VMLAL.S16  Q13, D20, D6[2]         @ Q13:  -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
-+    VMULL.S16  Q14, D22, D6[3]         @ Q14:  454*(U0,U1,U2,U3)     Blue
-+
-+    /*-------------------------------------------------------------------------
-+     *  Add bias
-+     * ------------------------------------------------------------------------ */
-+    VADD.S32  Q12, Q0                  @ Q12 add Red   bias -45824
-+    VADD.S32  Q13, Q1                  @ Q13 add Green bias  34816
-+    VADD.S32  Q14, Q2                  @ Q14 add Blue  bias -57984
-+
-+    /*-------------------------------------------------------------------------
-+     *  Calculate Red, Green, Blue
-+     * ------------------------------------------------------------------------ */
-+    VMLAL.S16  Q12, D18, D7[0]         @ Q12: R0, R1, R2, R3 in 32-bit Q8 format
-+    VMLAL.S16  Q13, D18, D7[0]         @ Q13: G0, G1, G2, G3 in 32-bit Q8 format
-+    VMLAL.S16  Q14, D18, D7[0]         @ Q14: B0, B1, B2, B3 in 32-bit Q8 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Right shift eight bits with rounding
-+     * ------------------------------------------------------------------------ */
-+    VSHRN.S32   D18 , Q12, #8          @ D18: R0, R1, R2, R3 in 16-bit Q0 format
-+    VSHRN.S32   D20 , Q13, #8          @ D20: G0, G1, G2, G3 in 16-bit Q0 format
-+    VSHRN.S32   D22,  Q14, #8          @ D22: B0, B1, B2, B3 in 16-bit Q0 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Done with the first 4 elements, continue on the next 4 elements
-+     * ------------------------------------------------------------------------ */
-+
-+    /*-------------------------------------------------------------------------
-+     *  Multiply contribution from chrominance, results are in 32-bit
-+     * ------------------------------------------------------------------------ */
-+    VMULL.S16  Q12, D21, D6[0]         @ Q12:  359*(V0,V1,V2,V3)     Red
-+    VMULL.S16  Q13, D23, D6[1]         @ Q13: -88*(U0,U1,U2,U3)     Green
-+    VMLAL.S16  Q13, D21, D6[2]         @ Q13: -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
-+    VMULL.S16  Q14, D23, D6[3]         @ Q14:  454*(U0,U1,U2,U3)     Blue
-+
-+    /*-------------------------------------------------------------------------
-+     *  Add bias
-+     * ------------------------------------------------------------------------ */
-+    VADD.S32  Q12, Q0                  @ Q12 add Red   bias -45824
-+    VADD.S32  Q13, Q1                  @ Q13 add Green bias  34816
-+    VADD.S32  Q14, Q2                  @ Q14 add Blue  bias -57984
-+
-+    /*-------------------------------------------------------------------------
-+     *  Calculate Red, Green, Blue
-+     * ------------------------------------------------------------------------ */
-+    VMLAL.S16  Q12, D19, D7[0]         @ Q12: R0, R1, R2, R3 in 32-bit Q8 format
-+    VMLAL.S16  Q13, D19, D7[0]         @ Q13: G0, G1, G2, G3 in 32-bit Q8 format
-+    VMLAL.S16  Q14, D19, D7[0]         @ Q14: B0, B1, B2, B3 in 32-bit Q8 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Right shift eight bits with rounding
-+     * ------------------------------------------------------------------------ */
-+    VSHRN.S32   D19 , Q12, #8          @ D18: R0, R1, R2, R3 in 16-bit Q0 format
-+    VSHRN.S32   D21 , Q13, #8          @ D20: G0, G1, G2, G3 in 16-bit Q0 format
-+    VSHRN.S32   D23,  Q14, #8          @ D22: B0, B1, B2, B3 in 16-bit Q0 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Clamp the value to be within [0~255]
-+     * ------------------------------------------------------------------------ */
-+    VMAX.S16  Q11, Q11, Q4             @ if Q11 <   0, Q11 =   0
-+    VMIN.S16  Q11, Q11, Q5             @ if Q11 > 255, Q11 = 255
-+    VQMOVUN.S16   D28, Q11             @ store Blue to D28, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q10, Q10, Q4             @ if Q10 <   0, Q10 =   0
-+    VMIN.S16  Q10, Q10, Q5             @ if Q10 > 255, Q10 = 255
-+    VQMOVUN.S16   D27, Q10             @ store Green to D27, narrow the value from int16 to int8
-+
-+    VMAX.S16    Q9, Q9, Q4             @ if Q9 <   0, Q9 =   0
-+    VMIN.S16    Q9, Q9, Q5             @ if Q9 > 255, Q9 = 255
-+    VQMOVUN.S16    D26, Q9             @ store Red to D26, narrow the value from int16 to int8
-+
-+    /*-------------------------------------------------------------------------
-+     *  abgr format with leading 0xFF byte
-+     * ------------------------------------------------------------------------ */
-+    VMOVN.I16  D29, Q5                 @ D29:  255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
-+
-+    SUBS length, length, #8            @ check if the length is less than 8
-+
-+    BMI  trailing_yvup2abgr            @ jump to trailing processing if remaining length is less than 8
-+
-+    VST4.U8  {D26,D27,D28,D29}, [p_bgr]!   @ vector store Red, Green, Blue to destination
-+                                       @ Blue at LSB
-+
-+    BHI loop_yvup2abgr                 @ loop if more than 8 pixels left
-+
-+    BEQ  end_yvup2abgr                 @ done if exactly 8 pixel processed in the loop
-+
-+
-+trailing_yvup2abgr:
-+    /*-------------------------------------------------------------------------
-+     *  There are from 1 ~ 7 pixels left in the trailing part.
-+     *  First adding 7 to the length so the length would be from 0 ~ 6.
-+     *  eg: 1 pixel left in the trailing part, so 1-8+7 = 0.
-+     *  Then save 1 pixel unconditionally since at least 1 pixels left in the
-+     *  trailing part.
-+     * ------------------------------------------------------------------------ */
-+    ADDS length, length, #7            @ there are 7 or less in the trailing part
-+
-+    VST4.U8 {D26[0], D27[0], D28[0], D29[0]}, [p_bgr]! @ at least 1 pixel left in the trailing part
-+    BEQ  end_yvup2abgr                 @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST4.U8 {D26[1], D27[1], D28[1], D29[1]}, [p_bgr]!  @ store one more pixel
-+    BEQ  end_yvup2abgr                 @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST4.U8 {D26[2], D27[2], D28[2], D29[2]}, [p_bgr]!  @ store one more pixel
-+    BEQ  end_yvup2abgr                 @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST4.U8 {D26[3], D27[3], D28[3], D29[3]}, [p_bgr]!  @ store one more pixel
-+    BEQ  end_yvup2abgr                 @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST4.U8 {D26[4], D27[4], D28[4], D29[4]}, [p_bgr]!  @ store one more pixel
-+    BEQ  end_yvup2abgr                 @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST4.U8 {D26[5], D27[5], D28[5], D29[5]}, [p_bgr]!  @ store one more pixel
-+    BEQ  end_yvup2abgr                 @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST4.U8 {D26[6], D27[6], D28[6], D29[6]}, [p_bgr]! @ store one more pixel
-+
-+end_yvup2abgr:
-+    LDMFD SP!, {PC}
-+                                       @ end of yvup2abgr
-+
-+/*--------------------------------------------------------------------------
-+* FUNCTION     : yyvup2abgr8888_venum
-+*--------------------------------------------------------------------------
-+* DESCRIPTION  : Perform YYVU planar to ABGR8888 conversion.
-+*--------------------------------------------------------------------------
-+* C PROTOTYPE  : void yyvup2abgr8888_venum(uint8_t  *p_y,
-+*                                 uint8_t  *p_cr,
-+*                                 uint8_t  *p_cb,
-+*                                 uint8_t  *p_abgr8888,
-+*                                 uint32_t  length)
-+*--------------------------------------------------------------------------
-+* REG INPUT    : R0: uint8_t  *p_y
-+*                      pointer to the input Y Line
-+*                R1: uint8_t  *p_cr
-+*                      pointer to the input Cr Line
-+*                R2: uint8_t  *p_cb
-+*                      pointer to the input Cb Line
-+*                R3: uint8_t  *p_abgr8888
-+*                      pointer to the output ABGR Line
-+*                R12: uint32_t  length
-+*                      width of Line
-+*--------------------------------------------------------------------------
-+* STACK ARG    : None
-+*--------------------------------------------------------------------------
-+* REG OUTPUT   : None
-+*--------------------------------------------------------------------------
-+* MEM INPUT    : p_y      - a line of Y pixels
-+*                p_cr     - a line of Cr pixels
-+*                p_cb     - a line of Cb pixels
-+*                length   - the width of the input line
-+*--------------------------------------------------------------------------
-+* MEM OUTPUT   : p_abgr8888 - the converted ABGR pixels
-+*--------------------------------------------------------------------------
-+* REG AFFECTED : ARM:  R0-R4, R12
-+*                NEON: Q0-Q15
-+*--------------------------------------------------------------------------
-+* STACK USAGE  : none
-+*--------------------------------------------------------------------------
-+* CYCLES       : none
-+*
-+*--------------------------------------------------------------------------
-+* NOTES        :
-+*--------------------------------------------------------------------------
-+*/
-+.type yyvup2abgr8888_venum, %function
-+yyvup2abgr8888_venum:
-+    /*-------------------------------------------------------------------------
-+     *  Store stack registers
-+     * ------------------------------------------------------------------------ */
-+    STMFD SP!, {LR}
-+
-+    PLD [R0, R3]                       @ preload luma line
-+
-+    ADR   R12, constants
-+
-+    VLD1.S16  {D6, D7}, [R12]!         @ D6, D7: 359 |  -88 | -183 | 454 | 256 | 0 | 255 | 0
-+    VLD1.S32  {D30, D31}, [R12]        @ Q15   :  -45824    |    34816   |  -57984 |     X
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load the 5th parameter via stack
-+     *  R0 ~ R3 are used to pass the first 4 parameters, the 5th and above
-+     *  parameters are passed via stack
-+     * ------------------------------------------------------------------------ */
-+    LDR R12, [SP, #4]                  @ LR is the only one that has been pushed
-+                                       @ into stack, increment SP by 4 to
-+                                       @ get the parameter.
-+                                       @ LDMIB SP, {R12} is an equivalent
-+                                       @ instruction in this case, where only
-+                                       @ one register was pushed into stack.
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load clamping parameters to duplicate vector elements
-+     * ------------------------------------------------------------------------ */
-+    VDUP.S16  Q4,  D7[1]               @ Q4:  0  |  0  |  0  |  0  |  0  |  0  |  0  |  0
-+    VDUP.S16  Q5,  D7[2]               @ Q5: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
-+
-+    /*-------------------------------------------------------------------------
-+     *  Read bias
-+     * ------------------------------------------------------------------------ */
-+    VDUP.S32  Q0,   D30[0]             @ Q0:  -45824 | -45824 | -45824 | -45824
-+    VDUP.S32  Q1,   D30[1]             @ Q1:   34816 |  34816 |  34816 |  34816
-+    VDUP.S32  Q2,   D31[0]             @ Q2:  -70688 | -70688 | -70688 | -70688
-+
-+
-+    /*-------------------------------------------------------------------------
-+     *  The main loop
-+     * ------------------------------------------------------------------------ */
-+loop_yyvup2abgr:
-+
-+    /*-------------------------------------------------------------------------
-+     *  Load input from Y, V and U
-+     *  D12, D13: Y0 Y2 Y4 Y6 Y8 Y10 Y12 Y14, Y1 Y3 Y5 Y7 Y9 Y11 Y13 Y15
-+     *  D14  : V0  V1  V2  V3  V4  V5  V6  V7
-+     *  D15  : U0  U1  U2  U3  U4  U5  U6  U7
-+     * ------------------------------------------------------------------------ */
-+    VLD2.U8  {D12,D13}, [p_y]!         @ Load 16 Luma elements (uint8) to D12, D13
-+    VLD1.U8  {D14},  [p_cr]!           @ Load 8 Cr elements (uint8) to D14
-+    VLD1.U8  {D15},  [p_cb]!           @ Load 8 Cb elements (uint8) to D15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Expand uint8 value to uint16
-+     *  D24, D25: Y0 Y2 Y4 Y6 Y8 Y10 Y12 Y14
-+     *  D26, D27: Y1 Y3 Y5 Y7 Y9 Y11 Y13 Y15
-+     *  D28, D29: V0 V1 V2 V3 V4 V5  V6  V7
-+     *  D30, D31: U0 U1 U2 U3 U4 U5  U6  U7
-+     * ------------------------------------------------------------------------ */
-+    VMOVL.U8 Q12, D12
-+    VMOVL.U8 Q13, D13
-+    VMOVL.U8 Q14, D14
-+    VMOVL.U8 Q15, D15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Multiply contribution from chrominance, results are in 32-bit
-+     * ------------------------------------------------------------------------ */
-+    VMULL.S16  Q6, D28, D6[0]          @ Q6:  359*(V0,V1,V2,V3)     Red
-+    VMULL.S16  Q7, D30, D6[1]          @ Q7: -88*(U0,U1,U2,U3)     Green
-+    VMLAL.S16  Q7, D28, D6[2]          @ Q7: -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
-+    VMULL.S16  Q8, D30, D6[3]          @ Q8:  454*(U0,U1,U2,U3)     Blue
-+
-+    /*-------------------------------------------------------------------------
-+     *  Add bias
-+     * ------------------------------------------------------------------------ */
-+    VADD.S32  Q6, Q0                   @ Q6 add Red   bias -45824
-+    VADD.S32  Q7, Q1                   @ Q7 add Green bias  34816
-+    VADD.S32  Q8, Q2                   @ Q8 add Blue  bias -57984
-+
-+    /*-------------------------------------------------------------------------
-+     *  Calculate Red, Green, Blue
-+     * ------------------------------------------------------------------------ */
-+    VMOV.S32   Q9, Q6
-+    VMLAL.S16  Q6, D24, D7[0]          @ Q6: R0, R2, R4, R6 in 32-bit Q8 format
-+    VMLAL.S16  Q9, D26, D7[0]          @ Q9: R1, R3, R5, R7 in 32-bit Q8 format
-+
-+    VMOV.S32   Q10, Q7
-+    VMLAL.S16  Q7,  D24, D7[0]         @ Q7:  G0, G2, G4, G6 in 32-bit Q8 format
-+    VMLAL.S16  Q10, D26, D7[0]         @ Q10: G1, G3, G5, G7 in 32-bit Q8 format
-+
-+    VMOV.S32   Q11, Q8
-+    VMLAL.S16  Q8,  D24, D7[0]         @ Q8:  B0, B2, B4, B6 in 32-bit Q8 format
-+    VMLAL.S16  Q11, D26, D7[0]         @ Q11: B1, B3, B5, B7 in 32-bit Q8 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Right shift eight bits with rounding
-+     * ------------------------------------------------------------------------ */
-+    VSHRN.S32   D12, Q6,  #8           @ D12: R0 R2 R4 R6 in 16-bit Q0 format
-+    VSHRN.S32   D13, Q9,  #8           @ D13: R1 R3 R5 R7 in 16-bit Q0 format
-+    VZIP.16     D12, D13               @ Q6 : R0 R1 R2 R3 R4 R5 R6 R7
-+
-+    VSHRN.S32   D18, Q7,  #8           @ D18: G0 G2 G4 G6 in 16-bit Q0 format
-+    VSHRN.S32   D19, Q10, #8           @ D19: G1 G3 G5 G7 in 16-bit Q0 format
-+    VZIP.16     D18, D19               @ Q9 : G0 G1 G2 G3 G4 G5 G6 G7
-+
-+    VSHRN.S32   D20, Q8,  #8           @ D20: B0 B2 B4 B6 in 16-bit Q0 format
-+    VSHRN.S32   D21, Q11, #8           @ D21: B1 B3 B5 B7 in 16-bit Q0 format
-+    VZIP.16     D20, D21               @ Q10: B0 B1 B2 B3 B4 B5 B6 B7
-+
-+    /*-------------------------------------------------------------------------
-+     *  Clamp the value to be within [0~255]
-+     * ------------------------------------------------------------------------ */
-+    VMAX.S16  Q10, Q10, Q4             @ if Q10 <   0, Q10 =   0
-+    VMIN.S16  Q10, Q10, Q5             @ if Q10 > 255, Q10 = 255
-+    VQMOVUN.S16   D23, Q10             @ store Blue to D23, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q9, Q9, Q4               @ if Q9 <   0, Q9 =   0
-+    VMIN.S16  Q9, Q9, Q5               @ if Q9 > 255, Q9 = 255
-+    VQMOVUN.S16  D22, Q9               @ store Green to D22, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q6, Q6, Q4               @ if Q6 <   0, Q6 =   0
-+    VMIN.S16  Q6, Q6, Q5               @ if Q6 > 255, Q6 = 255
-+    VQMOVUN.S16  D21, Q6               @ store Red to D21, narrow the value from int16 to int8
-+
-+    /*-------------------------------------------------------------------------
-+     *  abgr format with leading 0xFF byte
-+     * ------------------------------------------------------------------------ */
-+    VMOVN.I16  D24, Q5                 @ D24:  255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
-+
-+    SUBS length, length, #8            @ check if the length is less than 8
-+
-+    BMI  trailing_yyvup2abgr           @ jump to trailing processing if remaining length is less than 8
-+
-+    VST4.U8  {D21,D22,D23,D24}, [p_bgr]!   @ vector store Blue, Green, Red to destination
-+                                       @ Red at LSB
-+
-+    BEQ  end_yyvup2abgr                @ done if exactly 8 pixel processed in the loop
-+
-+
-+    /*-------------------------------------------------------------------------
-+     *  Done with the first 8 elements, continue on the next 8 elements
-+     * ------------------------------------------------------------------------ */
-+
-+    /*-------------------------------------------------------------------------
-+     *  Multiply contribution from chrominance, results are in 32-bit
-+     * ------------------------------------------------------------------------ */
-+    VMULL.S16  Q6, D29, D6[0]          @ Q6: 359*(V4,V5,V6,V7)       Red
-+    VMULL.S16  Q7, D31, D6[1]          @ Q7: -88*(U4,U5,U6,U7)      Green
-+    VMLAL.S16  Q7, D29, D6[2]          @ Q7: -88*(U4,U5,U6,U7) - 183*(V4,V5,V6,V7)
-+    VMULL.S16  Q8, D31, D6[3]          @ Q8: 454*(U4,U5,U6,U7)       Blue
-+
-+    /*-------------------------------------------------------------------------
-+     *  Add bias
-+     * ------------------------------------------------------------------------ */
-+    VADD.S32  Q6, Q0                   @ Q6 add Red   bias -45824
-+    VADD.S32  Q7, Q1                   @ Q7 add Green bias  34816
-+    VADD.S32  Q8, Q2                   @ Q8 add Blue  bias -57984
-+
-+    /*-------------------------------------------------------------------------
-+     *  Calculate Red, Green, Blue
-+     * ------------------------------------------------------------------------ */
-+    VMOV.S32   Q9, Q6
-+    VMLAL.S16  Q6, D25, D7[0]          @ Q6: R8 R10 R12 R14 in 32-bit Q8 format
-+    VMLAL.S16  Q9, D27, D7[0]          @ Q9: R9 R11 R13 R15 in 32-bit Q8 format
-+
-+    VMOV.S32   Q10, Q7
-+    VMLAL.S16  Q7,  D25, D7[0]         @ Q7: G0, G2, G4, G6 in 32-bit Q8 format
-+    VMLAL.S16  Q10, D27, D7[0]         @ Q10 : G1, G3, G5, G7 in 32-bit Q8 format
-+
-+    VMOV.S32   Q11, Q8
-+    VMLAL.S16  Q8,  D25, D7[0]         @ Q8: B0, B2, B4, B6 in 32-bit Q8 format
-+    VMLAL.S16  Q11, D27, D7[0]         @ Q11 : B1, B3, B5, B7 in 32-bit Q8 format
-+
-+    /*-------------------------------------------------------------------------
-+     *  Right shift eight bits with rounding
-+     * ------------------------------------------------------------------------ */
-+    VSHRN.S32   D12, Q6,  #8           @ D12: R8 R10 R12 R14 in 16-bit Q0 format
-+    VSHRN.S32   D13, Q9,  #8           @ D13: R9 R11 R13 R15 in 16-bit Q0 format
-+    VZIP.16     D12, D13               @ Q6: R8 R9 R10 R11 R12 R13 R14 R15
-+
-+    VSHRN.S32   D18, Q7,  #8           @ D18: G8 G10 G12 G14 in 16-bit Q0 format
-+    VSHRN.S32   D19, Q10, #8           @ D19: G9 G11 G13 G15 in 16-bit Q0 format
-+    VZIP.16     D18, D19               @ Q9:  G8 G9 G10 G11 G12 G13 G14 G15
-+
-+    VSHRN.S32   D20, Q8,  #8           @ D20: B8 B10 B12 B14 in 16-bit Q0 format
-+    VSHRN.S32   D21, Q11, #8           @ D21: B9 B11 B13 B15 in 16-bit Q0 format
-+    VZIP.16     D20, D21               @ Q10: B8 B9 B10 B11 B12 B13 B14 B15
-+
-+    /*-------------------------------------------------------------------------
-+     *  Clamp the value to be within [0~255]
-+     * ------------------------------------------------------------------------ */
-+    VMAX.S16  Q10, Q10, Q4             @ if Q10 <   0, Q10 =   0
-+    VMIN.S16  Q10, Q10, Q5             @ if Q10 > 255, Q10 = 255
-+    VQMOVUN.S16   D23, Q10             @ store Blue to D23, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q9, Q9, Q4               @ if Q9 <   0, Q9 =   0
-+    VMIN.S16  Q9, Q9, Q5               @ if Q9 > 255, Q9 = 255
-+    VQMOVUN.S16  D22, Q9               @ store Green to D22, narrow the value from int16 to int8
-+
-+    VMAX.S16  Q6, Q6, Q4               @ if Q6 <   0, Q6 =   0
-+    VMIN.S16  Q6, Q6, Q5               @ if Q6 > 255, Q6 = 255
-+    VQMOVUN.S16  D21, Q6               @ store Red to D21, narrow the value from int16 to int8
-+
-+    /*-------------------------------------------------------------------------
-+     *  abgr format with leading 0xFF byte
-+     * ------------------------------------------------------------------------ */
-+    VMOVN.I16  D24, Q5                 @ D24:  255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
-+
-+    SUBS length, length, #8            @ check if the length is less than 8
-+
-+    BMI  trailing_yyvup2abgr           @ jump to trailing processing if remaining length is less than 8
-+
-+    VST4.U8  {D21,D22,D23,D24}, [p_bgr]!   @ vector store Blue, Green, Red to destination
-+                                       @ Red at LSB
-+
-+    BHI loop_yyvup2abgr                @ loop if more than 8 pixels left
-+
-+    BEQ  end_yyvup2abgr                @ done if exactly 8 pixel processed in the loop
-+
-+
-+trailing_yyvup2abgr:
-+    /*-------------------------------------------------------------------------
-+     *  There are from 1 ~ 7 pixels left in the trailing part.
-+     *  First adding 7 to the length so the length would be from 0 ~ 6.
-+     *  eg: 1 pixel left in the trailing part, so 1-8+7 = 0.
-+     *  Then save 1 pixel unconditionally since at least 1 pixels left in the
-+     *  trailing part.
-+     * ------------------------------------------------------------------------ */
-+    ADDS length, length, #7            @ there are 7 or less in the trailing part
-+
-+    VST4.U8 {D21[0],D22[0],D23[0],D24[0]}, [p_bgr]! @ at least 1 pixel left in the trailing part
-+    BEQ end_yyvup2abgr                 @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST4.U8 {D21[1],D22[1],D23[1],D24[1]}, [p_bgr]!  @ store one more pixel
-+    BEQ end_yyvup2abgr                 @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST4.U8 {D21[2],D22[2],D23[2],D24[2]}, [p_bgr]!  @ store one more pixel
-+    BEQ end_yyvup2abgr                 @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST4.U8 {D21[3],D22[3],D23[3],D24[3]}, [p_bgr]!  @ store one more pixel
-+    BEQ end_yyvup2abgr                 @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST4.U8 {D21[4],D22[4],D23[4],D24[4]}, [p_bgr]!  @ store one more pixel
-+    BEQ end_yyvup2abgr                 @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST4.U8 {D21[5],D22[5],D23[5],D24[5]}, [p_bgr]!  @ store one more pixel
-+    BEQ end_yyvup2abgr                 @ done if 0 pixel left
-+
-+    SUBS length, length, #1            @ update length counter
-+    VST4.U8 {D21[6],D22[6],D23[6],D24[6]}, [p_bgr]!  @ store one more pixel
-+
-+end_yyvup2abgr:
-+    LDMFD SP!, {PC}
-+                                       @ end of yyvup2abgr
-+
-+.end
---- libjpeg-turbo-1.1.1.orig/simd/Makefile.am
-+++ libjpeg-turbo-1.1.1/simd/Makefile.am
-@@ -6,6 +6,20 @@ EXTRA_DIST = nasm_lt.sh jcclrmmx.asm jcc
- 	jdmrgmmx.asm jdmrgss2.asm jcclrss2-64.asm jdclrss2-64.asm \
- 	jdmrgss2-64.asm CMakeLists.txt
- 
-+if SIMD_ARM_NEON
-+
-+AM_CFLAGS = -march=armv7-a -mfpu=neon
-+AM_CCASFLAGS = -march=armv7-a -mfpu=neon
-+
-+libsimd_la_SOURCES = jsimd_arm_neon.c \
-+                     jdcolor-armv7.s \
-+                     jdidct-armv7.s
-+
-+jdcolor-armv7.lo: jdcolor-armv7.s
-+jdidct-armv7.lo: jdidct-armv7.s
-+
-+endif
-+
- if SIMD_X86_64
- 
- libsimd_la_SOURCES = jsimd_x86_64.c \
-@@ -21,6 +35,10 @@ libsimd_la_SOURCES = jsimd_x86_64.c \
- jccolss2-64.lo: jcclrss2-64.asm
- jdcolss2-64.lo: jdclrss2-64.asm
- jdmerss2-64.lo: jdmrgss2-64.asm
-+
-+.asm.lo:
-+	$(LIBTOOL) --mode=compile --tag NASM $(srcdir)/nasm_lt.sh $(NASM) $(NAFLAGS) -I$(srcdir) $< -o $@
-+
- endif
- 
- if SIMD_I386
-@@ -47,12 +65,14 @@ jdcolmmx.lo: jdclrmmx.asm
- jdcolss2.lo: jdclrss2.asm
- jdmermmx.lo: jdmrgmmx.asm
- jdmerss2.lo: jdmrgss2.asm
-+
-+.asm.lo:
-+	$(LIBTOOL) --mode=compile --tag NASM $(srcdir)/nasm_lt.sh $(NASM) $(NAFLAGS) -I$(srcdir) $< -o $@
-+
- endif
- 
- AM_CPPFLAGS = -I$(top_srcdir) 
- 
--.asm.lo:
--	$(LIBTOOL) --mode=compile --tag NASM $(srcdir)/nasm_lt.sh $(NASM) $(NAFLAGS) -I$(srcdir) $< -o $@
- 
- jsimdcfg.inc: $(srcdir)/jsimdcfg.inc.h ../jpeglib.h ../jconfig.h ../jmorecfg.h
- 	$(CPP) -I$(top_builddir) -I$(top_builddir)/simd $(srcdir)/jsimdcfg.inc.h | $(EGREP) "^[\;%]|^\ %" | sed 's%_cpp_protection_%%' | sed 's@% define@%define@g' > $@
---- /dev/null
-+++ libjpeg-turbo-1.1.1/simd/jsimd_arm_neon.c
-@@ -0,0 +1,564 @@
-+/*
-+ * jsimd_arm_neon.c
-+ *
-+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-+ * Copyright 2009 D. R. Commander
-+ * Copyright 2011 Mandeep Kumar <mandeep.kumar@linaro.org> 
-+ * 
-+ * Based on the x86 SIMD extension for IJG JPEG library,
-+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
-+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
-+ *
-+ * This file contain ARM NEON optimized routines. 
-+ */
-+
-+#define JPEG_INTERNALS
-+#include "../jinclude.h"
-+#include "../jpeglib.h"
-+#include "../jsimd.h"
-+#include "../jdct.h"
-+#include "../jsimddct.h"
-+
-+
-+/* Private subobject */
-+
-+typedef struct {
-+  struct jpeg_color_deconverter pub; /* public fields */
-+
-+  /* Private state for YCC->RGB conversion */
-+  int * Cr_r_tab;		/* => table for Cr to R conversion */
-+  int * Cb_b_tab;		/* => table for Cb to B conversion */
-+  INT32 * Cr_g_tab;		/* => table for Cr to G conversion */
-+  INT32 * Cb_g_tab;		/* => table for Cb to G conversion */
-+} my_color_deconverter;
-+
-+typedef my_color_deconverter * my_cconvert_ptr;
-+
-+
-+#define DEQUANTIZE(coef,quantval)  ((coef) * ((INT16)quantval))
-+
-+/* IDCT routines */
-+EXTERN (void) idct_1x1_venum (INT16 * coeffPtr, INT16 * samplePtr, INT32 stride);
-+EXTERN (void) idct_2x2_venum (INT16 * coeffPtr, INT16 * samplePtr, INT32 stride);
-+EXTERN (void) idct_4x4_venum (INT16 * coeffPtr, INT16 * samplePtr, INT32 stride);
-+EXTERN (void) idct_8x8_venum (INT16 * coeffPtr, INT16 * samplePtr, INT32 stride);
-+
-+/* Color conversion routines */
-+EXTERN (void) yvup2rgb565_venum (UINT8 *pLumaLine,
-+                UINT8 *pCrLine,
-+                UINT8 *pCbLine,
-+                UINT8 *pRGB565Line,
-+                JDIMENSION nLineWidth);
-+EXTERN (void) yyvup2rgb565_venum (UINT8 * pLumaLine,
-+                UINT8 *pCrLine,
-+                UINT8 *pCbLine,
-+                UINT8 * pRGB565Line,
-+                JDIMENSION nLineWidth);
-+EXTERN (void) yvup2bgr888_venum (UINT8 * pLumaLine,
-+                UINT8 *pCrLine,
-+                UINT8 *pCbLine,
-+                UINT8 * pBGR888Line,
-+                JDIMENSION nLineWidth);
-+EXTERN (void) yyvup2bgr888_venum (UINT8 * pLumaLine,
-+                UINT8 *pCrLine,
-+                UINT8 *pCbLine,
-+                UINT8 * pBGR888Line,
-+                JDIMENSION nLineWidth);
-+EXTERN (void) yvup2abgr8888_venum (UINT8 * pLumaLine,
-+                UINT8 *pCrLine,
-+                UINT8 *pCbLine,
-+                UINT8 * pABGR888Line,
-+                JDIMENSION nLineWidth);
-+EXTERN (void) yyvup2abgr8888_venum (UINT8 * pLumaLine,
-+                UINT8 *pCrLine,
-+                UINT8 *pCbLine,
-+                UINT8 * pABGR888Line,
-+                JDIMENSION nLineWidth);
-+
-+
-+GLOBAL(int)
-+jsimd_can_rgb_ycc (void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_ycc_rgb (void)
-+{
-+  return 1;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_idct_islow (void)
-+{
-+  return 1;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_idct_ifast (void)
-+{
-+  return 1;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_idct_float (void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v2_downsample (void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v1_downsample (void)
-+{
-+  return 0;
-+}
-+GLOBAL(int)
-+jsimd_can_h2v2_upsample (void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v1_upsample (void)
-+{
-+  return 0;
-+}
-+GLOBAL(int)
-+jsimd_can_h2v2_fancy_upsample (void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v1_fancy_upsample (void)
-+{
-+  return 0;
-+}
-+GLOBAL(int)
-+jsimd_can_h2v2_merged_upsample (void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v1_merged_upsample (void)
-+{
-+  return 0;
-+}
-+GLOBAL(int)
-+jsimd_can_convsamp (void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_convsamp_float (void)
-+{
-+  return 0;
-+}
-+GLOBAL(int)
-+jsimd_can_fdct_islow (void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_fdct_ifast (void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_fdct_float (void)
-+{
-+  return 0;
-+}
-+GLOBAL(int)
-+jsimd_can_quantize (void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_quantize_float (void)
-+{
-+  return 0;
-+}
-+GLOBAL(int)
-+jsimd_can_idct_2x2 (void)
-+{
-+  return 1;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_idct_4x4 (void)
-+{
-+  return 1;
-+}
-+
-+
-+
-+
-+/* Function Implementation */
-+
-+GLOBAL(void)
-+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-+                       JDIMENSION output_row, int num_rows)
-+{
-+}
-+
-+GLOBAL(void)
-+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-+                       JSAMPARRAY output_buf, int num_rows)
-+{
-+  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-+  JSAMPROW inptr0, inptr1, inptr2;
-+  JSAMPROW outptr;
-+  JDIMENSION row;
-+
-+  for (row = 0; row < (JDIMENSION)num_rows; row++)
-+  {
-+    inptr0     = input_buf[0][input_row];
-+    inptr1     = input_buf[1][input_row];
-+    inptr2     = input_buf[2][input_row];
-+
-+    input_row++;
-+    outptr = *output_buf++;
-+
-+    yvup2bgr888_venum((UINT8*) inptr0,
-+                      (UINT8*) inptr2,
-+                      (UINT8*) inptr1,
-+                      (UINT8*) outptr,
-+                      cinfo->output_width);
-+  }
-+}
-+
-+
-+
-+GLOBAL(void)
-+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
-+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-+{
-+}
-+
-+GLOBAL(void)
-+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
-+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-+{
-+}
-+
-+
-+GLOBAL(void)
-+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-+                     jpeg_component_info * compptr, 
-+                     JSAMPARRAY input_data,
-+                     JSAMPARRAY * output_data_ptr)
-+{
-+}
-+
-+GLOBAL(void)
-+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-+                     jpeg_component_info * compptr, 
-+                     JSAMPARRAY input_data,
-+                     JSAMPARRAY * output_data_ptr)
-+{
-+}
-+
-+
-+GLOBAL(void)
-+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-+                           jpeg_component_info * compptr, 
-+                           JSAMPARRAY input_data,
-+                           JSAMPARRAY * output_data_ptr)
-+{
-+}
-+
-+GLOBAL(void)
-+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-+                           jpeg_component_info * compptr, 
-+                           JSAMPARRAY input_data,
-+                           JSAMPARRAY * output_data_ptr)
-+{
-+}
-+
-+
-+GLOBAL(void)
-+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-+                            JSAMPIMAGE input_buf,
-+                            JDIMENSION in_row_group_ctr,
-+                            JSAMPARRAY output_buf)
-+{
-+}
-+
-+GLOBAL(void)
-+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-+                            JSAMPIMAGE input_buf,
-+                            JDIMENSION in_row_group_ctr,
-+                            JSAMPARRAY output_buf)
-+{
-+}
-+
-+
-+GLOBAL(void)
-+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-+                DCTELEM * workspace)
-+{
-+}
-+
-+GLOBAL(void)
-+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-+                      FAST_FLOAT * workspace)
-+{
-+}
-+
-+
-+GLOBAL(void)
-+jsimd_fdct_islow (DCTELEM * data)
-+{
-+}
-+
-+GLOBAL(void)
-+jsimd_fdct_ifast (DCTELEM * data)
-+{
-+}
-+
-+GLOBAL(void)
-+jsimd_fdct_float (FAST_FLOAT * data)
-+{
-+}
-+
-+
-+GLOBAL(void)
-+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-+                DCTELEM * workspace)
-+{
-+}
-+
-+GLOBAL(void)
-+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-+                      FAST_FLOAT * workspace)
-+{
-+}
-+
-+
-+GLOBAL(void)
-+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+                JDIMENSION output_col)
-+{
-+  ISLOW_MULT_TYPE * quantptr;
-+  JSAMPROW outptr;
-+
-+  /* Note: Must allocate 8x2 even though only 2x2 is used because
-+   * IDCT function expects stride of 8. Stride input to function is ignored.
-+   * There is also a hw limitation requiring input size to be 8x2.
-+   */
-+  INT16    idct_out[DCTSIZE * (DCTSIZE>>2)];  /* buffers data between passes */
-+  INT16*   idctptr;
-+  JCOEFPTR coefptr;
-+  int ctr;
-+
-+  coefptr  = coef_block;
-+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-+
-+  /* Dequantize the coeff buffer and write it back to the same location */
-+  for (ctr = (DCTSIZE>>2); ctr > 0; ctr--) {
-+    coefptr[0]         = DEQUANTIZE(coefptr[0]        , quantptr[0]        );
-+    coefptr[DCTSIZE*1] = DEQUANTIZE(coefptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-+
-+    /* advance pointers to next column */
-+    quantptr++;
-+    coefptr++;
-+  }
-+
-+  idct_2x2_venum((INT16*)coef_block,
-+                 (INT16*)idct_out,
-+                  DCTSIZE * sizeof(INT16));
-+
-+  idctptr = idct_out;
-+  for (ctr = 0; ctr < (DCTSIZE>>2); ctr++) {
-+    outptr = output_buf[ctr] + output_col;
-+
-+    /* outptr sample size is 1 bytes, idctptr sample size is 2 bytes */
-+    outptr[0] = idctptr[0];
-+    outptr[1] = idctptr[1];
-+
-+    /* IDCT function assumes stride of 8 units */
-+    idctptr += (DCTSIZE);    /* advance pointers to next row */
-+  }
-+}
-+
-+GLOBAL(void)
-+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+                JDIMENSION output_col)
-+{
-+  ISLOW_MULT_TYPE * quantptr;
-+  JSAMPROW outptr;
-+
-+  /* Note: Must allocate 8x4 even though only 4x4 is used because
-+   * IDCT function expects stride of 8. Stride input to function is ignored.
-+   */
-+  INT16    idct_out[DCTSIZE * (DCTSIZE>>1)];  /* buffers data between passes */
-+  INT16*   idctptr;
-+  JCOEFPTR coefptr;
-+  int ctr;
-+
-+  coefptr  = coef_block;
-+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-+
-+  /* Dequantize the coeff buffer and write it back to the same location */
-+  for (ctr = (DCTSIZE>>1); ctr > 0; ctr--) {
-+    coefptr[0]         = DEQUANTIZE(coefptr[0]        , quantptr[0]        );
-+    coefptr[DCTSIZE*1] = DEQUANTIZE(coefptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-+    coefptr[DCTSIZE*2] = DEQUANTIZE(coefptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-+    coefptr[DCTSIZE*3] = DEQUANTIZE(coefptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-+
-+    /* advance pointers to next column */
-+    quantptr++;
-+    coefptr++;
-+  }
-+
-+  idct_4x4_venum((INT16*)coef_block,
-+                 (INT16*)idct_out,
-+                  DCTSIZE * sizeof(INT16));
-+
-+  idctptr = idct_out;
-+  for (ctr = 0; ctr < (DCTSIZE>>1); ctr++) {
-+    outptr = output_buf[ctr] + output_col;
-+
-+    /* outptr sample size is 1 byte while idctptr sample size is 2 bytes */
-+    outptr[0] = idctptr[0];
-+    outptr[1] = idctptr[1];
-+    outptr[2] = idctptr[2];
-+    outptr[3] = idctptr[3];
-+    /* IDCT function assumes stride of 8 units */
-+    idctptr += (DCTSIZE);    /* advance pointers to next row */
-+  }
-+}
-+
-+
-+GLOBAL(void)
-+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+                JDIMENSION output_col)
-+{
-+  ISLOW_MULT_TYPE * quantptr;
-+  JCOEFPTR coefptr;
-+  int ctr;
-+
-+  /* idct_out temp buffer is needed because output_buf sample allocation is 8 bits,
-+   * while IDCT output expects 16 bits.
-+   */
-+  INT16 idct_out[DCTSIZE2];  /* buffers data between passes */
-+  JSAMPROW outptr;
-+  INT16*  idctptr;
-+
-+  coefptr  = coef_block;
-+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-+
-+  /* Dequantize the coeff buffer and write it back to the same location */
-+  for (ctr = DCTSIZE; ctr > 0; ctr--) {
-+    coefptr[0]         = DEQUANTIZE(coefptr[0]        , quantptr[0]        );
-+    coefptr[DCTSIZE*1] = DEQUANTIZE(coefptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-+    coefptr[DCTSIZE*2] = DEQUANTIZE(coefptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-+    coefptr[DCTSIZE*3] = DEQUANTIZE(coefptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-+    coefptr[DCTSIZE*4] = DEQUANTIZE(coefptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-+    coefptr[DCTSIZE*5] = DEQUANTIZE(coefptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-+    coefptr[DCTSIZE*6] = DEQUANTIZE(coefptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-+    coefptr[DCTSIZE*7] = DEQUANTIZE(coefptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-+
-+    /* advance pointers to next column */
-+    quantptr++;
-+    coefptr++;
-+  }
-+
-+  idct_8x8_venum((INT16*)coef_block,
-+                 (INT16*)idct_out,
-+                 DCTSIZE * sizeof(INT16));
-+
-+  idctptr = idct_out;
-+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
-+    outptr = output_buf[ctr] + output_col;
-+    // outptr sample size is 1 byte while idctptr sample size is 2 bytes
-+    outptr[0] = idctptr[0];
-+    outptr[1] = idctptr[1];
-+    outptr[2] = idctptr[2];
-+    outptr[3] = idctptr[3];
-+    outptr[4] = idctptr[4];
-+    outptr[5] = idctptr[5];
-+    outptr[6] = idctptr[6];
-+    outptr[7] = idctptr[7];
-+    idctptr  += DCTSIZE;      /* advance pointers to next row */
-+  }
-+}
-+
-+GLOBAL(void)
-+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+                JDIMENSION output_col)
-+{
-+  ISLOW_MULT_TYPE * quantptr;
-+  JCOEFPTR coefptr;
-+  int ctr;
-+
-+  /* idct_out temp buffer is needed because output_buf sample allocation is 8 bits,
-+   * while IDCT output expects 16 bits.
-+   */
-+  INT16 idct_out[DCTSIZE2];  /* buffers data between passes */
-+  JSAMPROW outptr;
-+  INT16*  idctptr;
-+
-+  coefptr  = coef_block;
-+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-+
-+  /* Dequantize the coeff buffer and write it back to the same location */
-+  for (ctr = DCTSIZE; ctr > 0; ctr--) {
-+    coefptr[0]         = DEQUANTIZE(coefptr[0]        , quantptr[0]        );
-+    coefptr[DCTSIZE*1] = DEQUANTIZE(coefptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-+    coefptr[DCTSIZE*2] = DEQUANTIZE(coefptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-+    coefptr[DCTSIZE*3] = DEQUANTIZE(coefptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-+    coefptr[DCTSIZE*4] = DEQUANTIZE(coefptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-+    coefptr[DCTSIZE*5] = DEQUANTIZE(coefptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-+    coefptr[DCTSIZE*6] = DEQUANTIZE(coefptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-+    coefptr[DCTSIZE*7] = DEQUANTIZE(coefptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-+
-+    /* advance pointers to next column */
-+    quantptr++;
-+    coefptr++;
-+  }
-+
-+  idct_8x8_venum((INT16*)coef_block,
-+                 (INT16*)idct_out,
-+                 DCTSIZE * sizeof(INT16));
-+
-+  idctptr = idct_out;
-+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
-+    outptr = output_buf[ctr] + output_col;
-+    // outptr sample size is 1 byte while idctptr sample size is 2 bytes
-+    outptr[0] = idctptr[0];
-+    outptr[1] = idctptr[1];
-+    outptr[2] = idctptr[2];
-+    outptr[3] = idctptr[3];
-+    outptr[4] = idctptr[4];
-+    outptr[5] = idctptr[5];
-+    outptr[6] = idctptr[6];
-+    outptr[7] = idctptr[7];
-+    idctptr  += DCTSIZE;      /* advance pointers to next row */
-+  }
-+}
-+
-+GLOBAL(void)
-+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+                JDIMENSION output_col)
-+{
-+}
-+
---- /dev/null
-+++ libjpeg-turbo-1.1.1/simd/jdidct-armv7.s
-@@ -0,0 +1,762 @@
-+/*=========================================================================
-+* jdidct-armv7.s
-+*
-+*  Copyright (c) 2010, Code Aurora Forum. All rights reserved.
-+*
-+*  Redistribution and use in source and binary forms, with or without
-+*  modification, are permitted provided that the following conditions are
-+*  met:
-+*      * Redistributions of source code must retain the above copyright
-+*        notice, this list of conditions and the following disclaimer.
-+*      * Redistributions in binary form must reproduce the above
-+*        copyright notice, this list of conditions and the following
-+*        disclaimer in the documentation and/or other materials provided
-+*        with the distribution.
-+*      * Neither the name of Code Aurora Forum, Inc. nor the names of its
-+*        contributors may be used to endorse or promote products derived
-+*        from this software without specific prior written permission.
-+*
-+*  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
-+*  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+*  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
-+*  ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-+*  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+*  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+*  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+*  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-+*  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-+*  OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
-+*  IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*==========================================================================
-+
-+*==========================================================================
-+*                         FUNCTION LIST
-+*--------------------------------------------------------------------------
-+* - idct_1x1_venum
-+* - idct_2x2_venum
-+* - idct_4x4_venum
-+* - idct_8x8_venum
-+*
-+*==========================================================================
-+*/
-+
-+@==========================================================================
-+@ MACRO DEFINITION
-+@==========================================================================
-+    .macro Transpose8x8
-+        @==================================================================
-+        @ Transpose an 8 x 8 x 16 bit matrix in place
-+        @ Input: q8 to q15
-+        @ Output: q8 to q15
-+        @ Registers used: q8 to q15
-+        @ Assumptions: 8 x 8 x 16 bit data
-+        @==================================================================
-+
-+        vswp d17, d24                  @q8, q12
-+        vswp d23, d30                  @q11, q15
-+        vswp d21, d28                  @q10, q14
-+        vswp d19, d26                  @q9, q13
-+
-+        vtrn.32 q8,  q10
-+        vtrn.32 q9,  q11
-+        vtrn.32 q12, q14
-+        vtrn.32 q13, q15
-+
-+        vtrn.16 q8,  q9
-+        vtrn.16 q10, q11
-+        vtrn.16 q12, q13
-+        vtrn.16 q14, q15
-+    .endm
-+
-+    .macro IDCT1D
-+        @==================================================================
-+        @ One dimensional 64 element inverse DCT
-+        @ Input: q8 to q15 loaded with data
-+        @        q0 loaded with constants
-+        @ Output: q8 to q15
-+        @ Registers used: q0, q4 to q15
-+        @ Assumptions: 16 bit data, first elements in least significant
-+        @ halfwords
-+        @==================================================================
-+
-+        @1st stage
-+        vqrdmulh.s16 q4,  q15, d0[2]   @q4 = a1*vx7
-+        vqrdmulh.s16 q5,  q9,  d0[2]   @q5 = a1*vx1
-+        vqrdmulh.s16 q6,  q13, d0[3]   @q6 = a2*vx5
-+        vqrdmulh.s16 q7,  q11, d1[1]   @q7 = ma2*vx3
-+        vqrdmulh.s16 q2,  q14, d0[1]   @q6 = a0*vx6
-+        vqrdmulh.s16 q3,  q10, d0[1]   @q7 = a0*vx2
-+        vqadd.s16   q9,  q4,  q9       @q9 = t1 = a1*vx7 + vx1
-+        vqsub.s16   q5,  q5,  q15      @q5 = t8 = a1*vx1 - vx7
-+        vqadd.s16   q15, q6,  q11      @q15 = t7 = a2*vx5 + vx3
-+        vqadd.s16   q11, q7,  q13      @q11 = t3 = ma2*vx3 + vx5
-+
-+        @2nd stage
-+        vqadd.s16   q13, q8,  q12      @q13 = t5 = vx0 + vx4
-+        vqsub.s16   q8,  q8,  q12      @q8 = t0 = vx0 - vx4
-+        vqadd.s16   q10, q2,  q10      @q10 = t2 = a0*vx6 + vx2
-+        vqsub.s16   q12, q3,  q14      @q12 = t4 = a0*vx2 - vx6
-+        vqadd.s16   q14, q5,  q11      @q14 = t6 = t8 + t3
-+        vqsub.s16   q11, q5,  q11      @q11 = t3 = t8 - t3
-+        vqsub.s16   q5,  q9,  q15      @q5 = t8 = t1 - t7
-+        vqadd.s16   q9,  q9,  q15      @q9 = t1 = t1 + t7
-+
-+        @3rd stage
-+        vqadd.s16   q15, q13, q10      @q15 = t7 = t5 + t2
-+        vqsub.s16   q10, q13, q10      @q10 = t2 = t5 - t2
-+        vqadd.s16   q13, q8,  q12      @q13 = t5 = t0 + t4
-+        vqsub.s16   q7,  q8,  q12      @q7 = t0 = t0 - t4
-+        vqsub.s16   q12, q5,  q11      @q12 = t4 = t8 - t3
-+        vqadd.s16   q11, q5,  q11      @q11 = t3 = t8 + t3
-+
-+        @4th stage
-+        vqadd.s16   q8,  q15, q9       @q8 = vy0 = t7 + t1
-+        vqsub.s16   q15, q15, q9       @q15 = vy7 = t7 - t1
-+        vqrdmulh.s16 q6,  q12, d0[0]   @q6 = c4*t4
-+        vqrdmulh.s16 q4,  q11, d0[0]   @q4 = c4*t3
-+        vqsub.s16   q12, q10, q14      @q12 = vy4 = t2 - t6
-+        vqadd.s16   q11, q10, q14      @q11 = vy3 = t2 + t6
-+        vqadd.s16   q10, q7,  q6       @q10 = vy2 = t0 + c4*t4
-+        vqsub.s16   q14, q13, q4       @q14 = vy6 = t5 - c4*t3
-+        vqadd.s16   q9,  q13, q4       @q9 = vy1 = t5 + c4*t3
-+        vqsub.s16   q13, q7,  q6       @q13 = vy5 = t0 - c4*t4
-+    .endm
-+
-+    .macro PART1
-+        @==================================================================
-+        @ Load input input data from memory and shift
-+        @==================================================================
-+        vld1.16   {d16, d17},[r0]!     @q8 =row0
-+        vqshl.s16  q8,  q8,  #4        @Input data too big?!!
-+                                       @Maximum MPEG input is 2047/-2048.
-+        vld1.16   {d18, d19},[r0]!     @q9 =row1
-+        vqshl.s16  q9,  q9,  #4        @Shift 1 instead of 4
-+
-+        vld1.16   {d20, d21},[r0]!     @q10=row2
-+        vqshl.s16  q10, q10, #4
-+
-+        vld1.16   {d22, d23},[r0]!     @q11=row3
-+        vqshl.s16  q11, q11, #4
-+
-+        vld1.16   {d24, d25},[r0]!     @q12=row4
-+        vqshl.s16  q12, q12, #4
-+
-+        vld1.16   {d26, d27},[r0]!     @q13=row5
-+        vqshl.s16  q13, q13, #4
-+        vld1.16   {d28, d29},[r0]!     @q14=row6
-+        vqshl.s16  q14, q14, #4
-+        vld1.16   {d30, d31},[r0]!     @q15=row7
-+        vqshl.s16  q15, q15, #4
-+
-+        @==================================================================
-+        @ refresh the constants that was clobbered last time through IDCT1D
-+        @==================================================================
-+        vld1.16   {d4, d5},[r7]        @q2 =constants[2]
-+        vld1.16   {d6, d7},[r8]        @q3 =constants[3]
-+        vld1.16   {d8, d9},[r9]        @q4 =constants[4]
-+    .endm
-+
-+    .macro PART2
-+        @==================================================================
-+        @ Prescale the input
-+        @==================================================================
-+        vqrdmulh.s16 q12, q12, q1      @q12=row4 * constants[1] = vx4
-+        vqrdmulh.s16 q15, q15, q2      @q15=row7 * constants[2] = vx7
-+        vqrdmulh.s16 q9,  q9,  q2      @q9 =row1 * constants[2] = vx1
-+        vqrdmulh.s16 q13, q13, q4      @q13=row5 * constants[4] = vx5
-+        vqrdmulh.s16 q11, q11, q4      @q11=row3 * constants[4] = vx3
-+        vqrdmulh.s16 q14, q14, q3      @q14=row6 * constants[3] = vx6
-+        vqrdmulh.s16 q10, q10, q3      @q10=row2 * constants[3] = vx2
-+        vqrdmulh.s16 q8,  q8,  q1      @q8 =row0 * constants[1] = vx0
-+
-+        @==================================================================
-+        @ At thsi point, the input 8x8 x 16 bit coefficients are
-+        @ transposed, prescaled, and loaded in q8 to q15
-+        @ q0 loaded with scalar constants
-+        @ Perform 1D IDCT
-+        @==================================================================
-+        IDCT1D                         @perform 1d idct
-+
-+        @==================================================================
-+        @ Transpose the intermediate results to get read for vertical
-+        @ transformation
-+        @==================================================================
-+        vswp d17, d24                  @q8, q12
-+        vswp d23, d30                  @q11, q15
-+        vswp d21, d28                  @q10, q14
-+        vswp d19, d26                  @q9, q13
-+
-+        @==================================================================
-+        @ Load the bias
-+        @==================================================================
-+        vdup.32 q4, d1[1]              @a cycle is saved by loading
-+                                       @the bias at this point
-+
-+        @==================================================================
-+        @ Finish the transposition
-+        @==================================================================
-+        vtrn.32 q8,  q10
-+        vtrn.32 q9,  q11
-+        vtrn.32 q12, q14
-+        vtrn.32 q13, q15
-+        vtrn.16 q8,  q9
-+        vtrn.16 q10, q11
-+        vtrn.16 q12, q13
-+        vtrn.16 q14, q15
-+
-+        @==================================================================
-+        @ Add bias
-+        @==================================================================
-+        vqadd.s16 q8, q8, q4
-+
-+        @==================================================================
-+        @ IDCT 2nd half
-+        @==================================================================
-+        IDCT1D                         @perform 1d dct
-+
-+        @==================================================================
-+        @ Scale and clamp the output to correct range and save to memory
-+        @ 1. scale to 8bits by right shift 6
-+        @ 2. clamp output to [0, 255] by min/max
-+        @ 3. use multiple store. Each store will save one row of output.
-+        @    The st queue size is 4, so do no more than 4 str in sequence.
-+        @==================================================================
-+        ldr       r5, =constants+5*16  @constants[5],
-+        vld1.16   d10, [r5]            @load clamping parameters
-+        vdup.s16  q6,  d10[0]          @q6=[0000000000000000]
-+        vdup.s16  q7,  d10[1]          @q7=[FFFFFFFFFFFFFFFF]
-+
-+        @Save the results
-+        vshr.s16  q8,  q8,  #6         @q8 = vy0
-+        vmax.s16  q8,  q8,  q6         @clamp >0
-+        vmin.s16  q8,  q8,  q7         @clamp <255
-+
-+        vshr.s16  q9,  q9,  #6         @q9 = vy1
-+        vmax.s16  q9,  q9,  q6         @clamp >0
-+        vmin.s16  q9,  q9,  q7         @clamp <255
-+
-+        vshr.s16  q10, q10, #6         @q10 = vy2
-+        vmax.s16  q10, q10, q6         @clamp >0
-+        vmin.s16  q10, q10, q7         @clamp <255
-+
-+        vshr.s16  q11, q11, #6         @q11 = vy3
-+        vmax.s16  q11, q11, q6         @clamp >0
-+        vmin.s16  q11, q11, q7         @clamp <255
-+
-+        vst1.16  {d16, d17},[r1],r2    @q8 =row0
-+        vst1.16  {d18, d19},[r1],r2    @q9 =row1
-+        vst1.16  {d20, d21},[r1],r2    @q10=row2
-+        vst1.16  {d22, d23},[r1],r2    @q11=row3
-+
-+        vshr.s16  q12, q12, #6         @q12 = vy4
-+        vmax.s16  q12, q12, q6         @clamp >0
-+        vmin.s16  q12, q12, q7         @clamp <255
-+
-+        vshr.s16  q13, q13, #6         @q13 = vy5
-+        vmax.s16  q13, q13, q6         @clamp >0
-+        vmin.s16  q13, q13, q7         @clamp <255
-+
-+        vshr.s16  q14, q14, #6         @q14 = vy6
-+        vmax.s16  q14, q14, q6         @clamp >0
-+        vmin.s16  q14, q14, q7         @clamp <255
-+
-+        vshr.s16  q15, q15, #6         @q15 = vy7
-+        vmax.s16  q15, q15, q6         @clamp >0
-+        vmin.s16  q15, q15, q7         @clamp <255
-+
-+        vst1.16  {d24, d25},[r1],r2    @q12=row4
-+        vst1.16  {d26, d27},[r1],r2    @q13=row5
-+        vst1.16  {d28, d29},[r1],r2    @q14=row6
-+        vst1.16  {d30, d31},[r1]       @q15=row7
-+    .endm
-+
-+    .macro BIG_BODY_TRANSPOSE_INPUT
-+        @==================================================================
-+        @ Main body of idct
-+        @==================================================================
-+        PART1
-+        Transpose8x8
-+        PART2
-+    .endm
-+
-+    .macro IDCT_ENTRY
-+        @==================================================================
-+        @ Load the locations of the constants
-+        @==================================================================
-+        ldr  r5,  =constants+0*16      @constants[0]
-+        ldr  r6,  =constants+1*16      @constants[1]
-+        ldr  r7,  =constants+2*16      @constants[2]
-+        ldr  r8,  =constants+3*16      @constants[3]
-+        ldr  r9,  =constants+4*16      @constants[4]
-+
-+        @==================================================================
-+        @ Load the coefficients
-+        @ only some input coefficients are load due to register constrain
-+        @==================================================================
-+        vld1.16   {d0, d1},[r5]        @q0 =constants[0] (scalars)
-+        vld1.16   {d2, d3},[r6]        @q1 =constants[1]
-+    .endm
-+@==========================================================================
-+@ END of MACRO DEFINITION
-+@==========================================================================
-+
-+
-+    .section idct_func, "x"            @ ARE
-+    .text                              @ idct_func, CODE, READONLY
-+    .align 2
-+    .code 32                           @ CODE32
-+
-+@==========================================================================
-+@ Main Routine
-+@==========================================================================
-+
-+    .global idct_1x1_venum
-+    .global idct_2x2_venum
-+    .global idct_4x4_venum
-+    .global idct_8x8_venum
-+
-+@==========================================================================
-+@ FUNCTION     : idct_1x1_venum
-+@--------------------------------------------------------------------------
-+@ DISCRIPTION  : ARM optimization of one 1x1 block iDCT
-+@--------------------------------------------------------------------------
-+@ C PROTOTYPE  : void idct_1x1_venum(int16 * input,
-+@                                    int16 * output,
-+@                                    int32 stride)
-+@--------------------------------------------------------------------------
-+@ REG INPUT    : R0 pointer to input (int16)
-+@                R1 pointer to output (int16)
-+@                R2 block stride
-+@--------------------------------------------------------------------------
-+@ STACK ARG    : None
-+@--------------------------------------------------------------------------
-+@ MEM INPUT    : None
-+@--------------------------------------------------------------------------
-+@ REG OUTPUT   : None
-+@--------------------------------------------------------------------------
-+@ MEM OUTPUT   : None
-+@--------------------------------------------------------------------------
-+@ REG AFFECTED : R0 - R2
-+@--------------------------------------------------------------------------
-+@ STACK USAGE  : none
-+@--------------------------------------------------------------------------
-+@ CYCLES       : 17 cycles
-+@--------------------------------------------------------------------------
-+@ NOTES        :
-+@ This idct_1x1_venum code was developed with ARM instruction set.
-+@
-+@ ARM REGISTER ALLOCATION
-+@ =========================================================================
-+@ r0  : pointer to input data
-+@ r1  : pointer to output area
-+@ r2  : stride in the output buffer
-+@==========================================================================
-+.type idct_1x1_venum, %function
-+idct_1x1_venum:
-+
-+    ldrsh   r3, [r0]                   @ Load signed half word (int16)
-+    ldr     r2, =1028                  @ 1028 = 4 + 128 << 3
-+                                       @ 4 for rounding, 128 for offset
-+    add     r2, r3, r2
-+    asrs    r2, r2, #3                 @ Divide by 8, and set status bit
-+    movmi   r2, #0                     @ Clamp to be greater than 0
-+    cmp     r2, #255
-+    movgt   r2, #255                   @ Clamp to be less than 255
-+    str     r2, [r1]                   @ Save output
-+    bx      lr                         @ Return to caller
-+
-+                                       @ end of idct_1x1_venum
-+
-+
-+@==========================================================================
-+@ FUNCTION     : idct_2x2_venum
-+@--------------------------------------------------------------------------
-+@ DISCRIPTION  : VeNum optimization of one 2x2 block iDCT
-+@--------------------------------------------------------------------------
-+@ C PROTOTYPE  : void idct_2x2_venum(int16 * input,
-+@                                    int16 * output,
-+@                                    int32 stride)
-+@--------------------------------------------------------------------------
-+@ REG INPUT    : R0 pointer to input (int16)
-+@                R1 pointer to output (int16)
-+@                R2 block stride
-+@--------------------------------------------------------------------------
-+@ STACK ARG    : None
-+@--------------------------------------------------------------------------
-+@ MEM INPUT    : None
-+@--------------------------------------------------------------------------
-+@ REG OUTPUT   : None
-+@--------------------------------------------------------------------------
-+@ MEM OUTPUT   : None
-+@--------------------------------------------------------------------------
-+@ REG AFFECTED : R0 - R2
-+@--------------------------------------------------------------------------
-+@ STACK USAGE  : none
-+@--------------------------------------------------------------------------
-+@ CYCLES       : 27 cycles
-+@--------------------------------------------------------------------------
-+@ NOTES        : Output buffer must be an 8x8 16-bit buffer
-+@
-+@ ARM REGISTER ALLOCATION
-+@ ==========================================
-+@ r0  : pointer to input data
-+@ r1  : pointer to output area
-+@ r2  : stride in the output buffer
-+@ -------------------------------------------
-+@
-+@ VENUM REGISTER ALLOCATION
-+@ =================================================
-+@ q0     : output x0 - x4
-+@ q1     : not used
-+@ q2     : not used
-+@ q3     : not used
-+@ q4     : not used
-+@ q5     : not used
-+@ q6     : not used
-+@ q7     : not used
-+@ q8     : input y0 - y4
-+@ q9     : intermediate value
-+@ q10    : intermediate value
-+@ q11    : offset value
-+@ q12    : clamp value
-+@ q13    : not used
-+@ q14    : not used
-+@ q15    : not used
-+@==========================================================================
-+.type idct_2x2_venum, %function
-+idct_2x2_venum:
-+
-+    vld4.32    {d16, d17, d18, d19}, [r0]
-+                                       @  d16: y0 | y1 | y2 | y3  (LSB | MSB)
-+
-+    vtrn.32    d16, d17                @  d16: y0 | y1 | X | X
-+                                       @  d17: y2 | y3 | X | X
-+
-+    vqadd.s16  d18, d16, d17           @ d18: y0+y2 | y1+y3 | X | X   q: saturated
-+    vqsub.s16  d19, d16, d17           @ d19: y0-y2 | y1-y3 | X | X   q: saturated
-+
-+    vtrn.16    d18, d19                @ d18: y0+y2 | y0-y2 | X | X
-+                                       @ d19: y1+y3 | y1-y3 | X | X
-+
-+    vqadd.s16  d20, d18, d19           @ d20: (y0+y2)+(y1+y3) | (y0-y2)+(y1-y3)
-+                                       @       x0 | x2 | X | X
-+    vqsub.s16  d21, d18, d19           @ d21: (y0+y2)-(y1+y3) | (y0-y2)-(y1-y3)
-+                                       @       x1 | x3 | X | X
-+
-+    vtrn.16    d20, d21                @ d20:  x0 | x1 | X | X
-+                                       @ d21:  x2 | x3 | X | X
-+
-+    vrshr.s16  q10, q10, #3               @ Divide by 8
-+
-+    vmov.i16   q11, #128               @ q11 = 128|128|128|128|128|128|128|128
-+    vqadd.s16  q0, q10, q11            @ Add offset to make output in [0,255]
-+
-+    vmov.i16   q12, #0                   @ q12 = [0000000000000000]
-+    vmov.i16   q13, #255               @ q13 = [FFFFFFFFFFFFFFFF] (hex)
-+
-+    vmax.s16   q0, q0, q12             @ Clamp > 0
-+    vmin.s16   q0, q0, q13             @ Clamp < 255
-+
-+    vstr       d0, [r1]                @ Store  x0 | x1 | X | X
-+                                       @ Potential out of boundary issue
-+    add        r1, r1, r2              @ Add the offset to the output pointer
-+    vstr       d1, [r1]                @ Store  x2 | x3 | X | X
-+                                       @ Potential out of boundary issue
-+    bx         lr                      @ Return to caller
-+
-+                                       @ end of idct_2x2_venum
-+
-+
-+@==========================================================================
-+@ FUNCTION     : idct_4x4_venum
-+@--------------------------------------------------------------------------
-+@ DISCRIPTION  : VeNum optimization of one 4x4 block iDCT
-+@--------------------------------------------------------------------------
-+@ C PROTOTYPE  : void idct_4x4_venum(int16 * input,
-+@                                    int16 * output,
-+@                                    int32 stride)
-+@--------------------------------------------------------------------------
-+@ REG INPUT    : R0 pointer to input (int16)
-+@                R1 pointer to output (int16)
-+@                R2 block stride
-+@--------------------------------------------------------------------------
-+@ STACK ARG    : None
-+@--------------------------------------------------------------------------
-+@ MEM INPUT    : None
-+@--------------------------------------------------------------------------
-+@ REG OUTPUT   : None
-+@--------------------------------------------------------------------------
-+@ MEM OUTPUT   : None
-+@--------------------------------------------------------------------------
-+@ REG AFFECTED : R0 - R3, R12
-+@--------------------------------------------------------------------------
-+@ STACK USAGE  : none
-+@--------------------------------------------------------------------------
-+@ CYCLES       : 56 cycles
-+@--------------------------------------------------------------------------
-+@ NOTES        :
-+@
-+@ ARM REGISTER ALLOCATION
-+@ ==========================================
-+@ r0  : pointer to input data
-+@ r1  : pointer to output area
-+@ r2  : stride in the output buffer
-+@ r3  : pointer to the coefficient set
-+@ r12 : pointer to the coefficient set
-+@ -------------------------------------------
-+@
-+@ VENUM REGISTER ALLOCATION
-+@ =================================================
-+@ q0     : coefficients[0]
-+@ q1     : coefficients[1]
-+@ q2     : coefficients[2]
-+@ q3     : coefficients[3]
-+@ q4     : not used
-+@ q5     : not used
-+@ q6     : not used
-+@ q7     : not used
-+@ q8     : input y0 - y7
-+@ q9     : input y8 - y15
-+@ q10    : intermediate value
-+@ q11    : intermediate value
-+@ q12    : intermediate value
-+@ q13    : intermediate value
-+@ q14    : intermediate value
-+@ q15    : intermediate value
-+@==========================================================================
-+.type idct_4x4_venum, %function
-+idct_4x4_venum:
-+
-+        @ Load the locations of the first 2 sets of coefficients
-+        ldr  r3,   =coefficient+0*16   @ coefficient[0]
-+        ldr  r12,  =coefficient+1*16   @ coefficient[1]
-+
-+        @ Load the first 2 sets of coefficients
-+        vld1.16  {d0, d1},[r3]         @ q0 = C4 | C2 | C4 | C6 | C4 | C2 | C4 | C6
-+        vld1.16  {d2, d3},[r12]        @ q1 = C4 | C6 | C4 | C2 | C4 | C6 | C4 | C2
-+
-+        @ Load the locations of the second 2 sets of coefficients
-+        ldr  r3,   =coefficient+2*16   @ coefficient[2]
-+        ldr  r12,  =coefficient+3*16   @ coefficient[3]
-+
-+        @ Load the second 2 sets of coefficients
-+        vld1.16  {d4, d5},[r3]         @ q2 = C4 | C4 | C4 | C4 | C2 | C2 | C2 | C2
-+        vld1.16  {d6, d7},[r12]        @ q3 = C4 | C4 | C4 | C4 | C6 | C6 | C6 | C6
-+
-+        @ Load the input values
-+        vld1.16  {d16}, [r0], r2       @ d16:   y0  | y1  | y2  | y3  (LSB | MSB)
-+        vld1.16  {d17}, [r0], r2       @ d17:   y4  | y5  | y6  | y7  (LSB | MSB)
-+        vld1.16  {d18}, [r0], r2       @ d18:   y8  | y9  | y10 | y11 (LSB | MSB)
-+        vld1.16  {d19}, [r0], r2       @ d19:   y12 | y13 | y14 | y15 (LSB | MSB)
-+
-+        @ Apply iDCT Horizonally
-+
-+        @ q8: y0 |y1 |y2 |y3 |y4 |y5 |y6 |y7
-+        @ q9: y8 |y9 |y10|y11|y12|y13|y14|y15
-+
-+        @======================================================================
-+        @ vqrdmulh doubles the result and save the high 16 bits of the result,
-+        @ this is equivalent to right shift by 15 bits.
-+        @ since coefficients are in Q15 format, it contradicts with the right
-+        @ shift 15 here, so the final result is in Q0 format
-+        @
-+        @ vqrdmulh will also round the result
-+        @======================================================================
-+
-+        vqrdmulh.s16  q10, q8, q0      @ q10: C4*y0  | C2*y1  | C4*y2  | C6*y3  | C4*y4  | C2*y5  | C4*y6  | C6*y7
-+        vqrdmulh.s16  q11, q8, q1      @ q11: C4*y0  | C6*y1  | C4*y2  | C2*y3  | C4*y4  | C6*y5  | C4*y6  | C2*y7
-+
-+        vqrdmulh.s16  q12, q9, q0      @ q12: C4*y8  | C2*y9  | C4*y10 | C6*y11 | C4*y12 | C2*y13 | C4*y14 | C6*y15
-+        vqrdmulh.s16  q13, q9, q1      @ q13: C4*y8  | C6*y9  | C4*y10 | C2*y11 | C4*y12 | C6*y13 | C4*y14 | C2*y15
-+
-+        vtrn.32       q10, q12         @ q10: C4*y0  | C2*y1  | C4*y8  | C2*y9  | C4*y4  | C2*y5  | C4*y12 | C2*y13
-+                                       @ q12: C4*y2  | C6*y3  | C4*y10 | C6*y11 | C4*y6  | C6*y7  | C4*y14 | C6*y15
-+
-+        vtrn.32       q11, q13         @ q11: C4*y0  | C6*y1  | C4*y8  | C6*y9  | C4*y4  | C6*y5  | C4*y12 | C6*y13
-+                                       @ q13: C4*y2  | C2*y3  | C4*y10 | C2*y11 | C4*y6  | C2*y7  | C4*y14 | C2*y15
-+
-+        vqadd.s16     q14, q10, q12    @ q14: C4*y0 + C4*y2 | C2*y1 + C6*y3 | C4*y8 + C4*y10 | C2*y9 + C6*y11 | C4*y4 + C4*y6 | C2*y5 + C6*y7 | C4*y12 + C4*y14 | C2*y13 + C6*y15
-+                                       @       S0 | S2 | S8 | S10 | S4 | S6 | S12 | S14
-+
-+        vqsub.s16     q15, q11, q13    @ q15: C4*y0 - C4*y2 | C6*y1 - C2*y3 | C4*y8 - C4*y10 | C6*y9 - C2*y11 | C4*y4 - C4*y6 | C6*y5 - C2*y7 | C4*y12 - C4*y14 | C6*y13 - C2*y15
-+                                       @       S1 | S3 | S9 | S11 | S5 | S7 | S13 | S15
-+
-+        vtrn.16       q14, q15         @ q14: S0 | S1 | S8  | S9  | S4 | S5 | S12 | S13
-+                                       @ q15: S2 | S3 | S10 | S11 | S6 | S7 | S14 | S15
-+
-+        vqadd.s16     q8, q14, q15     @ q8:  Z0 | Z1 | Z8  | Z9  | Z4 | Z5 | Z12 | Z13
-+        vqsub.s16     q9, q14, q15     @ q9:  Z3 | Z2 | Z11 | Z10 | Z7 | Z6 | Z15 | Z14
-+        vrev32.16     q9, q9           @ q9:  Z2 | Z3 | Z10 | Z11 | Z6 | Z7 | Z14 | Z15
-+
-+
-+        @ Apply iDCT Vertically
-+
-+        vtrn.32       q8, q9           @ q8:  Z0 | Z1 | Z2  | Z3  | Z4  | Z5  | Z6  | Z7
-+                                       @ q9:  Z8 | Z9 | Z10 | Z11 | Z12 | Z13 | Z14 | Z15
-+
-+
-+        vqrdmulh.s16  q10, q8, q2      @ q10: C4*Z0 | C4*Z1 | C4*Z2 | C4*Z3 | C2*Z4 | C2*Z5 | C2*Z6 | C2*Z7
-+        vqrdmulh.s16  q11, q8, q3      @ q11: C4*Z0 | C4*Z1 | C4*Z2 | C4*Z3 | C6*Z4 | C6*Z5 | C6*Z6 | C6*Z7
-+
-+        vqrdmulh.s16  q12, q9, q2      @ q12: C4*Z8 | C4*Z9 | C4*Z10 | C4*Z11 | C2*Z12 | C2*Z13 | C2*Z14 | C2*Z15
-+        vqrdmulh.s16  q13, q9, q3      @ q13: C4*Z8 | C4*Z9 | C4*Z10 | C4*Z11 | C6*Z12 | C6*Z13 | C6*Z14 | C6*Z15
-+
-+        vqadd.s16     q14, q10, q13    @ q14: C4*Z0+C4*Z8 | C4*Z1+C4*Z9 | C4*Z2+C4*Z10 | C4*Z3+C4*Z11 | C2*Z4+C6*Z12 | C2*Z5+C6*Z13 | C2*Z6+C6*Z14 | C2*Z7+C6*Z15
-+                                       @      s0 | s4 | s8 | s12 | s2 | s6 | s10 | s14
-+
-+        vqsub.s16     q15, q11, q12    @ q15: C4*Z0-C4*Z8 | C4*Z1-C4*Z9 | C4*Z2-C4*Z10 | C4*Z3-C4*Z11 | C6*Z4-C2*Z12 | C6*Z5-C2*Z13 | C6*Z6-C2*Z14 | C6*Z7-C2*Z15
-+                                       @      s1 | s5 | s9 | s13 | s3 | s7 | s11 | s15
-+
-+        vswp          d29, d30         @ q14: s0 | s4 | s8  | s12 | s1 | s5 | s9  | s13
-+                                       @ q15: s2 | s6 | s10 | s14 | s3 | s7 | s11 | s15
-+
-+        vqadd.s16     q8, q14, q15     @ q8:  x0 | x4 | x8  | x12 | x1 | x5 | x9 | x13
-+        vqsub.s16     q9, q14, q15     @ q9:  x3 | x7 | x11 | x15 | x2 | x6 | x10 | x14
-+
-+        vmov.i16      q10, #0           @ q10=[0000000000000000]
-+        vmov.i16      q11, #255        @ q11=[FFFFFFFFFFFFFFFF] (hex)
-+
-+        vmov.i16      q0, #128         @ q0 = 128|128|128|128|128|128|128|128
-+
-+        vqadd.s16     q8, q8, q0       @ Add the offset
-+        vqadd.s16     q9, q9, q0       @ Add the offset
-+
-+        vmax.s16      q8, q8, q10      @ clamp > 0
-+        vmin.s16      q8, q8, q11      @ clamp < 255
-+
-+        vmax.s16      q9, q9, q10      @ clamp > 0
-+        vmin.s16      q9, q9, q11      @ clamp < 255
-+
-+        vst1.16       {d16}, [r1], r2  @  d16:   x0 | x1  | x2  | x3  (LSB | MSB)
-+        vst1.16       {d17}, [r1], r2  @  d17:   x4 | x5  | x6  | x7  (LSB | MSB)
-+        vst1.16       {d19}, [r1], r2  @  d18:   x8 | x9  | x10 | x11 (LSB | MSB)
-+        vst1.16       {d18}, [r1], r2  @  d19:   x12| x13 | x14 | x15 (LSB | MSB)
-+
-+        bx         lr                  @ Return to caller
-+
-+                                       @ end of idct_4x4_venum
-+
-+@==========================================================================
-+@ FUNCTION     : idct_8x8_venum
-+@--------------------------------------------------------------------------
-+@ DISCRIPTION  : VeNum optimization of one 8x8 block iDCT
-+@--------------------------------------------------------------------------
-+@ C PROTOTYPE  : void idct_8x8_venum(int16 * input,
-+@                                    int16 * output,
-+@                                    int32 stride)
-+@--------------------------------------------------------------------------
-+@ REG INPUT    : R0 pointer to input (int16)
-+@                R1 pointer to output (int16)
-+@                R2 block stride
-+@--------------------------------------------------------------------------
-+@ STACK ARG    : None
-+@--------------------------------------------------------------------------
-+@ MEM INPUT    : None
-+@--------------------------------------------------------------------------
-+@ REG OUTPUT   : None
-+@--------------------------------------------------------------------------
-+@ MEM OUTPUT   : None
-+@--------------------------------------------------------------------------
-+@ REG AFFECTED : R0 - R9
-+@--------------------------------------------------------------------------
-+@ STACK USAGE  : none
-+@--------------------------------------------------------------------------
-+@ CYCLES       : 177 cycles
-+@--------------------------------------------------------------------------
-+@ NOTES        :
-+@
-+@ It was tested to be IEEE 1180 compliant. Since IEEE 1180 compliance is more stringent
-+@ than MPEG-4 compliance, this version is also MPEG-4 compliant.
-+@
-+@ CODE STRUCTURE:
-+@ (i)   Macros for transposing an 8x8 matrix and for configuring the VFP unit are defined.
-+@ (ii)  Macro for IDCT in one dimension is defined as four stages
-+@ (iii) The two dimensional code begins
-+@ (iv)  constants are defined in the area DataArea
-+@
-+@ PROGRAM FLOW:
-+@
-+@ The VFP is configured
-+@ The parameters to IDCT are loaded
-+@ the coefficients are loaded
-+@ loop:
-+@    decrement loop counter
-+@    The first input Matrix is loaded and pre-scaled
-+@    The input is prescaled using the constants
-+@    IDCT is performed in one dimension on the 8 columns
-+@    The matrix is transposed
-+@    A bias is loaded an added to the matrix
-+@    IDCT is performed in one dimension on the 8 rows
-+@    The matrix is post-scaled
-+@    The matrix is saved
-+@    test loop counter and loop if greater than zero
-+@ stop
-+@
-+@
-+@ ARM REGISTER ALLOCATION
-+@ ==========================================
-+@ r0 : pointer to input data
-+@ r1 : pointer to output are
-+@ r2 : stride in the output buffer
-+@ r3 :
-+@ r4 :
-+@ r5 : pointer to constants[0] [5]
-+@ r6 : pointer to constants[1]
-+@ r7 : pointer to constants[2]
-+@ r8 : pointer to constants[3]
-+@ r9 : pointer to constants[4]
-+@ -------------------------------------------
-+@
-+@ VENUM REGISTER ALLOCATION
-+@ =================================================
-+@ q0     : constants[0]
-+@ q1     : constants[1]
-+@ q2     : constants[2], IDCT1D in-place scratch
-+@ q3     : constants[3], IDCT1D in-place scratch
-+@ q4     : constants[4], IDCT1D in-place scratch, and bias compensation
-+@ q5     :               IDCT1D in-place scratch
-+@ q6     :               IDCT1D in-place scratch
-+@ q7     :               IDCT1D in-place scratch
-+@ q8     : Matrix[0]     IDCT1D in-place scratch
-+@ q9     : Matrix[1]     IDCT1D in-place scratch
-+@ q10    : Matrix[2]     IDCT1D in-place scratch
-+@ q11    : Matrix[3]     IDCT1D in-place scratch
-+@ q12    : Matrix[4]     IDCT1D in-place scratch
-+@ q13    : Matrix[5]     IDCT1D in-place scratch
-+@ q14    : Matrix[6]     IDCT1D in-place scratch
-+@ q15    : Matrix[7]     IDCT1D in-place scratch
-+@==========================================================================
-+.type idct_8x8_venum, %function
-+idct_8x8_venum:
-+
-+        push {r5-r9}
-+        vpush {d8-d15}
-+        IDCT_ENTRY
-+        BIG_BODY_TRANSPOSE_INPUT
-+        vpop {d8-d15}
-+        pop  {r5-r9}
-+        bx   lr
-+                                       @ end of idct_8x8_venum
-+
-+@==========================================================================
-+@ Constants Definition AREA: define idct kernel, bias
-+@==========================================================================
-+    .section ro_data_area              @ AREA  RODataArea
-+    .data                              @ DATA, READONLY
-+    .align 5                           @ ALIGN=5
-+
-+constants:
-+        .hword  23170, 13573, 6518,  21895, -23170, -21895, 8223,  8224
-+        .hword  16384, 22725, 21407, 19266, 16384,  19266,  21407, 22725
-+        .hword  22725, 31521, 29692, 26722, 22725,  26722,  29692, 31521
-+        .hword  21407, 29692, 27969, 25172, 21407,  25172,  27969, 29692
-+        .hword  19266, 26722, 25172, 22654, 19266,  22654,  25172, 26722
-+        .hword      0,   255,     0,     0
-+
-+coefficient:                           @ These are the coefficent used by 4x4 iDCT in Q15 format
-+        .hword 11585, 15137,  11585,  6270, 11585, 15137,  11585,  6270  @ C4, C2, C4, C6, C4, C2, C4, C6 /2
-+        .hword 11585,  6270,  11585, 15137, 11585,  6270,  11585, 15137  @ C4, C6, C4, C2, C4, C6, C4, C2 /2
-+        .hword 11585, 11585,  11585, 11585, 15137, 15137,  15137, 15137  @ C4, C4, C4, C4, C2, C2, C2, C2 /2
-+        .hword 11585, 11585,  11585, 11585,  6270,  6270,   6270,  6270  @ C4, C4, C4, C4, C6, C6, C6, C6 /2
-+
-+.end
diff --git a/debian/patches/series b/debian/patches/series
index d41f795..d229217 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1 +1,3 @@
-debian-changes-1.1.1-1inaro2
+FixLibraryStartup.patch
+tjunittest.patch
+add-copying-for-debian-dir.patch
diff --git a/debian/patches/tjunittest.patch b/debian/patches/tjunittest.patch
new file mode 100644
index 0000000..ff13aa4
--- /dev/null
+++ b/debian/patches/tjunittest.patch
@@ -0,0 +1,14 @@
+Index: libjpeg-turbo-1.1.90+svn733/Makefile.am
+===================================================================
+--- libjpeg-turbo-1.1.90+svn733.orig/Makefile.am	2011-12-20 19:33:20.000000000 +0000
++++ libjpeg-turbo-1.1.90+svn733/Makefile.am	2011-12-20 19:35:26.358583864 +0000
+@@ -72,8 +72,7 @@
+ 
+ endif
+ 
+-bin_PROGRAMS = cjpeg djpeg jpegtran rdjpgcom wrjpgcom tjbench
+-noinst_PROGRAMS = tjunittest jcstest
++bin_PROGRAMS = cjpeg djpeg jpegtran rdjpgcom wrjpgcom tjbench tjunittest jcstest
+ 
+ tjbench_SOURCES = tjbench.c bmp.h bmp.c tjutil.h tjutil.c rdbmp.c rdppm.c \
+ 	wrbmp.c wrppm.c
diff --git a/debian/rules b/debian/rules
index af70163..783a3d5 100755
--- a/debian/rules
+++ b/debian/rules
@@ -9,13 +9,77 @@
 # Uncomment this to turn on verbose mode.
 #export DH_VERBOSE=1
 
+# for architecture dependent variables and changelog vars
+vafilt = $(subst $(2)=,,$(filter $(2)=%,$(1)))
+
+DPKG_VARS               := $(shell dpkg-architecture)
+DEB_BUILD_GNU_TYPE	?= $(call vafilt,$(DPKG_VARS),DEB_BUILD_GNU_TYPE)
+DEB_HOST_GNU_TYPE	?= $(call vafilt,$(DPKG_VARS),DEB_HOST_GNU_TYPE)
+DEB_HOST_MULTIARCH	?= $(call vafilt,$(DPKG_VARS),DEB_HOST_MULTIARCH)
+
+CFLAGS ?= $(shell dpkg-buildflags --get CFLAGS)
+ifneq (,$(filter noopt,$(DEB_BUILD_OPTIONS)))
+	CFLAGS += -Wall
+endif
+
+ifeq ($(DEB_HOST_GNU_TYPE),$(DEB_BUILD_GNU_TYPE))
+CC = gcc
+else
+CC = $(DEB_HOST_GNU_TYPE)-gcc
+endif
+
 %:
 	dh $@  --with autoreconf
 
+override_dh_auto_configure:
+	dh_auto_configure -v -- --with-jpeg8 CFLAGS="$(CFLAGS)"
+
+override_dh_auto_build:
+	dh_auto_build -v
+	$(MAKE) -C debian/extra CC=$(CC) CFLAGS="$(CFLAGS) -Wall"
+
+override_dh_install:
+	$(MAKE) -C debian/extra install prefix=/usr DESTDIR=$(CURDIR)/debian/tmp
+	dh_install
+	: # fix jconfig.h 
+	rm -f debian/tmp/usr/include/jconfig.h
+	mkdir -p debian/tmp/usr/include/$(DEB_HOST_MULTIARCH)
+	sed -e "s/#\(undef\|define\) HAVE_\(LOCALE\|\(STD\(DEF\|LIB\)\)\)_H 1//g" \
+		jconfig.h > debian/tmp/usr/include/$(DEB_HOST_MULTIARCH)/jconfig.h
+
 override_dh_installchangelogs:
-	dh_installchangelogs change.log
+	dh_installchangelogs -plibjpeg-turbo8-dev change.log
+	dh_installchangelogs -Nlibjpeg-turbo8-dev
+
+override_dh_strip:
+	dh_strip -plibjpeg-turbo8 --dbg-package=libjpeg-turbo8-dbg
+	dh_strip -Nlibjpeg-turbo8 libjpeg-turbo8-dbg
+
+override_dh_makeshlibs:
+	: # keep the original libjpeg symbol version
+	dh_makeshlibs -- -v8c
+	: # ... and mark libjpeg-turbo only symbols with a different version
+	sed -i "$$(for i in $$(cat debian/libjpeg-turbo-only.symbols); do echo "/^ $$i/s/8c$$/8c-2ubuntu5~/;"; done)" debian/libjpeg-turbo8/DEBIAN/symbols
 
-override_dh_test:
 override_dh_auto_test:
 
+override_dh_auto_clean:
+	dh_auto_clean
+	$(MAKE) -C debian/extra clean
+	rm -f simd/jsimdcfg.inc
+
+deb_source := $(shell dpkg-parsechangelog | sed -n 's/^Source: //p')
+svn_rev := $(shell dpkg-parsechangelog | sed -rne 's,^Version: .*[+~]svn([0-9]+).*,\1,p')
+upstream_version := $(shell dpkg-parsechangelog | sed -rne 's,^Version: ([^-]+).*,\1,p')
+
+.PHONY: get-orig-source
+get-orig-source:
+	rm -rf $(deb_source)-$(upstream_version).orig
+	rm -f $(deb_source)-$(upstream_version).orig.tar.gz
+	svn -q export -r $(svn_rev) https://libjpeg-turbo.svn.sourceforge.net/svnroot/libjpeg-turbo/trunk \
+		$(deb_source)-$(upstream_version).orig
+	GZIP=--best tar -cz --owner root --group root --mode a+rX \
+	     -f $(deb_source)_$(upstream_version).orig.tar.gz \
+	     $(deb_source)-$(upstream_version).orig
+	rm -r $(deb_source)-$(upstream_version).orig
 
diff --git a/debian/source.lintian-overrides b/debian/source.lintian-overrides
new file mode 100644
index 0000000..4c9ca41
--- /dev/null
+++ b/debian/source.lintian-overrides
@@ -0,0 +1 @@
+package-needs-versioned-debhelper-build-depends 9
diff --git a/debian/watch b/debian/watch
new file mode 100644
index 0000000..20f80e6
--- /dev/null
+++ b/debian/watch
@@ -0,0 +1,2 @@
+version=3
+http://sf.net/libjpeg-turbo/libjpeg-turbo-(.+)\.tar\.gz
author	Tom Gall <tom.gall@linaro.org>	2011-12-23 10:42:23 -0600
committer	Tom Gall <tom.gall@linaro.org>	2011-12-23 10:42:23 -0600
commit	d519a0ef385e8d74fce083497630ffb4e9ba0adc (patch)
tree	1b145eb57dd8a40f710ff23ec591703b24e3aa0c
parent	f1a90e77f305fc679c9fcbd4c5a11deb48f92c29 (diff)