| Index: media-libs/libpng/files/libpng-1.2.44-neon-optimized.patch
|
| diff --git a/media-libs/libpng/files/libpng-1.2.44-neon-optimized.patch b/media-libs/libpng/files/libpng-1.2.44-neon-optimized.patch
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..7ab0d5b8ff1d3ac515085fcb02a1c12f270deaff
|
| --- /dev/null
|
| +++ b/media-libs/libpng/files/libpng-1.2.44-neon-optimized.patch
|
| @@ -0,0 +1,1334 @@
|
| +diff --git a/Makefile.in b/Makefile.in
|
| +index b9c41f0..8472c8a 100644
|
| +--- a/Makefile.in
|
| ++++ b/Makefile.in
|
| +@@ -99,7 +99,8 @@ am__objects_1 = libpng_la-png.lo libpng_la-pngset.lo \
|
| + libpng_la-pngread.lo libpng_la-pngrio.lo libpng_la-pngwio.lo \
|
| + libpng_la-pngwrite.lo libpng_la-pngrtran.lo \
|
| + libpng_la-pngwtran.lo libpng_la-pngmem.lo \
|
| +- libpng_la-pngerror.lo libpng_la-pngpread.lo
|
| ++ libpng_la-pngerror.lo libpng_la-pngpread.lo \
|
| ++ libpng_la-png_read_filter_row_neon.lo
|
| + am_libpng_la_OBJECTS = $(am__objects_1)
|
| + libpng_la_OBJECTS = $(am_libpng_la_OBJECTS)
|
| + libpng_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
|
| +@@ -113,7 +114,7 @@ am_libpng12_la_OBJECTS = libpng12_la-png.lo libpng12_la-pngset.lo \
|
| + libpng12_la-pngwio.lo libpng12_la-pngwrite.lo \
|
| + libpng12_la-pngrtran.lo libpng12_la-pngwtran.lo \
|
| + libpng12_la-pngmem.lo libpng12_la-pngerror.lo \
|
| +- libpng12_la-pngpread.lo
|
| ++ libpng12_la-pngpread.lo libpng12_la-png_read_filter_row_neon.lo
|
| + libpng12_la_OBJECTS = $(am_libpng12_la_OBJECTS)
|
| + libpng12_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
|
| + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
|
| +@@ -136,9 +137,9 @@ LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
|
| + --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
|
| + $(LDFLAGS) -o $@
|
| + SOURCES = $(libpng_la_SOURCES) $(libpng12_la_SOURCES) \
|
| +- $(pngtest_SOURCES)
|
| ++ $(pngtest_SOURCES) $(pngasm_SOURCES)
|
| + DIST_SOURCES = $(libpng_la_SOURCES) $(libpng12_la_SOURCES) \
|
| +- $(pngtest_SOURCES)
|
| ++ $(pngtest_SOURCES) $(pngasm_SOURCES)
|
| + man3dir = $(mandir)/man3
|
| + man5dir = $(mandir)/man5
|
| + NROFF = nroff
|
| +@@ -307,6 +308,8 @@ dist_man_MANS = libpng.3 libpngpf.3 png.5
|
| + EXTRA_SCRIPTS = libpng-config libpng12-config
|
| + bin_SCRIPTS = @binconfigs@
|
| +
|
| ++pngasm_SOURCES = png_read_filter_row_neon.S
|
| ++
|
| + # rules to build libpng, only build the old library on request
|
| + lib_LTLIBRARIES = libpng12.la @compatlib@
|
| + EXTRA_LTLIBRARIES = libpng.la
|
| +@@ -363,7 +366,7 @@ all: config.h
|
| + $(MAKE) $(AM_MAKEFLAGS) all-am
|
| +
|
| + .SUFFIXES:
|
| +-.SUFFIXES: .c .lo .o .obj
|
| ++.SUFFIXES: .c .S .lo .o .obj
|
| + am--refresh:
|
| + @:
|
| + $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
|
| +@@ -537,6 +540,7 @@ distclean-compile:
|
| + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngwrite.Plo@am__quote@
|
| + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngwtran.Plo@am__quote@
|
| + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngwutil.Plo@am__quote@
|
| ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-png_read_filter_row_neon.Plo@am__quote@
|
| + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pngtest.Po@am__quote@
|
| +
|
| + .c.o:
|
| +@@ -553,6 +557,13 @@ distclean-compile:
|
| + @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
| + @am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'`
|
| +
|
| ++.S.o:
|
| ++@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
| ++@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
| ++@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
| ++@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
| ++@am__fastdepCC_FALSE@ $(COMPILE) -c $<
|
| ++
|
| + .c.lo:
|
| + @am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
| + @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
|
| +@@ -560,6 +571,14 @@ distclean-compile:
|
| + @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
| + @am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $<
|
| +
|
| ++.S.lo:
|
| ++@am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
| ++@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
|
| ++@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
|
| ++@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
| ++@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $<
|
| ++
|
| ++
|
| + libpng_la-png.lo: png.c
|
| + @am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng_la-png.lo -MD -MP -MF $(DEPDIR)/libpng_la-png.Tpo -c -o libpng_la-png.lo `test -f 'png.c' || echo '$(srcdir)/'`png.c
|
| + @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libpng_la-png.Tpo $(DEPDIR)/libpng_la-png.Plo
|
| +@@ -665,6 +684,16 @@ libpng_la-pngpread.lo: pngpread.c
|
| + @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
| + @am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng_la-pngpread.lo `test -f 'pngpread.c' || echo '$(srcdir)/'`pngpread.c
|
| +
|
| ++
|
| ++
|
| ++libpng_la-png_read_filter_row_neon.lo: png_read_filter_row_neon.S
|
| ++@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng_la-png_read_filter_row_neon.lo -MD -MP -MF $(DEPDIR)/libpng_la-png_read_filter_row_neon.Tpo -c -o libpng_la-png_read_filter_row_neon.lo `test -f 'png_read_filter_row_neon.S' || echo '$(srcdir)/'`png_read_filter_row_neon.S
|
| ++@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libpng_la-png_read_filter_row_neon.Tpo $(DEPDIR)/libpng_la-png_read_filter_row_neon.Plo
|
| ++@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='png_read_filter_row_neon.S' object='libpng_la-png_read_filter_row_neon.lo' libtool=yes @AMDEPBACKSLASH@
|
| ++@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
| ++@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng_la-png_read_filter_row_neon.lo `test -f 'png_read_filter_row_neon.S' || echo '$(srcdir)/'`png_read_filter_row_neon.S
|
| ++
|
| ++
|
| + libpng12_la-png.lo: png.c
|
| + @am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng12_la-png.lo -MD -MP -MF $(DEPDIR)/libpng12_la-png.Tpo -c -o libpng12_la-png.lo `test -f 'png.c' || echo '$(srcdir)/'`png.c
|
| + @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libpng12_la-png.Tpo $(DEPDIR)/libpng12_la-png.Plo
|
| +@@ -770,6 +799,15 @@ libpng12_la-pngpread.lo: pngpread.c
|
| + @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
| + @am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng12_la-pngpread.lo `test -f 'pngpread.c' || echo '$(srcdir)/'`pngpread.c
|
| +
|
| ++
|
| ++libpng12_la-png_read_filter_row_neon.lo: png_read_filter_row_neon.S
|
| ++@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng12_la-png_read_filter_row_neon.lo -MD -MP -MF $(DEPDIR)/libpng12_la-png_read_filter_row_neon.Tpo -c -o libpng12_la-png_read_filter_row_neon.lo `test -f 'png_read_filter_row_neon.S' || echo '$(srcdir)/'`png_read_filter_row_neon.S
|
| ++@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libpng12_la-png_read_filter_row_neon.Tpo $(DEPDIR)/libpng12_la-png_read_filter_row_neon.Plo
|
| ++@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='png_read_filter_row_neon' object='libpng12_la-png_read_filter_row_neon.lo' libtool=yes @AMDEPBACKSLASH@
|
| ++@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
| ++@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng12_la-png_read_filter_row_neon.lo `test -f 'png_read_filter_row_neon.S' || echo '$(srcdir)/'`png_read_filter_row_neon.S
|
| ++
|
| ++
|
| + mostlyclean-libtool:
|
| + -rm -f *.lo
|
| +
|
| +diff --git a/png_read_filter_row_neon.S b/png_read_filter_row_neon.S
|
| +new file mode 100755
|
| +index 0000000..77ec7bd
|
| +--- /dev/null
|
| ++++ b/png_read_filter_row_neon.S
|
| +@@ -0,0 +1,1172 @@
|
| ++#if defined(__ARM_NEON__)
|
| ++#; Copyright (c) 2010, Code Aurora Forum. All rights reserved.
|
| ++#;
|
| ++#; Redistribution and use in source and binary forms, with or without
|
| ++#; modification, are permitted provided that the following conditions are
|
| ++#; met:
|
| ++#; * Redistributions of source code must retain the above copyright
|
| ++#; notice, this list of conditions and the following disclaimer.
|
| ++#; * Redistributions in binary form must reproduce the above
|
| ++#; copyright notice, this list of conditions and the following
|
| ++#; disclaimer in the documentation and/or other materials provided
|
| ++#; with the distribution.
|
| ++#; * Neither the name of Code Aurora Forum, Inc. nor the names of its
|
| ++#; contributors may be used to endorse or promote products derived
|
| ++#; from this software without specific prior written permission.
|
| ++#;
|
| ++#; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
|
| ++#; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
| ++#; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
|
| ++#; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
|
| ++#; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
| ++#; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
| ++#; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
| ++#; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
| ++#; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
| ++#; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
| ++#; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| ++
|
| ++#;==============================================================================
|
| ++
|
| ++ .code 32 @; Code is ARM ISA
|
| ++#;==============================================================================
|
| ++
|
| ++ .global png_read_filter_row_neon
|
| ++
|
| ++#;==============================================================================
|
| ++#; INPUTS: r0 rowbytes: number of bytes in current row
|
| ++#; r1 pixel_depth: number of bits per pixel
|
| ++#; r2 row: pointer to start of current row
|
| ++#; r3 prev_row: pointer to start of previous row
|
| ++#; [sp,#0] filter: filter type
|
| ++#;
|
| ++#; NOTE: Don't touch r5-r11
|
| ++#;==============================================================================
|
| ++.balign 32
|
| ++.type png_read_filter_row_neon, %function
|
| ++png_read_filter_row_neon:
|
| ++
|
| ++ ldr r12,[sp,#0]
|
| ++
|
| ++ cmp r12,#0
|
| ++ beq DONE
|
| ++
|
| ++ cmp r12,#1
|
| ++ beq sub_filter
|
| ++
|
| ++ cmp r12,#2
|
| ++ beq up_filter
|
| ++
|
| ++ cmp r12,#3
|
| ++ beq avg_filter
|
| ++
|
| ++ cmp r12,#4
|
| ++ beq paeth_filter
|
| ++
|
| ++ b DONE
|
| ++
|
| ++ #;; ---------------
|
| ++ #;; SUB filter type
|
| ++ #;; ---------------
|
| ++
|
| ++
|
| ++sub_filter:
|
| ++
|
| ++ stmdb sp!, {r4}
|
| ++
|
| ++ add r1,r1,#7 @; bpp = bytes per pixel
|
| ++ lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
|
| ++ mov r12,r1
|
| ++
|
| ++ #;; r0 = rowbytes
|
| ++ #;; r1 = loop counter = bpp (initially)
|
| ++ #;; r2 = row pointer
|
| ++ #;; r12 = bpp = loop/pointer increment value
|
| ++
|
| ++ cmp r1,r0
|
| ++ beq sub_filter_exit @; exit if bpp == rowbytes
|
| ++
|
| ++ cmp r12,#1
|
| ++ beq sub_filter_1bpp
|
| ++
|
| ++ cmp r12,#2
|
| ++ beq sub_filter_2bpp
|
| ++
|
| ++ cmp r12,#3
|
| ++ beq sub_filter_3bpp
|
| ++
|
| ++ cmp r12,#4
|
| ++ beq sub_filter_4bpp
|
| ++
|
| ++ cmp r12,#6
|
| ++ beq sub_filter_6bpp
|
| ++
|
| ++ cmp r12,#8
|
| ++ beq sub_filter_8bpp
|
| ++
|
| ++sub_filter_exit:
|
| ++ b sub_filter_DONE @; return
|
| ++
|
| ++
|
| ++sub_filter_1bpp:
|
| ++
|
| ++ #;; ----------------------------
|
| ++ #;; SUB filter, 1 byte per pixel
|
| ++ #;; ----------------------------
|
| ++
|
| ++ lsrs r4,r0,#4 @; r1 = floor(rowbytes/4)
|
| ++ @; = iteration count for loop16
|
| ++ beq sub_filter_1bpp_16bytes_done
|
| ++
|
| ++ vmov.i8 d21, #0
|
| ++ vld1.8 {d16,d17}, [r2] @; load 16 pixels
|
| ++ @; d16 = a b c d e f g h
|
| ++ @; d17 = i j k l m n o p
|
| ++
|
| ++ mov r1, #0
|
| ++sub_filter_1bpp_16bytes:
|
| ++
|
| ++
|
| ++
|
| ++
|
| ++ vshl.i64 d18, d16, #8 @; d18 = 0 a b c d e f g
|
| ++ vadd.i8 d18, d16, d18 @; d18 = a a+b b+c c+d d+e e+f f+g g+h
|
| ++
|
| ++ vshl.i64 d18, d18, #8 @; d18 = 0 a a+b b+c c+d d+e e+f f+g
|
| ++ vadd.i8 d18, d16, d18 @; d18 = a a+b a+b+c b+c+d c+d+e d+e+f e+f+g f+g+h
|
| ++
|
| ++ vshl.i64 d18, d18, #8 @; shift add continuously to propage the sum of previous
|
| ++ vadd.i8 d18, d16, d18 @; and current pixels
|
| ++
|
| ++ vshl.i64 d18, d18, #8
|
| ++ vadd.i8 d18, d16, d18
|
| ++
|
| ++ vshl.i64 d18, d18, #8
|
| ++ vadd.i8 d18, d16, d18
|
| ++
|
| ++ vshl.i64 d18, d18, #8
|
| ++ vadd.i8 d18, d16, d18
|
| ++
|
| ++ vshl.i64 d18, d18, #8
|
| ++ vadd.i8 d18, d16, d18 @; maximum data size for shift is 64 bits i.e. doubleword.
|
| ++ @; after computing thh value of all the pixels in the double word
|
| ++ @; extract the last computed value which will be used by
|
| ++ @; the next set of pixels (i.e next doubleword)
|
| ++ vext.8 d22, d18, d21, #7 @; extract the updated value of d18[7] i.e a+b+c+d+e+f+h
|
| ++ vadd.i8 d17, d17, d22 @; d17 = a+b+c+d+e+f+g+h+i j k l m n o p
|
| ++
|
| ++ vshl.i64 d19, d17, #8 @; continue shift-add as the first half
|
| ++ vadd.i8 d19, d17, d19
|
| ++
|
| ++ vshl.i64 d19, d19, #8
|
| ++ vadd.i8 d19, d17, d19
|
| ++
|
| ++ vshl.i64 d19, d19, #8
|
| ++ vadd.i8 d19, d17, d19
|
| ++
|
| ++ vshl.i64 d19, d19, #8
|
| ++ vadd.i8 d19, d17, d19
|
| ++
|
| ++ vshl.i64 d19, d19, #8
|
| ++ vadd.i8 d19, d17, d19
|
| ++
|
| ++ vshl.i64 d19, d19, #8
|
| ++ vadd.i8 d19, d17, d19
|
| ++
|
| ++ vshl.i64 d19, d19, #8
|
| ++ vadd.i8 d19, d17, d19
|
| ++
|
| ++ vst1.8 {d18,d19},[r2]! @; store the result back
|
| ++
|
| ++ add r1, r1, #16 @; add 16 to the loop counter(no of bytes completed)
|
| ++ subs r4,r4,#1 @; decrement iteration count
|
| ++ beq sub_filter_1bpp_16bytes_adjust
|
| ++
|
| ++
|
| ++ vext.8 d22, d19, d21, #7 @; more iterations to go
|
| ++ @; extract the last computed value
|
| ++ vld1.8 {d16,d17}, [r2] @; load the next 16 bytes
|
| ++ vadd.i8 d16, d16, d22 @; set up the input by adding the previous pixel
|
| ++ @; value to the input
|
| ++ b sub_filter_1bpp_16bytes
|
| ++
|
| ++sub_filter_1bpp_16bytes_adjust:
|
| ++
|
| ++ cmp r1, r0 @; no more pixels left .. exit
|
| ++ sub r2, r2, #1 @; more pixels remaining
|
| ++ @; r2 points to the current pixel adjust it
|
| ++ @; so that it points to the prev pixel for the below loop
|
| ++ beq sub_filter_DONE
|
| ++
|
| ++sub_filter_1bpp_16bytes_done:
|
| ++
|
| ++
|
| ++ vld1.8 {d0[0]},[r2]! @; load 1 byte (1 pixel) into D0[0]
|
| ++ @; increment row pointer
|
| ++sub_filter_1bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0 @;
|
| ++
|
| ++ vld1.8 {d2[0]},[r2] @; load 1 byte (current pixel) into D2[0]
|
| ++
|
| ++ vadd.i8 d0,d0,d2 @; vector add 1 byte of previous pixel with
|
| ++ @; 1 byte of current pixel
|
| ++ vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel) back
|
| ++ @; into row pointer location and increment
|
| ++ @; row pointer
|
| ++
|
| ++ bne sub_filter_1bpp_loop @; loop back until loop counter == rowbytes
|
| ++
|
| ++ b sub_filter_DONE @; return
|
| ++
|
| ++ #;; -----------------------------
|
| ++ #;; SUB filter, 2 bytes per pixel
|
| ++ #;; -----------------------------
|
| ++sub_filter_2bpp:
|
| ++
|
| ++ lsrs r4,r0,#4 @; r1 = floor(rowbytes/4)
|
| ++ @; = iteration count for loop16
|
| ++ beq sub_filter_2bpp_16bytes_done
|
| ++
|
| ++ vmov.i8 d21, #0
|
| ++ vld1.8 {d16,d17}, [r2] @; load 16 bytes to q8
|
| ++ @; d16 = a b c d e f g h
|
| ++ @; d17 = i j k l m n o p
|
| ++ mov r1, #0
|
| ++sub_filter_2bpp_16bytes:
|
| ++
|
| ++ vshl.i64 d18, d16, #16 @; each pixel is 2bytes .. shift by 16 bits to get previous pixel
|
| ++ vadd.i8 d18, d16, d18 @; add to the current pixel
|
| ++
|
| ++ vshl.i64 d18, d18, #16 @; shift-add to propagate the computed sum as the case for 1bpp
|
| ++ vadd.i8 d18, d16, d18
|
| ++
|
| ++ vshl.i64 d18, d18, #16
|
| ++ vadd.i8 d18, d16, d18
|
| ++
|
| ++
|
| ++ vext.8 d22, d18, d21, #6 @; extract the last computed value (i.e. last 2 bytes)
|
| ++ vadd.i8 d17, d17, d22 @; add the last computed pixel to the input
|
| ++
|
| ++ vshl.i64 d19, d17, #16
|
| ++ vadd.i8 d19, d17, d19
|
| ++
|
| ++ vshl.i64 d19, d19, #16
|
| ++ vadd.i8 d19, d17, d19
|
| ++
|
| ++ vshl.i64 d19, d19, #16
|
| ++ vadd.i8 d19, d17, d19
|
| ++
|
| ++
|
| ++ vst1.8 {d18,d19},[r2]! @; store the result back
|
| ++
|
| ++
|
| ++ add r1, r1, #16 @; add 16 to the loop counter(no of bytes completed)
|
| ++ subs r4,r4,#1 @; decrement iteration count
|
| ++ beq sub_filter_2bpp_16bytes_adjust
|
| ++
|
| ++
|
| ++ vext.8 d22, d19, d21, #6 @; extract the last computed value
|
| ++ @; add the last computed pixel to the input
|
| ++ vld1.8 {d16,d17}, [r2]
|
| ++ vadd.i8 d16, d16, d22
|
| ++
|
| ++ b sub_filter_2bpp_16bytes
|
| ++
|
| ++
|
| ++sub_filter_2bpp_16bytes_adjust:
|
| ++
|
| ++ cmp r1, r0 @; no more pixels left .. exit
|
| ++ sub r2, r2, #2 @; more pixels remaining
|
| ++ @; r2 points to the current pixel adjust it
|
| ++ @; so that it points to the prev pixel for the below loop
|
| ++ beq sub_filter_DONE
|
| ++
|
| ++sub_filter_2bpp_16bytes_done:
|
| ++
|
| ++ vld1.16 {d0[0]},[r2]! @; load 2 bytes (1 pixel) into D0[0]
|
| ++ @; increment row pointer
|
| ++sub_filter_2bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0 @;
|
| ++
|
| ++ vld1.16 {d2[0]},[r2] @; load 2 bytes (current pixel) into D2[0]
|
| ++ vadd.i8 d0,d0,d2 @; vector add 2 bytes of previous pixel with
|
| ++ @; 2 bytes of current pixel
|
| ++ vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel) back
|
| ++ @; into row pointer location and increment
|
| ++ @; row pointer
|
| ++
|
| ++ bne sub_filter_2bpp_loop @; loop back until loop counter == rowbytes
|
| ++ @
|
| ++ b sub_filter_DONE @ ; return
|
| ++
|
| ++ #;; -----------------------------
|
| ++ #;; SUB filter, 3 bytes per pixel
|
| ++ #;; -----------------------------
|
| ++sub_filter_3bpp:
|
| ++ vld1.32 {d0[0]},[r2], r12 @; load 4 bytes (1 pixel + 1 extra byte) into D0[0]
|
| ++ @; increment row pointer by bpp
|
| ++sub_filter_3bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0 @;
|
| ++
|
| ++ vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel + 1 extra byte) into D2[0]
|
| ++ vadd.i8 d0,d0,d2 @; vector add 3 bytes of previous pixel with
|
| ++ @; 3 bytes of current pixel
|
| ++ vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel) back
|
| ++ @; into row pointer location and increment
|
| ++ @; row pointer
|
| ++ vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel) back
|
| ++ @; into row pointer location and increment
|
| ++ @; row pointer
|
| ++
|
| ++ bne sub_filter_3bpp_loop @; loop back until loop counter == rowbytes
|
| ++
|
| ++ b sub_filter_DONE @; return
|
| ++
|
| ++ #;; -----------------------------
|
| ++ #;; SUB filter, 4 bytes per pixel
|
| ++ #;; -----------------------------
|
| ++sub_filter_4bpp:
|
| ++ vld1.32 {d0[0]},[r2]! @; load 4 bytes (1 pixel) into D0[0]
|
| ++ @; increment row pointer
|
| ++sub_filter_4bpp_loop: @
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0 @;
|
| ++
|
| ++
|
| ++ vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel) into D2[0]
|
| ++ vadd.i8 d0,d0,d2 @; vector add 4 bytes of previous pixel with
|
| ++ @; 4 bytes of current pixel
|
| ++ vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel) back
|
| ++ @; into row pointer location and increment
|
| ++ @; row pointer
|
| ++
|
| ++ bne sub_filter_4bpp_loop @; loop back until loop counter == rowbytes
|
| ++
|
| ++ b sub_filter_DONE @; return
|
| ++
|
| ++ #;; -----------------------------
|
| ++ #;; SUB filter, 6 bytes per pixel
|
| ++ #;; -----------------------------
|
| ++sub_filter_6bpp:
|
| ++ vld1.8 {d0},[r2],r12 @; load 8 bytes (1 pixel + 2 extra bytes) into D0
|
| ++ @; increment row pointer by bpp
|
| ++sub_filter_6bpp_loop: @
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0 @;
|
| ++
|
| ++ vld1.8 {d2},[r2] @; load 8 bytes (1 pixel + 2 extra bytes) into D2
|
| ++ vadd.i8 d0,d0,d2 @; vector add 6 bytes of previous pixel with
|
| ++ @; 6 bytes of current pixel
|
| ++ vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel) back
|
| ++ @; into row pointer location and increment
|
| ++ @; row pointer
|
| ++ vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel) back
|
| ++ @; into row pointer location and increment
|
| ++ @; row pointer
|
| ++
|
| ++ bne sub_filter_6bpp_loop @; loop back until loop counter == rowbytes
|
| ++
|
| ++ b sub_filter_DONE @; return
|
| ++
|
| ++ #;; -----------------------------
|
| ++ #;; SUB filter, 8 bytes per pixel
|
| ++ #;; -----------------------------
|
| ++sub_filter_8bpp:
|
| ++ vld1.8 {d0},[r2]! @; load 8 bytes (1 pixel) into D0
|
| ++ @; increment row pointer
|
| ++sub_filter_8bpp_loop: @
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0 @;
|
| ++ vld1.8 {d2},[r2] @; load 8 bytes (current pixel) into D2
|
| ++ vadd.i8 d0,d0,d2 @; vector add 8 bytes of previous pixel with
|
| ++ @; 8 bytes of current pixel
|
| ++ vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel) back
|
| ++ @; into row pointer location and increment
|
| ++ @; row pointer
|
| ++
|
| ++
|
| ++ bne sub_filter_8bpp_loop @; loop back until loop counter == rowbytes
|
| ++ @
|
| ++ b sub_filter_DONE @ ; return
|
| ++
|
| ++sub_filter_DONE:
|
| ++
|
| ++ ldmia sp!, {r4}
|
| ++ bx r14
|
| ++
|
| ++ #;; --------------
|
| ++ #;; UP filter type
|
| ++ #;; --------------
|
| ++up_filter:
|
| ++
|
| ++ #;; r0 = rowbytes
|
| ++ #;; r1 = pixel_depth (not required for UP filter type)
|
| ++ #;; r2 = row pointer
|
| ++ #;; r3 = previous row pointer
|
| ++
|
| ++
|
| ++ lsrs r1,r0,#5 @; r1 = floor(rowbytes/32)
|
| ++ @; = iteration count for loop32
|
| ++ beq up_filter_32bytes_proc_done
|
| ++
|
| ++
|
| ++up_filter_32bytes_proc:
|
| ++
|
| ++
|
| ++ mov r12, r2
|
| ++
|
| ++ vld1.8 {q0},[r3]! @; load 32 bytes from previous
|
| ++ vld1.8 {q2},[r3]! @; row and increment pointer
|
| ++ @
|
| ++ @
|
| ++ vld1.8 {q1},[r12]! @; load 32 bytes from current row
|
| ++ vld1.8 {q3},[r12]! @
|
| ++ @
|
| ++ @
|
| ++ @
|
| ++ vadd.i8 q0,q0,q1 @; vector add of 16 bytes
|
| ++ vadd.i8 q2,q2,q3 @
|
| ++ @
|
| ++ @
|
| ++ @
|
| ++ vst1.8 {q0},[r2]! @; store 32 bytes to current row
|
| ++ vst1.8 {q2},[r2]! @
|
| ++ @; and increment pointer
|
| ++ sub r0,r0,#32 @; subtract 32 from rowbytes
|
| ++ subs r1,r1,#1 @; decrement iteration count
|
| ++ bne up_filter_32bytes_proc
|
| ++
|
| ++
|
| ++
|
| ++up_filter_32bytes_proc_done:
|
| ++
|
| ++ lsrs r1,r0,#4 @; r1 = floor(rowbytes/16)
|
| ++ @; = iteration count for loop16
|
| ++ beq up_filter_16bytes_proc_done
|
| ++
|
| ++up_filter_16bytes_proc:
|
| ++
|
| ++ vld1.8 {q0},[r3]! @; load 16 bytes from previous
|
| ++ @; row and increment pointer
|
| ++ vld1.8 {q1},[r2] @; load 16 bytes from current row
|
| ++ vadd.i8 q0,q0,q1 @; vector add of 16 bytes
|
| ++ vst1.8 {q0},[r2]! @; store 16 bytes to current row
|
| ++ @; and increment pointer
|
| ++ sub r0,r0,#16 @; subtract 16 from rowbytes
|
| ++ subs r1,r1,#1 @; decrement iteration count
|
| ++ bne up_filter_16bytes_proc
|
| ++
|
| ++up_filter_16bytes_proc_done:
|
| ++
|
| ++ lsrs r1,r0,#3 @; r1 = floor(rowbytes/8)
|
| ++ beq up_filter_8bytes_proc_done
|
| ++
|
| ++up_filter_8bytes_proc:
|
| ++
|
| ++ vld1.8 {d0},[r3]! @; load 8 bytes from previous
|
| ++ @; row and increment pointer
|
| ++ vld1.8 {d2},[r2] @; load 8 bytes from current row
|
| ++ vadd.i8 d0,d0,d2 @; vector add 8 bytes
|
| ++ vst1.8 {d0},[r2]! @; store 8 bytes to current row
|
| ++ @; and increment pointer
|
| ++ sub r0,r0,#8 @; subtract 8 from rowbytes
|
| ++
|
| ++up_filter_8bytes_proc_done:
|
| ++
|
| ++ lsrs r1,r0,#2 @; r1 = floor(rowbytes/4)
|
| ++ beq up_filter_4bytes_proc_done
|
| ++
|
| ++up_filter_4bytes_proc:
|
| ++
|
| ++ vld1.32 {d0[0]},[r3]! @; load 4 bytes from previous row
|
| ++ @; and increment pointer
|
| ++ vld1.32 {d2[0]},[r2] @; load 4 bytes from current row
|
| ++ vadd.i8 d0,d0,d2 @; vector add 4 bytes
|
| ++ vst1.32 {d0[0]},[r2]! @; store 4 bytes to current row
|
| ++ @; and increment pointer
|
| ++ sub r0,r0,#4 @; subtract 4 from rowbytes
|
| ++
|
| ++up_filter_4bytes_proc_done:
|
| ++
|
| ++ lsrs r1,r0,#1 @; r1 = floor(rowbytes/2)
|
| ++ beq up_filter_2bytes_proc_done
|
| ++
|
| ++up_filter_2bytes_proc:
|
| ++
|
| ++ vld1.16 {d0[0]},[r3]! @; load 2 bytes from previous row
|
| ++ @; and increment pointer
|
| ++ vld1.16 {d2[0]},[r2] @; load 2 bytes from current row
|
| ++ vadd.i8 d0,d0,d2 @; vector add 2 bytes
|
| ++ vst1.16 {d0[0]},[r2]! @; store 2 bytes to current row
|
| ++ @; and increment pointer
|
| ++ sub r0,r0,#2 @; subtract 2 from rowbytes
|
| ++
|
| ++up_filter_2bytes_proc_done:
|
| ++
|
| ++ cmp r0,#0
|
| ++ beq up_filter_1byte_proc_done
|
| ++
|
| ++up_filter_1byte_proc:
|
| ++
|
| ++ vld1.8 {d0[0]},[r3]! @; load 1 byte from previous row
|
| ++ @; and increment pointer
|
| ++ vld1.8 {d2[0]},[r2] @; load 1 byte from current row
|
| ++ vadd.i8 d0,d0,d2 @; vector add 1 byte
|
| ++ vst1.8 {d0[0]},[r2]! @; store 1 byte to current row
|
| ++ @; and increment pointer
|
| ++up_filter_1byte_proc_done:
|
| ++
|
| ++ b DONE
|
| ++
|
| ++ #;; ---------------
|
| ++ #;; AVG filter type
|
| ++ #;; ---------------
|
| ++avg_filter:
|
| ++
|
| ++ add r1,r1,#7 @; bpp = byptes per pixel
|
| ++ lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
|
| ++ mov r12,r1
|
| ++
|
| ++ #;; r0 = rowbytes
|
| ++ #;; r1 = loop counter = bpp (initially)
|
| ++ #;; r2 = row pointer
|
| ++ #;; r3 = previous row pointer
|
| ++ #;; r12 = bpp = loop/pointer increment value
|
| ++
|
| ++ cmp r12,#1
|
| ++ beq avg_filter_1bpp
|
| ++
|
| ++ cmp r12,#2
|
| ++ beq avg_filter_2bpp
|
| ++
|
| ++ cmp r12,#3
|
| ++ beq avg_filter_3bpp
|
| ++
|
| ++ cmp r12,#4
|
| ++ beq avg_filter_4bpp
|
| ++
|
| ++ cmp r12,#6
|
| ++ beq avg_filter_6bpp
|
| ++
|
| ++ cmp r12,#8
|
| ++ beq avg_filter_8bpp
|
| ++
|
| ++avg_filter_exit:
|
| ++ b DONE @; return
|
| ++
|
| ++ #;; ----------------------------
|
| ++ #;; AVG filter, 1 byte per pixel
|
| ++ #;; ----------------------------
|
| ++avg_filter_1bpp:
|
| ++
|
| ++ cmp r1,r0
|
| ++
|
| ++ vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr
|
| ++ @; row into d0[0]
|
| ++ vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev
|
| ++ @; row into d1[0]
|
| ++ @; increment prev row pointer
|
| ++ vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
|
| ++ @; to pixel x
|
| ++ vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ @; updated pixel x is now pixel a
|
| ++ beq DONE
|
| ++
|
| ++avg_filter_1bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0
|
| ++
|
| ++
|
| ++ vld1.8 {d2[0]},[r2] @; load 1 byte (pixel x) from curr
|
| ++ @; row into d2[0]
|
| ++ vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev
|
| ++ @; row into d1[0]
|
| ++ vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
|
| ++ vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
|
| ++ vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
|
| ++ vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ bne avg_filter_1bpp_loop
|
| ++
|
| ++ b DONE @; exit loop when
|
| ++ @; loop counter == rowbytes
|
| ++ #;; -----------------------------
|
| ++ #;; AVG filter, 2 bytes per pixel
|
| ++ #;; -----------------------------
|
| ++avg_filter_2bpp:
|
| ++
|
| ++ cmp r1,r0
|
| ++
|
| ++ vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
|
| ++ @; row into d0[0]
|
| ++ vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
|
| ++ @; row into d1[0]
|
| ++ @; increment prev row pointer
|
| ++ vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
|
| ++ @; to pixel x
|
| ++ vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ @; updated pixel x is now pixel a
|
| ++ beq DONE
|
| ++
|
| ++avg_filter_2bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0
|
| ++
|
| ++
|
| ++ vld1.16 {d2[0]},[r2] @; load 2 bytes (pixel x) from curr
|
| ++ @; row into d2[0]
|
| ++ vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
|
| ++ @; row into d1[0]
|
| ++ vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
|
| ++ vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
|
| ++ vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
|
| ++ vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++
|
| ++ bne avg_filter_2bpp_loop
|
| ++
|
| ++ b DONE @; exit loop when
|
| ++ @; loop counter == rowbytes
|
| ++
|
| ++ #;; -----------------------------
|
| ++ #;; AVG filter, 3 bytes per pixel
|
| ++ #;; -----------------------------
|
| ++avg_filter_3bpp:
|
| ++
|
| ++ cmp r1,r0
|
| ++
|
| ++ vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra
|
| ++ @; byte) from curr row into d0[0]
|
| ++ vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
|
| ++ @; byte) from prev row into d1[0]
|
| ++ @; increment prev row pointer
|
| ++ vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
|
| ++ @; to pixel x
|
| ++ vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ @; updated pixel x is now pixel a
|
| ++ beq DONE
|
| ++
|
| ++avg_filter_3bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0
|
| ++
|
| ++ vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x + 1 extra
|
| ++ @; byte) from curr row into d2[0]
|
| ++ vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
|
| ++ @; byte) from prev row into d1[0]
|
| ++ vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
|
| ++ vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
|
| ++ vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
|
| ++ vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++
|
| ++ bne avg_filter_3bpp_loop
|
| ++
|
| ++ b DONE @; exit loop when
|
| ++ @; loop counter == rowbytes
|
| ++ #;; -----------------------------
|
| ++ #;; AVG filter, 4 bytes per pixel
|
| ++ #;; -----------------------------
|
| ++avg_filter_4bpp:
|
| ++
|
| ++ cmp r1,r0
|
| ++
|
| ++ vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
|
| ++ @; row into d0[0]
|
| ++ vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
|
| ++ @; row into d1[0]
|
| ++ @; increment prev row pointer
|
| ++ vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
|
| ++ @; to pixel x
|
| ++ vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ @; updated pixel x is now pixel a
|
| ++ beq DONE
|
| ++
|
| ++avg_filter_4bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0
|
| ++
|
| ++
|
| ++ vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x) from curr
|
| ++ @; row into d2[0]
|
| ++ vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
|
| ++ @; row into d1[0]
|
| ++ vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
|
| ++ vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
|
| ++ vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
|
| ++ vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ bne avg_filter_4bpp_loop
|
| ++
|
| ++ b DONE @; exit loop when
|
| ++ @; loop counter == rowbytes
|
| ++ #;; -----------------------------
|
| ++ #;; AVG filter, 6 bytes per pixel
|
| ++ #;; -----------------------------
|
| ++avg_filter_6bpp:
|
| ++
|
| ++ cmp r1,r0
|
| ++
|
| ++ vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra
|
| ++ @; bytes) from curr row into d0
|
| ++ vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra
|
| ++ @; bytes) from prev row into d1
|
| ++ @; increment prev row pointer
|
| ++ vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
|
| ++ @; to pixel x
|
| ++ vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ @; updated pixel x is now pixel a
|
| ++ vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ @; updated pixel x is now pixel a
|
| ++ beq DONE
|
| ++
|
| ++avg_filter_6bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0
|
| ++
|
| ++
|
| ++ vld1.8 {d2},[r2] @; load 8 bytes (pixel x + 2 extra
|
| ++ @; bytes) from curr row into d2
|
| ++ vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra
|
| ++ @; bytes) from prev row into d1
|
| ++ vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
|
| ++ vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2
|
| ++ vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2)
|
| ++ vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ bne avg_filter_6bpp_loop
|
| ++
|
| ++ b DONE @; exit loop when
|
| ++ @; loop counter == rowbytes
|
| ++ #;; -----------------------------
|
| ++ #;; AVG filter, 8 bytes per pixel
|
| ++ #;; -----------------------------
|
| ++avg_filter_8bpp:
|
| ++
|
| ++ cmp r1,r0
|
| ++
|
| ++ vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
|
| ++ @; row into d0
|
| ++ vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
|
| ++ @; row into d1
|
| ++ @; increment prev row pointer
|
| ++ vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
|
| ++ @; to pixel x
|
| ++ vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ @; updated pixel x is now pixel a
|
| ++ beq DONE
|
| ++avg_filter_8bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0
|
| ++
|
| ++
|
| ++ vld1.8 {d2},[r2] @; load 8 bytes (pixel x) from curr
|
| ++ @; row into d2
|
| ++ vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
|
| ++ @; row into d1
|
| ++ vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
|
| ++ vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2
|
| ++ vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2)
|
| ++ vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ bne avg_filter_8bpp_loop
|
| ++
|
| ++ b DONE @; exit loop when
|
| ++ @; loop counter == rowbytes
|
| ++ #;; -----------------
|
| ++ #;; PAETH filter type
|
| ++ #;; -----------------
|
| ++paeth_filter:
|
| ++
|
| ++ VPUSH {q4-q7}
|
| ++ add r1,r1,#7 @; bpp = bytes per pixel
|
| ++ lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
|
| ++ mov r12,r1
|
| ++
|
| ++ #;; r0 = rowbytes
|
| ++ #;; r1 = loop counter = bpp (initially)
|
| ++ #;; r2 = row pointer
|
| ++ #;; r3 = previous row pointer
|
| ++ #;; r12 = bpp = loop/pointer increment value
|
| ++
|
| ++
|
| ++ cmp r12,#1
|
| ++ beq paeth_filter_1bpp
|
| ++
|
| ++ cmp r12,#2
|
| ++ beq paeth_filter_2bpp
|
| ++
|
| ++ cmp r12,#3
|
| ++ beq paeth_filter_3bpp
|
| ++
|
| ++ cmp r12,#4
|
| ++ beq paeth_filter_4bpp
|
| ++
|
| ++ cmp r12,#6
|
| ++ beq paeth_filter_6bpp
|
| ++
|
| ++ cmp r12,#8
|
| ++ beq paeth_filter_8bpp
|
| ++
|
| ++paeth_filter_exit:
|
| ++ b paeth_filter_DONE @; return
|
| ++
|
| ++ #;; ------------------------------
|
| ++ #;; PAETH filter, 1 byte per pixel
|
| ++ #;; ------------------------------
|
| ++paeth_filter_1bpp:
|
| ++
|
| ++ cmp r1, r0
|
| ++
|
| ++ vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr
|
| ++ @; row into d0[0]
|
| ++ vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev
|
| ++ @; row into d1[0]
|
| ++ @; increment prev row pointer
|
| ++ vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
|
| ++ vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++
|
| ++ beq paeth_filter_DONE
|
| ++
|
| ++paeth_filter_1bpp_loop:
|
| ++ add r1,r1,r12 @; increment curr row pointer
|
| ++ cmp r1,r0
|
| ++
|
| ++
|
| ++ #;; d1[0] = c (b in the previous loop iteration)
|
| ++ #;; d2[0] = a (x in the previous loop iteration)
|
| ++ vld1.8 {d3[0]},[r3]! @; load 1 byte (pixel b) from prev
|
| ++ @; row into d3[0]
|
| ++ vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr
|
| ++ @; row into d0[0]
|
| ++ vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
|
| ++ vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
|
| ++ vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
|
| ++ vaddl.u8 q5,d2,d3 @; q5 = a + b
|
| ++ vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
|
| ++
|
| ++ vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
|
| ++ vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
|
| ++ vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
|
| ++ vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
|
| ++ vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
|
| ++ vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
|
| ++ @
|
| ++ vand d2,d2,d10 @; d2 = a where 1, 0 where 0
|
| ++ vbsl d14,d3,d1 @; d14 = b where 1, c where 0
|
| ++ vmvn d10,d10 @; invert d10
|
| ++ vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
|
| ++ vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
|
| ++ vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
|
| ++ vmov d1,d3 @; d1 = b (c for next iteration)
|
| ++ vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel x)
|
| ++
|
| ++
|
| ++ bne paeth_filter_1bpp_loop
|
| ++
|
| ++ b paeth_filter_DONE @; exit loop when
|
| ++ @; loop counter == rowbytes
|
| ++ #;; -------------------------------
|
| ++ #;; PAETH filter, 2 bytes per pixel
|
| ++ #;; -------------------------------
|
| ++paeth_filter_2bpp:
|
| ++
|
| ++ cmp r1, r0
|
| ++
|
| ++ vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
|
| ++ @; row into d0[0]
|
| ++ vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
|
| ++ @; row into d1[0]
|
| ++ @; increment prev row pointer
|
| ++ vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
|
| ++ vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ beq paeth_filter_DONE
|
| ++
|
| ++paeth_filter_2bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0
|
| ++
|
| ++ #;; d1[0] = c (b in the previous loop iteration)
|
| ++ #;; d2[0] = a (x in the previous loop iteration)
|
| ++ vld1.16 {d3[0]},[r3]! @; load 2 bytes (pixel b) from prev
|
| ++ @; row into d3[0]
|
| ++ vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
|
| ++ @; row into d0[0]
|
| ++ vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
|
| ++ vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
|
| ++ vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
|
| ++ vaddl.u8 q5,d2,d3 @; q5 = a + b
|
| ++ vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
|
| ++
|
| ++ vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
|
| ++ vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
|
| ++ vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
|
| ++ vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
|
| ++ vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
|
| ++ vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
|
| ++
|
| ++ vand d2,d2,d10 @; d2 = a where 1, 0 where 0
|
| ++ vbsl d14,d3,d1 @; d14 = b where 1, c where 0
|
| ++ vmvn d10,d10 @; invert d10
|
| ++ vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
|
| ++ vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
|
| ++ vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
|
| ++ vmov d1,d3 @; d1 = b (c for next iteration)
|
| ++ vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ bne paeth_filter_2bpp_loop
|
| ++
|
| ++ b paeth_filter_DONE @; exit loop when
|
| ++ @; loop counter == rowbytes
|
| ++ #;; -------------------------------
|
| ++ #;; PAETH filter, 3 bytes per pixel
|
| ++ #;; -------------------------------
|
| ++paeth_filter_3bpp:
|
| ++
|
| ++ cmp r1, r0
|
| ++
|
| ++ vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra
|
| ++ @; byte) from curr row into d0[0]
|
| ++ vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
|
| ++ @; byte) from prev row into d1[0]
|
| ++ @; increment prev row pointer
|
| ++ vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
|
| ++ vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ beq paeth_filter_DONE
|
| ++
|
| ++paeth_filter_3bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0
|
| ++
|
| ++
|
| ++ #;; d1[0] = c (b in the previous loop iteration)
|
| ++ #;; d2[0] = a (x in the previous loop iteration)
|
| ++ vld1.32 {d3[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
|
| ++ @; byte) from prev row into d3[0]
|
| ++ vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra
|
| ++ @; byte) from curr row into d0[0]
|
| ++ vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
|
| ++ vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
|
| ++ vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
|
| ++ vaddl.u8 q5,d2,d3 @; q5 = a + b
|
| ++ vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
|
| ++ @
|
| ++ vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
|
| ++ vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
|
| ++ vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
|
| ++ vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
|
| ++ vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
|
| ++ vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
|
| ++ @
|
| ++ vand d2,d2,d10 @; d2 = a where 1, 0 where 0
|
| ++ vbsl d14,d3,d1 @; d14 = b where 1, c where 0
|
| ++ vmvn d10,d10 @; invert d10
|
| ++ vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
|
| ++ vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
|
| ++ vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
|
| ++ vmov d1,d3 @; d1 = b (c for next iteration)
|
| ++ vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ bne paeth_filter_3bpp_loop
|
| ++
|
| ++ b paeth_filter_DONE @; exit loop when
|
| ++ @; loop counter == rowbytes
|
| ++ #;; -------------------------------
|
| ++ #;; PAETH filter, 4 bytes per pixel
|
| ++ #;; -------------------------------
|
| ++paeth_filter_4bpp:
|
| ++
|
| ++ cmp r1, r0
|
| ++
|
| ++ vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
|
| ++ @; row into d0[0]
|
| ++ vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
|
| ++ @; row into d1[0]
|
| ++ @; increment prev row pointer
|
| ++ vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
|
| ++ vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ beq paeth_filter_DONE
|
| ++
|
| ++paeth_filter_4bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0
|
| ++
|
| ++
|
| ++ #;; d1[0] = c (b in the previous loop iteration)
|
| ++ #;; d2[0] = a (x in the previous loop iteration)
|
| ++ vld1.32 {d3[0]},[r3]! @; load 4 bytes (pixel b) from prev
|
| ++ @; row into d3[0]
|
| ++ vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
|
| ++ @; row into d0[0]
|
| ++ vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
|
| ++ vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
|
| ++ vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
|
| ++ vaddl.u8 q5,d2,d3 @; q5 = a + b
|
| ++ vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
|
| ++ @
|
| ++ vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
|
| ++ vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
|
| ++ vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
|
| ++ vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
|
| ++ vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
|
| ++ vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
|
| ++ @
|
| ++ vand d2,d2,d10 @; d2 = a where 1, 0 where 0
|
| ++ vbsl d14,d3,d1 @; d14 = b where 1, c where 0
|
| ++ vmvn d10,d10 @; invert d10
|
| ++ vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
|
| ++ vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
|
| ++ vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
|
| ++ vmov d1,d3 @; d1 = b (c for next iteration)
|
| ++ vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ bne paeth_filter_4bpp_loop
|
| ++
|
| ++ b paeth_filter_DONE @; exit loop when
|
| ++ @; loop counter == rowbytes
|
| ++ #;; -------------------------------
|
| ++ #;; PAETH filter, 6 bytes per pixel
|
| ++ #;; -------------------------------
|
| ++paeth_filter_6bpp:
|
| ++ cmp r1, r0
|
| ++
|
| ++ vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra
|
| ++ @; bytes) from curr row into d0
|
| ++ vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra
|
| ++ @; bytes) from prev row into d1
|
| ++ @; increment prev row pointer
|
| ++ vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
|
| ++ vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ beq paeth_filter_DONE
|
| ++
|
| ++paeth_filter_6bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0
|
| ++
|
| ++
|
| ++ #;; d1[0] = c (b in the previous loop iteration)
|
| ++ #;; d2[0] = a (x in the previous loop iteration)
|
| ++ vld1.8 {d3},[r3],r12 @; load 8 bytes (pixel b + 2 extra
|
| ++ @; bytes) from prev row into d3
|
| ++ vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra
|
| ++ @; bytes) from curr row into d0
|
| ++ vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
|
| ++ vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
|
| ++ vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
|
| ++ vaddl.u8 q5,d2,d3 @; q5 = a + b
|
| ++ vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
|
| ++
|
| ++ vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
|
| ++ vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
|
| ++ vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
|
| ++ vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
|
| ++ vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
|
| ++ vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
|
| ++
|
| ++ vand d2,d2,d10 @; d2 = a where 1, 0 where 0
|
| ++ vbsl d14,d3,d1 @; d14 = b where 1, c where 0
|
| ++ vmvn d10,d10 @; invert d10
|
| ++ vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
|
| ++ vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
|
| ++ vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
|
| ++ vmov d1,d3 @; d1 = b (c for next iteration)
|
| ++ vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ bne paeth_filter_6bpp_loop
|
| ++
|
| ++ b paeth_filter_DONE @; exit loop when
|
| ++ @; loop counter == rowbytes
|
| ++ #;; -------------------------------
|
| ++ #;; PAETH filter, 8 bytes per pixel
|
| ++ #;; -------------------------------
|
| ++paeth_filter_8bpp:
|
| ++ cmp r1, r0
|
| ++
|
| ++ vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
|
| ++ @; row into d0
|
| ++ vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
|
| ++ @; row into d1
|
| ++ @; increment prev row pointer
|
| ++ vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
|
| ++ vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ beq paeth_filter_DONE
|
| ++
|
| ++paeth_filter_8bpp_loop:
|
| ++ add r1,r1,r12 @; loop counter += bpp
|
| ++ cmp r1,r0
|
| ++
|
| ++
|
| ++ #;; d1[0] = c (b in the previous loop iteration)
|
| ++ #;; d2[0] = a (x in the previous loop iteration)
|
| ++ vld1.8 {d3},[r3]! @; load 8 bytes (pixel b) from prev
|
| ++ @; row into d3
|
| ++ vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
|
| ++ @; row into d0
|
| ++ vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
|
| ++ vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
|
| ++ vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
|
| ++ vaddl.u8 q5,d2,d3 @; q5 = a + b
|
| ++ vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
|
| ++ @
|
| ++ vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
|
| ++ vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
|
| ++ vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
|
| ++ vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
|
| ++ vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
|
| ++ vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
|
| ++ @
|
| ++ vand d2,d2,d10 @; d2 = a where 1, 0 where 0
|
| ++ vbsl d14,d3,d1 @; d14 = b where 1, c where 0
|
| ++ vmvn d10,d10 @; invert d10
|
| ++ vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
|
| ++ vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
|
| ++ vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
|
| ++ vmov d1,d3 @; d1 = b (c for next iteration)
|
| ++ vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel x)
|
| ++ @; increment curr row pointer
|
| ++ bne paeth_filter_8bpp_loop
|
| ++
|
| ++ b paeth_filter_DONE @; exit loop when
|
| ++ @; loop counter == rowbytes
|
| ++paeth_filter_DONE:
|
| ++
|
| ++ VPOP {q4-q7}
|
| ++ bx r14
|
| ++
|
| ++DONE:
|
| ++ bx r14
|
| ++
|
| ++
|
| ++.size png_read_filter_row_neon, .-png_read_filter_row_neon
|
| ++ .END
|
| ++#endif
|
| +diff --git a/pngrutil.c b/pngrutil.c
|
| +index 1e2db31..adfffb2 100755
|
| +--- a/pngrutil.c
|
| ++++ b/pngrutil.c
|
| +@@ -23,6 +23,10 @@
|
| + # define WIN32_WCE_OLD
|
| + #endif
|
| +
|
| ++#if defined(__ARM_NEON__)
|
| ++extern void png_read_filter_row_neon(png_uint_32 rowbytes, png_byte pixel_depth, png_bytep row, png_bytep prev_row, int filter);
|
| ++#endif
|
| ++
|
| + #ifdef PNG_FLOATING_POINT_SUPPORTED
|
| + # ifdef WIN32_WCE_OLD
|
| + /* The strtod() function is not supported on WindowsCE */
|
| +@@ -2928,6 +2932,9 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep row,
|
| + {
|
| + png_debug(1, "in png_read_filter_row");
|
| + png_debug2(2, "row = %lu, filter = %d", png_ptr->row_number, filter);
|
| ++#if defined(__ARM_NEON__)
|
| ++ png_read_filter_row_neon(row_info->rowbytes, row_info->pixel_depth, row, prev_row, filter);
|
| ++#else
|
| + switch (filter)
|
| + {
|
| + case PNG_FILTER_VALUE_NONE:
|
| +@@ -3043,6 +3050,7 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep row,
|
| + *row = 0;
|
| + break;
|
| + }
|
| ++#endif /* #if defined(__ARM_NEON__) */
|
| + }
|
| +
|
| + #ifdef PNG_SEQUENTIAL_READ_SUPPORTED
|
|
|