sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch - Issue 5176006: Applying Neon optimization patch to the ZLIB library.

Unified Diff: sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch

Issue 5176006: Applying Neon optimization patch to the ZLIB library. Base URL: http://git.chromium.org/git/portage.git@master

Patch Set: Created 10 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch

diff --git a/sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch b/sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch

new file mode 100644

index 0000000000000000000000000000000000000000..3e7698e96bfd4a6400a15e4142391b7cc973d7b3

--- /dev/null

+++ b/sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch

@@ -0,0 +1,865 @@

+commit df426dcc3179a3647a695c0fde66e1b3616fa6c0

+Author: John Alayari <jalayari@codeaurora.org>

+Date: Thu Nov 18 16:24:17 2010 -0800

+ commiting the PNG and ZLIB Neon Optimizations files to local git repsitory.

+diff --git a/Makefile.in b/Makefile.in

+index 7da5a85..276c531 100644

+--- a/Makefile.in

++++ b/Makefile.in

+@@ -51,7 +51,7 @@ OBJS = adler32.o compress.o crc32.o gzio.o uncompr.o deflate.o trees.o \

+ PIC_OBJS = $(OBJS:%.o=%.lo)

+-OBJA =

++OBJA =inflate_fast_copy_neon.o adler32_DO16_loop_neon.o

+ # to use the asm code: make OBJA=match.o

+ TEST_OBJS = example.o minigzip.o

+@@ -82,8 +82,12 @@ match.o: match.S

+ %.lo: %.c

+ $(CC) $(CFLAGS) -DPIC -fPIC -c $< -o $@

+-$(SHAREDLIBV): $(PIC_OBJS)

+- $(LDSHARED) -o $@ $(PIC_OBJS) -lc $(LDFLAGS)

++%.o: %.S

++ $(CC) $(CFLAGS) -DPIC -fPIC -c $< -o $@

++$(SHAREDLIBV): $(PIC_OBJS) $(OBJA)

++ $(LDSHARED) -o $@ $(PIC_OBJS) $(OBJA) -lc $(LDFLAGS)

+ rm -f $(SHAREDLIB) $(SHAREDLIBM)

+ ln -s $@ $(SHAREDLIB)

+ ln -s $@ $(SHAREDLIBM)

+diff --git a/adler32.c b/adler32.c

+index 007ba26..a256e88 100644

+--- a/adler32.c

++++ b/adler32.c

+@@ -1,5 +1,6 @@

+ /* adler32.c -- compute the Adler-32 checksum of a data stream

+ * For conditions of distribution and use, see copyright notice in zlib.h

+ */

+@@ -18,6 +19,10 @@

+ #define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);

+ #define DO16(buf) DO8(buf,0); DO8(buf,8);

++#if defined(__ARM_NEON__)

++extern void adler32_DO16_loop_neon(unsigned char** , unsigned long *, unsigned long * , int );

++#endif

+ /* use NO_DIVIDE if your processor does not do division in hardware */

+ #ifdef NO_DIVIDE

+ # define MOD(a) \

+@@ -96,17 +101,25 @@ uLong ZEXPORT adler32(adler, buf, len)

+ /* do length NMAX blocks -- requires just one modulo operation */

+ while (len >= NMAX) {

+ len -= NMAX;

++#if defined(__ARM_NEON__)

++ adler32_DO16_loop_neon(&buf, &adler, &sum2, NMAX);

++#else

+ n = NMAX / 16; /* NMAX is divisible by 16 */

+ do {

+ DO16(buf); /* 16 sums unrolled */

+ buf += 16;

+ } while (--n);

++#endif

+ MOD(adler);

+ MOD(sum2);

+ }

+ /* do remaining bytes (less than NMAX, still just one modulo) */

+ if (len) { /* avoid modulos if none remaining */

++#if defined(__ARM_NEON__)

++ adler32_DO16_loop_neon(&buf, &adler, &sum2, len);

++#else

+ while (len >= 16) {

+ len -= 16;

+ DO16(buf);

+@@ -116,6 +129,7 @@ uLong ZEXPORT adler32(adler, buf, len)

+ adler += *buf++;

+ sum2 += adler;

+ }

++#endif

+ MOD(adler);

+ MOD(sum2);

+ }

+diff --git a/adler32_DO16_loop_neon.S b/adler32_DO16_loop_neon.S

+new file mode 100755

+index 0000000..1ba5147

+--- /dev/null

++++ b/adler32_DO16_loop_neon.S

+@@ -0,0 +1,195 @@

++#

++# Redistribution and use in source and binary forms, with or without

++# modification, are permitted provided that the following conditions

++# are met:

++# * Redistributions of source code must retain the above copyright

++# notice, this list of conditions and the following disclaimer.

++# * Redistributions in binary form must reproduce the above

++# copyright notice, this list of conditions and the following

++# disclaimer in the documentation and/or other materials provided

++# with the distribution.

++# * Neither the name of Code Aurora Forum, Inc. nor the names of it

++# contributors may be used to endorse or promote products derived

++# from this software without specific prior written permission.

++#

++# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED

++# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF

++# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMEN

++# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTOR

++# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, O

++# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

++# SUBSTITUTE GOODS OR SERVICES LOSS OF USE, DATA, OR PROFITS OR

++# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,

++# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE

++# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVE

++# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

++#

++#if defined(__ARM_NEON__)

++#======================================================================

++# Code Section

++ .code 32 @ Code is ARM ISA

++ .global adler32_DO16_loop_neon

++#======================================================================

++# Function Name(s) : adler32_DO16_loop_neon

++#

++# Function Parameters

++# r0 = pointer to buf

++# r1 = pointer to adler

++# r2 = pointer to sum

++# r3 = len

++#

++# Register Usage

++# q0, q1 = input data

++# d18,d24 = adler

++# d16,d28 = sum

++# d30 = multiplication factor

++#

++#======================================================================

++#

++# algorithm:

++#

++# while (length < loop_counter)

++# do

++# length = length - loop_counter

++# for i = 0 to loop_counter

++# adler = adler_begin + input[i]

++# for i = 0 to loop_counter

++# sum2 = sum2_begin+(loop_counter-i)*input[i]

++# sum2 = sum2+(adler_begin*loop_counter)

++# adler_begin = adler

++# sum2_begin = sum2

++# end

++#

++# Here loop counter holds values of 16, 8 and 1 to compute

++# adler and sum for 16 bytes, 8 bytes and 1 byte at a time

++# adler_begin and sum2_begin are used to hold the values

++# of adler and sum2 from previous iterations.

++#

++#======================================================================

++.balign 32

++.type adler32_DO16_loop_neon, %function

++adler32_DO16_loop_neon:

++ stmdb sp!,{r4-r7}

++ ldr r7,[r0]

++ vld1.32 {d18[0]},[r1] @load the input adler

++ vld1.32 {d16[0]},[r2] @load the input sum

++ ldr r5,=FACTOR16 @load the multiplication

++ @factors for data elements

++ vld1.8 {d20,d21},[r5] @load the multiplication

++ @factor for adler.

++ lsrs r4,r3,#4 @Calculate the number

++ @16 byte iterations

++ beq adler32_DO16_loop_16bytes_done

++ mov r6, #16

++ vmov.32 d30[0],r6

++adler32_DO16_loop_16bytes:

++ vld1.8 {d0,d1},[r7]! @load buf[0]..buf[15]

++ vpaddl.u8 d6, d0 @pair wise add to reduce

++ @8 elements to 4 and extend.

++ vpaddl.u8 d7,d1 @pair wise add to reduce 8

++ @elements to 4 and extend.

++ vpadd.u16 d24,d6,d7 @pair wise add (i.e. no

++ @need to extend 16 bits

++ @sufficient to hold the sum).

++ vpaddl.u16 d24,d24 @pair wise add to reduce 4

++ @elements to 2 and extend.

++ vpaddl.u32 d24,d24 @pair wise add to get the

++ @adler of 16 inputs no need

++ @to extend .. but only vpaddl

++ @adds pair wise on one

++ @doubleword.

++ vadd.u32 d24,d18,d24 @adler'=adler+adler_of_16_inputs

++ vmull.u8 q13,d20,d0 @sum'=mul_fac_for_inputs[0...7]

++ @ * buf[0..7].

++ vmlal.u8 q13,d21,d1 @sum'=sum'+ mul_fac_for_inputs

++ @[8...15] * buf[8..15].

++ vpadd.u16 d28,d26,d27 @pair wise add the doublewords

++ vpaddl.u16 d28,d28 @pair wise add to reduce 4

++ @elements to 2 and extend.

++ vpaddl.u32 d28,d28 @pair wise add

++ vadd.u32 d28,d16,d28 @sum' = sum + sum'

++ vmla.u32 d28,d18,d30 @sum' = sum' + (adler*

++ @mul_fac_for_adler).

++ vmov.u32 d18,d24 @save adler for next iteration.

++ vmov.u32 d16,d28 @save sum for next iteration.

++ sub r3,r3,#16

++ subs r4,r4,#1

++ bne adler32_DO16_loop_16bytes

++adler32_DO16_loop_16bytes_done:

++ lsrs r4, r3, #3 @find if there are atleast 8 bytes

++ beq adler32_DO16_loop_8bytes_done

++adler32_DO16_loop_8bytes:

++ vld1.8 {d0},[r7]! @load buf[0] .buf[7]

++ vpaddl.u8 d24,d0 @pair wise add to

++ @reduce 8 elements to 4

++ vpaddl.u16 d24,d24 @pair wise add to reduce

++ @4 elements to 2

++ vpaddl.u32 d24,d24 @pair wise add to get the

++ @adler for 8 inputs

++ vadd.u32 d24,d18,d24 @adler' = adler +

++ @adler_for_8_inputs.

++ vmull.u8 q13,d21,d0 @sum' = mul_fac_for_inputs[0..7]

++ @ * buf[0..7]

++ vpadd.u16 d28,d26,d27 @pair wise add to reduce 8

++ @elements to 4

++ vpaddl.u16 d28,d28 @pair wise add to reduce 4

++ @elements to 2.

++ vpaddl.u32 d28,d28 @pair wise add

++ vadd.u32 d28,d16,d28 @sum' = sum + sum'

++ vshl.u32 d30,d18,#3 @adler" = adler * 8

++ vadd.u32 d28,d28,d30 @sum' = sum' + adler"

++ vmov.u32 d18,d24 @save adler for next iteration

++ vmov.u32 d16,d28 @save sum for next iteration

++ sub r3,r3,#8

++adler32_DO16_loop_8bytes_done:

++ cmp r3, #0 @find if there are remaining bytes

++ beq DONE @after profiling found that a loop

++ @to compute 4 or 2 bytes at a time

++ @is less efficient than a byte by

++ @byte computation.

++ vmov.u64 d3, #0

++adler32_DO16_loop_remaining:

++ vld1.8 {d3[0]}, [r7]! @load 1 byte of input

++ subs r3,r3,#1

++ vadd.u32 d24,d3,d18 @adler' = adler + *buf

++ vadd.u32 d28,d24,d16 @sum' = sum + adler'

++ vmov.u32 d18,d24

++ vmov.u32 d16,d28

++ bne adler32_DO16_loop_remaining

++DONE:

++ vst1.32 {d24[0]},[r1]

++ vst1.32 {d28[0]},[r2]

++ str r7, [r0]

++ ldmia sp!, {r4-r7}

++ bx lr

++.size adler32_DO16_loop_neon, .-adler32_DO16_loop_neon

++.balign 16

++#======================================================================

++#FACTOR16 provides the multiplication factors for the inputs for 16

++#byte loops. The second half (i.e. 8 to 1) has the multiplications

++#factors for 8 byte loops.

++#======================================================================

++FACTOR16:

++ .byte 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1

++#endif

++ .END

+diff --git a/inffast.c b/inffast.c

+index bbee92e..943cb92 100644

+--- a/inffast.c

++++ b/inffast.c

+@@ -8,6 +8,10 @@

+ #include "inflate.h"

+ #include "inffast.h"

++#if defined(__ARM_NEON__)

++extern void inflate_fast_copy_neon(unsigned len, unsigned char **out, unsigned char *from);

++#endif

+ #ifndef ASMINF

+ /* Allow machine dependent optimization for post-increment or pre-increment.

+@@ -231,6 +235,9 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */

+ from = out - dist; /* rest from output */

+ }

++#if defined(__ARM_NEON__)

++ inflate_fast_copy_neon(len, &out, from);

++#else

+ while (len > 2) {

+ PUP(out) = PUP(from);

+@@ -242,9 +249,13 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */

+ if (len > 1)

+ PUP(out) = PUP(from);

+ }

++#endif

+ }

+ else {

+ from = out - dist; /* copy direct from output */

++#if defined(__ARM_NEON__)

++ inflate_fast_copy_neon(len, &out, from);

++#else

+ do { /* minimum length is three */

+ PUP(out) = PUP(from);

+@@ -256,6 +267,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */

+ if (len > 1)

+ PUP(out) = PUP(from);

+ }

++#endif

+ }

+ else if ((op & 64) == 0) { /* 2nd level distance code */

+diff --git a/inflate_fast_copy_neon.S b/inflate_fast_copy_neon.S

+new file mode 100755

+index 0000000..ec1e4ab

+--- /dev/null

++++ b/inflate_fast_copy_neon.S

+@@ -0,0 +1,521 @@

++#;

++#; Redistribution and use in source and binary forms, with or without

++#; modification, are permitted provided that the following conditions are

++#; met:

++#; * Redistributions of source code must retain the above copyright

++#; notice, this list of conditions and the following disclaimer.

++#; * Redistributions in binary form must reproduce the above

++#; copyright notice, this list of conditions and the following

++#; disclaimer in the documentation and/or other materials provided

++#; with the distribution.

++#; * Neither the name of Code Aurora Forum, Inc. nor the names of its

++#; contributors may be used to endorse or promote products derived

++#; from this software without specific prior written permission.

++#;

++#; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED

++#; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF

++#; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT

++#; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS

++#; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

++#; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

++#; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR

++#; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,

++#; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE

++#; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN

++#; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

++#;

++#if defined(__ARM_NEON__)

++#;============================================================================

++#; Code Section

++ .code 32 @; Code is ARM ISA

++#;============================================================================

++ .global inflate_fast_copy_neon

++#;============================================================================

++#; INPUTS: r0 len: number of bytes to transfer

++#; r1 **out: pointer to pointer to ``out'' buffer

++#; r2 *from: pointer to ``from'' buffer

++#; OUTPUTS: r1 **out: pointer to pointer to ``out'' buffer

++#;============================================================================

++.balign 32

++.type inflate_fast_copy_neon, %function

++inflate_fast_copy_neon:

++ push {r4-r11} @; push r4-r11 onto stack

++ cmp r0,#16 @;

++ bge inflate_fast_copy_vectorized

++ #;; transfer bytes one by one

++ #;; only if len < 16 bytes

++inflate_fast_copy_default:

++ cmp r0,#0

++ beq inflate_fast_copy_exit

++ ldr r3,[r1,#0] @; r3 = pointer to out

++inflate_fast_copy_default_loop:

++ ldrb r12,[r2,#1]! @; r12 = *(++from)

++ subs r0,r0,#1 @; len--

++ strb r12,[r3,#1]! @; *(++out) = r12

++ bne inflate_fast_copy_default_loop

++ str r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ b inflate_fast_copy_exit

++ #;; vectorized copy routines

++ #;; only if len > 16 bytes

++inflate_fast_copy_vectorized:

++ ldr r3,[r1,#0] @; r3 = pointer to out

++ @; DON'T TOUCH r1 UNTIL FINAL

++ @; UPDATE OF r1 WITH ADDRESS OF r3

++ cmp r3,r2 @

++ sublt r4,r2,r3 @

++ subge r4,r3,r2 @;r4 = gap = |out-from|

++ cmp r4,#0

++ beq inflate_fast_copy_exit

++ cmp r4,#1

++ beq inflate_fast_copy_gap1b_proc

++ cmp r4,#2

++ beq inflate_fast_copy_gap2b_proc

++ cmp r4,#3

++ beq inflate_fast_copy_gap3b_proc

++ cmp r4,#4

++ beq inflate_fast_copy_gap4b_proc

++ cmp r4,#8

++ blt inflate_fast_copy_gap5to7b_proc

++ beq inflate_fast_copy_gap8b_proc

++ cmp r4,#16

++ blt inflate_fast_copy_gap9to15b_proc

++ bge inflate_fast_copy_gap16b_proc

++ #;; ------------------------------------------------------------------

++ #;; vectorized copy routine when gap between ``from'' and ``out''

++ #;; buffers is 1 byte

++ #;; INPUTS:

++ #;; r0 = len

++ #;; r2 = pointer to from

++ #;; r3 = pointer to out

++ #;; OUTPUTS:

++ #;; r1 = pointer to pointer to out

++ #;; ------------------------------------------------------------------

++inflate_fast_copy_gap1b_proc:

++ add r3,r3,#1 @; out++

++ @

++ ldrb r12,[r2,#1]! @; r12 = *(++from)

++ vdup.8 q0, r12 @; duplicate r12 16 times in q0

++ @

++ lsrs r4,r0,#4 @; r4 = floor(len/16)

++ @; = iteration count for loop16

++ beq inflate_fast_copy_gap1b_proc_16bytes_loop_done

++inflate_fast_copy_gap1b_proc_16bytes_loop:

++ vst1.8 {q0},[r3]! @; store 16 bytes in out and

++ @; increment out pointer

++ sub r0,r0,#16 @; subtract 16 from len

++ subs r4,r4,#1 @; decrement iteration count

++ bne inflate_fast_copy_gap1b_proc_16bytes_loop

++inflate_fast_copy_gap1b_proc_16bytes_loop_done:

++ cmp r0,#0

++ subeq r3,r3,#1 @; out--

++ streq r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ beq inflate_fast_copy_exit

++inflate_fast_copy_gap1b_proc_lastfewbytes_loop:

++ strb r12,[r3],#1 @; *out = r12, out++

++ subs r0,r0,#1 @; len--

++ bne inflate_fast_copy_gap1b_proc_lastfewbytes_loop

++ sub r3,r3,#1 @; out--

++ str r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ b inflate_fast_copy_exit

++ #;; ------------------------------------------------------------------

++ #;; vectorized copy routine when gap between ``from'' and ``out''

++ #;; buffers is 2 bytes

++ #;; INPUTS:

++ #;; r0 = len

++ #;; r2 = pointer to from

++ #;; r3 = pointer to out

++ #;; OUTPUTS:

++ #;; r1 = pointer to pointer to out

++ #;; ------------------------------------------------------------------

++inflate_fast_copy_gap2b_proc:

++ add r2,r2,#1 @; from++

++ add r3,r3,#1 @; out++

++ @

++ vld1.16 {d0[0]},[r2] @; load 2 bytes into d0[0]

++ vdup.16 q0,d0[0] @; duplicate those 2 bytes 8 times

++ @; to fill up q0

++ @

++ lsrs r4,r0,#4 @; r4 = floor(len/16)

++ @; = iteration count for loop16

++ beq inflate_fast_copy_gap2b_proc_16bytes_loop_done

++inflate_fast_copy_gap2b_proc_16bytes_loop:

++ vst1.8 {q0},[r3]! @; store 16 bytes in out and

++ @; increment out pointer

++ sub r0,r0,#16 @; subtract 16 from len

++ subs r4,r4,#1 @; decrement iteration count

++ bne inflate_fast_copy_gap2b_proc_16bytes_loop

++inflate_fast_copy_gap2b_proc_16bytes_loop_done:

++ cmp r0,#0

++ subeq r3,r3,#1 @; out--

++ streq r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ beq inflate_fast_copy_exit

++inflate_fast_copy_gap2b_proc_lastfewbytes_loop:

++ ldrb r12,[r2],#1 @; r12 = *from, from++

++ subs r0,r0,#1 @; len--

++ strb r12,[r3],#1 @; *out = r12, out++

++ @

++ bne inflate_fast_copy_gap2b_proc_lastfewbytes_loop

++ sub r3,r3,#1 @; out--

++ str r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ b inflate_fast_copy_exit

++ #;; ------------------------------------------------------------------

++ #;; vectorized copy routine when gap between ``from'' and ``out''

++ #;; buffers is 3 bytes

++ #;; INPUTS:

++ #;; r0 = len

++ #;; r2 = pointer to from

++ #;; r3 = pointer to out

++ #;; r4 = 3

++ #;; OUTPUTS:

++ #;; r1 = pointer to pointer to out

++ #;; ------------------------------------------------------------------

++inflate_fast_copy_gap3b_proc:

++ add r2,r2,#1 @; from++

++ add r3,r3,#1 @; out++

++ @

++ vld1.32 {d0[0]},[r2] @; load 4 bytes into d0[0]

++inflate_fast_copy_gap3b_proc_3bytes_loop:

++ cmp r0,#3 @; exit loop if len < 3

++ blt inflate_fast_copy_gap3b_proc_3bytes_loop_done

++ vst1.32 {d0[0]},[r3],r4 @; store 4 bytes in out

++ @; out+=3

++ sub r0,r0,#3 @; len-=3

++ b inflate_fast_copy_gap3b_proc_3bytes_loop

++inflate_fast_copy_gap3b_proc_3bytes_loop_done:

++ cmp r0,#0

++ subeq r3,r3,#1 @; out--

++ streq r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ beq inflate_fast_copy_exit

++inflate_fast_copy_gap3b_proc_lastfewbytes_loop:

++ ldrb r12,[r2],#1 @; r12 = *from, from++

++ subs r0,r0,#1 @; len--

++ strb r12,[r3],#1 @; *out = r12, out++

++ bne inflate_fast_copy_gap3b_proc_lastfewbytes_loop

++ sub r3,r3,#1 @; out--

++ str r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ b inflate_fast_copy_exit

++ #;; ------------------------------------------------------------------

++ #;; vectorized copy routine when gap between ``from'' and ``out''

++ #;; buffers is 4 bytes

++ #;; INPUTS:

++ #;; r0 = len

++ #;; r2 = pointer to from

++ #;; r3 = pointer to out

++ #;; OUTPUTS:

++ #;; r1 = pointer to pointer to out

++ #;; ------------------------------------------------------------------

++inflate_fast_copy_gap4b_proc:

++ add r2,r2,#1 @; from++

++ add r3,r3,#1 @; out++

++ @

++ vld1.32 {d0[0]},[r2] @; load 4 bytes into d0[0]

++ vdup.32 q0,d0[0] @; duplicate those 4 bytes 4 times

++ @; to fill up q0

++ @

++ lsrs r4,r0,#4 @; r4 = floor(len/16)

++ @; = iteration count for loop16

++ beq inflate_fast_copy_gap4b_proc_16bytes_loop_done

++inflate_fast_copy_gap4b_proc_16bytes_loop:

++ vst1.32 {q0},[r3]! @; store 16 bytes in out and

++ @; increment out pointer

++ sub r0,r0,#16 @; subtract 16 from len

++ subs r4,r4,#1 @; decrement iteration count

++ bne inflate_fast_copy_gap4b_proc_16bytes_loop

++inflate_fast_copy_gap4b_proc_16bytes_loop_done:

++ cmp r0,#0

++ subeq r3,r3,#1 @; out--

++ streq r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ beq inflate_fast_copy_exit

++inflate_fast_copy_gap4b_proc_lastfewbytes_loop:

++ ldrb r12,[r2],#1 @; r12 = *from, from++

++ subs r0,r0,#1 @; len--

++ strb r12,[r3],#1 @; *out = r12, out++

++ bne inflate_fast_copy_gap4b_proc_lastfewbytes_loop

++ sub r3,r3,#1 @; out--

++ str r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ b inflate_fast_copy_exit

++ #;; ------------------------------------------------------------------

++ #;; vectorized copy routine when gap between ``from'' and ``out''

++ #;; buffers is {5-7} bytes

++ #;; INPUTS:

++ #;; r0 = len

++ #;; r2 = pointer to from

++ #;; r3 = pointer to out

++ #;; r4 = {5-7}

++ #;; OUTPUTS:

++ #;; r1 = pointer to pointer to out

++ #;; ------------------------------------------------------------------

++inflate_fast_copy_gap5to7b_proc:

++ add r2,r2,#1 @; from++

++ add r3,r3,#1 @; out++

++ @

++ vld1.8 {d0},[r2] @; load 8 bytes into d0

++inflate_fast_copy_gap5to7b_proc_5to7bytes_loop:

++ cmp r0,r4 @; exit loop if len < {5-7}

++ blt inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done

++ vst1.8 {d0},[r3],r4 @; store 8 bytes in out

++ @; out+={5-7}

++ sub r0,r0,r4 @; len-={5-7}

++ b inflate_fast_copy_gap5to7b_proc_5to7bytes_loop

++inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done:

++ cmp r0,#0

++ subeq r3,r3,#1 @; out--

++ streq r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ beq inflate_fast_copy_exit

++inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop:

++ ldrb r12,[r2],#1 @; r12 = *from, from++

++ subs r0,r0,#1 @; len--

++ strb r12,[r3],#1 @; *out = r12, out++

++ bne inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop

++ sub r3,r3,#1 @; out--

++ str r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ b inflate_fast_copy_exit

++ #;; ------------------------------------------------------------------

++ #;; vectorized copy routine when gap between ``from'' and ``out''

++ #;; buffers is 8 bytes

++ #;; INPUTS:

++ #;; r0 = len

++ #;; r2 = pointer to from

++ #;; r3 = pointer to out

++ #;; OUTPUTS:

++ #;; r1 = pointer to pointer to out

++ #;; ------------------------------------------------------------------

++inflate_fast_copy_gap8b_proc:

++ add r2,r2,#1 @; from++

++ add r3,r3,#1 @; out++

++ @

++ vld1.8 {d0},[r2] @; load 8 bytes into d0

++ vmov d1,d0 @; duplicate the 8 bytes to fill up

++ @; q0

++ @

++ lsrs r4,r0,#4 @; r4 = floor(len/16)

++ @; = iteration count for loop16

++ beq inflate_fast_copy_gap8b_proc_16bytes_loop_done

++inflate_fast_copy_gap8b_proc_16bytes_loop:

++ vst1.8 {q0},[r3]! @; store 16 bytes in out and

++ @; increment out pointer

++ sub r0,r0,#16 @; subtract 16 from len

++ subs r4,r4,#1 @; decrement iteration count

++ bne inflate_fast_copy_gap8b_proc_16bytes_loop

++inflate_fast_copy_gap8b_proc_16bytes_loop_done:

++ cmp r0,#0

++ subeq r3,r3,#1 @; out--

++ streq r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ beq inflate_fast_copy_exit

++inflate_fast_copy_gap8b_proc_lastfewbytes_loop:

++ ldrb r12,[r2],#1 @; r12 = *from, from++

++ subs r0,r0,#1 @; len--

++ strb r12,[r3],#1 @; *out = r12, out++

++ bne inflate_fast_copy_gap8b_proc_lastfewbytes_loop

++ sub r3,r3,#1 @; out--

++ str r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ b inflate_fast_copy_exit

++ #;; ------------------------------------------------------------------

++ #;; vectorized copy routine when gap between ``from'' and ``out''

++ #;; buffers is {9-15} bytes

++ #;; INPUTS:

++ #;; r0 = len

++ #;; r2 = pointer to from

++ #;; r3 = pointer to out

++ #;; r4 = {9-15}

++ #;; OUTPUTS:

++ #;; r1 = pointer to pointer to out

++ #;; ------------------------------------------------------------------

++inflate_fast_copy_gap9to15b_proc:

++ add r2,r2,#1 @; from++

++ add r3,r3,#1 @; out++

++ @

++ vld1.8 {q0},[r2] @; load 16 bytes into q0

++inflate_fast_copy_gap9to15b_proc_9to15bytes_loop:

++ cmp r0, r4 @; exit loop if len < {9-15}

++ blt inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done

++ vst1.8 {q0},[r3],r4 @; store 16 bytes in out

++ @; out+={9-15}

++ sub r0,r0,r4 @; len-={9-15}

++ b inflate_fast_copy_gap9to15b_proc_9to15bytes_loop

++inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done:

++ cmp r0,#0

++ subeq r3,r3,#1 @; out--

++ streq r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ beq inflate_fast_copy_exit

++inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop:

++ ldrb r12,[r2],#1 @; r12 = *from, from++

++ subs r0,r0,#1 @; len--

++ strb r12,[r3],#1 @; *out = r12, out++

++ bne inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop

++ sub r3,r3,#1 @; out--

++ str r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ b inflate_fast_copy_exit

++ #;; ------------------------------------------------------------------

++ #;; vectorized copy routine when gap between ``from'' and ``out''

++ #;; buffers is 16 bytes or more

++ #;; INPUTS:

++ #;; r0 = len

++ #;; r2 = pointer to from

++ #;; r3 = pointer to out

++ #;; OUTPUTS:

++ #;; r1 = pointer to pointer to out

++ #;; ------------------------------------------------------------------

++inflate_fast_copy_gap16b_proc:

++ add r2,r2,#1 @; from++

++ add r3,r3,#1 @; out++

++ @

++ lsrs r4,r0,#4 @; r4 = floor(len/16)

++ @; = iteration count for loop16

++ beq inflate_fast_copy_gap16b_proc_16bytes_loop_done

++inflate_fast_copy_gap16b_proc_16bytes_loop:

++ vld1.8 {q0},[r2]! @; load 16 bytes into q0 and

++ @; increment from pointer

++ vst1.8 {q0},[r3]! @; store 16 bytes in out and

++ @; increment out pointer

++ sub r0,r0,#16 @; subtract 16 from len

++ subs r4,r4,#1 @; decrement iteration count

++ bne inflate_fast_copy_gap16b_proc_16bytes_loop

++inflate_fast_copy_gap16b_proc_16bytes_loop_done:

++ cmp r0,#0

++ subeq r3,r3,#1 @; out--

++ streq r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++ beq inflate_fast_copy_exit

++inflate_fast_copy_gap16b_proc_lastfewbytes_loop:

++ ldrb r12,[r2],#1 @; r12 = *from, from++

++ subs r0,r0,#1 @; len--

++ strb r12,[r3],#1 @; *out = r12, out++

++ bne inflate_fast_copy_gap16b_proc_lastfewbytes_loop

++ sub r3,r3,#1 @; out--

++ str r3,[r1,#0] @; r1 = updated pointer to pointer

++ @; to out

++inflate_fast_copy_exit:

++ pop {r4-r11} @; pop r4-r11 from stack

++ bx lr @; return

++.size inflate_fast_copy_neon, .-inflate_fast_copy_neon

++#endif

++ .END

« no previous file with comments | « no previous file | sys-libs/zlib/zlib-1.2.3-r1.ebuild » ('j') | no next file with comments »