Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(597)

Unified Diff: sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch

Issue 5176006: Applying Neon optimization patch to the ZLIB library. Base URL: http://git.chromium.org/git/portage.git@master
Patch Set: Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | sys-libs/zlib/zlib-1.2.3-r1.ebuild » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch
diff --git a/sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch b/sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch
new file mode 100644
index 0000000000000000000000000000000000000000..3e7698e96bfd4a6400a15e4142391b7cc973d7b3
--- /dev/null
+++ b/sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch
@@ -0,0 +1,865 @@
+commit df426dcc3179a3647a695c0fde66e1b3616fa6c0
+Author: John Alayari <jalayari@codeaurora.org>
+Date: Thu Nov 18 16:24:17 2010 -0800
+
+ commiting the PNG and ZLIB Neon Optimizations files to local git repsitory.
+
+diff --git a/Makefile.in b/Makefile.in
+index 7da5a85..276c531 100644
+--- a/Makefile.in
++++ b/Makefile.in
+@@ -51,7 +51,7 @@ OBJS = adler32.o compress.o crc32.o gzio.o uncompr.o deflate.o trees.o \
+
+ PIC_OBJS = $(OBJS:%.o=%.lo)
+
+-OBJA =
++OBJA =inflate_fast_copy_neon.o adler32_DO16_loop_neon.o
+ # to use the asm code: make OBJA=match.o
+
+ TEST_OBJS = example.o minigzip.o
+@@ -82,8 +82,12 @@ match.o: match.S
+ %.lo: %.c
+ $(CC) $(CFLAGS) -DPIC -fPIC -c $< -o $@
+
+-$(SHAREDLIBV): $(PIC_OBJS)
+- $(LDSHARED) -o $@ $(PIC_OBJS) -lc $(LDFLAGS)
++%.o: %.S
++ $(CC) $(CFLAGS) -DPIC -fPIC -c $< -o $@
++
++
++$(SHAREDLIBV): $(PIC_OBJS) $(OBJA)
++ $(LDSHARED) -o $@ $(PIC_OBJS) $(OBJA) -lc $(LDFLAGS)
+ rm -f $(SHAREDLIB) $(SHAREDLIBM)
+ ln -s $@ $(SHAREDLIB)
+ ln -s $@ $(SHAREDLIBM)
+diff --git a/adler32.c b/adler32.c
+index 007ba26..a256e88 100644
+--- a/adler32.c
++++ b/adler32.c
+@@ -1,5 +1,6 @@
+ /* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2004 Mark Adler
++ * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+@@ -18,6 +19,10 @@
+ #define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);
+ #define DO16(buf) DO8(buf,0); DO8(buf,8);
+
++#if defined(__ARM_NEON__)
++extern void adler32_DO16_loop_neon(unsigned char** , unsigned long *, unsigned long * , int );
++#endif
++
+ /* use NO_DIVIDE if your processor does not do division in hardware */
+ #ifdef NO_DIVIDE
+ # define MOD(a) \
+@@ -96,17 +101,25 @@ uLong ZEXPORT adler32(adler, buf, len)
+ /* do length NMAX blocks -- requires just one modulo operation */
+ while (len >= NMAX) {
+ len -= NMAX;
++#if defined(__ARM_NEON__)
++ adler32_DO16_loop_neon(&buf, &adler, &sum2, NMAX);
++#else
+ n = NMAX / 16; /* NMAX is divisible by 16 */
+ do {
+ DO16(buf); /* 16 sums unrolled */
+ buf += 16;
+ } while (--n);
++#endif
+ MOD(adler);
+ MOD(sum2);
+ }
+
+ /* do remaining bytes (less than NMAX, still just one modulo) */
+ if (len) { /* avoid modulos if none remaining */
++
++#if defined(__ARM_NEON__)
++ adler32_DO16_loop_neon(&buf, &adler, &sum2, len);
++#else
+ while (len >= 16) {
+ len -= 16;
+ DO16(buf);
+@@ -116,6 +129,7 @@ uLong ZEXPORT adler32(adler, buf, len)
+ adler += *buf++;
+ sum2 += adler;
+ }
++#endif
+ MOD(adler);
+ MOD(sum2);
+ }
+diff --git a/adler32_DO16_loop_neon.S b/adler32_DO16_loop_neon.S
+new file mode 100755
+index 0000000..1ba5147
+--- /dev/null
++++ b/adler32_DO16_loop_neon.S
+@@ -0,0 +1,195 @@
++#
++# Copyright (c) 2010, Code Aurora Forum. All rights reserved.
++#
++# Redistribution and use in source and binary forms, with or without
++# modification, are permitted provided that the following conditions
++# are met:
++# * Redistributions of source code must retain the above copyright
++# notice, this list of conditions and the following disclaimer.
++# * Redistributions in binary form must reproduce the above
++# copyright notice, this list of conditions and the following
++# disclaimer in the documentation and/or other materials provided
++# with the distribution.
++# * Neither the name of Code Aurora Forum, Inc. nor the names of it
++# contributors may be used to endorse or promote products derived
++# from this software without specific prior written permission.
++#
++# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
++# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMEN
++# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTOR
++# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, O
++# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++# SUBSTITUTE GOODS OR SERVICES LOSS OF USE, DATA, OR PROFITS OR
++# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
++# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
++# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVE
++# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++#
++#if defined(__ARM_NEON__)
++#======================================================================
++
++#======================================================================
++# Code Section
++
++ .code 32 @ Code is ARM ISA
++
++ .global adler32_DO16_loop_neon
++
++#======================================================================
++# Function Name(s) : adler32_DO16_loop_neon
++#
++# Function Parameters
++# r0 = pointer to buf
++# r1 = pointer to adler
++# r2 = pointer to sum
++# r3 = len
++#
++# Register Usage
++# q0, q1 = input data
++# d18,d24 = adler
++# d16,d28 = sum
++# d30 = multiplication factor
++#
++#======================================================================
++#
++# algorithm:
++#
++# while (length < loop_counter)
++# do
++# length = length - loop_counter
++# for i = 0 to loop_counter
++# adler = adler_begin + input[i]
++# for i = 0 to loop_counter
++# sum2 = sum2_begin+(loop_counter-i)*input[i]
++# sum2 = sum2+(adler_begin*loop_counter)
++# adler_begin = adler
++# sum2_begin = sum2
++# end
++# end
++# end
++#
++# Here loop counter holds values of 16, 8 and 1 to compute
++# adler and sum for 16 bytes, 8 bytes and 1 byte at a time
++# adler_begin and sum2_begin are used to hold the values
++# of adler and sum2 from previous iterations.
++#
++#======================================================================
++.balign 32
++.type adler32_DO16_loop_neon, %function
++
++adler32_DO16_loop_neon:
++ stmdb sp!,{r4-r7}
++ ldr r7,[r0]
++ vld1.32 {d18[0]},[r1] @load the input adler
++ vld1.32 {d16[0]},[r2] @load the input sum
++ ldr r5,=FACTOR16 @load the multiplication
++ @factors for data elements
++ vld1.8 {d20,d21},[r5] @load the multiplication
++ @factor for adler.
++ lsrs r4,r3,#4 @Calculate the number
++ @16 byte iterations
++ beq adler32_DO16_loop_16bytes_done
++ mov r6, #16
++ vmov.32 d30[0],r6
++
++adler32_DO16_loop_16bytes:
++ vld1.8 {d0,d1},[r7]! @load buf[0]..buf[15]
++ vpaddl.u8 d6, d0 @pair wise add to reduce
++ @8 elements to 4 and extend.
++ vpaddl.u8 d7,d1 @pair wise add to reduce 8
++ @elements to 4 and extend.
++ vpadd.u16 d24,d6,d7 @pair wise add (i.e. no
++ @need to extend 16 bits
++ @sufficient to hold the sum).
++ vpaddl.u16 d24,d24 @pair wise add to reduce 4
++ @elements to 2 and extend.
++ vpaddl.u32 d24,d24 @pair wise add to get the
++ @adler of 16 inputs no need
++ @to extend .. but only vpaddl
++ @adds pair wise on one
++ @doubleword.
++ vadd.u32 d24,d18,d24 @adler'=adler+adler_of_16_inputs
++ vmull.u8 q13,d20,d0 @sum'=mul_fac_for_inputs[0...7]
++ @ * buf[0..7].
++ vmlal.u8 q13,d21,d1 @sum'=sum'+ mul_fac_for_inputs
++ @[8...15] * buf[8..15].
++ vpadd.u16 d28,d26,d27 @pair wise add the doublewords
++ vpaddl.u16 d28,d28 @pair wise add to reduce 4
++ @elements to 2 and extend.
++ vpaddl.u32 d28,d28 @pair wise add
++ vadd.u32 d28,d16,d28 @sum' = sum + sum'
++ vmla.u32 d28,d18,d30 @sum' = sum' + (adler*
++ @mul_fac_for_adler).
++ vmov.u32 d18,d24 @save adler for next iteration.
++ vmov.u32 d16,d28 @save sum for next iteration.
++ sub r3,r3,#16
++ subs r4,r4,#1
++ bne adler32_DO16_loop_16bytes
++
++adler32_DO16_loop_16bytes_done:
++ lsrs r4, r3, #3 @find if there are atleast 8 bytes
++ beq adler32_DO16_loop_8bytes_done
++adler32_DO16_loop_8bytes:
++ vld1.8 {d0},[r7]! @load buf[0] .buf[7]
++ vpaddl.u8 d24,d0 @pair wise add to
++ @reduce 8 elements to 4
++ vpaddl.u16 d24,d24 @pair wise add to reduce
++ @4 elements to 2
++ vpaddl.u32 d24,d24 @pair wise add to get the
++ @adler for 8 inputs
++ vadd.u32 d24,d18,d24 @adler' = adler +
++ @adler_for_8_inputs.
++ vmull.u8 q13,d21,d0 @sum' = mul_fac_for_inputs[0..7]
++ @ * buf[0..7]
++ vpadd.u16 d28,d26,d27 @pair wise add to reduce 8
++ @elements to 4
++ vpaddl.u16 d28,d28 @pair wise add to reduce 4
++ @elements to 2.
++ vpaddl.u32 d28,d28 @pair wise add
++ vadd.u32 d28,d16,d28 @sum' = sum + sum'
++ vshl.u32 d30,d18,#3 @adler" = adler * 8
++ vadd.u32 d28,d28,d30 @sum' = sum' + adler"
++ vmov.u32 d18,d24 @save adler for next iteration
++ vmov.u32 d16,d28 @save sum for next iteration
++ sub r3,r3,#8
++
++adler32_DO16_loop_8bytes_done:
++ cmp r3, #0 @find if there are remaining bytes
++ beq DONE @after profiling found that a loop
++ @to compute 4 or 2 bytes at a time
++ @is less efficient than a byte by
++ @byte computation.
++ vmov.u64 d3, #0
++
++adler32_DO16_loop_remaining:
++ vld1.8 {d3[0]}, [r7]! @load 1 byte of input
++ subs r3,r3,#1
++ vadd.u32 d24,d3,d18 @adler' = adler + *buf
++ vadd.u32 d28,d24,d16 @sum' = sum + adler'
++ vmov.u32 d18,d24
++ vmov.u32 d16,d28
++ bne adler32_DO16_loop_remaining
++
++DONE:
++ vst1.32 {d24[0]},[r1]
++ vst1.32 {d28[0]},[r2]
++ str r7, [r0]
++ ldmia sp!, {r4-r7}
++ bx lr
++
++.size adler32_DO16_loop_neon, .-adler32_DO16_loop_neon
++
++.balign 16
++#======================================================================
++#FACTOR16 provides the multiplication factors for the inputs for 16
++#byte loops. The second half (i.e. 8 to 1) has the multiplications
++#factors for 8 byte loops.
++#======================================================================
++
++FACTOR16:
++ .byte 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
++
++#endif
++ .END
++
+diff --git a/inffast.c b/inffast.c
+index bbee92e..943cb92 100644
+--- a/inffast.c
++++ b/inffast.c
+@@ -8,6 +8,10 @@
+ #include "inflate.h"
+ #include "inffast.h"
+
++#if defined(__ARM_NEON__)
++extern void inflate_fast_copy_neon(unsigned len, unsigned char **out, unsigned char *from);
++#endif
++
+ #ifndef ASMINF
+
+ /* Allow machine dependent optimization for post-increment or pre-increment.
+@@ -231,6 +235,9 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
+ from = out - dist; /* rest from output */
+ }
+ }
++#if defined(__ARM_NEON__)
++ inflate_fast_copy_neon(len, &out, from);
++#else
+ while (len > 2) {
+ PUP(out) = PUP(from);
+ PUP(out) = PUP(from);
+@@ -242,9 +249,13 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
+ if (len > 1)
+ PUP(out) = PUP(from);
+ }
++#endif
+ }
+ else {
+ from = out - dist; /* copy direct from output */
++#if defined(__ARM_NEON__)
++ inflate_fast_copy_neon(len, &out, from);
++#else
+ do { /* minimum length is three */
+ PUP(out) = PUP(from);
+ PUP(out) = PUP(from);
+@@ -256,6 +267,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
+ if (len > 1)
+ PUP(out) = PUP(from);
+ }
++#endif
+ }
+ }
+ else if ((op & 64) == 0) { /* 2nd level distance code */
+diff --git a/inflate_fast_copy_neon.S b/inflate_fast_copy_neon.S
+new file mode 100755
+index 0000000..ec1e4ab
+--- /dev/null
++++ b/inflate_fast_copy_neon.S
+@@ -0,0 +1,521 @@
++#; Copyright (c) 2010, Code Aurora Forum. All rights reserved.
++#;
++#; Redistribution and use in source and binary forms, with or without
++#; modification, are permitted provided that the following conditions are
++#; met:
++#; * Redistributions of source code must retain the above copyright
++#; notice, this list of conditions and the following disclaimer.
++#; * Redistributions in binary form must reproduce the above
++#; copyright notice, this list of conditions and the following
++#; disclaimer in the documentation and/or other materials provided
++#; with the distribution.
++#; * Neither the name of Code Aurora Forum, Inc. nor the names of its
++#; contributors may be used to endorse or promote products derived
++#; from this software without specific prior written permission.
++#;
++#; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
++#; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++#; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
++#; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
++#; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++#; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++#; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++#; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
++#; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
++#; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
++#; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++#;
++#;
++#if defined(__ARM_NEON__)
++#;============================================================================
++#; Code Section
++ .code 32 @; Code is ARM ISA
++#;============================================================================
++
++ .global inflate_fast_copy_neon
++
++
++#;============================================================================
++#; INPUTS: r0 len: number of bytes to transfer
++#; r1 **out: pointer to pointer to ``out'' buffer
++#; r2 *from: pointer to ``from'' buffer
++#; OUTPUTS: r1 **out: pointer to pointer to ``out'' buffer
++#;============================================================================
++.balign 32
++.type inflate_fast_copy_neon, %function
++inflate_fast_copy_neon:
++ push {r4-r11} @; push r4-r11 onto stack
++
++ cmp r0,#16 @;
++ bge inflate_fast_copy_vectorized
++
++ #;; transfer bytes one by one
++ #;; only if len < 16 bytes
++inflate_fast_copy_default:
++
++ cmp r0,#0
++ beq inflate_fast_copy_exit
++
++ ldr r3,[r1,#0] @; r3 = pointer to out
++
++inflate_fast_copy_default_loop:
++
++ ldrb r12,[r2,#1]! @; r12 = *(++from)
++ subs r0,r0,#1 @; len--
++ strb r12,[r3,#1]! @; *(++out) = r12
++
++ bne inflate_fast_copy_default_loop
++
++ str r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ b inflate_fast_copy_exit
++
++ #;; vectorized copy routines
++ #;; only if len > 16 bytes
++inflate_fast_copy_vectorized:
++
++ ldr r3,[r1,#0] @; r3 = pointer to out
++ @; DON'T TOUCH r1 UNTIL FINAL
++ @; UPDATE OF r1 WITH ADDRESS OF r3
++ cmp r3,r2 @
++ sublt r4,r2,r3 @
++ subge r4,r3,r2 @;r4 = gap = |out-from|
++
++ cmp r4,#0
++ beq inflate_fast_copy_exit
++
++ cmp r4,#1
++ beq inflate_fast_copy_gap1b_proc
++
++ cmp r4,#2
++ beq inflate_fast_copy_gap2b_proc
++
++ cmp r4,#3
++ beq inflate_fast_copy_gap3b_proc
++
++ cmp r4,#4
++ beq inflate_fast_copy_gap4b_proc
++
++ cmp r4,#8
++ blt inflate_fast_copy_gap5to7b_proc
++ beq inflate_fast_copy_gap8b_proc
++
++ cmp r4,#16
++ blt inflate_fast_copy_gap9to15b_proc
++ bge inflate_fast_copy_gap16b_proc
++
++
++ #;; ------------------------------------------------------------------
++ #;; vectorized copy routine when gap between ``from'' and ``out''
++ #;; buffers is 1 byte
++ #;; INPUTS:
++ #;; r0 = len
++ #;; r2 = pointer to from
++ #;; r3 = pointer to out
++ #;; OUTPUTS:
++ #;; r1 = pointer to pointer to out
++ #;; ------------------------------------------------------------------
++inflate_fast_copy_gap1b_proc:
++
++ add r3,r3,#1 @; out++
++ @
++ ldrb r12,[r2,#1]! @; r12 = *(++from)
++ vdup.8 q0, r12 @; duplicate r12 16 times in q0
++ @
++ lsrs r4,r0,#4 @; r4 = floor(len/16)
++ @; = iteration count for loop16
++ beq inflate_fast_copy_gap1b_proc_16bytes_loop_done
++
++inflate_fast_copy_gap1b_proc_16bytes_loop:
++
++ vst1.8 {q0},[r3]! @; store 16 bytes in out and
++ @; increment out pointer
++ sub r0,r0,#16 @; subtract 16 from len
++ subs r4,r4,#1 @; decrement iteration count
++ bne inflate_fast_copy_gap1b_proc_16bytes_loop
++
++inflate_fast_copy_gap1b_proc_16bytes_loop_done:
++
++ cmp r0,#0
++ subeq r3,r3,#1 @; out--
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ beq inflate_fast_copy_exit
++
++inflate_fast_copy_gap1b_proc_lastfewbytes_loop:
++
++ strb r12,[r3],#1 @; *out = r12, out++
++ subs r0,r0,#1 @; len--
++ bne inflate_fast_copy_gap1b_proc_lastfewbytes_loop
++
++ sub r3,r3,#1 @; out--
++ str r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ b inflate_fast_copy_exit
++
++ #;; ------------------------------------------------------------------
++ #;; vectorized copy routine when gap between ``from'' and ``out''
++ #;; buffers is 2 bytes
++ #;; INPUTS:
++ #;; r0 = len
++ #;; r2 = pointer to from
++ #;; r3 = pointer to out
++ #;; OUTPUTS:
++ #;; r1 = pointer to pointer to out
++ #;; ------------------------------------------------------------------
++inflate_fast_copy_gap2b_proc:
++
++ add r2,r2,#1 @; from++
++ add r3,r3,#1 @; out++
++ @
++ vld1.16 {d0[0]},[r2] @; load 2 bytes into d0[0]
++ vdup.16 q0,d0[0] @; duplicate those 2 bytes 8 times
++ @; to fill up q0
++ @
++ lsrs r4,r0,#4 @; r4 = floor(len/16)
++ @; = iteration count for loop16
++ beq inflate_fast_copy_gap2b_proc_16bytes_loop_done
++
++inflate_fast_copy_gap2b_proc_16bytes_loop:
++
++ vst1.8 {q0},[r3]! @; store 16 bytes in out and
++ @; increment out pointer
++ sub r0,r0,#16 @; subtract 16 from len
++ subs r4,r4,#1 @; decrement iteration count
++ bne inflate_fast_copy_gap2b_proc_16bytes_loop
++
++inflate_fast_copy_gap2b_proc_16bytes_loop_done:
++
++ cmp r0,#0
++ subeq r3,r3,#1 @; out--
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ beq inflate_fast_copy_exit
++
++inflate_fast_copy_gap2b_proc_lastfewbytes_loop:
++
++ ldrb r12,[r2],#1 @; r12 = *from, from++
++ subs r0,r0,#1 @; len--
++ strb r12,[r3],#1 @; *out = r12, out++
++ @
++ bne inflate_fast_copy_gap2b_proc_lastfewbytes_loop
++
++ sub r3,r3,#1 @; out--
++ str r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ b inflate_fast_copy_exit
++
++ #;; ------------------------------------------------------------------
++ #;; vectorized copy routine when gap between ``from'' and ``out''
++ #;; buffers is 3 bytes
++ #;; INPUTS:
++ #;; r0 = len
++ #;; r2 = pointer to from
++ #;; r3 = pointer to out
++ #;; r4 = 3
++ #;; OUTPUTS:
++ #;; r1 = pointer to pointer to out
++ #;; ------------------------------------------------------------------
++inflate_fast_copy_gap3b_proc:
++
++ add r2,r2,#1 @; from++
++ add r3,r3,#1 @; out++
++ @
++ vld1.32 {d0[0]},[r2] @; load 4 bytes into d0[0]
++
++inflate_fast_copy_gap3b_proc_3bytes_loop:
++
++ cmp r0,#3 @; exit loop if len < 3
++ blt inflate_fast_copy_gap3b_proc_3bytes_loop_done
++
++ vst1.32 {d0[0]},[r3],r4 @; store 4 bytes in out
++ @; out+=3
++
++ sub r0,r0,#3 @; len-=3
++ b inflate_fast_copy_gap3b_proc_3bytes_loop
++
++inflate_fast_copy_gap3b_proc_3bytes_loop_done:
++
++ cmp r0,#0
++ subeq r3,r3,#1 @; out--
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ beq inflate_fast_copy_exit
++
++inflate_fast_copy_gap3b_proc_lastfewbytes_loop:
++
++ ldrb r12,[r2],#1 @; r12 = *from, from++
++ subs r0,r0,#1 @; len--
++ strb r12,[r3],#1 @; *out = r12, out++
++
++ bne inflate_fast_copy_gap3b_proc_lastfewbytes_loop
++
++ sub r3,r3,#1 @; out--
++ str r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ b inflate_fast_copy_exit
++
++ #;; ------------------------------------------------------------------
++ #;; vectorized copy routine when gap between ``from'' and ``out''
++ #;; buffers is 4 bytes
++ #;; INPUTS:
++ #;; r0 = len
++ #;; r2 = pointer to from
++ #;; r3 = pointer to out
++ #;; OUTPUTS:
++ #;; r1 = pointer to pointer to out
++ #;; ------------------------------------------------------------------
++inflate_fast_copy_gap4b_proc:
++
++ add r2,r2,#1 @; from++
++ add r3,r3,#1 @; out++
++ @
++ vld1.32 {d0[0]},[r2] @; load 4 bytes into d0[0]
++ vdup.32 q0,d0[0] @; duplicate those 4 bytes 4 times
++ @; to fill up q0
++ @
++ lsrs r4,r0,#4 @; r4 = floor(len/16)
++ @; = iteration count for loop16
++ beq inflate_fast_copy_gap4b_proc_16bytes_loop_done
++
++inflate_fast_copy_gap4b_proc_16bytes_loop:
++
++ vst1.32 {q0},[r3]! @; store 16 bytes in out and
++ @; increment out pointer
++ sub r0,r0,#16 @; subtract 16 from len
++ subs r4,r4,#1 @; decrement iteration count
++ bne inflate_fast_copy_gap4b_proc_16bytes_loop
++
++inflate_fast_copy_gap4b_proc_16bytes_loop_done:
++
++ cmp r0,#0
++ subeq r3,r3,#1 @; out--
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ beq inflate_fast_copy_exit
++
++inflate_fast_copy_gap4b_proc_lastfewbytes_loop:
++
++ ldrb r12,[r2],#1 @; r12 = *from, from++
++ subs r0,r0,#1 @; len--
++ strb r12,[r3],#1 @; *out = r12, out++
++
++ bne inflate_fast_copy_gap4b_proc_lastfewbytes_loop
++
++ sub r3,r3,#1 @; out--
++ str r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ b inflate_fast_copy_exit
++
++ #;; ------------------------------------------------------------------
++ #;; vectorized copy routine when gap between ``from'' and ``out''
++ #;; buffers is {5-7} bytes
++ #;; INPUTS:
++ #;; r0 = len
++ #;; r2 = pointer to from
++ #;; r3 = pointer to out
++ #;; r4 = {5-7}
++ #;; OUTPUTS:
++ #;; r1 = pointer to pointer to out
++ #;; ------------------------------------------------------------------
++inflate_fast_copy_gap5to7b_proc:
++
++ add r2,r2,#1 @; from++
++ add r3,r3,#1 @; out++
++ @
++ vld1.8 {d0},[r2] @; load 8 bytes into d0
++
++inflate_fast_copy_gap5to7b_proc_5to7bytes_loop:
++
++ cmp r0,r4 @; exit loop if len < {5-7}
++ blt inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done
++
++ vst1.8 {d0},[r3],r4 @; store 8 bytes in out
++ @; out+={5-7}
++
++ sub r0,r0,r4 @; len-={5-7}
++ b inflate_fast_copy_gap5to7b_proc_5to7bytes_loop
++
++inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done:
++
++ cmp r0,#0
++ subeq r3,r3,#1 @; out--
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ beq inflate_fast_copy_exit
++
++inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop:
++
++ ldrb r12,[r2],#1 @; r12 = *from, from++
++ subs r0,r0,#1 @; len--
++ strb r12,[r3],#1 @; *out = r12, out++
++
++ bne inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop
++
++ sub r3,r3,#1 @; out--
++ str r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ b inflate_fast_copy_exit
++
++ #;; ------------------------------------------------------------------
++ #;; vectorized copy routine when gap between ``from'' and ``out''
++ #;; buffers is 8 bytes
++ #;; INPUTS:
++ #;; r0 = len
++ #;; r2 = pointer to from
++ #;; r3 = pointer to out
++ #;; OUTPUTS:
++ #;; r1 = pointer to pointer to out
++ #;; ------------------------------------------------------------------
++inflate_fast_copy_gap8b_proc:
++
++ add r2,r2,#1 @; from++
++ add r3,r3,#1 @; out++
++ @
++ vld1.8 {d0},[r2] @; load 8 bytes into d0
++ vmov d1,d0 @; duplicate the 8 bytes to fill up
++ @; q0
++ @
++ lsrs r4,r0,#4 @; r4 = floor(len/16)
++ @; = iteration count for loop16
++ beq inflate_fast_copy_gap8b_proc_16bytes_loop_done
++
++inflate_fast_copy_gap8b_proc_16bytes_loop:
++
++ vst1.8 {q0},[r3]! @; store 16 bytes in out and
++ @; increment out pointer
++ sub r0,r0,#16 @; subtract 16 from len
++ subs r4,r4,#1 @; decrement iteration count
++ bne inflate_fast_copy_gap8b_proc_16bytes_loop
++
++inflate_fast_copy_gap8b_proc_16bytes_loop_done:
++
++ cmp r0,#0
++ subeq r3,r3,#1 @; out--
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ beq inflate_fast_copy_exit
++
++inflate_fast_copy_gap8b_proc_lastfewbytes_loop:
++
++ ldrb r12,[r2],#1 @; r12 = *from, from++
++ subs r0,r0,#1 @; len--
++ strb r12,[r3],#1 @; *out = r12, out++
++
++ bne inflate_fast_copy_gap8b_proc_lastfewbytes_loop
++
++ sub r3,r3,#1 @; out--
++ str r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ b inflate_fast_copy_exit
++
++ #;; ------------------------------------------------------------------
++ #;; vectorized copy routine when gap between ``from'' and ``out''
++ #;; buffers is {9-15} bytes
++ #;; INPUTS:
++ #;; r0 = len
++ #;; r2 = pointer to from
++ #;; r3 = pointer to out
++ #;; r4 = {9-15}
++ #;; OUTPUTS:
++ #;; r1 = pointer to pointer to out
++ #;; ------------------------------------------------------------------
++inflate_fast_copy_gap9to15b_proc:
++
++ add r2,r2,#1 @; from++
++ add r3,r3,#1 @; out++
++ @
++ vld1.8 {q0},[r2] @; load 16 bytes into q0
++
++inflate_fast_copy_gap9to15b_proc_9to15bytes_loop:
++
++ cmp r0, r4 @; exit loop if len < {9-15}
++ blt inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done
++
++ vst1.8 {q0},[r3],r4 @; store 16 bytes in out
++ @; out+={9-15}
++
++ sub r0,r0,r4 @; len-={9-15}
++ b inflate_fast_copy_gap9to15b_proc_9to15bytes_loop
++
++inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done:
++
++ cmp r0,#0
++ subeq r3,r3,#1 @; out--
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ beq inflate_fast_copy_exit
++
++inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop:
++
++ ldrb r12,[r2],#1 @; r12 = *from, from++
++ subs r0,r0,#1 @; len--
++ strb r12,[r3],#1 @; *out = r12, out++
++
++ bne inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop
++
++ sub r3,r3,#1 @; out--
++ str r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ b inflate_fast_copy_exit
++
++ #;; ------------------------------------------------------------------
++ #;; vectorized copy routine when gap between ``from'' and ``out''
++ #;; buffers is 16 bytes or more
++ #;; INPUTS:
++ #;; r0 = len
++ #;; r2 = pointer to from
++ #;; r3 = pointer to out
++ #;; OUTPUTS:
++ #;; r1 = pointer to pointer to out
++ #;; ------------------------------------------------------------------
++inflate_fast_copy_gap16b_proc:
++
++ add r2,r2,#1 @; from++
++ add r3,r3,#1 @; out++
++ @
++ lsrs r4,r0,#4 @; r4 = floor(len/16)
++ @; = iteration count for loop16
++ beq inflate_fast_copy_gap16b_proc_16bytes_loop_done
++
++inflate_fast_copy_gap16b_proc_16bytes_loop:
++
++ vld1.8 {q0},[r2]! @; load 16 bytes into q0 and
++ @; increment from pointer
++ vst1.8 {q0},[r3]! @; store 16 bytes in out and
++ @; increment out pointer
++ sub r0,r0,#16 @; subtract 16 from len
++ subs r4,r4,#1 @; decrement iteration count
++ bne inflate_fast_copy_gap16b_proc_16bytes_loop
++
++inflate_fast_copy_gap16b_proc_16bytes_loop_done:
++
++ cmp r0,#0
++ subeq r3,r3,#1 @; out--
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++ beq inflate_fast_copy_exit
++
++inflate_fast_copy_gap16b_proc_lastfewbytes_loop:
++
++ ldrb r12,[r2],#1 @; r12 = *from, from++
++ subs r0,r0,#1 @; len--
++ strb r12,[r3],#1 @; *out = r12, out++
++
++ bne inflate_fast_copy_gap16b_proc_lastfewbytes_loop
++
++ sub r3,r3,#1 @; out--
++ str r3,[r1,#0] @; r1 = updated pointer to pointer
++ @; to out
++
++inflate_fast_copy_exit:
++
++ pop {r4-r11} @; pop r4-r11 from stack
++ bx lr @; return
++
++.size inflate_fast_copy_neon, .-inflate_fast_copy_neon
++
++#endif
++ .END
++
++
« no previous file with comments | « no previous file | sys-libs/zlib/zlib-1.2.3-r1.ebuild » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698