Index: sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch |
diff --git a/sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch b/sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch |
new file mode 100644 |
index 0000000000000000000000000000000000000000..3e7698e96bfd4a6400a15e4142391b7cc973d7b3 |
--- /dev/null |
+++ b/sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch |
@@ -0,0 +1,865 @@ |
+commit df426dcc3179a3647a695c0fde66e1b3616fa6c0 |
+Author: John Alayari <jalayari@codeaurora.org> |
+Date: Thu Nov 18 16:24:17 2010 -0800 |
+ |
+ commiting the PNG and ZLIB Neon Optimizations files to local git repsitory. |
+ |
+diff --git a/Makefile.in b/Makefile.in |
+index 7da5a85..276c531 100644 |
+--- a/Makefile.in |
++++ b/Makefile.in |
+@@ -51,7 +51,7 @@ OBJS = adler32.o compress.o crc32.o gzio.o uncompr.o deflate.o trees.o \ |
+ |
+ PIC_OBJS = $(OBJS:%.o=%.lo) |
+ |
+-OBJA = |
++OBJA =inflate_fast_copy_neon.o adler32_DO16_loop_neon.o |
+ # to use the asm code: make OBJA=match.o |
+ |
+ TEST_OBJS = example.o minigzip.o |
+@@ -82,8 +82,12 @@ match.o: match.S |
+ %.lo: %.c |
+ $(CC) $(CFLAGS) -DPIC -fPIC -c $< -o $@ |
+ |
+-$(SHAREDLIBV): $(PIC_OBJS) |
+- $(LDSHARED) -o $@ $(PIC_OBJS) -lc $(LDFLAGS) |
++%.o: %.S |
++ $(CC) $(CFLAGS) -DPIC -fPIC -c $< -o $@ |
++ |
++ |
++$(SHAREDLIBV): $(PIC_OBJS) $(OBJA) |
++ $(LDSHARED) -o $@ $(PIC_OBJS) $(OBJA) -lc $(LDFLAGS) |
+ rm -f $(SHAREDLIB) $(SHAREDLIBM) |
+ ln -s $@ $(SHAREDLIB) |
+ ln -s $@ $(SHAREDLIBM) |
+diff --git a/adler32.c b/adler32.c |
+index 007ba26..a256e88 100644 |
+--- a/adler32.c |
++++ b/adler32.c |
+@@ -1,5 +1,6 @@ |
+ /* adler32.c -- compute the Adler-32 checksum of a data stream |
+ * Copyright (C) 1995-2004 Mark Adler |
++ * Copyright (c) 2010, Code Aurora Forum. All rights reserved. |
+ * For conditions of distribution and use, see copyright notice in zlib.h |
+ */ |
+ |
+@@ -18,6 +19,10 @@ |
+ #define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); |
+ #define DO16(buf) DO8(buf,0); DO8(buf,8); |
+ |
++#if defined(__ARM_NEON__) |
++extern void adler32_DO16_loop_neon(unsigned char** , unsigned long *, unsigned long * , int ); |
++#endif |
++ |
+ /* use NO_DIVIDE if your processor does not do division in hardware */ |
+ #ifdef NO_DIVIDE |
+ # define MOD(a) \ |
+@@ -96,17 +101,25 @@ uLong ZEXPORT adler32(adler, buf, len) |
+ /* do length NMAX blocks -- requires just one modulo operation */ |
+ while (len >= NMAX) { |
+ len -= NMAX; |
++#if defined(__ARM_NEON__) |
++ adler32_DO16_loop_neon(&buf, &adler, &sum2, NMAX); |
++#else |
+ n = NMAX / 16; /* NMAX is divisible by 16 */ |
+ do { |
+ DO16(buf); /* 16 sums unrolled */ |
+ buf += 16; |
+ } while (--n); |
++#endif |
+ MOD(adler); |
+ MOD(sum2); |
+ } |
+ |
+ /* do remaining bytes (less than NMAX, still just one modulo) */ |
+ if (len) { /* avoid modulos if none remaining */ |
++ |
++#if defined(__ARM_NEON__) |
++ adler32_DO16_loop_neon(&buf, &adler, &sum2, len); |
++#else |
+ while (len >= 16) { |
+ len -= 16; |
+ DO16(buf); |
+@@ -116,6 +129,7 @@ uLong ZEXPORT adler32(adler, buf, len) |
+ adler += *buf++; |
+ sum2 += adler; |
+ } |
++#endif |
+ MOD(adler); |
+ MOD(sum2); |
+ } |
+diff --git a/adler32_DO16_loop_neon.S b/adler32_DO16_loop_neon.S |
+new file mode 100755 |
+index 0000000..1ba5147 |
+--- /dev/null |
++++ b/adler32_DO16_loop_neon.S |
+@@ -0,0 +1,195 @@ |
++# |
++# Copyright (c) 2010, Code Aurora Forum. All rights reserved. |
++# |
++# Redistribution and use in source and binary forms, with or without |
++# modification, are permitted provided that the following conditions |
++# are met: |
++# * Redistributions of source code must retain the above copyright |
++# notice, this list of conditions and the following disclaimer. |
++# * Redistributions in binary form must reproduce the above |
++# copyright notice, this list of conditions and the following |
++# disclaimer in the documentation and/or other materials provided |
++# with the distribution. |
++# * Neither the name of Code Aurora Forum, Inc. nor the names of it |
++# contributors may be used to endorse or promote products derived |
++# from this software without specific prior written permission. |
++# |
++# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED |
++# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
++# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMEN |
++# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTOR |
++# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, O |
++# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
++# SUBSTITUTE GOODS OR SERVICES LOSS OF USE, DATA, OR PROFITS OR |
++# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
++# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
++# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVE |
++# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
++# |
++#if defined(__ARM_NEON__) |
++#====================================================================== |
++ |
++#====================================================================== |
++# Code Section |
++ |
++ .code 32 @ Code is ARM ISA |
++ |
++ .global adler32_DO16_loop_neon |
++ |
++#====================================================================== |
++# Function Name(s) : adler32_DO16_loop_neon |
++# |
++# Function Parameters |
++# r0 = pointer to buf |
++# r1 = pointer to adler |
++# r2 = pointer to sum |
++# r3 = len |
++# |
++# Register Usage |
++# q0, q1 = input data |
++# d18,d24 = adler |
++# d16,d28 = sum |
++# d30 = multiplication factor |
++# |
++#====================================================================== |
++# |
++# algorithm: |
++# |
++# while (length < loop_counter) |
++# do |
++# length = length - loop_counter |
++# for i = 0 to loop_counter |
++# adler = adler_begin + input[i] |
++# for i = 0 to loop_counter |
++# sum2 = sum2_begin+(loop_counter-i)*input[i] |
++# sum2 = sum2+(adler_begin*loop_counter) |
++# adler_begin = adler |
++# sum2_begin = sum2 |
++# end |
++# end |
++# end |
++# |
++# Here loop counter holds values of 16, 8 and 1 to compute |
++# adler and sum for 16 bytes, 8 bytes and 1 byte at a time |
++# adler_begin and sum2_begin are used to hold the values |
++# of adler and sum2 from previous iterations. |
++# |
++#====================================================================== |
++.balign 32 |
++.type adler32_DO16_loop_neon, %function |
++ |
++adler32_DO16_loop_neon: |
++ stmdb sp!,{r4-r7} |
++ ldr r7,[r0] |
++ vld1.32 {d18[0]},[r1] @load the input adler |
++ vld1.32 {d16[0]},[r2] @load the input sum |
++ ldr r5,=FACTOR16 @load the multiplication |
++ @factors for data elements |
++ vld1.8 {d20,d21},[r5] @load the multiplication |
++ @factor for adler. |
++ lsrs r4,r3,#4 @Calculate the number |
++ @16 byte iterations |
++ beq adler32_DO16_loop_16bytes_done |
++ mov r6, #16 |
++ vmov.32 d30[0],r6 |
++ |
++adler32_DO16_loop_16bytes: |
++ vld1.8 {d0,d1},[r7]! @load buf[0]..buf[15] |
++ vpaddl.u8 d6, d0 @pair wise add to reduce |
++ @8 elements to 4 and extend. |
++ vpaddl.u8 d7,d1 @pair wise add to reduce 8 |
++ @elements to 4 and extend. |
++ vpadd.u16 d24,d6,d7 @pair wise add (i.e. no |
++ @need to extend 16 bits |
++ @sufficient to hold the sum). |
++ vpaddl.u16 d24,d24 @pair wise add to reduce 4 |
++ @elements to 2 and extend. |
++ vpaddl.u32 d24,d24 @pair wise add to get the |
++ @adler of 16 inputs no need |
++ @to extend .. but only vpaddl |
++ @adds pair wise on one |
++ @doubleword. |
++ vadd.u32 d24,d18,d24 @adler'=adler+adler_of_16_inputs |
++ vmull.u8 q13,d20,d0 @sum'=mul_fac_for_inputs[0...7] |
++ @ * buf[0..7]. |
++ vmlal.u8 q13,d21,d1 @sum'=sum'+ mul_fac_for_inputs |
++ @[8...15] * buf[8..15]. |
++ vpadd.u16 d28,d26,d27 @pair wise add the doublewords |
++ vpaddl.u16 d28,d28 @pair wise add to reduce 4 |
++ @elements to 2 and extend. |
++ vpaddl.u32 d28,d28 @pair wise add |
++ vadd.u32 d28,d16,d28 @sum' = sum + sum' |
++ vmla.u32 d28,d18,d30 @sum' = sum' + (adler* |
++ @mul_fac_for_adler). |
++ vmov.u32 d18,d24 @save adler for next iteration. |
++ vmov.u32 d16,d28 @save sum for next iteration. |
++ sub r3,r3,#16 |
++ subs r4,r4,#1 |
++ bne adler32_DO16_loop_16bytes |
++ |
++adler32_DO16_loop_16bytes_done: |
++ lsrs r4, r3, #3 @find if there are atleast 8 bytes |
++ beq adler32_DO16_loop_8bytes_done |
++adler32_DO16_loop_8bytes: |
++ vld1.8 {d0},[r7]! @load buf[0] .buf[7] |
++ vpaddl.u8 d24,d0 @pair wise add to |
++ @reduce 8 elements to 4 |
++ vpaddl.u16 d24,d24 @pair wise add to reduce |
++ @4 elements to 2 |
++ vpaddl.u32 d24,d24 @pair wise add to get the |
++ @adler for 8 inputs |
++ vadd.u32 d24,d18,d24 @adler' = adler + |
++ @adler_for_8_inputs. |
++ vmull.u8 q13,d21,d0 @sum' = mul_fac_for_inputs[0..7] |
++ @ * buf[0..7] |
++ vpadd.u16 d28,d26,d27 @pair wise add to reduce 8 |
++ @elements to 4 |
++ vpaddl.u16 d28,d28 @pair wise add to reduce 4 |
++ @elements to 2. |
++ vpaddl.u32 d28,d28 @pair wise add |
++ vadd.u32 d28,d16,d28 @sum' = sum + sum' |
++ vshl.u32 d30,d18,#3 @adler" = adler * 8 |
++ vadd.u32 d28,d28,d30 @sum' = sum' + adler" |
++ vmov.u32 d18,d24 @save adler for next iteration |
++ vmov.u32 d16,d28 @save sum for next iteration |
++ sub r3,r3,#8 |
++ |
++adler32_DO16_loop_8bytes_done: |
++ cmp r3, #0 @find if there are remaining bytes |
++ beq DONE @after profiling found that a loop |
++ @to compute 4 or 2 bytes at a time |
++ @is less efficient than a byte by |
++ @byte computation. |
++ vmov.u64 d3, #0 |
++ |
++adler32_DO16_loop_remaining: |
++ vld1.8 {d3[0]}, [r7]! @load 1 byte of input |
++ subs r3,r3,#1 |
++ vadd.u32 d24,d3,d18 @adler' = adler + *buf |
++ vadd.u32 d28,d24,d16 @sum' = sum + adler' |
++ vmov.u32 d18,d24 |
++ vmov.u32 d16,d28 |
++ bne adler32_DO16_loop_remaining |
++ |
++DONE: |
++ vst1.32 {d24[0]},[r1] |
++ vst1.32 {d28[0]},[r2] |
++ str r7, [r0] |
++ ldmia sp!, {r4-r7} |
++ bx lr |
++ |
++.size adler32_DO16_loop_neon, .-adler32_DO16_loop_neon |
++ |
++.balign 16 |
++#====================================================================== |
++#FACTOR16 provides the multiplication factors for the inputs for 16 |
++#byte loops. The second half (i.e. 8 to 1) has the multiplications |
++#factors for 8 byte loops. |
++#====================================================================== |
++ |
++FACTOR16: |
++ .byte 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 |
++ |
++#endif |
++ .END |
++ |
+diff --git a/inffast.c b/inffast.c |
+index bbee92e..943cb92 100644 |
+--- a/inffast.c |
++++ b/inffast.c |
+@@ -8,6 +8,10 @@ |
+ #include "inflate.h" |
+ #include "inffast.h" |
+ |
++#if defined(__ARM_NEON__) |
++extern void inflate_fast_copy_neon(unsigned len, unsigned char **out, unsigned char *from); |
++#endif |
++ |
+ #ifndef ASMINF |
+ |
+ /* Allow machine dependent optimization for post-increment or pre-increment. |
+@@ -231,6 +235,9 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ |
+ from = out - dist; /* rest from output */ |
+ } |
+ } |
++#if defined(__ARM_NEON__) |
++ inflate_fast_copy_neon(len, &out, from); |
++#else |
+ while (len > 2) { |
+ PUP(out) = PUP(from); |
+ PUP(out) = PUP(from); |
+@@ -242,9 +249,13 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ |
+ if (len > 1) |
+ PUP(out) = PUP(from); |
+ } |
++#endif |
+ } |
+ else { |
+ from = out - dist; /* copy direct from output */ |
++#if defined(__ARM_NEON__) |
++ inflate_fast_copy_neon(len, &out, from); |
++#else |
+ do { /* minimum length is three */ |
+ PUP(out) = PUP(from); |
+ PUP(out) = PUP(from); |
+@@ -256,6 +267,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ |
+ if (len > 1) |
+ PUP(out) = PUP(from); |
+ } |
++#endif |
+ } |
+ } |
+ else if ((op & 64) == 0) { /* 2nd level distance code */ |
+diff --git a/inflate_fast_copy_neon.S b/inflate_fast_copy_neon.S |
+new file mode 100755 |
+index 0000000..ec1e4ab |
+--- /dev/null |
++++ b/inflate_fast_copy_neon.S |
+@@ -0,0 +1,521 @@ |
++#; Copyright (c) 2010, Code Aurora Forum. All rights reserved. |
++#; |
++#; Redistribution and use in source and binary forms, with or without |
++#; modification, are permitted provided that the following conditions are |
++#; met: |
++#; * Redistributions of source code must retain the above copyright |
++#; notice, this list of conditions and the following disclaimer. |
++#; * Redistributions in binary form must reproduce the above |
++#; copyright notice, this list of conditions and the following |
++#; disclaimer in the documentation and/or other materials provided |
++#; with the distribution. |
++#; * Neither the name of Code Aurora Forum, Inc. nor the names of its |
++#; contributors may be used to endorse or promote products derived |
++#; from this software without specific prior written permission. |
++#; |
++#; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED |
++#; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
++#; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT |
++#; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS |
++#; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
++#; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
++#; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
++#; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
++#; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
++#; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN |
++#; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
++#; |
++#; |
++#if defined(__ARM_NEON__) |
++#;============================================================================ |
++#; Code Section |
++ .code 32 @; Code is ARM ISA |
++#;============================================================================ |
++ |
++ .global inflate_fast_copy_neon |
++ |
++ |
++#;============================================================================ |
++#; INPUTS: r0 len: number of bytes to transfer |
++#; r1 **out: pointer to pointer to ``out'' buffer |
++#; r2 *from: pointer to ``from'' buffer |
++#; OUTPUTS: r1 **out: pointer to pointer to ``out'' buffer |
++#;============================================================================ |
++.balign 32 |
++.type inflate_fast_copy_neon, %function |
++inflate_fast_copy_neon: |
++ push {r4-r11} @; push r4-r11 onto stack |
++ |
++ cmp r0,#16 @; |
++ bge inflate_fast_copy_vectorized |
++ |
++ #;; transfer bytes one by one |
++ #;; only if len < 16 bytes |
++inflate_fast_copy_default: |
++ |
++ cmp r0,#0 |
++ beq inflate_fast_copy_exit |
++ |
++ ldr r3,[r1,#0] @; r3 = pointer to out |
++ |
++inflate_fast_copy_default_loop: |
++ |
++ ldrb r12,[r2,#1]! @; r12 = *(++from) |
++ subs r0,r0,#1 @; len-- |
++ strb r12,[r3,#1]! @; *(++out) = r12 |
++ |
++ bne inflate_fast_copy_default_loop |
++ |
++ str r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ b inflate_fast_copy_exit |
++ |
++ #;; vectorized copy routines |
++ #;; only if len > 16 bytes |
++inflate_fast_copy_vectorized: |
++ |
++ ldr r3,[r1,#0] @; r3 = pointer to out |
++ @; DON'T TOUCH r1 UNTIL FINAL |
++ @; UPDATE OF r1 WITH ADDRESS OF r3 |
++ cmp r3,r2 @ |
++ sublt r4,r2,r3 @ |
++ subge r4,r3,r2 @;r4 = gap = |out-from| |
++ |
++ cmp r4,#0 |
++ beq inflate_fast_copy_exit |
++ |
++ cmp r4,#1 |
++ beq inflate_fast_copy_gap1b_proc |
++ |
++ cmp r4,#2 |
++ beq inflate_fast_copy_gap2b_proc |
++ |
++ cmp r4,#3 |
++ beq inflate_fast_copy_gap3b_proc |
++ |
++ cmp r4,#4 |
++ beq inflate_fast_copy_gap4b_proc |
++ |
++ cmp r4,#8 |
++ blt inflate_fast_copy_gap5to7b_proc |
++ beq inflate_fast_copy_gap8b_proc |
++ |
++ cmp r4,#16 |
++ blt inflate_fast_copy_gap9to15b_proc |
++ bge inflate_fast_copy_gap16b_proc |
++ |
++ |
++ #;; ------------------------------------------------------------------ |
++ #;; vectorized copy routine when gap between ``from'' and ``out'' |
++ #;; buffers is 1 byte |
++ #;; INPUTS: |
++ #;; r0 = len |
++ #;; r2 = pointer to from |
++ #;; r3 = pointer to out |
++ #;; OUTPUTS: |
++ #;; r1 = pointer to pointer to out |
++ #;; ------------------------------------------------------------------ |
++inflate_fast_copy_gap1b_proc: |
++ |
++ add r3,r3,#1 @; out++ |
++ @ |
++ ldrb r12,[r2,#1]! @; r12 = *(++from) |
++ vdup.8 q0, r12 @; duplicate r12 16 times in q0 |
++ @ |
++ lsrs r4,r0,#4 @; r4 = floor(len/16) |
++ @; = iteration count for loop16 |
++ beq inflate_fast_copy_gap1b_proc_16bytes_loop_done |
++ |
++inflate_fast_copy_gap1b_proc_16bytes_loop: |
++ |
++ vst1.8 {q0},[r3]! @; store 16 bytes in out and |
++ @; increment out pointer |
++ sub r0,r0,#16 @; subtract 16 from len |
++ subs r4,r4,#1 @; decrement iteration count |
++ bne inflate_fast_copy_gap1b_proc_16bytes_loop |
++ |
++inflate_fast_copy_gap1b_proc_16bytes_loop_done: |
++ |
++ cmp r0,#0 |
++ subeq r3,r3,#1 @; out-- |
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ beq inflate_fast_copy_exit |
++ |
++inflate_fast_copy_gap1b_proc_lastfewbytes_loop: |
++ |
++ strb r12,[r3],#1 @; *out = r12, out++ |
++ subs r0,r0,#1 @; len-- |
++ bne inflate_fast_copy_gap1b_proc_lastfewbytes_loop |
++ |
++ sub r3,r3,#1 @; out-- |
++ str r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ b inflate_fast_copy_exit |
++ |
++ #;; ------------------------------------------------------------------ |
++ #;; vectorized copy routine when gap between ``from'' and ``out'' |
++ #;; buffers is 2 bytes |
++ #;; INPUTS: |
++ #;; r0 = len |
++ #;; r2 = pointer to from |
++ #;; r3 = pointer to out |
++ #;; OUTPUTS: |
++ #;; r1 = pointer to pointer to out |
++ #;; ------------------------------------------------------------------ |
++inflate_fast_copy_gap2b_proc: |
++ |
++ add r2,r2,#1 @; from++ |
++ add r3,r3,#1 @; out++ |
++ @ |
++ vld1.16 {d0[0]},[r2] @; load 2 bytes into d0[0] |
++ vdup.16 q0,d0[0] @; duplicate those 2 bytes 8 times |
++ @; to fill up q0 |
++ @ |
++ lsrs r4,r0,#4 @; r4 = floor(len/16) |
++ @; = iteration count for loop16 |
++ beq inflate_fast_copy_gap2b_proc_16bytes_loop_done |
++ |
++inflate_fast_copy_gap2b_proc_16bytes_loop: |
++ |
++ vst1.8 {q0},[r3]! @; store 16 bytes in out and |
++ @; increment out pointer |
++ sub r0,r0,#16 @; subtract 16 from len |
++ subs r4,r4,#1 @; decrement iteration count |
++ bne inflate_fast_copy_gap2b_proc_16bytes_loop |
++ |
++inflate_fast_copy_gap2b_proc_16bytes_loop_done: |
++ |
++ cmp r0,#0 |
++ subeq r3,r3,#1 @; out-- |
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ beq inflate_fast_copy_exit |
++ |
++inflate_fast_copy_gap2b_proc_lastfewbytes_loop: |
++ |
++ ldrb r12,[r2],#1 @; r12 = *from, from++ |
++ subs r0,r0,#1 @; len-- |
++ strb r12,[r3],#1 @; *out = r12, out++ |
++ @ |
++ bne inflate_fast_copy_gap2b_proc_lastfewbytes_loop |
++ |
++ sub r3,r3,#1 @; out-- |
++ str r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ b inflate_fast_copy_exit |
++ |
++ #;; ------------------------------------------------------------------ |
++ #;; vectorized copy routine when gap between ``from'' and ``out'' |
++ #;; buffers is 3 bytes |
++ #;; INPUTS: |
++ #;; r0 = len |
++ #;; r2 = pointer to from |
++ #;; r3 = pointer to out |
++ #;; r4 = 3 |
++ #;; OUTPUTS: |
++ #;; r1 = pointer to pointer to out |
++ #;; ------------------------------------------------------------------ |
++inflate_fast_copy_gap3b_proc: |
++ |
++ add r2,r2,#1 @; from++ |
++ add r3,r3,#1 @; out++ |
++ @ |
++ vld1.32 {d0[0]},[r2] @; load 4 bytes into d0[0] |
++ |
++inflate_fast_copy_gap3b_proc_3bytes_loop: |
++ |
++ cmp r0,#3 @; exit loop if len < 3 |
++ blt inflate_fast_copy_gap3b_proc_3bytes_loop_done |
++ |
++ vst1.32 {d0[0]},[r3],r4 @; store 4 bytes in out |
++ @; out+=3 |
++ |
++ sub r0,r0,#3 @; len-=3 |
++ b inflate_fast_copy_gap3b_proc_3bytes_loop |
++ |
++inflate_fast_copy_gap3b_proc_3bytes_loop_done: |
++ |
++ cmp r0,#0 |
++ subeq r3,r3,#1 @; out-- |
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ beq inflate_fast_copy_exit |
++ |
++inflate_fast_copy_gap3b_proc_lastfewbytes_loop: |
++ |
++ ldrb r12,[r2],#1 @; r12 = *from, from++ |
++ subs r0,r0,#1 @; len-- |
++ strb r12,[r3],#1 @; *out = r12, out++ |
++ |
++ bne inflate_fast_copy_gap3b_proc_lastfewbytes_loop |
++ |
++ sub r3,r3,#1 @; out-- |
++ str r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ b inflate_fast_copy_exit |
++ |
++ #;; ------------------------------------------------------------------ |
++ #;; vectorized copy routine when gap between ``from'' and ``out'' |
++ #;; buffers is 4 bytes |
++ #;; INPUTS: |
++ #;; r0 = len |
++ #;; r2 = pointer to from |
++ #;; r3 = pointer to out |
++ #;; OUTPUTS: |
++ #;; r1 = pointer to pointer to out |
++ #;; ------------------------------------------------------------------ |
++inflate_fast_copy_gap4b_proc: |
++ |
++ add r2,r2,#1 @; from++ |
++ add r3,r3,#1 @; out++ |
++ @ |
++ vld1.32 {d0[0]},[r2] @; load 4 bytes into d0[0] |
++ vdup.32 q0,d0[0] @; duplicate those 4 bytes 4 times |
++ @; to fill up q0 |
++ @ |
++ lsrs r4,r0,#4 @; r4 = floor(len/16) |
++ @; = iteration count for loop16 |
++ beq inflate_fast_copy_gap4b_proc_16bytes_loop_done |
++ |
++inflate_fast_copy_gap4b_proc_16bytes_loop: |
++ |
++ vst1.32 {q0},[r3]! @; store 16 bytes in out and |
++ @; increment out pointer |
++ sub r0,r0,#16 @; subtract 16 from len |
++ subs r4,r4,#1 @; decrement iteration count |
++ bne inflate_fast_copy_gap4b_proc_16bytes_loop |
++ |
++inflate_fast_copy_gap4b_proc_16bytes_loop_done: |
++ |
++ cmp r0,#0 |
++ subeq r3,r3,#1 @; out-- |
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ beq inflate_fast_copy_exit |
++ |
++inflate_fast_copy_gap4b_proc_lastfewbytes_loop: |
++ |
++ ldrb r12,[r2],#1 @; r12 = *from, from++ |
++ subs r0,r0,#1 @; len-- |
++ strb r12,[r3],#1 @; *out = r12, out++ |
++ |
++ bne inflate_fast_copy_gap4b_proc_lastfewbytes_loop |
++ |
++ sub r3,r3,#1 @; out-- |
++ str r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ b inflate_fast_copy_exit |
++ |
++ #;; ------------------------------------------------------------------ |
++ #;; vectorized copy routine when gap between ``from'' and ``out'' |
++ #;; buffers is {5-7} bytes |
++ #;; INPUTS: |
++ #;; r0 = len |
++ #;; r2 = pointer to from |
++ #;; r3 = pointer to out |
++ #;; r4 = {5-7} |
++ #;; OUTPUTS: |
++ #;; r1 = pointer to pointer to out |
++ #;; ------------------------------------------------------------------ |
++inflate_fast_copy_gap5to7b_proc: |
++ |
++ add r2,r2,#1 @; from++ |
++ add r3,r3,#1 @; out++ |
++ @ |
++ vld1.8 {d0},[r2] @; load 8 bytes into d0 |
++ |
++inflate_fast_copy_gap5to7b_proc_5to7bytes_loop: |
++ |
++ cmp r0,r4 @; exit loop if len < {5-7} |
++ blt inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done |
++ |
++ vst1.8 {d0},[r3],r4 @; store 8 bytes in out |
++ @; out+={5-7} |
++ |
++ sub r0,r0,r4 @; len-={5-7} |
++ b inflate_fast_copy_gap5to7b_proc_5to7bytes_loop |
++ |
++inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done: |
++ |
++ cmp r0,#0 |
++ subeq r3,r3,#1 @; out-- |
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ beq inflate_fast_copy_exit |
++ |
++inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop: |
++ |
++ ldrb r12,[r2],#1 @; r12 = *from, from++ |
++ subs r0,r0,#1 @; len-- |
++ strb r12,[r3],#1 @; *out = r12, out++ |
++ |
++ bne inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop |
++ |
++ sub r3,r3,#1 @; out-- |
++ str r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ b inflate_fast_copy_exit |
++ |
++ #;; ------------------------------------------------------------------ |
++ #;; vectorized copy routine when gap between ``from'' and ``out'' |
++ #;; buffers is 8 bytes |
++ #;; INPUTS: |
++ #;; r0 = len |
++ #;; r2 = pointer to from |
++ #;; r3 = pointer to out |
++ #;; OUTPUTS: |
++ #;; r1 = pointer to pointer to out |
++ #;; ------------------------------------------------------------------ |
++inflate_fast_copy_gap8b_proc: |
++ |
++ add r2,r2,#1 @; from++ |
++ add r3,r3,#1 @; out++ |
++ @ |
++ vld1.8 {d0},[r2] @; load 8 bytes into d0 |
++ vmov d1,d0 @; duplicate the 8 bytes to fill up |
++ @; q0 |
++ @ |
++ lsrs r4,r0,#4 @; r4 = floor(len/16) |
++ @; = iteration count for loop16 |
++ beq inflate_fast_copy_gap8b_proc_16bytes_loop_done |
++ |
++inflate_fast_copy_gap8b_proc_16bytes_loop: |
++ |
++ vst1.8 {q0},[r3]! @; store 16 bytes in out and |
++ @; increment out pointer |
++ sub r0,r0,#16 @; subtract 16 from len |
++ subs r4,r4,#1 @; decrement iteration count |
++ bne inflate_fast_copy_gap8b_proc_16bytes_loop |
++ |
++inflate_fast_copy_gap8b_proc_16bytes_loop_done: |
++ |
++ cmp r0,#0 |
++ subeq r3,r3,#1 @; out-- |
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ beq inflate_fast_copy_exit |
++ |
++inflate_fast_copy_gap8b_proc_lastfewbytes_loop: |
++ |
++ ldrb r12,[r2],#1 @; r12 = *from, from++ |
++ subs r0,r0,#1 @; len-- |
++ strb r12,[r3],#1 @; *out = r12, out++ |
++ |
++ bne inflate_fast_copy_gap8b_proc_lastfewbytes_loop |
++ |
++ sub r3,r3,#1 @; out-- |
++ str r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ b inflate_fast_copy_exit |
++ |
++ #;; ------------------------------------------------------------------ |
++ #;; vectorized copy routine when gap between ``from'' and ``out'' |
++ #;; buffers is {9-15} bytes |
++ #;; INPUTS: |
++ #;; r0 = len |
++ #;; r2 = pointer to from |
++ #;; r3 = pointer to out |
++ #;; r4 = {9-15} |
++ #;; OUTPUTS: |
++ #;; r1 = pointer to pointer to out |
++ #;; ------------------------------------------------------------------ |
++inflate_fast_copy_gap9to15b_proc: |
++ |
++ add r2,r2,#1 @; from++ |
++ add r3,r3,#1 @; out++ |
++ @ |
++ vld1.8 {q0},[r2] @; load 16 bytes into q0 |
++ |
++inflate_fast_copy_gap9to15b_proc_9to15bytes_loop: |
++ |
++ cmp r0, r4 @; exit loop if len < {9-15} |
++ blt inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done |
++ |
++ vst1.8 {q0},[r3],r4 @; store 16 bytes in out |
++ @; out+={9-15} |
++ |
++ sub r0,r0,r4 @; len-={9-15} |
++ b inflate_fast_copy_gap9to15b_proc_9to15bytes_loop |
++ |
++inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done: |
++ |
++ cmp r0,#0 |
++ subeq r3,r3,#1 @; out-- |
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ beq inflate_fast_copy_exit |
++ |
++inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop: |
++ |
++ ldrb r12,[r2],#1 @; r12 = *from, from++ |
++ subs r0,r0,#1 @; len-- |
++ strb r12,[r3],#1 @; *out = r12, out++ |
++ |
++ bne inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop |
++ |
++ sub r3,r3,#1 @; out-- |
++ str r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ b inflate_fast_copy_exit |
++ |
++ #;; ------------------------------------------------------------------ |
++ #;; vectorized copy routine when gap between ``from'' and ``out'' |
++ #;; buffers is 16 bytes or more |
++ #;; INPUTS: |
++ #;; r0 = len |
++ #;; r2 = pointer to from |
++ #;; r3 = pointer to out |
++ #;; OUTPUTS: |
++ #;; r1 = pointer to pointer to out |
++ #;; ------------------------------------------------------------------ |
++inflate_fast_copy_gap16b_proc: |
++ |
++ add r2,r2,#1 @; from++ |
++ add r3,r3,#1 @; out++ |
++ @ |
++ lsrs r4,r0,#4 @; r4 = floor(len/16) |
++ @; = iteration count for loop16 |
++ beq inflate_fast_copy_gap16b_proc_16bytes_loop_done |
++ |
++inflate_fast_copy_gap16b_proc_16bytes_loop: |
++ |
++ vld1.8 {q0},[r2]! @; load 16 bytes into q0 and |
++ @; increment from pointer |
++ vst1.8 {q0},[r3]! @; store 16 bytes in out and |
++ @; increment out pointer |
++ sub r0,r0,#16 @; subtract 16 from len |
++ subs r4,r4,#1 @; decrement iteration count |
++ bne inflate_fast_copy_gap16b_proc_16bytes_loop |
++ |
++inflate_fast_copy_gap16b_proc_16bytes_loop_done: |
++ |
++ cmp r0,#0 |
++ subeq r3,r3,#1 @; out-- |
++ streq r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ beq inflate_fast_copy_exit |
++ |
++inflate_fast_copy_gap16b_proc_lastfewbytes_loop: |
++ |
++ ldrb r12,[r2],#1 @; r12 = *from, from++ |
++ subs r0,r0,#1 @; len-- |
++ strb r12,[r3],#1 @; *out = r12, out++ |
++ |
++ bne inflate_fast_copy_gap16b_proc_lastfewbytes_loop |
++ |
++ sub r3,r3,#1 @; out-- |
++ str r3,[r1,#0] @; r1 = updated pointer to pointer |
++ @; to out |
++ |
++inflate_fast_copy_exit: |
++ |
++ pop {r4-r11} @; pop r4-r11 from stack |
++ bx lr @; return |
++ |
++.size inflate_fast_copy_neon, .-inflate_fast_copy_neon |
++ |
++#endif |
++ .END |
++ |
++ |