sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch - Issue 5176006: Applying Neon optimization patch to the ZLIB library.

Side by Side Diff: sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch

Issue 5176006: Applying Neon optimization patch to the ZLIB library. Base URL: http://git.chromium.org/git/portage.git@master

Patch Set: Created 10 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 commit df426dcc3179a3647a695c0fde66e1b3616fa6c0

	2 Author: John Alayari <jalayari@codeaurora.org>

	3 Date: Thu Nov 18 16:24:17 2010 -0800

	4

	5 commiting the PNG and ZLIB Neon Optimizations files to local git repsitory.

	6

	7 diff --git a/Makefile.in b/Makefile.in

	8 index 7da5a85..276c531 100644

	9 --- a/Makefile.in

	10 +++ b/Makefile.in

	11 @@ -51,7 +51,7 @@ OBJS = adler32.o compress.o crc32.o gzio.o uncompr.o deflate.o trees.o \

	12

	13 PIC_OBJS = $(OBJS:%.o=%.lo)

	14

	15 -OBJA =

	16 +OBJA =inflate_fast_copy_neon.o adler32_DO16_loop_neon.o

	17 # to use the asm code: make OBJA=match.o

	18

	19 TEST_OBJS = example.o minigzip.o

	20 @@ -82,8 +82,12 @@ match.o: match.S

	21 %.lo: %.c

	22 $(CC) $(CFLAGS) -DPIC -fPIC -c $< -o $@

	23

	24 -$(SHAREDLIBV): $(PIC_OBJS)

	25 - $(LDSHARED) -o $@ $(PIC_OBJS) -lc $(LDFLAGS)

	26 +%.o: %.S

	27 + $(CC) $(CFLAGS) -DPIC -fPIC -c $< -o $@

	28 +

	29 +

	30 +$(SHAREDLIBV): $(PIC_OBJS) $(OBJA)

	31 + $(LDSHARED) -o $@ $(PIC_OBJS) $(OBJA) -lc $(LDFLAGS)

	32 rm -f $(SHAREDLIB) $(SHAREDLIBM)

	33 ln -s $@ $(SHAREDLIB)

	34 ln -s $@ $(SHAREDLIBM)

	35 diff --git a/adler32.c b/adler32.c

	36 index 007ba26..a256e88 100644

	37 --- a/adler32.c

	38 +++ b/adler32.c

	39 @@ -1,5 +1,6 @@

	40 /* adler32.c -- compute the Adler-32 checksum of a data stream

	41 * Copyright (C) 1995-2004 Mark Adler

	42 + * Copyright (c) 2010, Code Aurora Forum. All rights reserved.

	43 * For conditions of distribution and use, see copyright notice in zlib.h

	44 */

	45

	46 @@ -18,6 +19,10 @@

	47 #define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);

	48 #define DO16(buf) DO8(buf,0); DO8(buf,8);

	49

	50 +#if defined(__ARM_NEON__)

	51 +extern void adler32_DO16_loop_neon(unsigned char** , unsigned long , unsigned long , int );

	52 +#endif

	53 +

	54 /* use NO_DIVIDE if your processor does not do division in hardware */

	55 #ifdef NO_DIVIDE

	56 # define MOD(a) \

	57 @@ -96,17 +101,25 @@ uLong ZEXPORT adler32(adler, buf, len)

	58 /* do length NMAX blocks -- requires just one modulo operation */

	59 while (len >= NMAX) {

	60 len -= NMAX;

	61 +#if defined(__ARM_NEON__)

	62 + adler32_DO16_loop_neon(&buf, &adler, &sum2, NMAX);

	63 +#else

	64 n = NMAX / 16; /* NMAX is divisible by 16 */

	65 do {

	66 DO16(buf); /* 16 sums unrolled */

	67 buf += 16;

	68 } while (--n);

	69 +#endif

	70 MOD(adler);

	71 MOD(sum2);

	72 }

	73

	74 /* do remaining bytes (less than NMAX, still just one modulo) */

	75 if (len) { /* avoid modulos if none remaining */

	76 +

	77 +#if defined(__ARM_NEON__)

	78 + adler32_DO16_loop_neon(&buf, &adler, &sum2, len);

	79 +#else

	80 while (len >= 16) {

	81 len -= 16;

	82 DO16(buf);

	83 @@ -116,6 +129,7 @@ uLong ZEXPORT adler32(adler, buf, len)

	84 adler += *buf++;

	85 sum2 += adler;

	86 }

	87 +#endif

	88 MOD(adler);

	89 MOD(sum2);

	90 }

	91 diff --git a/adler32_DO16_loop_neon.S b/adler32_DO16_loop_neon.S

	92 new file mode 100755

	93 index 0000000..1ba5147

	94 --- /dev/null

	95 +++ b/adler32_DO16_loop_neon.S

	96 @@ -0,0 +1,195 @@

	97 +#

	98 +# Copyright (c) 2010, Code Aurora Forum. All rights reserved.

	99 +#

	100 +# Redistribution and use in source and binary forms, with or without

	101 +# modification, are permitted provided that the following conditions

	102 +# are met:

	103 +# * Redistributions of source code must retain the above copyright

	104 +# notice, this list of conditions and the following disclaimer.

	105 +# * Redistributions in binary form must reproduce the above

	106 +# copyright notice, this list of conditions and the following

	107 +# disclaimer in the documentation and/or other materials provided

	108 +# with the distribution.

	109 +# * Neither the name of Code Aurora Forum, Inc. nor the names of it

	110 +# contributors may be used to endorse or promote products derived

	111 +# from this software without specific prior written permission.

	112 +#

	113 +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED

	114 +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF

	115 +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMEN

	116 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTOR

	117 +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, O

	118 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

	119 +# SUBSTITUTE GOODS OR SERVICES LOSS OF USE, DATA, OR PROFITS OR

	120 +# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,

	121 +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE

	122 +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVE

	123 +# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	124 +#

	125 +#if defined(__ARM_NEON__)

	126 +#======================================================================

	127 +

	128 +#======================================================================

	129 +# Code Section

	130 +

	131 + .code 32 @ Code is ARM ISA

	132 +

	133 + .global adler32_DO16_loop_neon

	134 +

	135 +#======================================================================

	136 +# Function Name(s) : adler32_DO16_loop_neon

	137 +#

	138 +# Function Parameters

	139 +# r0 = pointer to buf

	140 +# r1 = pointer to adler

	141 +# r2 = pointer to sum

	142 +# r3 = len

	143 +#

	144 +# Register Usage

	145 +# q0, q1 = input data

	146 +# d18,d24 = adler

	147 +# d16,d28 = sum

	148 +# d30 = multiplication factor

	149 +#

	150 +#======================================================================

	151 +#

	152 +# algorithm:

	153 +#

	154 +# while (length < loop_counter)

	155 +# do

	156 +# length = length - loop_counter

	157 +# for i = 0 to loop_counter

	158 +# adler = adler_begin + input[i]

	159 +# for i = 0 to loop_counter

	160 +# sum2 = sum2_begin+(loop_counter-i)*input[i]

	161 +# sum2 = sum2+(adler_begin*loop_counter)

	162 +# adler_begin = adler

	163 +# sum2_begin = sum2

	164 +# end

	165 +# end

	166 +# end

	167 +#

	168 +# Here loop counter holds values of 16, 8 and 1 to compute

	169 +# adler and sum for 16 bytes, 8 bytes and 1 byte at a time

	170 +# adler_begin and sum2_begin are used to hold the values

	171 +# of adler and sum2 from previous iterations.

	172 +#

	173 +#======================================================================

	174 +.balign 32

	175 +.type adler32_DO16_loop_neon, %function

	176 +

	177 +adler32_DO16_loop_neon:

	178 + stmdb sp!,{r4-r7}

	179 + ldr r7,[r0]

	180 + vld1.32 {d18[0]},[r1] @load the input adler

	181 + vld1.32 {d16[0]},[r2] @load the input sum

	182 + ldr r5,=FACTOR16 @load the multiplication

	183 + @factors for data elements

	184 + vld1.8 {d20,d21},[r5] @load the multiplication

	185 + @factor for adler.

	186 + lsrs r4,r3,#4 @Calculate the number

	187 + @16 byte iterations

	188 + beq adler32_DO16_loop_16bytes_done

	189 + mov r6, #16

	190 + vmov.32 d30[0],r6

	191 +

	192 +adler32_DO16_loop_16bytes:

	193 + vld1.8 {d0,d1},[r7]! @load buf[0]..buf[15]

	194 + vpaddl.u8 d6, d0 @pair wise add to reduce

	195 + @8 elements to 4 and extend.

	196 + vpaddl.u8 d7,d1 @pair wise add to reduce 8

	197 + @elements to 4 and extend.

	198 + vpadd.u16 d24,d6,d7 @pair wise add (i.e. no

	199 + @need to extend 16 bits

	200 + @sufficient to hold the sum).

	201 + vpaddl.u16 d24,d24 @pair wise add to reduce 4

	202 + @elements to 2 and extend.

	203 + vpaddl.u32 d24,d24 @pair wise add to get the

	204 + @adler of 16 inputs no need

	205 + @to extend .. but only vpaddl

	206 + @adds pair wise on one

	207 + @doubleword.

	208 + vadd.u32 d24,d18,d24 @adler'=adler+adler_of_16_inputs

	209 + vmull.u8 q13,d20,d0 @sum'=mul_fac_for_inputs[0...7]

	210 + @ * buf[0..7].

	211 + vmlal.u8 q13,d21,d1 @sum'=sum'+ mul_fac_for_inputs

	212 + @[8...15] * buf[8..15].

	213 + vpadd.u16 d28,d26,d27 @pair wise add the doublewords

	214 + vpaddl.u16 d28,d28 @pair wise add to reduce 4

	215 + @elements to 2 and extend.

	216 + vpaddl.u32 d28,d28 @pair wise add

	217 + vadd.u32 d28,d16,d28 @sum' = sum + sum'

	218 + vmla.u32 d28,d18,d30 @sum' = sum' + (adler*

	219 + @mul_fac_for_adler).

	220 + vmov.u32 d18,d24 @save adler for next iteration.

	221 + vmov.u32 d16,d28 @save sum for next iteration.

	222 + sub r3,r3,#16

	223 + subs r4,r4,#1

	224 + bne adler32_DO16_loop_16bytes

	225 +

	226 +adler32_DO16_loop_16bytes_done:

	227 + lsrs r4, r3, #3 @find if there are atleast 8 bytes

	228 + beq adler32_DO16_loop_8bytes_done

	229 +adler32_DO16_loop_8bytes:

	230 + vld1.8 {d0},[r7]! @load buf[0] .buf[7]

	231 + vpaddl.u8 d24,d0 @pair wise add to

	232 + @reduce 8 elements to 4

	233 + vpaddl.u16 d24,d24 @pair wise add to reduce

	234 + @4 elements to 2

	235 + vpaddl.u32 d24,d24 @pair wise add to get the

	236 + @adler for 8 inputs

	237 + vadd.u32 d24,d18,d24 @adler' = adler +

	238 + @adler_for_8_inputs.

	239 + vmull.u8 q13,d21,d0 @sum' = mul_fac_for_inputs[0..7]

	240 + @ * buf[0..7]

	241 + vpadd.u16 d28,d26,d27 @pair wise add to reduce 8

	242 + @elements to 4

	243 + vpaddl.u16 d28,d28 @pair wise add to reduce 4

	244 + @elements to 2.

	245 + vpaddl.u32 d28,d28 @pair wise add

	246 + vadd.u32 d28,d16,d28 @sum' = sum + sum'

	247 + vshl.u32 d30,d18,#3 @adler" = adler * 8

	248 + vadd.u32 d28,d28,d30 @sum' = sum' + adler"

	249 + vmov.u32 d18,d24 @save adler for next iteration

	250 + vmov.u32 d16,d28 @save sum for next iteration

	251 + sub r3,r3,#8

	252 +

	253 +adler32_DO16_loop_8bytes_done:

	254 + cmp r3, #0 @find if there are remaining bytes

	255 + beq DONE @after profiling found that a loop

	256 + @to compute 4 or 2 bytes at a time

	257 + @is less efficient than a byte by

	258 + @byte computation.

	259 + vmov.u64 d3, #0

	260 +

	261 +adler32_DO16_loop_remaining:

	262 + vld1.8 {d3[0]}, [r7]! @load 1 byte of input

	263 + subs r3,r3,#1

	264 + vadd.u32 d24,d3,d18 @adler' = adler + *buf

	265 + vadd.u32 d28,d24,d16 @sum' = sum + adler'

	266 + vmov.u32 d18,d24

	267 + vmov.u32 d16,d28

	268 + bne adler32_DO16_loop_remaining

	269 +

	270 +DONE:

	271 + vst1.32 {d24[0]},[r1]

	272 + vst1.32 {d28[0]},[r2]

	273 + str r7, [r0]

	274 + ldmia sp!, {r4-r7}

	275 + bx lr

	276 +

	277 +.size adler32_DO16_loop_neon, .-adler32_DO16_loop_neon

	278 +

	279 +.balign 16

	280 +#======================================================================

	281 +#FACTOR16 provides the multiplication factors for the inputs for 16

	282 +#byte loops. The second half (i.e. 8 to 1) has the multiplications

	283 +#factors for 8 byte loops.

	284 +#======================================================================

	285 +

	286 +FACTOR16:

	287 + .byte 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1

	288 +

	289 +#endif

	290 + .END

	291 +

	292 diff --git a/inffast.c b/inffast.c

	293 index bbee92e..943cb92 100644

	294 --- a/inffast.c

	295 +++ b/inffast.c

	296 @@ -8,6 +8,10 @@

	297 #include "inflate.h"

	298 #include "inffast.h"

	299

	300 +#if defined(__ARM_NEON__)

	301 +extern void inflate_fast_copy_neon(unsigned len, unsigned char *out, unsigned char from);

	302 +#endif

	303 +

	304 #ifndef ASMINF

	305

	306 /* Allow machine dependent optimization for post-increment or pre-increment.

	307 @@ -231,6 +235,9 @@ unsigned start; /* inflate()'s starting value for st rm->avail_out */

	308 from = out - dist; /* rest from output */

	309 }

	310 }

	311 +#if defined(__ARM_NEON__)

	312 + inflate_fast_copy_neon(len, &out, from);

	313 +#else

	314 while (len > 2) {

	315 PUP(out) = PUP(from);

	316 PUP(out) = PUP(from);

	317 @@ -242,9 +249,13 @@ unsigned start; /* inflate()'s starting value for s trm->avail_out */

	318 if (len > 1)

	319 PUP(out) = PUP(from);

	320 }

	321 +#endif

	322 }

	323 else {

	324 from = out - dist; /* copy direct from output */

	325 +#if defined(__ARM_NEON__)

	326 + inflate_fast_copy_neon(len, &out, from);

	327 +#else

	328 do { /* minimum length is three */

	329 PUP(out) = PUP(from);

	330 PUP(out) = PUP(from);

	331 @@ -256,6 +267,7 @@ unsigned start; /* inflate()'s starting value for st rm->avail_out */

	332 if (len > 1)

	333 PUP(out) = PUP(from);

	334 }

	335 +#endif

	336 }

	337 }

	338 else if ((op & 64) == 0) { /* 2nd level distance code */

	339 diff --git a/inflate_fast_copy_neon.S b/inflate_fast_copy_neon.S

	340 new file mode 100755

	341 index 0000000..ec1e4ab

	342 --- /dev/null

	343 +++ b/inflate_fast_copy_neon.S

	344 @@ -0,0 +1,521 @@

	345 +#; Copyright (c) 2010, Code Aurora Forum. All rights reserved.

	346 +#;

	347 +#; Redistribution and use in source and binary forms, with or without

	348 +#; modification, are permitted provided that the following conditions are

	349 +#; met:

	350 +#; * Redistributions of source code must retain the above copyright

	351 +#; notice, this list of conditions and the following disclaimer.

	352 +#; * Redistributions in binary form must reproduce the above

	353 +#; copyright notice, this list of conditions and the following

	354 +#; disclaimer in the documentation and/or other materials provided

	355 +#; with the distribution.

	356 +#; * Neither the name of Code Aurora Forum, Inc. nor the names of its

	357 +#; contributors may be used to endorse or promote products derived

	358 +#; from this software without specific prior written permission.

	359 +#;

	360 +#; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED

	361 +#; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF

	362 +#; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT

	363 +#; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS

	364 +#; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

	365 +#; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

	366 +#; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR

	367 +#; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,

	368 +#; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE

	369 +#; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN

	370 +#; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	371 +#;

	372 +#;

	373 +#if defined(__ARM_NEON__)

	374 +#;============================================================================

	375 +#; Code Section

	376 + .code 32 @; Code is ARM ISA

	377 +#;============================================================================

	378 +

	379 + .global inflate_fast_copy_neon

	380 +

	381 +

	382 +#;============================================================================

	383 +#; INPUTS: r0 len: number of bytes to transfer

	384 +#; r1 **out: pointer to pointer to ``out'' buffer

	385 +#; r2 *from: pointer to ``from'' buffer

	386 +#; OUTPUTS: r1 **out: pointer to pointer to ``out'' buffer

	387 +#;============================================================================

	388 +.balign 32

	389 +.type inflate_fast_copy_neon, %function

	390 +inflate_fast_copy_neon:

	391 + push {r4-r11} @; push r4-r11 onto stack

	392 +

	393 + cmp r0,#16 @;

	394 + bge inflate_fast_copy_vectorized

	395 +

	396 + #;; transfer bytes one by one

	397 + #;; only if len < 16 bytes

	398 +inflate_fast_copy_default:

	399 +

	400 + cmp r0,#0

	401 + beq inflate_fast_copy_exit

	402 +

	403 + ldr r3,[r1,#0] @; r3 = pointer to out

	404 +

	405 +inflate_fast_copy_default_loop:

	406 +

	407 + ldrb r12,[r2,#1]! @; r12 = *(++from)

	408 + subs r0,r0,#1 @; len--

	409 + strb r12,[r3,#1]! @; *(++out) = r12

	410 +

	411 + bne inflate_fast_copy_default_loop

	412 +

	413 + str r3,[r1,#0] @; r1 = updated pointer to pointer

	414 + @; to out

	415 + b inflate_fast_copy_exit

	416 +

	417 + #;; vectorized copy routines

	418 + #;; only if len > 16 bytes

	419 +inflate_fast_copy_vectorized:

	420 +

	421 + ldr r3,[r1,#0] @; r3 = pointer to out

	422 + @; DON'T TOUCH r1 UNTIL FINAL

	423 + @; UPDATE OF r1 WITH ADDRESS OF r3

	424 + cmp r3,r2 @

	425 + sublt r4,r2,r3 @

	426 + subge r4,r3,r2 @;r4 = gap = \|out-from\|

	427 +

	428 + cmp r4,#0

	429 + beq inflate_fast_copy_exit

	430 +

	431 + cmp r4,#1

	432 + beq inflate_fast_copy_gap1b_proc

	433 +

	434 + cmp r4,#2

	435 + beq inflate_fast_copy_gap2b_proc

	436 +

	437 + cmp r4,#3

	438 + beq inflate_fast_copy_gap3b_proc

	439 +

	440 + cmp r4,#4

	441 + beq inflate_fast_copy_gap4b_proc

	442 +

	443 + cmp r4,#8

	444 + blt inflate_fast_copy_gap5to7b_proc

	445 + beq inflate_fast_copy_gap8b_proc

	446 +

	447 + cmp r4,#16

	448 + blt inflate_fast_copy_gap9to15b_proc

	449 + bge inflate_fast_copy_gap16b_proc

	450 +

	451 +

	452 + #;; ------------------------------------------------------------------

	453 + #;; vectorized copy routine when gap between ``from'' and ``out''

	454 + #;; buffers is 1 byte

	455 + #;; INPUTS:

	456 + #;; r0 = len

	457 + #;; r2 = pointer to from

	458 + #;; r3 = pointer to out

	459 + #;; OUTPUTS:

	460 + #;; r1 = pointer to pointer to out

	461 + #;; ------------------------------------------------------------------

	462 +inflate_fast_copy_gap1b_proc:

	463 +

	464 + add r3,r3,#1 @; out++

	465 + @

	466 + ldrb r12,[r2,#1]! @; r12 = *(++from)

	467 + vdup.8 q0, r12 @; duplicate r12 16 times in q0

	468 + @

	469 + lsrs r4,r0,#4 @; r4 = floor(len/16)

	470 + @; = iteration count for loop16

	471 + beq inflate_fast_copy_gap1b_proc_16bytes_loop_done

	472 +

	473 +inflate_fast_copy_gap1b_proc_16bytes_loop:

	474 +

	475 + vst1.8 {q0},[r3]! @; store 16 bytes in out and

	476 + @; increment out pointer

	477 + sub r0,r0,#16 @; subtract 16 from len

	478 + subs r4,r4,#1 @; decrement iteration count

	479 + bne inflate_fast_copy_gap1b_proc_16bytes_loop

	480 +

	481 +inflate_fast_copy_gap1b_proc_16bytes_loop_done:

	482 +

	483 + cmp r0,#0

	484 + subeq r3,r3,#1 @; out--

	485 + streq r3,[r1,#0] @; r1 = updated pointer to pointer

	486 + @; to out

	487 + beq inflate_fast_copy_exit

	488 +

	489 +inflate_fast_copy_gap1b_proc_lastfewbytes_loop:

	490 +

	491 + strb r12,[r3],#1 @; *out = r12, out++

	492 + subs r0,r0,#1 @; len--

	493 + bne inflate_fast_copy_gap1b_proc_lastfewbytes_loop

	494 +

	495 + sub r3,r3,#1 @; out--

	496 + str r3,[r1,#0] @; r1 = updated pointer to pointer

	497 + @; to out

	498 + b inflate_fast_copy_exit

	499 +

	500 + #;; ------------------------------------------------------------------

	501 + #;; vectorized copy routine when gap between ``from'' and ``out''

	502 + #;; buffers is 2 bytes

	503 + #;; INPUTS:

	504 + #;; r0 = len

	505 + #;; r2 = pointer to from

	506 + #;; r3 = pointer to out

	507 + #;; OUTPUTS:

	508 + #;; r1 = pointer to pointer to out

	509 + #;; ------------------------------------------------------------------

	510 +inflate_fast_copy_gap2b_proc:

	511 +

	512 + add r2,r2,#1 @; from++

	513 + add r3,r3,#1 @; out++

	514 + @

	515 + vld1.16 {d0[0]},[r2] @; load 2 bytes into d0[0]

	516 + vdup.16 q0,d0[0] @; duplicate those 2 bytes 8 times

	517 + @; to fill up q0

	518 + @

	519 + lsrs r4,r0,#4 @; r4 = floor(len/16)

	520 + @; = iteration count for loop16

	521 + beq inflate_fast_copy_gap2b_proc_16bytes_loop_done

	522 +

	523 +inflate_fast_copy_gap2b_proc_16bytes_loop:

	524 +

	525 + vst1.8 {q0},[r3]! @; store 16 bytes in out and

	526 + @; increment out pointer

	527 + sub r0,r0,#16 @; subtract 16 from len

	528 + subs r4,r4,#1 @; decrement iteration count

	529 + bne inflate_fast_copy_gap2b_proc_16bytes_loop

	530 +

	531 +inflate_fast_copy_gap2b_proc_16bytes_loop_done:

	532 +

	533 + cmp r0,#0

	534 + subeq r3,r3,#1 @; out--

	535 + streq r3,[r1,#0] @; r1 = updated pointer to pointer

	536 + @; to out

	537 + beq inflate_fast_copy_exit

	538 +

	539 +inflate_fast_copy_gap2b_proc_lastfewbytes_loop:

	540 +

	541 + ldrb r12,[r2],#1 @; r12 = *from, from++

	542 + subs r0,r0,#1 @; len--

	543 + strb r12,[r3],#1 @; *out = r12, out++

	544 + @

	545 + bne inflate_fast_copy_gap2b_proc_lastfewbytes_loop

	546 +

	547 + sub r3,r3,#1 @; out--

	548 + str r3,[r1,#0] @; r1 = updated pointer to pointer

	549 + @; to out

	550 + b inflate_fast_copy_exit

	551 +

	552 + #;; ------------------------------------------------------------------

	553 + #;; vectorized copy routine when gap between ``from'' and ``out''

	554 + #;; buffers is 3 bytes

	555 + #;; INPUTS:

	556 + #;; r0 = len

	557 + #;; r2 = pointer to from

	558 + #;; r3 = pointer to out

	559 + #;; r4 = 3

	560 + #;; OUTPUTS:

	561 + #;; r1 = pointer to pointer to out

	562 + #;; ------------------------------------------------------------------

	563 +inflate_fast_copy_gap3b_proc:

	564 +

	565 + add r2,r2,#1 @; from++

	566 + add r3,r3,#1 @; out++

	567 + @

	568 + vld1.32 {d0[0]},[r2] @; load 4 bytes into d0[0]

	569 +

	570 +inflate_fast_copy_gap3b_proc_3bytes_loop:

	571 +

	572 + cmp r0,#3 @; exit loop if len < 3

	573 + blt inflate_fast_copy_gap3b_proc_3bytes_loop_done

	574 +

	575 + vst1.32 {d0[0]},[r3],r4 @; store 4 bytes in out

	576 + @; out+=3

	577 +

	578 + sub r0,r0,#3 @; len-=3

	579 + b inflate_fast_copy_gap3b_proc_3bytes_loop

	580 +

	581 +inflate_fast_copy_gap3b_proc_3bytes_loop_done:

	582 +

	583 + cmp r0,#0

	584 + subeq r3,r3,#1 @; out--

	585 + streq r3,[r1,#0] @; r1 = updated pointer to pointer

	586 + @; to out

	587 + beq inflate_fast_copy_exit

	588 +

	589 +inflate_fast_copy_gap3b_proc_lastfewbytes_loop:

	590 +

	591 + ldrb r12,[r2],#1 @; r12 = *from, from++

	592 + subs r0,r0,#1 @; len--

	593 + strb r12,[r3],#1 @; *out = r12, out++

	594 +

	595 + bne inflate_fast_copy_gap3b_proc_lastfewbytes_loop

	596 +

	597 + sub r3,r3,#1 @; out--

	598 + str r3,[r1,#0] @; r1 = updated pointer to pointer

	599 + @; to out

	600 + b inflate_fast_copy_exit

	601 +

	602 + #;; ------------------------------------------------------------------

	603 + #;; vectorized copy routine when gap between ``from'' and ``out''

	604 + #;; buffers is 4 bytes

	605 + #;; INPUTS:

	606 + #;; r0 = len

	607 + #;; r2 = pointer to from

	608 + #;; r3 = pointer to out

	609 + #;; OUTPUTS:

	610 + #;; r1 = pointer to pointer to out

	611 + #;; ------------------------------------------------------------------

	612 +inflate_fast_copy_gap4b_proc:

	613 +

	614 + add r2,r2,#1 @; from++

	615 + add r3,r3,#1 @; out++

	616 + @

	617 + vld1.32 {d0[0]},[r2] @; load 4 bytes into d0[0]

	618 + vdup.32 q0,d0[0] @; duplicate those 4 bytes 4 times

	619 + @; to fill up q0

	620 + @

	621 + lsrs r4,r0,#4 @; r4 = floor(len/16)

	622 + @; = iteration count for loop16

	623 + beq inflate_fast_copy_gap4b_proc_16bytes_loop_done

	624 +

	625 +inflate_fast_copy_gap4b_proc_16bytes_loop:

	626 +

	627 + vst1.32 {q0},[r3]! @; store 16 bytes in out and

	628 + @; increment out pointer

	629 + sub r0,r0,#16 @; subtract 16 from len

	630 + subs r4,r4,#1 @; decrement iteration count

	631 + bne inflate_fast_copy_gap4b_proc_16bytes_loop

	632 +

	633 +inflate_fast_copy_gap4b_proc_16bytes_loop_done:

	634 +

	635 + cmp r0,#0

	636 + subeq r3,r3,#1 @; out--

	637 + streq r3,[r1,#0] @; r1 = updated pointer to pointer

	638 + @; to out

	639 + beq inflate_fast_copy_exit

	640 +

	641 +inflate_fast_copy_gap4b_proc_lastfewbytes_loop:

	642 +

	643 + ldrb r12,[r2],#1 @; r12 = *from, from++

	644 + subs r0,r0,#1 @; len--

	645 + strb r12,[r3],#1 @; *out = r12, out++

	646 +

	647 + bne inflate_fast_copy_gap4b_proc_lastfewbytes_loop

	648 +

	649 + sub r3,r3,#1 @; out--

	650 + str r3,[r1,#0] @; r1 = updated pointer to pointer

	651 + @; to out

	652 + b inflate_fast_copy_exit

	653 +

	654 + #;; ------------------------------------------------------------------

	655 + #;; vectorized copy routine when gap between ``from'' and ``out''

	656 + #;; buffers is {5-7} bytes

	657 + #;; INPUTS:

	658 + #;; r0 = len

	659 + #;; r2 = pointer to from

	660 + #;; r3 = pointer to out

	661 + #;; r4 = {5-7}

	662 + #;; OUTPUTS:

	663 + #;; r1 = pointer to pointer to out

	664 + #;; ------------------------------------------------------------------

	665 +inflate_fast_copy_gap5to7b_proc:

	666 +

	667 + add r2,r2,#1 @; from++

	668 + add r3,r3,#1 @; out++

	669 + @

	670 + vld1.8 {d0},[r2] @; load 8 bytes into d0

	671 +

	672 +inflate_fast_copy_gap5to7b_proc_5to7bytes_loop:

	673 +

	674 + cmp r0,r4 @; exit loop if len < {5-7}

	675 + blt inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done

	676 +

	677 + vst1.8 {d0},[r3],r4 @; store 8 bytes in out

	678 + @; out+={5-7}

	679 +

	680 + sub r0,r0,r4 @; len-={5-7}

	681 + b inflate_fast_copy_gap5to7b_proc_5to7bytes_loop

	682 +

	683 +inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done:

	684 +

	685 + cmp r0,#0

	686 + subeq r3,r3,#1 @; out--

	687 + streq r3,[r1,#0] @; r1 = updated pointer to pointer

	688 + @; to out

	689 + beq inflate_fast_copy_exit

	690 +

	691 +inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop:

	692 +

	693 + ldrb r12,[r2],#1 @; r12 = *from, from++

	694 + subs r0,r0,#1 @; len--

	695 + strb r12,[r3],#1 @; *out = r12, out++

	696 +

	697 + bne inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop

	698 +

	699 + sub r3,r3,#1 @; out--

	700 + str r3,[r1,#0] @; r1 = updated pointer to pointer

	701 + @; to out

	702 + b inflate_fast_copy_exit

	703 +

	704 + #;; ------------------------------------------------------------------

	705 + #;; vectorized copy routine when gap between ``from'' and ``out''

	706 + #;; buffers is 8 bytes

	707 + #;; INPUTS:

	708 + #;; r0 = len

	709 + #;; r2 = pointer to from

	710 + #;; r3 = pointer to out

	711 + #;; OUTPUTS:

	712 + #;; r1 = pointer to pointer to out

	713 + #;; ------------------------------------------------------------------

	714 +inflate_fast_copy_gap8b_proc:

	715 +

	716 + add r2,r2,#1 @; from++

	717 + add r3,r3,#1 @; out++

	718 + @

	719 + vld1.8 {d0},[r2] @; load 8 bytes into d0

	720 + vmov d1,d0 @; duplicate the 8 bytes to fill up

	721 + @; q0

	722 + @

	723 + lsrs r4,r0,#4 @; r4 = floor(len/16)

	724 + @; = iteration count for loop16

	725 + beq inflate_fast_copy_gap8b_proc_16bytes_loop_done

	726 +

	727 +inflate_fast_copy_gap8b_proc_16bytes_loop:

	728 +

	729 + vst1.8 {q0},[r3]! @; store 16 bytes in out and

	730 + @; increment out pointer

	731 + sub r0,r0,#16 @; subtract 16 from len

	732 + subs r4,r4,#1 @; decrement iteration count

	733 + bne inflate_fast_copy_gap8b_proc_16bytes_loop

	734 +

	735 +inflate_fast_copy_gap8b_proc_16bytes_loop_done:

	736 +

	737 + cmp r0,#0

	738 + subeq r3,r3,#1 @; out--

	739 + streq r3,[r1,#0] @; r1 = updated pointer to pointer

	740 + @; to out

	741 + beq inflate_fast_copy_exit

	742 +

	743 +inflate_fast_copy_gap8b_proc_lastfewbytes_loop:

	744 +

	745 + ldrb r12,[r2],#1 @; r12 = *from, from++

	746 + subs r0,r0,#1 @; len--

	747 + strb r12,[r3],#1 @; *out = r12, out++

	748 +

	749 + bne inflate_fast_copy_gap8b_proc_lastfewbytes_loop

	750 +

	751 + sub r3,r3,#1 @; out--

	752 + str r3,[r1,#0] @; r1 = updated pointer to pointer

	753 + @; to out

	754 + b inflate_fast_copy_exit

	755 +

	756 + #;; ------------------------------------------------------------------

	757 + #;; vectorized copy routine when gap between ``from'' and ``out''

	758 + #;; buffers is {9-15} bytes

	759 + #;; INPUTS:

	760 + #;; r0 = len

	761 + #;; r2 = pointer to from

	762 + #;; r3 = pointer to out

	763 + #;; r4 = {9-15}

	764 + #;; OUTPUTS:

	765 + #;; r1 = pointer to pointer to out

	766 + #;; ------------------------------------------------------------------

	767 +inflate_fast_copy_gap9to15b_proc:

	768 +

	769 + add r2,r2,#1 @; from++

	770 + add r3,r3,#1 @; out++

	771 + @

	772 + vld1.8 {q0},[r2] @; load 16 bytes into q0

	773 +

	774 +inflate_fast_copy_gap9to15b_proc_9to15bytes_loop:

	775 +

	776 + cmp r0, r4 @; exit loop if len < {9-15}

	777 + blt inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done

	778 +

	779 + vst1.8 {q0},[r3],r4 @; store 16 bytes in out

	780 + @; out+={9-15}

	781 +

	782 + sub r0,r0,r4 @; len-={9-15}

	783 + b inflate_fast_copy_gap9to15b_proc_9to15bytes_loop

	784 +

	785 +inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done:

	786 +

	787 + cmp r0,#0

	788 + subeq r3,r3,#1 @; out--

	789 + streq r3,[r1,#0] @; r1 = updated pointer to pointer

	790 + @; to out

	791 + beq inflate_fast_copy_exit

	792 +

	793 +inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop:

	794 +

	795 + ldrb r12,[r2],#1 @; r12 = *from, from++

	796 + subs r0,r0,#1 @; len--

	797 + strb r12,[r3],#1 @; *out = r12, out++

	798 +

	799 + bne inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop

	800 +

	801 + sub r3,r3,#1 @; out--

	802 + str r3,[r1,#0] @; r1 = updated pointer to pointer

	803 + @; to out

	804 + b inflate_fast_copy_exit

	805 +

	806 + #;; ------------------------------------------------------------------

	807 + #;; vectorized copy routine when gap between ``from'' and ``out''

	808 + #;; buffers is 16 bytes or more

	809 + #;; INPUTS:

	810 + #;; r0 = len

	811 + #;; r2 = pointer to from

	812 + #;; r3 = pointer to out

	813 + #;; OUTPUTS:

	814 + #;; r1 = pointer to pointer to out

	815 + #;; ------------------------------------------------------------------

	816 +inflate_fast_copy_gap16b_proc:

	817 +

	818 + add r2,r2,#1 @; from++

	819 + add r3,r3,#1 @; out++

	820 + @

	821 + lsrs r4,r0,#4 @; r4 = floor(len/16)

	822 + @; = iteration count for loop16

	823 + beq inflate_fast_copy_gap16b_proc_16bytes_loop_done

	824 +

	825 +inflate_fast_copy_gap16b_proc_16bytes_loop:

	826 +

	827 + vld1.8 {q0},[r2]! @; load 16 bytes into q0 and

	828 + @; increment from pointer

	829 + vst1.8 {q0},[r3]! @; store 16 bytes in out and

	830 + @; increment out pointer

	831 + sub r0,r0,#16 @; subtract 16 from len

	832 + subs r4,r4,#1 @; decrement iteration count

	833 + bne inflate_fast_copy_gap16b_proc_16bytes_loop

	834 +

	835 +inflate_fast_copy_gap16b_proc_16bytes_loop_done:

	836 +

	837 + cmp r0,#0

	838 + subeq r3,r3,#1 @; out--

	839 + streq r3,[r1,#0] @; r1 = updated pointer to pointer

	840 + @; to out

	841 + beq inflate_fast_copy_exit

	842 +

	843 +inflate_fast_copy_gap16b_proc_lastfewbytes_loop:

	844 +

	845 + ldrb r12,[r2],#1 @; r12 = *from, from++

	846 + subs r0,r0,#1 @; len--

	847 + strb r12,[r3],#1 @; *out = r12, out++

	848 +

	849 + bne inflate_fast_copy_gap16b_proc_lastfewbytes_loop

	850 +

	851 + sub r3,r3,#1 @; out--

	852 + str r3,[r1,#0] @; r1 = updated pointer to pointer

	853 + @; to out

	854 +

	855 +inflate_fast_copy_exit:

	856 +

	857 + pop {r4-r11} @; pop r4-r11 from stack

	858 + bx lr @; return

	859 +

	860 +.size inflate_fast_copy_neon, .-inflate_fast_copy_neon

	861 +

	862 +#endif

	863 + .END

	864 +

	865 +

OLD	NEW

« no previous file with comments | « no previous file | sys-libs/zlib/zlib-1.2.3-r1.ebuild » ('j') | no next file with comments »