src/opts/memset16_neon.S - Issue 1075003002: Replace NEON assembly memset16 and memset32 with intrinsic versions.

Unified Diff: src/opts/memset16_neon.S

Issue 1075003002: Replace NEON assembly memset16 and memset32 with intrinsic versions. (Closed) Base URL: https://skia.googlesource.com/skia@master

Patch Set: Created 5 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/memset16_neon.S

diff --git a/src/opts/memset16_neon.S b/src/opts/memset16_neon.S

deleted file mode 100644

index b39832fff163627a532ef34209fb9309717c2e06..0000000000000000000000000000000000000000

--- a/src/opts/memset16_neon.S

+++ /dev/null

@@ -1,143 +0,0 @@

-/***************************************************************************

- *

- * Use of this source code is governed by a BSD-style license that can be

- * found in the LICENSE file.

- ***************************************************************************/

-/***************************************************************************

- Neon memset: Attempts to do a memset with Neon registers if possible,

- Inputs:

- s: The buffer to write to

- c: The integer data to write to the buffer

- n: The size_t count.

- Outputs:

-***************************************************************************/

- .syntax unified

- .code 32

- .fpu neon

- .align 4

- .globl memset16_neon

- .hidden memset16_neon

-memset16_neon:

- cmp r2, #0

- bxeq lr

- /* Keep in mind that r2 -- the count argument -- is for the

- * number of 16-bit items to copy.

- */

- lsl r2, r2, #1

- push {r0}

- /* If we have < 8 bytes, just do a quick loop to handle that */

- cmp r2, #8

- bgt memset_gt4

-memset_smallcopy_loop:

- strh r1, [r0], #2

- subs r2, r2, #2

- bne memset_smallcopy_loop

-memset_smallcopy_done:

- pop {r0}

- bx lr

-memset_gt4:

- /*

- * Duplicate the r1 lowest 16-bits across r1. The idea is to have

- * a register with two 16-bit-values we can copy. We do this by

- * duplicating lowest 16-bits of r1 to upper 16-bits.

- */

- orr r1, r1, r1, lsl #16

- /*

- * If we're copying > 64 bytes, then we may want to get

- * onto a 16-byte boundary to improve speed even more.

- */

- cmp r2, #64

- blt memset_route

- ands r12, r0, #0xf

- beq memset_route

- /*

- * Determine the number of bytes to move forward to get to the 16-byte

- * boundary. Note that this will be a multiple of 4, since we

- * already are word-aligned.

- */

- rsb r12, r12, #16

- sub r2, r2, r12

- lsls r12, r12, #29

- strmi r1, [r0], #4

- strcs r1, [r0], #4

- lsls r12, r12, #2

- strhcs r1, [r0], #2

-memset_route:

- /*

- * Decide where to route for the maximum copy sizes. Note that we

- * build q0 and q1 depending on if we'll need it, so that's

- * interwoven here as well.

- */

- vdup.u32 d0, r1

- cmp r2, #16

- blt memset_8

- vmov d1, d0

- cmp r2, #64

- blt memset_16

- vmov q1, q0

- cmp r2, #128

- blt memset_32

-memset_128:

- mov r12, r2, lsr #7

-memset_128_loop:

- vst1.64 {q0, q1}, [r0]!

- subs r12, r12, #1

- bne memset_128_loop

- ands r2, r2, #0x7f

- beq memset_end

-memset_32:

- movs r12, r2, lsr #5

- beq memset_16

-memset_32_loop:

- subs r12, r12, #1

- vst1.64 {q0, q1}, [r0]!

- bne memset_32_loop

- ands r2, r2, #0x1f

- beq memset_end

-memset_16:

- movs r12, r2, lsr #4

- beq memset_8

-memset_16_loop:

- subs r12, r12, #1

- vst1.32 {q0}, [r0]!

- bne memset_16_loop

- ands r2, r2, #0xf

- beq memset_end

- /*

- * memset_8 isn't a loop, since we try to do our loops at 16

- * bytes and above. We should loop there, then drop down here

- * to finish the <16-byte versions. Same for memset_4 and

- * memset_1.

- */

-memset_8:

- cmp r2, #8

- blt memset_4

- subs r2, r2, #8

- vst1.32 {d0}, [r0]!

-memset_4:

- cmp r2, #4

- blt memset_2

- subs r2, r2, #4

- str r1, [r0], #4

-memset_2:

- cmp r2, #0

- ble memset_end

- strh r1, [r0], #2

-memset_end:

- pop {r0}

- bx lr

- .end

« src/opts/SkUtils_opts_arm_neon.cpp ('K') | « src/opts/SkUtils_opts_arm_neon.cpp ('k') | src/opts/memset32_neon.S » ('j') | no next file with comments »