Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(603)

Unified Diff: src/opts/memset16_neon.S

Issue 1075003002: Replace NEON assembly memset16 and memset32 with intrinsic versions. (Closed) Base URL: https://skia.googlesource.com/skia@master
Patch Set: Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/opts/memset16_neon.S
diff --git a/src/opts/memset16_neon.S b/src/opts/memset16_neon.S
deleted file mode 100644
index b39832fff163627a532ef34209fb9309717c2e06..0000000000000000000000000000000000000000
--- a/src/opts/memset16_neon.S
+++ /dev/null
@@ -1,143 +0,0 @@
-/***************************************************************************
- * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- ***************************************************************************/
-
-/***************************************************************************
- Neon memset: Attempts to do a memset with Neon registers if possible,
- Inputs:
- s: The buffer to write to
- c: The integer data to write to the buffer
- n: The size_t count.
- Outputs:
-
-***************************************************************************/
-
- .syntax unified
- .code 32
- .fpu neon
- .align 4
- .globl memset16_neon
- .hidden memset16_neon
-
-memset16_neon:
- cmp r2, #0
- bxeq lr
-
- /* Keep in mind that r2 -- the count argument -- is for the
- * number of 16-bit items to copy.
- */
- lsl r2, r2, #1
-
- push {r0}
-
- /* If we have < 8 bytes, just do a quick loop to handle that */
- cmp r2, #8
- bgt memset_gt4
-memset_smallcopy_loop:
- strh r1, [r0], #2
- subs r2, r2, #2
- bne memset_smallcopy_loop
-memset_smallcopy_done:
- pop {r0}
- bx lr
-
-memset_gt4:
- /*
- * Duplicate the r1 lowest 16-bits across r1. The idea is to have
- * a register with two 16-bit-values we can copy. We do this by
- * duplicating lowest 16-bits of r1 to upper 16-bits.
- */
- orr r1, r1, r1, lsl #16
- /*
- * If we're copying > 64 bytes, then we may want to get
- * onto a 16-byte boundary to improve speed even more.
- */
- cmp r2, #64
- blt memset_route
- ands r12, r0, #0xf
- beq memset_route
- /*
- * Determine the number of bytes to move forward to get to the 16-byte
- * boundary. Note that this will be a multiple of 4, since we
- * already are word-aligned.
- */
- rsb r12, r12, #16
- sub r2, r2, r12
- lsls r12, r12, #29
- strmi r1, [r0], #4
- strcs r1, [r0], #4
- strcs r1, [r0], #4
- lsls r12, r12, #2
- strhcs r1, [r0], #2
-memset_route:
- /*
- * Decide where to route for the maximum copy sizes. Note that we
- * build q0 and q1 depending on if we'll need it, so that's
- * interwoven here as well.
- */
- vdup.u32 d0, r1
- cmp r2, #16
- blt memset_8
- vmov d1, d0
- cmp r2, #64
- blt memset_16
- vmov q1, q0
- cmp r2, #128
- blt memset_32
-memset_128:
- mov r12, r2, lsr #7
-memset_128_loop:
- vst1.64 {q0, q1}, [r0]!
- vst1.64 {q0, q1}, [r0]!
- vst1.64 {q0, q1}, [r0]!
- vst1.64 {q0, q1}, [r0]!
- subs r12, r12, #1
- bne memset_128_loop
- ands r2, r2, #0x7f
- beq memset_end
-memset_32:
- movs r12, r2, lsr #5
- beq memset_16
-memset_32_loop:
- subs r12, r12, #1
- vst1.64 {q0, q1}, [r0]!
- bne memset_32_loop
- ands r2, r2, #0x1f
- beq memset_end
-memset_16:
- movs r12, r2, lsr #4
- beq memset_8
-memset_16_loop:
- subs r12, r12, #1
- vst1.32 {q0}, [r0]!
- bne memset_16_loop
- ands r2, r2, #0xf
- beq memset_end
- /*
- * memset_8 isn't a loop, since we try to do our loops at 16
- * bytes and above. We should loop there, then drop down here
- * to finish the <16-byte versions. Same for memset_4 and
- * memset_1.
- */
-memset_8:
- cmp r2, #8
- blt memset_4
- subs r2, r2, #8
- vst1.32 {d0}, [r0]!
-memset_4:
- cmp r2, #4
- blt memset_2
- subs r2, r2, #4
- str r1, [r0], #4
-memset_2:
- cmp r2, #0
- ble memset_end
- strh r1, [r0], #2
-memset_end:
- pop {r0}
- bx lr
-
- .end

Powered by Google App Engine
This is Rietveld 408576698