Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(11)

Unified Diff: source/libvpx/vp8/common/arm/armv6/filter_v6.asm

Issue 7671004: Update libvpx snapshot to v0.9.7-p1 (Cayuga). (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: '' Created 9 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: source/libvpx/vp8/common/arm/armv6/filter_v6.asm
===================================================================
--- source/libvpx/vp8/common/arm/armv6/filter_v6.asm (revision 96967)
+++ source/libvpx/vp8/common/arm/armv6/filter_v6.asm (working copy)
@@ -10,6 +10,8 @@
EXPORT |vp8_filter_block2d_first_pass_armv6|
+ EXPORT |vp8_filter_block2d_first_pass_16x16_armv6|
+ EXPORT |vp8_filter_block2d_first_pass_8x8_armv6|
EXPORT |vp8_filter_block2d_second_pass_armv6|
EXPORT |vp8_filter4_block2d_second_pass_armv6|
EXPORT |vp8_filter_block2d_first_pass_only_armv6|
@@ -40,11 +42,6 @@
add r12, r3, #16 ; square off the output
sub sp, sp, #4
- ;;IF ARCHITECTURE=6
- ;pld [r0, #-2]
- ;;pld [r0, #30]
- ;;ENDIF
-
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
@@ -101,15 +98,10 @@
bne width_loop_1st_6
- ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
- ;;IF ARCHITECTURE=6
- ;pld [r0, r2]
- ;;pld [r0, r9]
- ;;ENDIF
-
ldr r1, [sp] ; load and update dst address
subs r7, r7, #0x10000
add r0, r0, r2 ; move to next input line
+
add r1, r1, #2 ; move over to next column
str r1, [sp]
@@ -120,6 +112,192 @@
ENDP
+; --------------------------
+; 16x16 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_16x16_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp8_filter address
+ ldr r7, [sp, #36] ; output height
+
+ add r4, r2, #18 ; preload next low
+ pld [r0, r4]
+
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
+ add r12, r3, #16 ; square off the output
+ sub sp, sp, #4
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r1, [sp] ; push destination to stack
+ mov r7, r7, lsl #16 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_16_6|
+ ldrb r8, [r0, #-2] ; load source data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+ orr r7, r7, r3, lsr #2 ; construct loop counter
+
+|width_loop_1st_16_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+ smuad lr, lr, r4 ; apply the filter
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ sub r7, r7, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r11, r10, r6, r8
+
+ ands r10, r7, #0xff ; test loop counter
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r11, r11, #0x40
+ ldrneb r9, [r0, #-1]
+ usat r11, #8, r11, asr #7
+
+ strh lr, [r1], r12 ; result is transposed and stored, which
+ ; will make second pass filtering easier.
+ ldrneb r10, [r0], #2
+ strh r11, [r1], r12
+
+ bne width_loop_1st_16_6
+
+ ldr r1, [sp] ; load and update dst address
+ subs r7, r7, #0x10000
+ add r0, r0, r2 ; move to next input line
+
+ add r11, r2, #34 ; adding back block width(=16)
+ pld [r0, r11] ; preload next low
+
+ add r1, r1, #2 ; move over to next column
+ str r1, [sp]
+
+ bne height_loop_1st_16_6
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+; --------------------------
+; 8x8 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_8x8_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp8_filter address
+ ldr r7, [sp, #36] ; output height
+
+ add r4, r2, #10 ; preload next low
+ pld [r0, r4]
+
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
+ add r12, r3, #16 ; square off the output
+ sub sp, sp, #4
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r1, [sp] ; push destination to stack
+ mov r7, r7, lsl #16 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_8_6|
+ ldrb r8, [r0, #-2] ; load source data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+ orr r7, r7, r3, lsr #2 ; construct loop counter
+
+|width_loop_1st_8_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+ smuad lr, lr, r4 ; apply the filter
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ sub r7, r7, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r11, r10, r6, r8
+
+ ands r10, r7, #0xff ; test loop counter
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r11, r11, #0x40
+ ldrneb r9, [r0, #-1]
+ usat r11, #8, r11, asr #7
+
+ strh lr, [r1], r12 ; result is transposed and stored, which
+ ; will make second pass filtering easier.
+ ldrneb r10, [r0], #2
+ strh r11, [r1], r12
+
+ bne width_loop_1st_8_6
+
+ ldr r1, [sp] ; load and update dst address
+ subs r7, r7, #0x10000
+ add r0, r0, r2 ; move to next input line
+
+ add r11, r2, #18 ; adding back block width(=8)
+ pld [r0, r11] ; preload next low
+
+ add r1, r1, #2 ; move over to next column
+ str r1, [sp]
+
+ bne height_loop_1st_8_6
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
;---------------------------------
; r0 short *src_ptr,
; r1 unsigned char *output_ptr,
@@ -262,6 +440,10 @@
|vp8_filter_block2d_first_pass_only_armv6| PROC
stmdb sp!, {r4 - r11, lr}
+ add r7, r2, r3 ; preload next low
+ add r7, r7, #2
+ pld [r0, r7]
+
ldr r4, [sp, #36] ; output pitch
ldr r11, [sp, #40] ; HFilter address
sub sp, sp, #8
@@ -330,16 +512,15 @@
bne width_loop_1st_only_6
- ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
- ;;IF ARCHITECTURE=6
- ;pld [r0, r2]
- ;;pld [r0, r9]
- ;;ENDIF
-
ldr lr, [sp] ; load back output pitch
ldr r12, [sp, #4] ; load back output pitch
subs r7, r7, #1
add r0, r0, r12 ; updata src for next loop
+
+ add r11, r12, r3 ; preload next low
+ add r11, r11, #2
+ pld [r0, r11]
+
add r1, r1, lr ; update dst for next loop
bne height_loop_1st_only_6
« no previous file with comments | « source/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm ('k') | source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698