Index: libvpx/source/libvpx/vp8/common/arm/armv6/filter_v6.asm |
diff --git a/libvpx/source/libvpx/vp8/common/arm/armv6/filter_v6.asm b/libvpx/source/libvpx/vp8/common/arm/armv6/filter_v6.asm |
index 1ba91ddd657a1da84f84426f82b3d9a37fcfb7cb..03b5bccd73bfc46e398bdf7f23355166069c1c5f 100644 |
--- a/libvpx/source/libvpx/vp8/common/arm/armv6/filter_v6.asm |
+++ b/libvpx/source/libvpx/vp8/common/arm/armv6/filter_v6.asm |
@@ -10,8 +10,6 @@ |
EXPORT |vp8_filter_block2d_first_pass_armv6| |
- EXPORT |vp8_filter_block2d_first_pass_16x16_armv6| |
- EXPORT |vp8_filter_block2d_first_pass_8x8_armv6| |
EXPORT |vp8_filter_block2d_second_pass_armv6| |
EXPORT |vp8_filter4_block2d_second_pass_armv6| |
EXPORT |vp8_filter_block2d_first_pass_only_armv6| |
@@ -42,6 +40,11 @@ |
add r12, r3, #16 ; square off the output |
sub sp, sp, #4 |
+ ;;IF ARCHITECTURE=6 |
+ ;pld [r0, #-2] |
+ ;;pld [r0, #30] |
+ ;;ENDIF |
+ |
ldr r4, [r11] ; load up packed filter coefficients |
ldr r5, [r11, #4] |
ldr r6, [r11, #8] |
@@ -98,200 +101,19 @@ |
bne width_loop_1st_6 |
- ldr r1, [sp] ; load and update dst address |
- subs r7, r7, #0x10000 |
- add r0, r0, r2 ; move to next input line |
- |
- add r1, r1, #2 ; move over to next column |
- str r1, [sp] |
- |
- bne height_loop_1st_6 |
- |
- add sp, sp, #4 |
- ldmia sp!, {r4 - r11, pc} |
- |
- ENDP |
- |
-; -------------------------- |
-; 16x16 version |
-; ----------------------------- |
-|vp8_filter_block2d_first_pass_16x16_armv6| PROC |
- stmdb sp!, {r4 - r11, lr} |
- |
- ldr r11, [sp, #40] ; vp8_filter address |
- ldr r7, [sp, #36] ; output height |
- |
- add r4, r2, #18 ; preload next low |
- pld [r0, r4] |
- |
- sub r2, r2, r3 ; inside loop increments input array, |
- ; so the height loop only needs to add |
- ; r2 - width to the input pointer |
- |
- mov r3, r3, lsl #1 ; multiply width by 2 because using shorts |
- add r12, r3, #16 ; square off the output |
- sub sp, sp, #4 |
- |
- ldr r4, [r11] ; load up packed filter coefficients |
- ldr r5, [r11, #4] |
- ldr r6, [r11, #8] |
- |
- str r1, [sp] ; push destination to stack |
- mov r7, r7, lsl #16 ; height is top part of counter |
- |
-; six tap filter |
-|height_loop_1st_16_6| |
- ldrb r8, [r0, #-2] ; load source data |
- ldrb r9, [r0, #-1] |
- ldrb r10, [r0], #2 |
- orr r7, r7, r3, lsr #2 ; construct loop counter |
- |
-|width_loop_1st_16_6| |
- ldrb r11, [r0, #-1] |
- |
- pkhbt lr, r8, r9, lsl #16 ; r9 | r8 |
- pkhbt r8, r9, r10, lsl #16 ; r10 | r9 |
- |
- ldrb r9, [r0] |
- |
- smuad lr, lr, r4 ; apply the filter |
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
- smuad r8, r8, r4 |
- pkhbt r11, r11, r9, lsl #16 ; r9 | r11 |
- |
- smlad lr, r10, r5, lr |
- ldrb r10, [r0, #1] |
- smlad r8, r11, r5, r8 |
- ldrb r11, [r0, #2] |
- |
- sub r7, r7, #1 |
- |
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9 |
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
- |
- smlad lr, r9, r6, lr |
- smlad r11, r10, r6, r8 |
- |
- ands r10, r7, #0xff ; test loop counter |
- |
- add lr, lr, #0x40 ; round_shift_and_clamp |
- ldrneb r8, [r0, #-2] ; load data for next loop |
- usat lr, #8, lr, asr #7 |
- add r11, r11, #0x40 |
- ldrneb r9, [r0, #-1] |
- usat r11, #8, r11, asr #7 |
- |
- strh lr, [r1], r12 ; result is transposed and stored, which |
- ; will make second pass filtering easier. |
- ldrneb r10, [r0], #2 |
- strh r11, [r1], r12 |
- |
- bne width_loop_1st_16_6 |
+ ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines |
+ ;;IF ARCHITECTURE=6 |
+ ;pld [r0, r2] |
+ ;;pld [r0, r9] |
+ ;;ENDIF |
ldr r1, [sp] ; load and update dst address |
subs r7, r7, #0x10000 |
add r0, r0, r2 ; move to next input line |
- |
- add r11, r2, #34 ; adding back block width(=16) |
- pld [r0, r11] ; preload next low |
- |
add r1, r1, #2 ; move over to next column |
str r1, [sp] |
- bne height_loop_1st_16_6 |
- |
- add sp, sp, #4 |
- ldmia sp!, {r4 - r11, pc} |
- |
- ENDP |
- |
-; -------------------------- |
-; 8x8 version |
-; ----------------------------- |
-|vp8_filter_block2d_first_pass_8x8_armv6| PROC |
- stmdb sp!, {r4 - r11, lr} |
- |
- ldr r11, [sp, #40] ; vp8_filter address |
- ldr r7, [sp, #36] ; output height |
- |
- add r4, r2, #10 ; preload next low |
- pld [r0, r4] |
- |
- sub r2, r2, r3 ; inside loop increments input array, |
- ; so the height loop only needs to add |
- ; r2 - width to the input pointer |
- |
- mov r3, r3, lsl #1 ; multiply width by 2 because using shorts |
- add r12, r3, #16 ; square off the output |
- sub sp, sp, #4 |
- |
- ldr r4, [r11] ; load up packed filter coefficients |
- ldr r5, [r11, #4] |
- ldr r6, [r11, #8] |
- |
- str r1, [sp] ; push destination to stack |
- mov r7, r7, lsl #16 ; height is top part of counter |
- |
-; six tap filter |
-|height_loop_1st_8_6| |
- ldrb r8, [r0, #-2] ; load source data |
- ldrb r9, [r0, #-1] |
- ldrb r10, [r0], #2 |
- orr r7, r7, r3, lsr #2 ; construct loop counter |
- |
-|width_loop_1st_8_6| |
- ldrb r11, [r0, #-1] |
- |
- pkhbt lr, r8, r9, lsl #16 ; r9 | r8 |
- pkhbt r8, r9, r10, lsl #16 ; r10 | r9 |
- |
- ldrb r9, [r0] |
- |
- smuad lr, lr, r4 ; apply the filter |
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
- smuad r8, r8, r4 |
- pkhbt r11, r11, r9, lsl #16 ; r9 | r11 |
- |
- smlad lr, r10, r5, lr |
- ldrb r10, [r0, #1] |
- smlad r8, r11, r5, r8 |
- ldrb r11, [r0, #2] |
- |
- sub r7, r7, #1 |
- |
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9 |
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10 |
- |
- smlad lr, r9, r6, lr |
- smlad r11, r10, r6, r8 |
- |
- ands r10, r7, #0xff ; test loop counter |
- |
- add lr, lr, #0x40 ; round_shift_and_clamp |
- ldrneb r8, [r0, #-2] ; load data for next loop |
- usat lr, #8, lr, asr #7 |
- add r11, r11, #0x40 |
- ldrneb r9, [r0, #-1] |
- usat r11, #8, r11, asr #7 |
- |
- strh lr, [r1], r12 ; result is transposed and stored, which |
- ; will make second pass filtering easier. |
- ldrneb r10, [r0], #2 |
- strh r11, [r1], r12 |
- |
- bne width_loop_1st_8_6 |
- |
- ldr r1, [sp] ; load and update dst address |
- subs r7, r7, #0x10000 |
- add r0, r0, r2 ; move to next input line |
- |
- add r11, r2, #18 ; adding back block width(=8) |
- pld [r0, r11] ; preload next low |
- |
- add r1, r1, #2 ; move over to next column |
- str r1, [sp] |
- |
- bne height_loop_1st_8_6 |
+ bne height_loop_1st_6 |
add sp, sp, #4 |
ldmia sp!, {r4 - r11, pc} |
@@ -440,10 +262,6 @@ |
|vp8_filter_block2d_first_pass_only_armv6| PROC |
stmdb sp!, {r4 - r11, lr} |
- add r7, r2, r3 ; preload next low |
- add r7, r7, #2 |
- pld [r0, r7] |
- |
ldr r4, [sp, #36] ; output pitch |
ldr r11, [sp, #40] ; HFilter address |
sub sp, sp, #8 |
@@ -512,15 +330,16 @@ |
bne width_loop_1st_only_6 |
+ ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines |
+ ;;IF ARCHITECTURE=6 |
+ ;pld [r0, r2] |
+ ;;pld [r0, r9] |
+ ;;ENDIF |
+ |
ldr lr, [sp] ; load back output pitch |
ldr r12, [sp, #4] ; load back output pitch |
subs r7, r7, #1 |
add r0, r0, r12 ; updata src for next loop |
- |
- add r11, r12, r3 ; preload next low |
- add r11, r11, #2 |
- pld [r0, r11] |
- |
add r1, r1, lr ; update dst for next loop |
bne height_loop_1st_only_6 |