| Index: libvpx/source/libvpx/vp8/common/arm/armv6/filter_v6.asm
|
| diff --git a/libvpx/source/libvpx/vp8/common/arm/armv6/filter_v6.asm b/libvpx/source/libvpx/vp8/common/arm/armv6/filter_v6.asm
|
| index 1ba91ddd657a1da84f84426f82b3d9a37fcfb7cb..03b5bccd73bfc46e398bdf7f23355166069c1c5f 100644
|
| --- a/libvpx/source/libvpx/vp8/common/arm/armv6/filter_v6.asm
|
| +++ b/libvpx/source/libvpx/vp8/common/arm/armv6/filter_v6.asm
|
| @@ -10,8 +10,6 @@
|
|
|
|
|
| EXPORT |vp8_filter_block2d_first_pass_armv6|
|
| - EXPORT |vp8_filter_block2d_first_pass_16x16_armv6|
|
| - EXPORT |vp8_filter_block2d_first_pass_8x8_armv6|
|
| EXPORT |vp8_filter_block2d_second_pass_armv6|
|
| EXPORT |vp8_filter4_block2d_second_pass_armv6|
|
| EXPORT |vp8_filter_block2d_first_pass_only_armv6|
|
| @@ -42,6 +40,11 @@
|
| add r12, r3, #16 ; square off the output
|
| sub sp, sp, #4
|
|
|
| + ;;IF ARCHITECTURE=6
|
| + ;pld [r0, #-2]
|
| + ;;pld [r0, #30]
|
| + ;;ENDIF
|
| +
|
| ldr r4, [r11] ; load up packed filter coefficients
|
| ldr r5, [r11, #4]
|
| ldr r6, [r11, #8]
|
| @@ -98,200 +101,19 @@
|
|
|
| bne width_loop_1st_6
|
|
|
| - ldr r1, [sp] ; load and update dst address
|
| - subs r7, r7, #0x10000
|
| - add r0, r0, r2 ; move to next input line
|
| -
|
| - add r1, r1, #2 ; move over to next column
|
| - str r1, [sp]
|
| -
|
| - bne height_loop_1st_6
|
| -
|
| - add sp, sp, #4
|
| - ldmia sp!, {r4 - r11, pc}
|
| -
|
| - ENDP
|
| -
|
| -; --------------------------
|
| -; 16x16 version
|
| -; -----------------------------
|
| -|vp8_filter_block2d_first_pass_16x16_armv6| PROC
|
| - stmdb sp!, {r4 - r11, lr}
|
| -
|
| - ldr r11, [sp, #40] ; vp8_filter address
|
| - ldr r7, [sp, #36] ; output height
|
| -
|
| - add r4, r2, #18 ; preload next low
|
| - pld [r0, r4]
|
| -
|
| - sub r2, r2, r3 ; inside loop increments input array,
|
| - ; so the height loop only needs to add
|
| - ; r2 - width to the input pointer
|
| -
|
| - mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
|
| - add r12, r3, #16 ; square off the output
|
| - sub sp, sp, #4
|
| -
|
| - ldr r4, [r11] ; load up packed filter coefficients
|
| - ldr r5, [r11, #4]
|
| - ldr r6, [r11, #8]
|
| -
|
| - str r1, [sp] ; push destination to stack
|
| - mov r7, r7, lsl #16 ; height is top part of counter
|
| -
|
| -; six tap filter
|
| -|height_loop_1st_16_6|
|
| - ldrb r8, [r0, #-2] ; load source data
|
| - ldrb r9, [r0, #-1]
|
| - ldrb r10, [r0], #2
|
| - orr r7, r7, r3, lsr #2 ; construct loop counter
|
| -
|
| -|width_loop_1st_16_6|
|
| - ldrb r11, [r0, #-1]
|
| -
|
| - pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
| - pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
| -
|
| - ldrb r9, [r0]
|
| -
|
| - smuad lr, lr, r4 ; apply the filter
|
| - pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
| - smuad r8, r8, r4
|
| - pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
| -
|
| - smlad lr, r10, r5, lr
|
| - ldrb r10, [r0, #1]
|
| - smlad r8, r11, r5, r8
|
| - ldrb r11, [r0, #2]
|
| -
|
| - sub r7, r7, #1
|
| -
|
| - pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
| - pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
| -
|
| - smlad lr, r9, r6, lr
|
| - smlad r11, r10, r6, r8
|
| -
|
| - ands r10, r7, #0xff ; test loop counter
|
| -
|
| - add lr, lr, #0x40 ; round_shift_and_clamp
|
| - ldrneb r8, [r0, #-2] ; load data for next loop
|
| - usat lr, #8, lr, asr #7
|
| - add r11, r11, #0x40
|
| - ldrneb r9, [r0, #-1]
|
| - usat r11, #8, r11, asr #7
|
| -
|
| - strh lr, [r1], r12 ; result is transposed and stored, which
|
| - ; will make second pass filtering easier.
|
| - ldrneb r10, [r0], #2
|
| - strh r11, [r1], r12
|
| -
|
| - bne width_loop_1st_16_6
|
| + ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
|
| + ;;IF ARCHITECTURE=6
|
| + ;pld [r0, r2]
|
| + ;;pld [r0, r9]
|
| + ;;ENDIF
|
|
|
| ldr r1, [sp] ; load and update dst address
|
| subs r7, r7, #0x10000
|
| add r0, r0, r2 ; move to next input line
|
| -
|
| - add r11, r2, #34 ; adding back block width(=16)
|
| - pld [r0, r11] ; preload next low
|
| -
|
| add r1, r1, #2 ; move over to next column
|
| str r1, [sp]
|
|
|
| - bne height_loop_1st_16_6
|
| -
|
| - add sp, sp, #4
|
| - ldmia sp!, {r4 - r11, pc}
|
| -
|
| - ENDP
|
| -
|
| -; --------------------------
|
| -; 8x8 version
|
| -; -----------------------------
|
| -|vp8_filter_block2d_first_pass_8x8_armv6| PROC
|
| - stmdb sp!, {r4 - r11, lr}
|
| -
|
| - ldr r11, [sp, #40] ; vp8_filter address
|
| - ldr r7, [sp, #36] ; output height
|
| -
|
| - add r4, r2, #10 ; preload next low
|
| - pld [r0, r4]
|
| -
|
| - sub r2, r2, r3 ; inside loop increments input array,
|
| - ; so the height loop only needs to add
|
| - ; r2 - width to the input pointer
|
| -
|
| - mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
|
| - add r12, r3, #16 ; square off the output
|
| - sub sp, sp, #4
|
| -
|
| - ldr r4, [r11] ; load up packed filter coefficients
|
| - ldr r5, [r11, #4]
|
| - ldr r6, [r11, #8]
|
| -
|
| - str r1, [sp] ; push destination to stack
|
| - mov r7, r7, lsl #16 ; height is top part of counter
|
| -
|
| -; six tap filter
|
| -|height_loop_1st_8_6|
|
| - ldrb r8, [r0, #-2] ; load source data
|
| - ldrb r9, [r0, #-1]
|
| - ldrb r10, [r0], #2
|
| - orr r7, r7, r3, lsr #2 ; construct loop counter
|
| -
|
| -|width_loop_1st_8_6|
|
| - ldrb r11, [r0, #-1]
|
| -
|
| - pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
| - pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
| -
|
| - ldrb r9, [r0]
|
| -
|
| - smuad lr, lr, r4 ; apply the filter
|
| - pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
| - smuad r8, r8, r4
|
| - pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
| -
|
| - smlad lr, r10, r5, lr
|
| - ldrb r10, [r0, #1]
|
| - smlad r8, r11, r5, r8
|
| - ldrb r11, [r0, #2]
|
| -
|
| - sub r7, r7, #1
|
| -
|
| - pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
| - pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
| -
|
| - smlad lr, r9, r6, lr
|
| - smlad r11, r10, r6, r8
|
| -
|
| - ands r10, r7, #0xff ; test loop counter
|
| -
|
| - add lr, lr, #0x40 ; round_shift_and_clamp
|
| - ldrneb r8, [r0, #-2] ; load data for next loop
|
| - usat lr, #8, lr, asr #7
|
| - add r11, r11, #0x40
|
| - ldrneb r9, [r0, #-1]
|
| - usat r11, #8, r11, asr #7
|
| -
|
| - strh lr, [r1], r12 ; result is transposed and stored, which
|
| - ; will make second pass filtering easier.
|
| - ldrneb r10, [r0], #2
|
| - strh r11, [r1], r12
|
| -
|
| - bne width_loop_1st_8_6
|
| -
|
| - ldr r1, [sp] ; load and update dst address
|
| - subs r7, r7, #0x10000
|
| - add r0, r0, r2 ; move to next input line
|
| -
|
| - add r11, r2, #18 ; adding back block width(=8)
|
| - pld [r0, r11] ; preload next low
|
| -
|
| - add r1, r1, #2 ; move over to next column
|
| - str r1, [sp]
|
| -
|
| - bne height_loop_1st_8_6
|
| + bne height_loop_1st_6
|
|
|
| add sp, sp, #4
|
| ldmia sp!, {r4 - r11, pc}
|
| @@ -440,10 +262,6 @@
|
| |vp8_filter_block2d_first_pass_only_armv6| PROC
|
| stmdb sp!, {r4 - r11, lr}
|
|
|
| - add r7, r2, r3 ; preload next low
|
| - add r7, r7, #2
|
| - pld [r0, r7]
|
| -
|
| ldr r4, [sp, #36] ; output pitch
|
| ldr r11, [sp, #40] ; HFilter address
|
| sub sp, sp, #8
|
| @@ -512,15 +330,16 @@
|
|
|
| bne width_loop_1st_only_6
|
|
|
| + ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
|
| + ;;IF ARCHITECTURE=6
|
| + ;pld [r0, r2]
|
| + ;;pld [r0, r9]
|
| + ;;ENDIF
|
| +
|
| ldr lr, [sp] ; load back output pitch
|
| ldr r12, [sp, #4] ; load back output pitch
|
| subs r7, r7, #1
|
| add r0, r0, r12 ; updata src for next loop
|
| -
|
| - add r11, r12, r3 ; preload next low
|
| - add r11, r11, #2
|
| - pld [r0, r11]
|
| -
|
| add r1, r1, lr ; update dst for next loop
|
|
|
| bne height_loop_1st_only_6
|
|
|