| Index: source/libvpx/vp8/common/arm/armv6/filter_v6.asm
|
| ===================================================================
|
| --- source/libvpx/vp8/common/arm/armv6/filter_v6.asm (revision 96967)
|
| +++ source/libvpx/vp8/common/arm/armv6/filter_v6.asm (working copy)
|
| @@ -10,6 +10,8 @@
|
|
|
|
|
| EXPORT |vp8_filter_block2d_first_pass_armv6|
|
| + EXPORT |vp8_filter_block2d_first_pass_16x16_armv6|
|
| + EXPORT |vp8_filter_block2d_first_pass_8x8_armv6|
|
| EXPORT |vp8_filter_block2d_second_pass_armv6|
|
| EXPORT |vp8_filter4_block2d_second_pass_armv6|
|
| EXPORT |vp8_filter_block2d_first_pass_only_armv6|
|
| @@ -40,11 +42,6 @@
|
| add r12, r3, #16 ; square off the output
|
| sub sp, sp, #4
|
|
|
| - ;;IF ARCHITECTURE=6
|
| - ;pld [r0, #-2]
|
| - ;;pld [r0, #30]
|
| - ;;ENDIF
|
| -
|
| ldr r4, [r11] ; load up packed filter coefficients
|
| ldr r5, [r11, #4]
|
| ldr r6, [r11, #8]
|
| @@ -101,15 +98,10 @@
|
|
|
| bne width_loop_1st_6
|
|
|
| - ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
|
| - ;;IF ARCHITECTURE=6
|
| - ;pld [r0, r2]
|
| - ;;pld [r0, r9]
|
| - ;;ENDIF
|
| -
|
| ldr r1, [sp] ; load and update dst address
|
| subs r7, r7, #0x10000
|
| add r0, r0, r2 ; move to next input line
|
| +
|
| add r1, r1, #2 ; move over to next column
|
| str r1, [sp]
|
|
|
| @@ -120,6 +112,192 @@
|
|
|
| ENDP
|
|
|
| +; --------------------------
|
| +; 16x16 version
|
| +; -----------------------------
|
| +|vp8_filter_block2d_first_pass_16x16_armv6| PROC
|
| + stmdb sp!, {r4 - r11, lr}
|
| +
|
| + ldr r11, [sp, #40] ; vp8_filter address
|
| + ldr r7, [sp, #36] ; output height
|
| +
|
| + add r4, r2, #18 ; preload next low
|
| + pld [r0, r4]
|
| +
|
| + sub r2, r2, r3 ; inside loop increments input array,
|
| + ; so the height loop only needs to add
|
| + ; r2 - width to the input pointer
|
| +
|
| + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
|
| + add r12, r3, #16 ; square off the output
|
| + sub sp, sp, #4
|
| +
|
| + ldr r4, [r11] ; load up packed filter coefficients
|
| + ldr r5, [r11, #4]
|
| + ldr r6, [r11, #8]
|
| +
|
| + str r1, [sp] ; push destination to stack
|
| + mov r7, r7, lsl #16 ; height is top part of counter
|
| +
|
| +; six tap filter
|
| +|height_loop_1st_16_6|
|
| + ldrb r8, [r0, #-2] ; load source data
|
| + ldrb r9, [r0, #-1]
|
| + ldrb r10, [r0], #2
|
| + orr r7, r7, r3, lsr #2 ; construct loop counter
|
| +
|
| +|width_loop_1st_16_6|
|
| + ldrb r11, [r0, #-1]
|
| +
|
| + pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
| + pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
| +
|
| + ldrb r9, [r0]
|
| +
|
| + smuad lr, lr, r4 ; apply the filter
|
| + pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
| + smuad r8, r8, r4
|
| + pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
| +
|
| + smlad lr, r10, r5, lr
|
| + ldrb r10, [r0, #1]
|
| + smlad r8, r11, r5, r8
|
| + ldrb r11, [r0, #2]
|
| +
|
| + sub r7, r7, #1
|
| +
|
| + pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
| + pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
| +
|
| + smlad lr, r9, r6, lr
|
| + smlad r11, r10, r6, r8
|
| +
|
| + ands r10, r7, #0xff ; test loop counter
|
| +
|
| + add lr, lr, #0x40 ; round_shift_and_clamp
|
| + ldrneb r8, [r0, #-2] ; load data for next loop
|
| + usat lr, #8, lr, asr #7
|
| + add r11, r11, #0x40
|
| + ldrneb r9, [r0, #-1]
|
| + usat r11, #8, r11, asr #7
|
| +
|
| + strh lr, [r1], r12 ; result is transposed and stored, which
|
| + ; will make second pass filtering easier.
|
| + ldrneb r10, [r0], #2
|
| + strh r11, [r1], r12
|
| +
|
| + bne width_loop_1st_16_6
|
| +
|
| + ldr r1, [sp] ; load and update dst address
|
| + subs r7, r7, #0x10000
|
| + add r0, r0, r2 ; move to next input line
|
| +
|
| + add r11, r2, #34 ; adding back block width(=16)
|
| + pld [r0, r11] ; preload next low
|
| +
|
| + add r1, r1, #2 ; move over to next column
|
| + str r1, [sp]
|
| +
|
| + bne height_loop_1st_16_6
|
| +
|
| + add sp, sp, #4
|
| + ldmia sp!, {r4 - r11, pc}
|
| +
|
| + ENDP
|
| +
|
| +; --------------------------
|
| +; 8x8 version
|
| +; -----------------------------
|
| +|vp8_filter_block2d_first_pass_8x8_armv6| PROC
|
| + stmdb sp!, {r4 - r11, lr}
|
| +
|
| + ldr r11, [sp, #40] ; vp8_filter address
|
| + ldr r7, [sp, #36] ; output height
|
| +
|
| + add r4, r2, #10 ; preload next low
|
| + pld [r0, r4]
|
| +
|
| + sub r2, r2, r3 ; inside loop increments input array,
|
| + ; so the height loop only needs to add
|
| + ; r2 - width to the input pointer
|
| +
|
| + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
|
| + add r12, r3, #16 ; square off the output
|
| + sub sp, sp, #4
|
| +
|
| + ldr r4, [r11] ; load up packed filter coefficients
|
| + ldr r5, [r11, #4]
|
| + ldr r6, [r11, #8]
|
| +
|
| + str r1, [sp] ; push destination to stack
|
| + mov r7, r7, lsl #16 ; height is top part of counter
|
| +
|
| +; six tap filter
|
| +|height_loop_1st_8_6|
|
| + ldrb r8, [r0, #-2] ; load source data
|
| + ldrb r9, [r0, #-1]
|
| + ldrb r10, [r0], #2
|
| + orr r7, r7, r3, lsr #2 ; construct loop counter
|
| +
|
| +|width_loop_1st_8_6|
|
| + ldrb r11, [r0, #-1]
|
| +
|
| + pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
| + pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
| +
|
| + ldrb r9, [r0]
|
| +
|
| + smuad lr, lr, r4 ; apply the filter
|
| + pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
| + smuad r8, r8, r4
|
| + pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
| +
|
| + smlad lr, r10, r5, lr
|
| + ldrb r10, [r0, #1]
|
| + smlad r8, r11, r5, r8
|
| + ldrb r11, [r0, #2]
|
| +
|
| + sub r7, r7, #1
|
| +
|
| + pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
| + pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
| +
|
| + smlad lr, r9, r6, lr
|
| + smlad r11, r10, r6, r8
|
| +
|
| + ands r10, r7, #0xff ; test loop counter
|
| +
|
| + add lr, lr, #0x40 ; round_shift_and_clamp
|
| + ldrneb r8, [r0, #-2] ; load data for next loop
|
| + usat lr, #8, lr, asr #7
|
| + add r11, r11, #0x40
|
| + ldrneb r9, [r0, #-1]
|
| + usat r11, #8, r11, asr #7
|
| +
|
| + strh lr, [r1], r12 ; result is transposed and stored, which
|
| + ; will make second pass filtering easier.
|
| + ldrneb r10, [r0], #2
|
| + strh r11, [r1], r12
|
| +
|
| + bne width_loop_1st_8_6
|
| +
|
| + ldr r1, [sp] ; load and update dst address
|
| + subs r7, r7, #0x10000
|
| + add r0, r0, r2 ; move to next input line
|
| +
|
| + add r11, r2, #18 ; adding back block width(=8)
|
| + pld [r0, r11] ; preload next low
|
| +
|
| + add r1, r1, #2 ; move over to next column
|
| + str r1, [sp]
|
| +
|
| + bne height_loop_1st_8_6
|
| +
|
| + add sp, sp, #4
|
| + ldmia sp!, {r4 - r11, pc}
|
| +
|
| + ENDP
|
| +
|
| ;---------------------------------
|
| ; r0 short *src_ptr,
|
| ; r1 unsigned char *output_ptr,
|
| @@ -262,6 +440,10 @@
|
| |vp8_filter_block2d_first_pass_only_armv6| PROC
|
| stmdb sp!, {r4 - r11, lr}
|
|
|
| + add r7, r2, r3 ; preload next low
|
| + add r7, r7, #2
|
| + pld [r0, r7]
|
| +
|
| ldr r4, [sp, #36] ; output pitch
|
| ldr r11, [sp, #40] ; HFilter address
|
| sub sp, sp, #8
|
| @@ -330,16 +512,15 @@
|
|
|
| bne width_loop_1st_only_6
|
|
|
| - ;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
|
| - ;;IF ARCHITECTURE=6
|
| - ;pld [r0, r2]
|
| - ;;pld [r0, r9]
|
| - ;;ENDIF
|
| -
|
| ldr lr, [sp] ; load back output pitch
|
| ldr r12, [sp, #4] ; load back output pitch
|
| subs r7, r7, #1
|
| add r0, r0, r12 ; updata src for next loop
|
| +
|
| + add r11, r12, r3 ; preload next low
|
| + add r11, r11, #2
|
| + pld [r0, r11]
|
| +
|
| add r1, r1, lr ; update dst for next loop
|
|
|
| bne height_loop_1st_only_6
|
|
|