| Index: libvpx/source/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
|
| diff --git a/libvpx/source/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/libvpx/source/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
|
| index 5e00cf01bbd7296ea6bd9455470c1f2d2baa617b..0137120363bb095851cbe7953296b01c728e01c9 100644
|
| --- a/libvpx/source/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
|
| +++ b/libvpx/source/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
|
| @@ -45,28 +45,35 @@
|
| MEND
|
|
|
|
|
| -
|
| src RN r0
|
| pstep RN r1
|
|
|
| ;r0 unsigned char *src_ptr,
|
| ;r1 int src_pixel_step,
|
| -;r2 const char *blimit
|
| +;r2 const char *flimit,
|
| +;r3 const char *limit,
|
| +;stack const char *thresh,
|
| +;stack int count
|
| +
|
| +; All 16 elements in flimit are equal. So, in the code, only one load is needed
|
| +; for flimit. Same applies to limit. thresh is not used in simple looopfilter
|
|
|
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
| |vp8_loop_filter_simple_horizontal_edge_armv6| PROC
|
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
| stmdb sp!, {r4 - r11, lr}
|
|
|
| - ldrb r12, [r2] ; blimit
|
| + ldr r12, [r3] ; limit
|
| ldr r3, [src, -pstep, lsl #1] ; p1
|
| ldr r4, [src, -pstep] ; p0
|
| ldr r5, [src] ; q0
|
| ldr r6, [src, pstep] ; q1
|
| - orr r12, r12, r12, lsl #8 ; blimit
|
| + ldr r7, [r2] ; flimit
|
| ldr r2, c0x80808080
|
| - orr r12, r12, r12, lsl #16 ; blimit
|
| - mov r9, #4 ; double the count. we're doing 4 at a time
|
| + ldr r9, [sp, #40] ; count for 8-in-parallel
|
| + uadd8 r7, r7, r7 ; flimit * 2
|
| + mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time
|
| + uadd8 r12, r7, r12 ; flimit * 2 + limit
|
| mov lr, #0 ; need 0 in a couple places
|
|
|
| |simple_hnext8|
|
| @@ -141,32 +148,30 @@ pstep RN r1
|
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
| stmdb sp!, {r4 - r11, lr}
|
|
|
| - ldrb r12, [r2] ; r12: blimit
|
| + ldr r12, [r2] ; r12: flimit
|
| ldr r2, c0x80808080
|
| - orr r12, r12, r12, lsl #8
|
| + ldr r7, [r3] ; limit
|
|
|
| ; load soure data to r7, r8, r9, r10
|
| ldrh r3, [src, #-2]
|
| - pld [src, #23] ; preload for next block
|
| ldrh r4, [src], pstep
|
| - orr r12, r12, r12, lsl #16
|
| + uadd8 r12, r12, r12 ; flimit * 2
|
|
|
| ldrh r5, [src, #-2]
|
| - pld [src, #23]
|
| ldrh r6, [src], pstep
|
| + uadd8 r12, r12, r7 ; flimit * 2 + limit
|
|
|
| pkhbt r7, r3, r4, lsl #16
|
|
|
| ldrh r3, [src, #-2]
|
| - pld [src, #23]
|
| ldrh r4, [src], pstep
|
| + ldr r11, [sp, #40] ; count (r11) for 8-in-parallel
|
|
|
| pkhbt r8, r5, r6, lsl #16
|
|
|
| ldrh r5, [src, #-2]
|
| - pld [src, #23]
|
| ldrh r6, [src], pstep
|
| - mov r11, #4 ; double the count. we're doing 4 at a time
|
| + mov r11, r11, lsl #1 ; 4-in-parallel
|
|
|
| |simple_vnext8|
|
| ; vp8_simple_filter_mask() function
|
| @@ -254,23 +259,19 @@ pstep RN r1
|
|
|
| ; load soure data to r7, r8, r9, r10
|
| ldrneh r3, [src, #-2]
|
| - pld [src, #23] ; preload for next block
|
| ldrneh r4, [src], pstep
|
|
|
| ldrneh r5, [src, #-2]
|
| - pld [src, #23]
|
| ldrneh r6, [src], pstep
|
|
|
| pkhbt r7, r3, r4, lsl #16
|
|
|
| ldrneh r3, [src, #-2]
|
| - pld [src, #23]
|
| ldrneh r4, [src], pstep
|
|
|
| pkhbt r8, r5, r6, lsl #16
|
|
|
| ldrneh r5, [src, #-2]
|
| - pld [src, #23]
|
| ldrneh r6, [src], pstep
|
|
|
| bne simple_vnext8
|
|
|