| Index: libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
|
| diff --git a/libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm b/libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
|
| index 1cbbbcdef5e2533b43a7095b9588d07b474d939b..b6417dee65f2b1606a86915035124f20136ba8b3 100644
|
| --- a/libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
|
| +++ b/libvpx/source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
|
| @@ -53,11 +53,14 @@ count RN r5
|
|
|
| ;r0 unsigned char *src_ptr,
|
| ;r1 int src_pixel_step,
|
| -;r2 const char *blimit,
|
| +;r2 const char *flimit,
|
| ;r3 const char *limit,
|
| ;stack const char *thresh,
|
| ;stack int count
|
|
|
| +;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
|
| +;for flimit. Same way applies to limit and thresh.
|
| +
|
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
| |vp8_loop_filter_horizontal_edge_armv6| PROC
|
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
| @@ -69,18 +72,14 @@ count RN r5
|
| sub sp, sp, #16 ; create temp buffer
|
|
|
| ldr r9, [src], pstep ; p3
|
| - ldrb r4, [r2] ; blimit
|
| + ldr r4, [r2], #4 ; flimit
|
| ldr r10, [src], pstep ; p2
|
| - ldrb r2, [r3] ; limit
|
| + ldr r2, [r3], #4 ; limit
|
| ldr r11, [src], pstep ; p1
|
| - orr r4, r4, r4, lsl #8
|
| - ldrb r3, [r6] ; thresh
|
| - orr r2, r2, r2, lsl #8
|
| + uadd8 r4, r4, r4 ; flimit * 2
|
| + ldr r3, [r6], #4 ; thresh
|
| mov count, count, lsl #1 ; 4-in-parallel
|
| - orr r4, r4, r4, lsl #16
|
| - orr r3, r3, r3, lsl #8
|
| - orr r2, r2, r2, lsl #16
|
| - orr r3, r3, r3, lsl #16
|
| + uadd8 r4, r4, r2 ; flimit * 2 + limit
|
|
|
| |Hnext8|
|
| ; vp8_filter_mask() function
|
| @@ -254,6 +253,12 @@ count RN r5
|
|
|
| subs count, count, #1
|
|
|
| + ;pld [src]
|
| + ;pld [src, pstep]
|
| + ;pld [src, pstep, lsl #1]
|
| + ;pld [src, pstep, lsl #2]
|
| + ;pld [src, pstep, lsl #3]
|
| +
|
| ldrne r9, [src], pstep ; p3
|
| ldrne r10, [src], pstep ; p2
|
| ldrne r11, [src], pstep ; p1
|
| @@ -276,18 +281,14 @@ count RN r5
|
| sub sp, sp, #16 ; create temp buffer
|
|
|
| ldr r9, [src], pstep ; p3
|
| - ldrb r4, [r2] ; blimit
|
| + ldr r4, [r2], #4 ; flimit
|
| ldr r10, [src], pstep ; p2
|
| - ldrb r2, [r3] ; limit
|
| + ldr r2, [r3], #4 ; limit
|
| ldr r11, [src], pstep ; p1
|
| - orr r4, r4, r4, lsl #8
|
| - ldrb r3, [r6] ; thresh
|
| - orr r2, r2, r2, lsl #8
|
| + uadd8 r4, r4, r4 ; flimit * 2
|
| + ldr r3, [r6], #4 ; thresh
|
| mov count, count, lsl #1 ; 4-in-parallel
|
| - orr r4, r4, r4, lsl #16
|
| - orr r3, r3, r3, lsl #8
|
| - orr r2, r2, r2, lsl #16
|
| - orr r3, r3, r3, lsl #16
|
| + uadd8 r4, r4, r2 ; flimit * 2 + limit
|
|
|
| |MBHnext8|
|
|
|
| @@ -589,19 +590,15 @@ count RN r5
|
| sub sp, sp, #16 ; create temp buffer
|
|
|
| ldr r6, [src], pstep ; load source data
|
| - ldrb r4, [r2] ; blimit
|
| + ldr r4, [r2], #4 ; flimit
|
| ldr r7, [src], pstep
|
| - ldrb r2, [r3] ; limit
|
| + ldr r2, [r3], #4 ; limit
|
| ldr r8, [src], pstep
|
| - orr r4, r4, r4, lsl #8
|
| - ldrb r3, [r12] ; thresh
|
| - orr r2, r2, r2, lsl #8
|
| + uadd8 r4, r4, r4 ; flimit * 2
|
| + ldr r3, [r12], #4 ; thresh
|
| ldr lr, [src], pstep
|
| mov count, count, lsl #1 ; 4-in-parallel
|
| - orr r4, r4, r4, lsl #16
|
| - orr r3, r3, r3, lsl #8
|
| - orr r2, r2, r2, lsl #16
|
| - orr r3, r3, r3, lsl #16
|
| + uadd8 r4, r4, r2 ; flimit * 2 + limit
|
|
|
| |Vnext8|
|
|
|
| @@ -860,26 +857,18 @@ count RN r5
|
| sub src, src, #4 ; move src pointer down by 4
|
| ldr count, [sp, #40] ; count for 8-in-parallel
|
| ldr r12, [sp, #36] ; load thresh address
|
| - pld [src, #23] ; preload for next block
|
| sub sp, sp, #16 ; create temp buffer
|
|
|
| ldr r6, [src], pstep ; load source data
|
| - ldrb r4, [r2] ; blimit
|
| - pld [src, #23]
|
| + ldr r4, [r2], #4 ; flimit
|
| ldr r7, [src], pstep
|
| - ldrb r2, [r3] ; limit
|
| - pld [src, #23]
|
| + ldr r2, [r3], #4 ; limit
|
| ldr r8, [src], pstep
|
| - orr r4, r4, r4, lsl #8
|
| - ldrb r3, [r12] ; thresh
|
| - orr r2, r2, r2, lsl #8
|
| - pld [src, #23]
|
| + uadd8 r4, r4, r4 ; flimit * 2
|
| + ldr r3, [r12], #4 ; thresh
|
| ldr lr, [src], pstep
|
| mov count, count, lsl #1 ; 4-in-parallel
|
| - orr r4, r4, r4, lsl #16
|
| - orr r3, r3, r3, lsl #8
|
| - orr r2, r2, r2, lsl #16
|
| - orr r3, r3, r3, lsl #16
|
| + uadd8 r4, r4, r2 ; flimit * 2 + limit
|
|
|
| |MBVnext8|
|
| ; vp8_filter_mask() function
|
| @@ -919,7 +908,6 @@ count RN r5
|
| str lr, [sp, #8]
|
| ldr lr, [src], pstep
|
|
|
| -
|
| TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
|
|
|
| ldr lr, [sp, #8] ; load back (f)limit accumulator
|
| @@ -968,7 +956,6 @@ count RN r5
|
| beq mbvskip_filter ; skip filtering
|
|
|
|
|
| -
|
| ;vp8_hevmask() function
|
| ;calculate high edge variance
|
|
|
| @@ -1136,7 +1123,6 @@ count RN r5
|
| smlabb r8, r6, lr, r7
|
| smlatb r6, r6, lr, r7
|
| smlabb r9, r10, lr, r7
|
| -
|
| smlatb r10, r10, lr, r7
|
| ssat r8, #8, r8, asr #7
|
| ssat r6, #8, r6, asr #7
|
| @@ -1256,13 +1242,9 @@ count RN r5
|
| sub src, src, #4
|
| subs count, count, #1
|
|
|
| - pld [src, #23] ; preload for next block
|
| ldrne r6, [src], pstep ; load source data
|
| - pld [src, #23]
|
| ldrne r7, [src], pstep
|
| - pld [src, #23]
|
| ldrne r8, [src], pstep
|
| - pld [src, #23]
|
| ldrne lr, [src], pstep
|
|
|
| bne MBVnext8
|
|
|