source/libvpx/vpx_dsp/arm/variance_media.asm - Issue 1162573005: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vpx_dsp/arm/variance_media.asm

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 ;

	2 ; Copyright (c) 2011 The WebM project authors. All Rights Reserved.

	3 ;

	4 ; Use of this source code is governed by a BSD-style license

	5 ; that can be found in the LICENSE file in the root of the source

	6 ; tree. An additional intellectual property rights grant can be found

	7 ; in the file PATENTS. All contributing project authors may

	8 ; be found in the AUTHORS file in the root of the source tree.

	9 ;

	10

	11

	12 EXPORT \|vpx_variance16x16_media\|

	13 EXPORT \|vpx_variance8x8_media\|

	14 EXPORT \|vpx_mse16x16_media\|

	15

	16 ARM

	17 REQUIRE8

	18 PRESERVE8

	19

	20 AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2

	21

	22 ; r0 unsigned char *src_ptr

	23 ; r1 int source_stride

	24 ; r2 unsigned char *ref_ptr

	25 ; r3 int recon_stride

	26 ; stack unsigned int *sse

	27 \|vpx_variance16x16_media\| PROC

	28

	29 stmfd sp!, {r4-r12, lr}

	30

	31 pld [r0, r1, lsl #0]

	32 pld [r2, r3, lsl #0]

	33

	34 mov r8, #0 ; initialize sum = 0

	35 mov r11, #0 ; initialize sse = 0

	36 mov r12, #16 ; set loop counter to 16 (=block height)

	37

	38 loop16x16

	39 ; 1st 4 pixels

	40 ldr r4, [r0, #0] ; load 4 src pixels

	41 ldr r5, [r2, #0] ; load 4 ref pixels

	42

	43 mov lr, #0 ; constant zero

	44

	45 usub8 r6, r4, r5 ; calculate difference

	46 pld [r0, r1, lsl #1]

	47 sel r7, r6, lr ; select bytes with positive difference

	48 usub8 r9, r5, r4 ; calculate difference with reversed operands

	49 pld [r2, r3, lsl #1]

	50 sel r6, r9, lr ; select bytes with negative difference

	51

	52 ; calculate partial sums

	53 usad8 r4, r7, lr ; calculate sum of positive differences

	54 usad8 r5, r6, lr ; calculate sum of negative differences

	55 orr r6, r6, r7 ; differences of all 4 pixels

	56 ; calculate total sum

	57 adds r8, r8, r4 ; add positive differences to sum

	58 subs r8, r8, r5 ; subtract negative differences from sum

	59

	60 ; calculate sse

	61 uxtb16 r5, r6 ; byte (two pixels) to halfwords

	62 uxtb16 r10, r6, ror #8 ; another two pixels to halfwords

	63 smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)

	64

	65 ; 2nd 4 pixels

	66 ldr r4, [r0, #4] ; load 4 src pixels

	67 ldr r5, [r2, #4] ; load 4 ref pixels

	68 smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)

	69

	70 usub8 r6, r4, r5 ; calculate difference

	71 sel r7, r6, lr ; select bytes with positive difference

	72 usub8 r9, r5, r4 ; calculate difference with reversed operands

	73 sel r6, r9, lr ; select bytes with negative difference

	74

	75 ; calculate partial sums

	76 usad8 r4, r7, lr ; calculate sum of positive differences

	77 usad8 r5, r6, lr ; calculate sum of negative differences

	78 orr r6, r6, r7 ; differences of all 4 pixels

	79

	80 ; calculate total sum

	81 add r8, r8, r4 ; add positive differences to sum

	82 sub r8, r8, r5 ; subtract negative differences from sum

	83

	84 ; calculate sse

	85 uxtb16 r5, r6 ; byte (two pixels) to halfwords

	86 uxtb16 r10, r6, ror #8 ; another two pixels to halfwords

	87 smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)

	88

	89 ; 3rd 4 pixels

	90 ldr r4, [r0, #8] ; load 4 src pixels

	91 ldr r5, [r2, #8] ; load 4 ref pixels

	92 smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)

	93

	94 usub8 r6, r4, r5 ; calculate difference

	95 sel r7, r6, lr ; select bytes with positive difference

	96 usub8 r9, r5, r4 ; calculate difference with reversed operands

	97 sel r6, r9, lr ; select bytes with negative difference

	98

	99 ; calculate partial sums

	100 usad8 r4, r7, lr ; calculate sum of positive differences

	101 usad8 r5, r6, lr ; calculate sum of negative differences

	102 orr r6, r6, r7 ; differences of all 4 pixels

	103

	104 ; calculate total sum

	105 add r8, r8, r4 ; add positive differences to sum

	106 sub r8, r8, r5 ; subtract negative differences from sum

	107

	108 ; calculate sse

	109 uxtb16 r5, r6 ; byte (two pixels) to halfwords

	110 uxtb16 r10, r6, ror #8 ; another two pixels to halfwords

	111 smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)

	112

	113 ; 4th 4 pixels

	114 ldr r4, [r0, #12] ; load 4 src pixels

	115 ldr r5, [r2, #12] ; load 4 ref pixels

	116 smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)

	117

	118 usub8 r6, r4, r5 ; calculate difference

	119 add r0, r0, r1 ; set src_ptr to next row

	120 sel r7, r6, lr ; select bytes with positive difference

	121 usub8 r9, r5, r4 ; calculate difference with reversed operands

	122 add r2, r2, r3 ; set dst_ptr to next row

	123 sel r6, r9, lr ; select bytes with negative difference

	124

	125 ; calculate partial sums

	126 usad8 r4, r7, lr ; calculate sum of positive differences

	127 usad8 r5, r6, lr ; calculate sum of negative differences

	128 orr r6, r6, r7 ; differences of all 4 pixels

	129

	130 ; calculate total sum

	131 add r8, r8, r4 ; add positive differences to sum

	132 sub r8, r8, r5 ; subtract negative differences from sum

	133

	134 ; calculate sse

	135 uxtb16 r5, r6 ; byte (two pixels) to halfwords

	136 uxtb16 r10, r6, ror #8 ; another two pixels to halfwords

	137 smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)

	138 smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)

	139

	140

	141 subs r12, r12, #1

	142

	143 bne loop16x16

	144

	145 ; return stuff

	146 ldr r6, [sp, #40] ; get address of sse

	147 mul r0, r8, r8 ; sum * sum

	148 str r11, [r6] ; store sse

	149 sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))

	150

	151 ldmfd sp!, {r4-r12, pc}

	152

	153 ENDP

	154

	155 ; r0 unsigned char *src_ptr

	156 ; r1 int source_stride

	157 ; r2 unsigned char *ref_ptr

	158 ; r3 int recon_stride

	159 ; stack unsigned int *sse

	160 \|vpx_variance8x8_media\| PROC

	161

	162 push {r4-r10, lr}

	163

	164 pld [r0, r1, lsl #0]

	165 pld [r2, r3, lsl #0]

	166

	167 mov r12, #8 ; set loop counter to 8 (=block height)

	168 mov r4, #0 ; initialize sum = 0

	169 mov r5, #0 ; initialize sse = 0

	170

	171 loop8x8

	172 ; 1st 4 pixels

	173 ldr r6, [r0, #0x0] ; load 4 src pixels

	174 ldr r7, [r2, #0x0] ; load 4 ref pixels

	175

	176 mov lr, #0 ; constant zero

	177

	178 usub8 r8, r6, r7 ; calculate difference

	179 pld [r0, r1, lsl #1]

	180 sel r10, r8, lr ; select bytes with positive difference

	181 usub8 r9, r7, r6 ; calculate difference with reversed operands

	182 pld [r2, r3, lsl #1]

	183 sel r8, r9, lr ; select bytes with negative difference

	184

	185 ; calculate partial sums

	186 usad8 r6, r10, lr ; calculate sum of positive differences

	187 usad8 r7, r8, lr ; calculate sum of negative differences

	188 orr r8, r8, r10 ; differences of all 4 pixels

	189 ; calculate total sum

	190 add r4, r4, r6 ; add positive differences to sum

	191 sub r4, r4, r7 ; subtract negative differences from sum

	192

	193 ; calculate sse

	194 uxtb16 r7, r8 ; byte (two pixels) to halfwords

	195 uxtb16 r10, r8, ror #8 ; another two pixels to halfwords

	196 smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)

	197

	198 ; 2nd 4 pixels

	199 ldr r6, [r0, #0x4] ; load 4 src pixels

	200 ldr r7, [r2, #0x4] ; load 4 ref pixels

	201 smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)

	202

	203 usub8 r8, r6, r7 ; calculate difference

	204 add r0, r0, r1 ; set src_ptr to next row

	205 sel r10, r8, lr ; select bytes with positive difference

	206 usub8 r9, r7, r6 ; calculate difference with reversed operands

	207 add r2, r2, r3 ; set dst_ptr to next row

	208 sel r8, r9, lr ; select bytes with negative difference

	209

	210 ; calculate partial sums

	211 usad8 r6, r10, lr ; calculate sum of positive differences

	212 usad8 r7, r8, lr ; calculate sum of negative differences

	213 orr r8, r8, r10 ; differences of all 4 pixels

	214

	215 ; calculate total sum

	216 add r4, r4, r6 ; add positive differences to sum

	217 sub r4, r4, r7 ; subtract negative differences from sum

	218

	219 ; calculate sse

	220 uxtb16 r7, r8 ; byte (two pixels) to halfwords

	221 uxtb16 r10, r8, ror #8 ; another two pixels to halfwords

	222 smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)

	223 subs r12, r12, #1 ; next row

	224 smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)

	225

	226 bne loop8x8

	227

	228 ; return stuff

	229 ldr r8, [sp, #32] ; get address of sse

	230 mul r1, r4, r4 ; sum * sum

	231 str r5, [r8] ; store sse

	232 sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))

	233

	234 pop {r4-r10, pc}

	235

	236 ENDP

	237

	238 ; r0 unsigned char *src_ptr

	239 ; r1 int source_stride

	240 ; r2 unsigned char *ref_ptr

	241 ; r3 int recon_stride

	242 ; stack unsigned int *sse

	243 ;

	244 ;note: Based on vpx_variance16x16_media. In this function, sum is never used.

	245 ; So, we can remove this part of calculation.

	246

	247 \|vpx_mse16x16_media\| PROC

	248

	249 push {r4-r9, lr}

	250

	251 pld [r0, r1, lsl #0]

	252 pld [r2, r3, lsl #0]

	253

	254 mov r12, #16 ; set loop counter to 16 (=block height)

	255 mov r4, #0 ; initialize sse = 0

	256

	257 loopmse

	258 ; 1st 4 pixels

	259 ldr r5, [r0, #0x0] ; load 4 src pixels

	260 ldr r6, [r2, #0x0] ; load 4 ref pixels

	261

	262 mov lr, #0 ; constant zero

	263

	264 usub8 r8, r5, r6 ; calculate difference

	265 pld [r0, r1, lsl #1]

	266 sel r7, r8, lr ; select bytes with positive difference

	267 usub8 r9, r6, r5 ; calculate difference with reversed operands

	268 pld [r2, r3, lsl #1]

	269 sel r8, r9, lr ; select bytes with negative difference

	270

	271 ; calculate partial sums

	272 usad8 r5, r7, lr ; calculate sum of positive differences

	273 usad8 r6, r8, lr ; calculate sum of negative differences

	274 orr r8, r8, r7 ; differences of all 4 pixels

	275

	276 ldr r5, [r0, #0x4] ; load 4 src pixels

	277

	278 ; calculate sse

	279 uxtb16 r6, r8 ; byte (two pixels) to halfwords

	280 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords

	281 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)

	282

	283 ; 2nd 4 pixels

	284 ldr r6, [r2, #0x4] ; load 4 ref pixels

	285 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)

	286

	287 usub8 r8, r5, r6 ; calculate difference

	288 sel r7, r8, lr ; select bytes with positive difference

	289 usub8 r9, r6, r5 ; calculate difference with reversed operands

	290 sel r8, r9, lr ; select bytes with negative difference

	291

	292 ; calculate partial sums

	293 usad8 r5, r7, lr ; calculate sum of positive differences

	294 usad8 r6, r8, lr ; calculate sum of negative differences

	295 orr r8, r8, r7 ; differences of all 4 pixels

	296 ldr r5, [r0, #0x8] ; load 4 src pixels

	297 ; calculate sse

	298 uxtb16 r6, r8 ; byte (two pixels) to halfwords

	299 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords

	300 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)

	301

	302 ; 3rd 4 pixels

	303 ldr r6, [r2, #0x8] ; load 4 ref pixels

	304 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)

	305

	306 usub8 r8, r5, r6 ; calculate difference

	307 sel r7, r8, lr ; select bytes with positive difference

	308 usub8 r9, r6, r5 ; calculate difference with reversed operands

	309 sel r8, r9, lr ; select bytes with negative difference

	310

	311 ; calculate partial sums

	312 usad8 r5, r7, lr ; calculate sum of positive differences

	313 usad8 r6, r8, lr ; calculate sum of negative differences

	314 orr r8, r8, r7 ; differences of all 4 pixels

	315

	316 ldr r5, [r0, #0xc] ; load 4 src pixels

	317

	318 ; calculate sse

	319 uxtb16 r6, r8 ; byte (two pixels) to halfwords

	320 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords

	321 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)

	322

	323 ; 4th 4 pixels

	324 ldr r6, [r2, #0xc] ; load 4 ref pixels

	325 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)

	326

	327 usub8 r8, r5, r6 ; calculate difference

	328 add r0, r0, r1 ; set src_ptr to next row

	329 sel r7, r8, lr ; select bytes with positive difference

	330 usub8 r9, r6, r5 ; calculate difference with reversed operands

	331 add r2, r2, r3 ; set dst_ptr to next row

	332 sel r8, r9, lr ; select bytes with negative difference

	333

	334 ; calculate partial sums

	335 usad8 r5, r7, lr ; calculate sum of positive differences

	336 usad8 r6, r8, lr ; calculate sum of negative differences

	337 orr r8, r8, r7 ; differences of all 4 pixels

	338

	339 subs r12, r12, #1 ; next row

	340

	341 ; calculate sse

	342 uxtb16 r6, r8 ; byte (two pixels) to halfwords

	343 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords

	344 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)

	345 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)

	346

	347 bne loopmse

	348

	349 ; return stuff

	350 ldr r1, [sp, #28] ; get address of sse

	351 mov r0, r4 ; return sse

	352 str r4, [r1] ; store sse

	353

	354 pop {r4-r9, pc}

	355

	356 ENDP

	357

	358 END

OLD	NEW

« no previous file with comments | « source/libvpx/vpx/vpx_codec.mk ('k') | source/libvpx/vpx_dsp/arm/variance_neon.c » ('j') | no next file with comments »