source/libvpx/vp9/common/x86/vp9_postproc_sse2.asm - Issue 11555023: libvpx: Add VP9 decoder.

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_postproc_sse2.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 ;

	2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.

	3 ;

	4 ; Use of this source code is governed by a BSD-style license

	5 ; that can be found in the LICENSE file in the root of the source

	6 ; tree. An additional intellectual property rights grant can be found

	7 ; in the file PATENTS. All contributing project authors may

	8 ; be found in the AUTHORS file in the root of the source tree.

	9 ;

	10

	11

	12 %include "vpx_ports/x86_abi_support.asm"

	13

	14 ;void vp9_post_proc_down_and_across_xmm

	15 ;(

	16 ; unsigned char *src_ptr,

	17 ; unsigned char *dst_ptr,

	18 ; int src_pixels_per_line,

	19 ; int dst_pixels_per_line,

	20 ; int rows,

	21 ; int cols,

	22 ; int flimit

	23 ;)

	24 global sym(vp9_post_proc_down_and_across_xmm)

	25 sym(vp9_post_proc_down_and_across_xmm):

	26 push rbp

	27 mov rbp, rsp

	28 SHADOW_ARGS_TO_STACK 7

	29 SAVE_XMM 7

	30 GET_GOT rbx

	31 push rsi

	32 push rdi

	33 ; end prolog

	34

	35 %if ABI_IS_32BIT=1 && CONFIG_PIC=1

	36 ALIGN_STACK 16, rax

	37 ; move the global rd onto the stack, since we don't have enough registers

	38 ; to do PIC addressing

	39 movdqa xmm0, [GLOBAL(rd42)]

	40 sub rsp, 16

	41 movdqa [rsp], xmm0

	42 %define RD42 [rsp]

	43 %else

	44 %define RD42 [GLOBAL(rd42)]

	45 %endif

	46

	47

	48 movd xmm2, dword ptr arg(6) ;flimit

	49 punpcklwd xmm2, xmm2

	50 punpckldq xmm2, xmm2

	51 punpcklqdq xmm2, xmm2

	52

	53 mov rsi, arg(0) ;src_ptr

	54 mov rdi, arg(1) ;dst_ptr

	55

	56 movsxd rcx, DWORD PTR arg(4) ;rows

	57 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destinat ion pitch?

	58 pxor xmm0, xmm0 ; mm0 = 00000000

	59

	60 .nextrow:

	61

	62 xor rdx, rdx ; clear out rdx for use as loop counte r

	63 .nextcol:

	64 movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7

	65 punpcklbw xmm3, xmm0 ; mm3 = p0..p3

	66 movdqa xmm1, xmm3 ; mm1 = p0..p3

	67 psllw xmm3, 2 ;

	68

	69 movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7

	70 punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3

	71 paddusw xmm3, xmm5 ; mm3 += mm6

	72

	73 ; thresholding

	74 movdqa xmm7, xmm1 ; mm7 = r0 p0..p3

	75 psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p 0..p3

	76 psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p 0..p3

	77 paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)

	78 pcmpgtw xmm7, xmm2

	79

	80 movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7

	81 punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3

	82 paddusw xmm3, xmm5 ; mm3 += mm5

	83

	84 ; thresholding

	85 movdqa xmm6, xmm1 ; mm6 = r0 p0..p3

	86 psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p 0..p3

	87 psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p 0..p3

	88 paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)

	89 pcmpgtw xmm6, xmm2

	90 por xmm7, xmm6 ; accumulate thresholds

	91

	92

	93 neg rax

	94 movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7

	95 punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3

	96 paddusw xmm3, xmm5 ; mm3 += mm5

	97

	98 ; thresholding

	99 movdqa xmm6, xmm1 ; mm6 = r0 p0..p3

	100 psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0. .p3

	101 psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0. .p3

	102 paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)

	103 pcmpgtw xmm6, xmm2

	104 por xmm7, xmm6 ; accumulate thresholds

	105

	106 movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7

	107 punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3

	108 paddusw xmm3, xmm4 ; mm3 += mm5

	109

	110 ; thresholding

	111 movdqa xmm6, xmm1 ; mm6 = r0 p0..p3

	112 psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0. .p3

	113 psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0. .p3

	114 paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)

	115 pcmpgtw xmm6, xmm2

	116 por xmm7, xmm6 ; accumulate thresholds

	117

	118

	119 paddusw xmm3, RD42 ; mm3 += round value

	120 psraw xmm3, 3 ; mm3 /= 8

	121

	122 pand xmm1, xmm7 ; mm1 select vals > thre sh from source

	123 pandn xmm7, xmm3 ; mm7 select vals < thre sh from blurred result

	124 paddusw xmm1, xmm7 ; combination

	125

	126 packuswb xmm1, xmm0 ; pack to bytes

	127 movq QWORD PTR [rdi], xmm1 ;

	128

	129 neg rax ; pitch is positive

	130 add rsi, 8

	131 add rdi, 8

	132

	133 add rdx, 8

	134 cmp edx, dword arg(5) ;cols

	135

	136 jl .nextcol

	137

	138 ; done with the all cols, start the across filtering in place

	139 sub rsi, rdx

	140 sub rdi, rdx

	141

	142 xor rdx, rdx

	143 movq mm0, QWORD PTR [rdi-8];

	144

	145 .acrossnextcol:

	146 movq xmm7, QWORD PTR [rdi +rdx -2]

	147 movd xmm4, DWORD PTR [rdi +rdx +6]

	148

	149 pslldq xmm4, 8

	150 por xmm4, xmm7

	151

	152 movdqa xmm3, xmm4

	153 psrldq xmm3, 2

	154 punpcklbw xmm3, xmm0 ; mm3 = p0..p3

	155 movdqa xmm1, xmm3 ; mm1 = p0..p3

	156 psllw xmm3, 2

	157

	158

	159 movdqa xmm5, xmm4

	160 psrldq xmm5, 3

	161 punpcklbw xmm5, xmm0 ; mm5 = p1..p4

	162 paddusw xmm3, xmm5 ; mm3 += mm6

	163

	164 ; thresholding

	165 movdqa xmm7, xmm1 ; mm7 = p0..p3

	166 psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4

	167 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3

	168 paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)

	169 pcmpgtw xmm7, xmm2

	170

	171 movdqa xmm5, xmm4

	172 psrldq xmm5, 4

	173 punpcklbw xmm5, xmm0 ; mm5 = p2..p5

	174 paddusw xmm3, xmm5 ; mm3 += mm5

	175

	176 ; thresholding

	177 movdqa xmm6, xmm1 ; mm6 = p0..p3

	178 psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4

	179 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3

	180 paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)

	181 pcmpgtw xmm6, xmm2

	182 por xmm7, xmm6 ; accumulate thresholds

	183

	184

	185 movdqa xmm5, xmm4 ; mm5 = p-2..p5

	186 punpcklbw xmm5, xmm0 ; mm5 = p-2..p1

	187 paddusw xmm3, xmm5 ; mm3 += mm5

	188

	189 ; thresholding

	190 movdqa xmm6, xmm1 ; mm6 = p0..p3

	191 psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4

	192 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3

	193 paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)

	194 pcmpgtw xmm6, xmm2

	195 por xmm7, xmm6 ; accumulate thresholds

	196

	197 psrldq xmm4, 1 ; mm4 = p-1..p5

	198 punpcklbw xmm4, xmm0 ; mm4 = p-1..p2

	199 paddusw xmm3, xmm4 ; mm3 += mm5

	200

	201 ; thresholding

	202 movdqa xmm6, xmm1 ; mm6 = p0..p3

	203 psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4

	204 psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3

	205 paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)

	206 pcmpgtw xmm6, xmm2

	207 por xmm7, xmm6 ; accumulate thresholds

	208

	209 paddusw xmm3, RD42 ; mm3 += round value

	210 psraw xmm3, 3 ; mm3 /= 8

	211

	212 pand xmm1, xmm7 ; mm1 select vals > thresh fro m source

	213 pandn xmm7, xmm3 ; mm7 select vals < thresh fro m blurred result

	214 paddusw xmm1, xmm7 ; combination

	215

	216 packuswb xmm1, xmm0 ; pack to bytes

	217 movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes

	218 movdq2q mm0, xmm1

	219

	220 add rdx, 8

	221 cmp edx, dword arg(5) ;cols

	222 jl .acrossnextcol;

	223

	224 ; last 8 pixels

	225 movq QWORD PTR [rdi+rdx-8], mm0

	226

	227 ; done with this rwo

	228 add rsi,rax ; next line

	229 mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?

	230 add rdi,rax ; next destination

	231 mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?

	232

	233 dec rcx ; decrement count

	234 jnz .nextrow ; next row

	235

	236 %if ABI_IS_32BIT=1 && CONFIG_PIC=1

	237 add rsp,16

	238 pop rsp

	239 %endif

	240 ; begin epilog

	241 pop rdi

	242 pop rsi

	243 RESTORE_GOT

	244 RESTORE_XMM

	245 UNSHADOW_ARGS

	246 pop rbp

	247 ret

	248 %undef RD42

	249

	250

	251 ;void vp9_mbpost_proc_down_xmm(unsigned char *dst,

	252 ; int pitch, int rows, int cols,int flimit)

	253 extern sym(vp9_rv)

	254 global sym(vp9_mbpost_proc_down_xmm)

	255 sym(vp9_mbpost_proc_down_xmm):

	256 push rbp

	257 mov rbp, rsp

	258 SHADOW_ARGS_TO_STACK 5

	259 SAVE_XMM 7

	260 GET_GOT rbx

	261 push rsi

	262 push rdi

	263 ; end prolog

	264

	265 ALIGN_STACK 16, rax

	266 sub rsp, 128+16

	267

	268 ; unsigned char d[16][8] at [rsp]

	269 ; create flimit2 at [rsp+128]

	270 mov eax, dword ptr arg(4) ;flimit

	271 mov [rsp+128], eax

	272 mov [rsp+128+4], eax

	273 mov [rsp+128+8], eax

	274 mov [rsp+128+12], eax

	275 %define flimit4 [rsp+128]

	276

	277 %if ABI_IS_32BIT=0

	278 lea r8, [GLOBAL(sym(vp9_rv))]

	279 %endif

	280

	281 ;rows +=8;

	282 add dword arg(2), 8

	283

	284 ;for(c=0; c<cols; c+=8)

	285 .loop_col:

	286 mov rsi, arg(0) ; s

	287 pxor xmm0, xmm0 ;

	288

	289 movsxd rax, dword ptr arg(1) ;pitch ;

	290 neg rax ; rax = -pitch

	291

	292 lea rsi, [rsi + rax8]; ; rdi = s[-pitch 8]

	293 neg rax

	294

	295

	296 pxor xmm5, xmm5

	297 pxor xmm6, xmm6 ;

	298

	299 pxor xmm7, xmm7 ;

	300 mov rdi, rsi

	301

	302 mov rcx, 15 ;

	303

	304 .loop_initvar:

	305 movq xmm1, QWORD PTR [rdi];

	306 punpcklbw xmm1, xmm0 ;

	307

	308 paddw xmm5, xmm1 ;

	309 pmullw xmm1, xmm1 ;

	310

	311 movdqa xmm2, xmm1 ;

	312 punpcklwd xmm1, xmm0 ;

	313

	314 punpckhwd xmm2, xmm0 ;

	315 paddd xmm6, xmm1 ;

	316

	317 paddd xmm7, xmm2 ;

	318 lea rdi, [rdi+rax] ;

	319

	320 dec rcx

	321 jne .loop_initvar

	322 ;save the var and sum

	323 xor rdx, rdx

	324 .loop_row:

	325 movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]

	326 movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]

	327

	328 punpcklbw xmm1, xmm0

	329 punpcklbw xmm2, xmm0

	330

	331 paddw xmm5, xmm2

	332 psubw xmm5, xmm1

	333

	334 pmullw xmm2, xmm2

	335 movdqa xmm4, xmm2

	336

	337 punpcklwd xmm2, xmm0

	338 punpckhwd xmm4, xmm0

	339

	340 paddd xmm6, xmm2

	341 paddd xmm7, xmm4

	342

	343 pmullw xmm1, xmm1

	344 movdqa xmm2, xmm1

	345

	346 punpcklwd xmm1, xmm0

	347 psubd xmm6, xmm1

	348

	349 punpckhwd xmm2, xmm0

	350 psubd xmm7, xmm2

	351

	352

	353 movdqa xmm3, xmm6

	354 pslld xmm3, 4

	355

	356 psubd xmm3, xmm6

	357 movdqa xmm1, xmm5

	358

	359 movdqa xmm4, xmm5

	360 pmullw xmm1, xmm1

	361

	362 pmulhw xmm4, xmm4

	363 movdqa xmm2, xmm1

	364

	365 punpcklwd xmm1, xmm4

	366 punpckhwd xmm2, xmm4

	367

	368 movdqa xmm4, xmm7

	369 pslld xmm4, 4

	370

	371 psubd xmm4, xmm7

	372

	373 psubd xmm3, xmm1

	374 psubd xmm4, xmm2

	375

	376 psubd xmm3, flimit4

	377 psubd xmm4, flimit4

	378

	379 psrad xmm3, 31

	380 psrad xmm4, 31

	381

	382 packssdw xmm3, xmm4

	383 packsswb xmm3, xmm0

	384

	385 movq xmm1, QWORD PTR [rsi+rax*8]

	386

	387 movq xmm2, xmm1

	388 punpcklbw xmm1, xmm0

	389

	390 paddw xmm1, xmm5

	391 mov rcx, rdx

	392

	393 and rcx, 127

	394 %if ABI_IS_32BIT=1 && CONFIG_PIC=1

	395 push rax

	396 lea rax, [GLOBAL(sym(vp9_rv))]

	397 movdqu xmm4, [rax + rcx2] ;vp9_rv[rcx2]

	398 pop rax

	399 %elif ABI_IS_32BIT=0

	400 movdqu xmm4, [r8 + rcx2] ;vp9_rv[rcx2]

	401 %else

	402 movdqu xmm4, [sym(vp9_rv) + rcx*2]

	403 %endif

	404

	405 paddw xmm1, xmm4

	406 ;paddw xmm1, eight8s

	407 psraw xmm1, 4

	408

	409 packuswb xmm1, xmm0

	410 pand xmm1, xmm3

	411

	412 pandn xmm3, xmm2

	413 por xmm1, xmm3

	414

	415 and rcx, 15

	416 movq QWORD PTR [rsp + rcx8], xmm1 ;d[rcx8]

	417

	418 mov rcx, rdx

	419 sub rcx, 8

	420

	421 and rcx, 15

	422 movq mm0, [rsp + rcx8] ;d[rcx8]

	423

	424 movq [rsi], mm0

	425 lea rsi, [rsi+rax]

	426

	427 lea rdi, [rdi+rax]

	428 add rdx, 1

	429

	430 cmp edx, dword arg(2) ;rows

	431 jl .loop_row

	432

	433 add dword arg(0), 8 ; s += 8

	434 sub dword arg(3), 8 ; cols -= 8

	435 cmp dword arg(3), 0

	436 jg .loop_col

	437

	438 add rsp, 128+16

	439 pop rsp

	440

	441 ; begin epilog

	442 pop rdi

	443 pop rsi

	444 RESTORE_GOT

	445 RESTORE_XMM

	446 UNSHADOW_ARGS

	447 pop rbp

	448 ret

	449 %undef flimit4

	450

	451

	452 ;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,

	453 ; int pitch, int rows, int cols,int flimit)

	454 global sym(vp9_mbpost_proc_across_ip_xmm)

	455 sym(vp9_mbpost_proc_across_ip_xmm):

	456 push rbp

	457 mov rbp, rsp

	458 SHADOW_ARGS_TO_STACK 5

	459 SAVE_XMM 7

	460 GET_GOT rbx

	461 push rsi

	462 push rdi

	463 ; end prolog

	464

	465 ALIGN_STACK 16, rax

	466 sub rsp, 16

	467

	468 ; create flimit4 at [rsp]

	469 mov eax, dword ptr arg(4) ;flimit

	470 mov [rsp], eax

	471 mov [rsp+4], eax

	472 mov [rsp+8], eax

	473 mov [rsp+12], eax

	474 %define flimit4 [rsp]

	475

	476

	477 ;for(r=0;r<rows;r++)

	478 .ip_row_loop:

	479

	480 xor rdx, rdx ;sumsq=0;

	481 xor rcx, rcx ;sum=0;

	482 mov rsi, arg(0); s

	483 mov rdi, -8

	484 .ip_var_loop:

	485 ;for(i=-8;i<=6;i++)

	486 ;{

	487 ; sumsq += s[i]*s[i];

	488 ; sum += s[i];

	489 ;}

	490 movzx eax, byte [rsi+rdi]

	491 add ecx, eax

	492 mul al

	493 add edx, eax

	494 add rdi, 1

	495 cmp rdi, 6

	496 jle .ip_var_loop

	497

	498

	499 ;mov rax, sumsq

	500 ;movd xmm7, rax

	501 movd xmm7, edx

	502

	503 ;mov rax, sum

	504 ;movd xmm6, rax

	505 movd xmm6, ecx

	506

	507 mov rsi, arg(0) ;s

	508 xor rcx, rcx

	509

	510 movsxd rdx, dword arg(3) ;cols

	511 add rdx, 8

	512 pxor mm0, mm0

	513 pxor mm1, mm1

	514

	515 pxor xmm0, xmm0

	516 .nextcol4:

	517

	518 movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5

	519 movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10

	520

	521 punpcklbw xmm1, xmm0 ; expanding

	522 punpcklbw xmm2, xmm0 ; expanding

	523

	524 punpcklwd xmm1, xmm0 ; expanding to dwords

	525 punpcklwd xmm2, xmm0 ; expanding to dwords

	526

	527 psubd xmm2, xmm1 ; 7--8 8--7 9--6 10- -5

	528 paddd xmm1, xmm1 ; -82 -72 -62 -5 2

	529

	530 paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+ -5

	531 pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5

	532

	533 paddd xmm6, xmm2

	534 paddd xmm7, xmm1

	535

	536 pshufd xmm6, xmm6, 0 ; duplicate the last one s

	537 pshufd xmm7, xmm7, 0 ; duplicate the last one s

	538

	539 psrldq xmm1, 4 ; 8--7 9--6 10--5 000 0

	540 psrldq xmm2, 4 ; 8--7 9--6 10--5 000 0

	541

	542 pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8- -7 squared

	543 pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8- -7 squared

	544

	545 paddd xmm6, xmm4

	546 paddd xmm7, xmm3

	547

	548 pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9- -6 squared

	549 pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9- -6 squared

	550

	551 paddd xmm7, xmm3

	552 paddd xmm6, xmm4

	553

	554 pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8- -7 squared

	555 pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8- -7 squared

	556

	557 paddd xmm7, xmm3

	558 paddd xmm6, xmm4

	559

	560 movdqa xmm3, xmm6

	561 pmaddwd xmm3, xmm3

	562

	563 movdqa xmm5, xmm7

	564 pslld xmm5, 4

	565

	566 psubd xmm5, xmm7

	567 psubd xmm5, xmm3

	568

	569 psubd xmm5, flimit4

	570 psrad xmm5, 31

	571

	572 packssdw xmm5, xmm0

	573 packsswb xmm5, xmm0

	574

	575 movd xmm1, DWORD PTR [rsi+rcx]

	576 movq xmm2, xmm1

	577

	578 punpcklbw xmm1, xmm0

	579 punpcklwd xmm1, xmm0

	580

	581 paddd xmm1, xmm6

	582 paddd xmm1, [GLOBAL(four8s)]

	583

	584 psrad xmm1, 4

	585 packssdw xmm1, xmm0

	586

	587 packuswb xmm1, xmm0

	588 pand xmm1, xmm5

	589

	590 pandn xmm5, xmm2

	591 por xmm5, xmm1

	592

	593 movd [rsi+rcx-8], mm0

	594 movq mm0, mm1

	595

	596 movdq2q mm1, xmm5

	597 psrldq xmm7, 12

	598

	599 psrldq xmm6, 12

	600 add rcx, 4

	601

	602 cmp rcx, rdx

	603 jl .nextcol4

	604

	605 ;s+=pitch;

	606 movsxd rax, dword arg(1)

	607 add arg(0), rax

	608

	609 sub dword arg(2), 1 ;rows-=1

	610 cmp dword arg(2), 0

	611 jg .ip_row_loop

	612

	613 add rsp, 16

	614 pop rsp

	615

	616 ; begin epilog

	617 pop rdi

	618 pop rsi

	619 RESTORE_GOT

	620 RESTORE_XMM

	621 UNSHADOW_ARGS

	622 pop rbp

	623 ret

	624 %undef flimit4

	625

	626

	627 ;void vp9_plane_add_noise_wmt (unsigned char Start, unsigned char noise,

	628 ; unsigned char blackclamp[16],

	629 ; unsigned char whiteclamp[16],

	630 ; unsigned char bothclamp[16],

	631 ; unsigned int Width, unsigned int Height, int Pitch)

	632 extern sym(rand)

	633 global sym(vp9_plane_add_noise_wmt)

	634 sym(vp9_plane_add_noise_wmt):

	635 push rbp

	636 mov rbp, rsp

	637 SHADOW_ARGS_TO_STACK 8

	638 GET_GOT rbx

	639 push rsi

	640 push rdi

	641 ; end prolog

	642

	643 .addnoise_loop:

	644 call sym(rand) WRT_PLT

	645 mov rcx, arg(1) ;noise

	646 and rax, 0xff

	647 add rcx, rax

	648

	649 ; we rely on the fact that the clamping vectors are stored contiguously

	650 ; in black/white/both order. Note that we have to reload this here because

	651 ; rdx could be trashed by rand()

	652 mov rdx, arg(2) ; blackclamp

	653

	654

	655 mov rdi, rcx

	656 movsxd rcx, dword arg(5) ;[Width]

	657 mov rsi, arg(0) ;Pos

	658 xor rax,rax

	659

	660 .addnoise_nextset:

	661 movdqu xmm1,[rsi+rax] ; get the source

	662

	663 psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise

	664 paddusb xmm1, [rdx+32] ;bothclamp

	665 psubusb xmm1, [rdx+16] ;whiteclamp

	666

	667 movdqu xmm2,[rdi+rax] ; get the noise for this line

	668 paddb xmm1,xmm2 ; add it in

	669 movdqu [rsi+rax],xmm1 ; store the result

	670

	671 add rax,16 ; move to the next line

	672

	673 cmp rax, rcx

	674 jl .addnoise_nextset

	675

	676 movsxd rax, dword arg(7) ; Pitch

	677 add arg(0), rax ; Start += Pitch

	678 sub dword arg(6), 1 ; Height -= 1

	679 jg .addnoise_loop

	680

	681 ; begin epilog

	682 pop rdi

	683 pop rsi

	684 RESTORE_GOT

	685 UNSHADOW_ARGS

	686 pop rbp

	687 ret

	688

	689

	690 SECTION_RODATA

	691 align 16

	692 rd42:

	693 times 8 dw 0x04

	694 four8s:

	695 times 4 dd 8

OLD	NEW

« libvpx.gyp ('K') | « source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm ('k') | source/libvpx/vp9/common/x86/vp9_postproc_x86.h » ('j') | no next file with comments »