OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 %define VP9_FILTER_WEIGHT 128 |
| 15 %define VP9_FILTER_SHIFT 7 |
| 16 |
| 17 ;void vp9_post_proc_down_and_across_mmx |
| 18 ;( |
| 19 ; unsigned char *src_ptr, |
| 20 ; unsigned char *dst_ptr, |
| 21 ; int src_pixels_per_line, |
| 22 ; int dst_pixels_per_line, |
| 23 ; int rows, |
| 24 ; int cols, |
| 25 ; int flimit |
| 26 ;) |
| 27 global sym(vp9_post_proc_down_and_across_mmx) |
| 28 sym(vp9_post_proc_down_and_across_mmx): |
| 29 push rbp |
| 30 mov rbp, rsp |
| 31 SHADOW_ARGS_TO_STACK 7 |
| 32 GET_GOT rbx |
| 33 push rsi |
| 34 push rdi |
| 35 ; end prolog |
| 36 |
| 37 %if ABI_IS_32BIT=1 && CONFIG_PIC=1 |
| 38 ; move the global rd onto the stack, since we don't have enough registers |
| 39 ; to do PIC addressing |
| 40 movq mm0, [GLOBAL(rd)] |
| 41 sub rsp, 8 |
| 42 movq [rsp], mm0 |
| 43 %define RD [rsp] |
| 44 %else |
| 45 %define RD [GLOBAL(rd)] |
| 46 %endif |
| 47 |
| 48 push rbx |
| 49 lea rbx, [GLOBAL(Blur)] |
| 50 movd mm2, dword ptr arg(6) ;flimit |
| 51 punpcklwd mm2, mm2 |
| 52 punpckldq mm2, mm2 |
| 53 |
| 54 mov rsi, arg(0) ;src_ptr |
| 55 mov rdi, arg(1) ;dst_ptr |
| 56 |
| 57 movsxd rcx, DWORD PTR arg(4) ;rows |
| 58 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pit
ch? |
| 59 pxor mm0, mm0 ; mm0 = 00000000 |
| 60 |
| 61 .nextrow: |
| 62 |
| 63 xor rdx, rdx ; clear out rdx for use as loop counte
r |
| 64 .nextcol: |
| 65 |
| 66 pxor mm7, mm7 ; mm7 = 00000000 |
| 67 movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps |
| 68 movq mm3, [rsi] ; mm4 = r0 p0..p7 |
| 69 punpcklbw mm3, mm0 ; mm3 = p0..p3 |
| 70 movq mm1, mm3 ; mm1 = p0..p3 |
| 71 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers |
| 72 |
| 73 movq mm6, [rbx + 48] ; mm6 = kernel 3 taps |
| 74 movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 |
| 75 punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 |
| 76 pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers |
| 77 paddusw mm3, mm6 ; mm3 += mm6 |
| 78 |
| 79 ; thresholding |
| 80 movq mm7, mm1 ; mm7 = r0 p0..p3 |
| 81 psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 |
| 82 psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 |
| 83 paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) |
| 84 pcmpgtw mm7, mm2 |
| 85 |
| 86 movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers |
| 87 movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 |
| 88 punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 |
| 89 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers |
| 90 paddusw mm3, mm6 ; mm3 += mm5 |
| 91 |
| 92 ; thresholding |
| 93 movq mm6, mm1 ; mm6 = r0 p0..p3 |
| 94 psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 |
| 95 psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 |
| 96 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) |
| 97 pcmpgtw mm6, mm2 |
| 98 por mm7, mm6 ; accumulate thresholds |
| 99 |
| 100 |
| 101 neg rax |
| 102 movq mm6, [rbx ] ; kernel 0 taps |
| 103 movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 |
| 104 punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 |
| 105 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers |
| 106 paddusw mm3, mm6 ; mm3 += mm5 |
| 107 |
| 108 ; thresholding |
| 109 movq mm6, mm1 ; mm6 = r0 p0..p3 |
| 110 psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 |
| 111 psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 |
| 112 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) |
| 113 pcmpgtw mm6, mm2 |
| 114 por mm7, mm6 ; accumulate thresholds |
| 115 |
| 116 movq mm6, [rbx + 16] ; kernel 1 taps |
| 117 movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 |
| 118 punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 |
| 119 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. |
| 120 paddusw mm3, mm6 ; mm3 += mm5 |
| 121 |
| 122 ; thresholding |
| 123 movq mm6, mm1 ; mm6 = r0 p0..p3 |
| 124 psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 |
| 125 psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 |
| 126 paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) |
| 127 pcmpgtw mm6, mm2 |
| 128 por mm7, mm6 ; accumulate thresholds |
| 129 |
| 130 |
| 131 paddusw mm3, RD ; mm3 += round value |
| 132 psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 |
| 133 |
| 134 pand mm1, mm7 ; mm1 select vals > thresh from source |
| 135 pandn mm7, mm3 ; mm7 select vals < thresh from blurre
d result |
| 136 paddusw mm1, mm7 ; combination |
| 137 |
| 138 packuswb mm1, mm0 ; pack to bytes |
| 139 |
| 140 movd [rdi], mm1 ; |
| 141 neg rax ; pitch is positive |
| 142 |
| 143 |
| 144 add rsi, 4 |
| 145 add rdi, 4 |
| 146 add rdx, 4 |
| 147 |
| 148 cmp edx, dword ptr arg(5) ;cols |
| 149 jl .nextcol |
| 150 ; done with the all cols, start the across filtering in place |
| 151 sub rsi, rdx |
| 152 sub rdi, rdx |
| 153 |
| 154 |
| 155 push rax |
| 156 xor rdx, rdx |
| 157 mov rax, [rdi-4]; |
| 158 |
| 159 .acrossnextcol: |
| 160 pxor mm7, mm7 ; mm7 = 00000000 |
| 161 movq mm6, [rbx + 32 ] ; |
| 162 movq mm4, [rdi+rdx] ; mm4 = p0..p7 |
| 163 movq mm3, mm4 ; mm3 = p0..p7 |
| 164 punpcklbw mm3, mm0 ; mm3 = p0..p3 |
| 165 movq mm1, mm3 ; mm1 = p0..p3 |
| 166 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers |
| 167 |
| 168 movq mm6, [rbx + 48] |
| 169 psrlq mm4, 8 ; mm4 = p1..p7 |
| 170 movq mm5, mm4 ; mm5 = p1..p7 |
| 171 punpcklbw mm5, mm0 ; mm5 = p1..p4 |
| 172 pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers |
| 173 paddusw mm3, mm6 ; mm3 += mm6 |
| 174 |
| 175 ; thresholding |
| 176 movq mm7, mm1 ; mm7 = p0..p3 |
| 177 psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 |
| 178 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 |
| 179 paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) |
| 180 pcmpgtw mm7, mm2 |
| 181 |
| 182 movq mm6, [rbx + 64 ] |
| 183 psrlq mm4, 8 ; mm4 = p2..p7 |
| 184 movq mm5, mm4 ; mm5 = p2..p7 |
| 185 punpcklbw mm5, mm0 ; mm5 = p2..p5 |
| 186 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers |
| 187 paddusw mm3, mm6 ; mm3 += mm5 |
| 188 |
| 189 ; thresholding |
| 190 movq mm6, mm1 ; mm6 = p0..p3 |
| 191 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 |
| 192 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 |
| 193 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) |
| 194 pcmpgtw mm6, mm2 |
| 195 por mm7, mm6 ; accumulate thresholds |
| 196 |
| 197 |
| 198 movq mm6, [rbx ] |
| 199 movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 |
| 200 movq mm5, mm4 ; mm5 = p-2..p5 |
| 201 punpcklbw mm5, mm0 ; mm5 = p-2..p1 |
| 202 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers |
| 203 paddusw mm3, mm6 ; mm3 += mm5 |
| 204 |
| 205 ; thresholding |
| 206 movq mm6, mm1 ; mm6 = p0..p3 |
| 207 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 |
| 208 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 |
| 209 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) |
| 210 pcmpgtw mm6, mm2 |
| 211 por mm7, mm6 ; accumulate thresholds |
| 212 |
| 213 movq mm6, [rbx + 16] |
| 214 psrlq mm4, 8 ; mm4 = p-1..p5 |
| 215 punpcklbw mm4, mm0 ; mm4 = p-1..p2 |
| 216 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. |
| 217 paddusw mm3, mm6 ; mm3 += mm5 |
| 218 |
| 219 ; thresholding |
| 220 movq mm6, mm1 ; mm6 = p0..p3 |
| 221 psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 |
| 222 psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 |
| 223 paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) |
| 224 pcmpgtw mm6, mm2 |
| 225 por mm7, mm6 ; accumulate thresholds |
| 226 |
| 227 paddusw mm3, RD ; mm3 += round value |
| 228 psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 |
| 229 |
| 230 pand mm1, mm7 ; mm1 select vals > thresh from source |
| 231 pandn mm7, mm3 ; mm7 select vals < thresh from blurre
d result |
| 232 paddusw mm1, mm7 ; combination |
| 233 |
| 234 packuswb mm1, mm0 ; pack to bytes |
| 235 mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes |
| 236 movd eax, mm1 |
| 237 |
| 238 add rdx, 4 |
| 239 cmp edx, dword ptr arg(5) ;cols |
| 240 jl .acrossnextcol; |
| 241 |
| 242 mov DWORD PTR [rdi+rdx-4], eax |
| 243 pop rax |
| 244 |
| 245 ; done with this rwo |
| 246 add rsi,rax ; next line |
| 247 movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pit
ch? |
| 248 add rdi,rax ; next destination |
| 249 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pit
ch? |
| 250 |
| 251 dec rcx ; decrement count |
| 252 jnz .nextrow ; next row |
| 253 pop rbx |
| 254 |
| 255 ; begin epilog |
| 256 pop rdi |
| 257 pop rsi |
| 258 RESTORE_GOT |
| 259 UNSHADOW_ARGS |
| 260 pop rbp |
| 261 ret |
| 262 %undef RD |
| 263 |
| 264 |
| 265 ;void vp9_mbpost_proc_down_mmx(unsigned char *dst, |
| 266 ; int pitch, int rows, int cols,int flimit) |
| 267 extern sym(vp9_rv) |
| 268 global sym(vp9_mbpost_proc_down_mmx) |
| 269 sym(vp9_mbpost_proc_down_mmx): |
| 270 push rbp |
| 271 mov rbp, rsp |
| 272 SHADOW_ARGS_TO_STACK 5 |
| 273 GET_GOT rbx |
| 274 push rsi |
| 275 push rdi |
| 276 ; end prolog |
| 277 |
| 278 ALIGN_STACK 16, rax |
| 279 sub rsp, 136 |
| 280 |
| 281 ; unsigned char d[16][8] at [rsp] |
| 282 ; create flimit2 at [rsp+128] |
| 283 mov eax, dword ptr arg(4) ;flimit |
| 284 mov [rsp+128], eax |
| 285 mov [rsp+128+4], eax |
| 286 %define flimit2 [rsp+128] |
| 287 |
| 288 %if ABI_IS_32BIT=0 |
| 289 lea r8, [GLOBAL(sym(vp9_rv))] |
| 290 %endif |
| 291 |
| 292 ;rows +=8; |
| 293 add dword ptr arg(2), 8 |
| 294 |
| 295 ;for(c=0; c<cols; c+=4) |
| 296 .loop_col: |
| 297 mov rsi, arg(0) ;s |
| 298 pxor mm0, mm0 ; |
| 299 |
| 300 movsxd rax, dword ptr arg(1) ;pitch ; |
| 301 neg rax ; rax = -pitch |
| 302 |
| 303 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch
*8] |
| 304 neg rax |
| 305 |
| 306 |
| 307 pxor mm5, mm5 |
| 308 pxor mm6, mm6 ; |
| 309 |
| 310 pxor mm7, mm7 ; |
| 311 mov rdi, rsi |
| 312 |
| 313 mov rcx, 15 ; |
| 314 |
| 315 .loop_initvar: |
| 316 movd mm1, DWORD PTR [rdi]; |
| 317 punpcklbw mm1, mm0 ; |
| 318 |
| 319 paddw mm5, mm1 ; |
| 320 pmullw mm1, mm1 ; |
| 321 |
| 322 movq mm2, mm1 ; |
| 323 punpcklwd mm1, mm0 ; |
| 324 |
| 325 punpckhwd mm2, mm0 ; |
| 326 paddd mm6, mm1 ; |
| 327 |
| 328 paddd mm7, mm2 ; |
| 329 lea rdi, [rdi+rax] ; |
| 330 |
| 331 dec rcx |
| 332 jne .loop_initvar |
| 333 ;save the var and sum |
| 334 xor rdx, rdx |
| 335 .loop_row: |
| 336 movd mm1, DWORD PTR [rsi] ; [s-pitch*8] |
| 337 movd mm2, DWORD PTR [rdi] ; [s+pitch*7] |
| 338 |
| 339 punpcklbw mm1, mm0 |
| 340 punpcklbw mm2, mm0 |
| 341 |
| 342 paddw mm5, mm2 |
| 343 psubw mm5, mm1 |
| 344 |
| 345 pmullw mm2, mm2 |
| 346 movq mm4, mm2 |
| 347 |
| 348 punpcklwd mm2, mm0 |
| 349 punpckhwd mm4, mm0 |
| 350 |
| 351 paddd mm6, mm2 |
| 352 paddd mm7, mm4 |
| 353 |
| 354 pmullw mm1, mm1 |
| 355 movq mm2, mm1 |
| 356 |
| 357 punpcklwd mm1, mm0 |
| 358 psubd mm6, mm1 |
| 359 |
| 360 punpckhwd mm2, mm0 |
| 361 psubd mm7, mm2 |
| 362 |
| 363 |
| 364 movq mm3, mm6 |
| 365 pslld mm3, 4 |
| 366 |
| 367 psubd mm3, mm6 |
| 368 movq mm1, mm5 |
| 369 |
| 370 movq mm4, mm5 |
| 371 pmullw mm1, mm1 |
| 372 |
| 373 pmulhw mm4, mm4 |
| 374 movq mm2, mm1 |
| 375 |
| 376 punpcklwd mm1, mm4 |
| 377 punpckhwd mm2, mm4 |
| 378 |
| 379 movq mm4, mm7 |
| 380 pslld mm4, 4 |
| 381 |
| 382 psubd mm4, mm7 |
| 383 |
| 384 psubd mm3, mm1 |
| 385 psubd mm4, mm2 |
| 386 |
| 387 psubd mm3, flimit2 |
| 388 psubd mm4, flimit2 |
| 389 |
| 390 psrad mm3, 31 |
| 391 psrad mm4, 31 |
| 392 |
| 393 packssdw mm3, mm4 |
| 394 packsswb mm3, mm0 |
| 395 |
| 396 movd mm1, DWORD PTR [rsi+rax*8] |
| 397 |
| 398 movq mm2, mm1 |
| 399 punpcklbw mm1, mm0 |
| 400 |
| 401 paddw mm1, mm5 |
| 402 mov rcx, rdx |
| 403 |
| 404 and rcx, 127 |
| 405 %if ABI_IS_32BIT=1 && CONFIG_PIC=1 |
| 406 push rax |
| 407 lea rax, [GLOBAL(sym(vp9_rv))] |
| 408 movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2] |
| 409 pop rax |
| 410 %elif ABI_IS_32BIT=0 |
| 411 movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2] |
| 412 %else |
| 413 movq mm4, [sym(vp9_rv) + rcx*2] |
| 414 %endif |
| 415 paddw mm1, mm4 |
| 416 ;paddw xmm1, eight8s |
| 417 psraw mm1, 4 |
| 418 |
| 419 packuswb mm1, mm0 |
| 420 pand mm1, mm3 |
| 421 |
| 422 pandn mm3, mm2 |
| 423 por mm1, mm3 |
| 424 |
| 425 and rcx, 15 |
| 426 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] |
| 427 |
| 428 mov rcx, rdx |
| 429 sub rcx, 8 |
| 430 |
| 431 and rcx, 15 |
| 432 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] |
| 433 |
| 434 movd [rsi], mm1 |
| 435 lea rsi, [rsi+rax] |
| 436 |
| 437 lea rdi, [rdi+rax] |
| 438 add rdx, 1 |
| 439 |
| 440 cmp edx, dword arg(2) ;rows |
| 441 jl .loop_row |
| 442 |
| 443 |
| 444 add dword arg(0), 4 ; s += 4 |
| 445 sub dword arg(3), 4 ; cols -= 4 |
| 446 cmp dword arg(3), 0 |
| 447 jg .loop_col |
| 448 |
| 449 add rsp, 136 |
| 450 pop rsp |
| 451 |
| 452 ; begin epilog |
| 453 pop rdi |
| 454 pop rsi |
| 455 RESTORE_GOT |
| 456 UNSHADOW_ARGS |
| 457 pop rbp |
| 458 ret |
| 459 %undef flimit2 |
| 460 |
| 461 |
| 462 ;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, |
| 463 ; unsigned char blackclamp[16], |
| 464 ; unsigned char whiteclamp[16], |
| 465 ; unsigned char bothclamp[16], |
| 466 ; unsigned int Width, unsigned int Height, int Pitch) |
| 467 extern sym(rand) |
| 468 global sym(vp9_plane_add_noise_mmx) |
| 469 sym(vp9_plane_add_noise_mmx): |
| 470 push rbp |
| 471 mov rbp, rsp |
| 472 SHADOW_ARGS_TO_STACK 8 |
| 473 GET_GOT rbx |
| 474 push rsi |
| 475 push rdi |
| 476 ; end prolog |
| 477 |
| 478 .addnoise_loop: |
| 479 call sym(rand) WRT_PLT |
| 480 mov rcx, arg(1) ;noise |
| 481 and rax, 0xff |
| 482 add rcx, rax |
| 483 |
| 484 ; we rely on the fact that the clamping vectors are stored contiguously |
| 485 ; in black/white/both order. Note that we have to reload this here because |
| 486 ; rdx could be trashed by rand() |
| 487 mov rdx, arg(2) ; blackclamp |
| 488 |
| 489 |
| 490 mov rdi, rcx |
| 491 movsxd rcx, dword arg(5) ;[Width] |
| 492 mov rsi, arg(0) ;Pos |
| 493 xor rax,rax |
| 494 |
| 495 .addnoise_nextset: |
| 496 movq mm1,[rsi+rax] ; get the source |
| 497 |
| 498 psubusb mm1, [rdx] ;blackclamp ; clamp both sides so w
e don't outrange adding noise |
| 499 paddusb mm1, [rdx+32] ;bothclamp |
| 500 psubusb mm1, [rdx+16] ;whiteclamp |
| 501 |
| 502 movq mm2,[rdi+rax] ; get the noise for this line |
| 503 paddb mm1,mm2 ; add it in |
| 504 movq [rsi+rax],mm1 ; store the result |
| 505 |
| 506 add rax,8 ; move to the next line |
| 507 |
| 508 cmp rax, rcx |
| 509 jl .addnoise_nextset |
| 510 |
| 511 movsxd rax, dword arg(7) ; Pitch |
| 512 add arg(0), rax ; Start += Pitch |
| 513 sub dword arg(6), 1 ; Height -= 1 |
| 514 jg .addnoise_loop |
| 515 |
| 516 ; begin epilog |
| 517 pop rdi |
| 518 pop rsi |
| 519 RESTORE_GOT |
| 520 UNSHADOW_ARGS |
| 521 pop rbp |
| 522 ret |
| 523 |
| 524 |
| 525 SECTION_RODATA |
| 526 align 16 |
| 527 Blur: |
| 528 times 16 dw 16 |
| 529 times 8 dw 64 |
| 530 times 16 dw 16 |
| 531 times 8 dw 0 |
| 532 |
| 533 rd: |
| 534 times 4 dw 0x40 |
OLD | NEW |