OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 291 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
302 | 302 |
303 pop esi | 303 pop esi |
304 vzeroupper | 304 vzeroupper |
305 ret | 305 ret |
306 } | 306 } |
307 } | 307 } |
308 #endif // HAS_SCALEROWDOWN2_AVX2 | 308 #endif // HAS_SCALEROWDOWN2_AVX2 |
309 | 309 |
310 // Point samples 32 pixels to 8 pixels. | 310 // Point samples 32 pixels to 8 pixels. |
311 __declspec(naked) | 311 __declspec(naked) |
312 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 312 void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
313 uint8* dst_ptr, int dst_width) { | 313 uint8* dst_ptr, int dst_width) { |
314 __asm { | 314 __asm { |
315 mov eax, [esp + 4] // src_ptr | 315 mov eax, [esp + 4] // src_ptr |
316 // src_stride ignored | 316 // src_stride ignored |
317 mov edx, [esp + 12] // dst_ptr | 317 mov edx, [esp + 12] // dst_ptr |
318 mov ecx, [esp + 16] // dst_width | 318 mov ecx, [esp + 16] // dst_width |
319 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 | 319 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 |
320 psrld xmm5, 24 | 320 psrld xmm5, 24 |
321 pslld xmm5, 16 | 321 pslld xmm5, 16 |
322 | 322 |
(...skipping 10 matching lines...) Expand all Loading... |
333 lea edx, [edx + 8] | 333 lea edx, [edx + 8] |
334 sub ecx, 8 | 334 sub ecx, 8 |
335 jg wloop | 335 jg wloop |
336 | 336 |
337 ret | 337 ret |
338 } | 338 } |
339 } | 339 } |
340 | 340 |
341 // Blends 32x4 rectangle to 8x1. | 341 // Blends 32x4 rectangle to 8x1. |
342 __declspec(naked) | 342 __declspec(naked) |
343 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 343 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
344 uint8* dst_ptr, int dst_width) { | 344 uint8* dst_ptr, int dst_width) { |
345 __asm { | 345 __asm { |
346 push esi | 346 push esi |
347 push edi | 347 push edi |
348 mov eax, [esp + 8 + 4] // src_ptr | 348 mov eax, [esp + 8 + 4] // src_ptr |
349 mov esi, [esp + 8 + 8] // src_stride | 349 mov esi, [esp + 8 + 8] // src_stride |
350 mov edx, [esp + 8 + 12] // dst_ptr | 350 mov edx, [esp + 8 + 12] // dst_ptr |
351 mov ecx, [esp + 8 + 16] // dst_width | 351 mov ecx, [esp + 8 + 16] // dst_width |
352 lea edi, [esi + esi * 2] // src_stride * 3 | 352 lea edi, [esi + esi * 2] // src_stride * 3 |
353 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff | 353 pcmpeqb xmm4, xmm4 // constant 0x0101 |
354 psrlw xmm7, 8 | 354 psrlw xmm4, 15 |
| 355 movdqa xmm5, xmm4 |
| 356 packuswb xmm4, xmm4 |
| 357 psllw xmm5, 3 // constant 0x0008 |
355 | 358 |
356 wloop: | 359 wloop: |
357 movdqu xmm0, [eax] // average rows | 360 movdqu xmm0, [eax] // average rows |
358 movdqu xmm1, [eax + 16] | 361 movdqu xmm1, [eax + 16] |
359 movdqu xmm2, [eax + esi] | 362 movdqu xmm2, [eax + esi] |
360 movdqu xmm3, [eax + esi + 16] | 363 movdqu xmm3, [eax + esi + 16] |
361 pavgb xmm0, xmm2 | 364 pmaddubsw xmm0, xmm4 // horizontal add |
362 pavgb xmm1, xmm3 | 365 pmaddubsw xmm1, xmm4 |
| 366 pmaddubsw xmm2, xmm4 |
| 367 pmaddubsw xmm3, xmm4 |
| 368 paddw xmm0, xmm2 // vertical add rows 0, 1 |
| 369 paddw xmm1, xmm3 |
363 movdqu xmm2, [eax + esi * 2] | 370 movdqu xmm2, [eax + esi * 2] |
364 movdqu xmm3, [eax + esi * 2 + 16] | 371 movdqu xmm3, [eax + esi * 2 + 16] |
365 movdqu xmm4, [eax + edi] | 372 pmaddubsw xmm2, xmm4 |
366 movdqu xmm5, [eax + edi + 16] | 373 pmaddubsw xmm3, xmm4 |
| 374 paddw xmm0, xmm2 // add row 2 |
| 375 paddw xmm1, xmm3 |
| 376 movdqu xmm2, [eax + edi] |
| 377 movdqu xmm3, [eax + edi + 16] |
367 lea eax, [eax + 32] | 378 lea eax, [eax + 32] |
368 pavgb xmm2, xmm4 | 379 pmaddubsw xmm2, xmm4 |
369 pavgb xmm3, xmm5 | 380 pmaddubsw xmm3, xmm4 |
370 pavgb xmm0, xmm2 | 381 paddw xmm0, xmm2 // add row 3 |
371 pavgb xmm1, xmm3 | 382 paddw xmm1, xmm3 |
372 | 383 phaddw xmm0, xmm1 |
373 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) | 384 paddw xmm0, xmm5 // + 8 for round |
374 psrlw xmm0, 8 | 385 psrlw xmm0, 4 // /16 for average of 4 * 4 |
375 movdqa xmm3, xmm1 | |
376 psrlw xmm1, 8 | |
377 pand xmm2, xmm7 | |
378 pand xmm3, xmm7 | |
379 pavgw xmm0, xmm2 | |
380 pavgw xmm1, xmm3 | |
381 packuswb xmm0, xmm1 | |
382 | |
383 movdqa xmm2, xmm0 // average columns (16 to 8 pixels) | |
384 psrlw xmm0, 8 | |
385 pand xmm2, xmm7 | |
386 pavgw xmm0, xmm2 | |
387 packuswb xmm0, xmm0 | 386 packuswb xmm0, xmm0 |
388 | |
389 movq qword ptr [edx], xmm0 | 387 movq qword ptr [edx], xmm0 |
390 lea edx, [edx + 8] | 388 lea edx, [edx + 8] |
391 sub ecx, 8 | 389 sub ecx, 8 |
392 jg wloop | 390 jg wloop |
393 | 391 |
394 pop edi | 392 pop edi |
395 pop esi | 393 pop esi |
396 ret | 394 ret |
397 } | 395 } |
398 } | 396 } |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
437 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, | 435 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
438 uint8* dst_ptr, int dst_width) { | 436 uint8* dst_ptr, int dst_width) { |
439 __asm { | 437 __asm { |
440 push esi | 438 push esi |
441 push edi | 439 push edi |
442 mov eax, [esp + 8 + 4] // src_ptr | 440 mov eax, [esp + 8 + 4] // src_ptr |
443 mov esi, [esp + 8 + 8] // src_stride | 441 mov esi, [esp + 8 + 8] // src_stride |
444 mov edx, [esp + 8 + 12] // dst_ptr | 442 mov edx, [esp + 8 + 12] // dst_ptr |
445 mov ecx, [esp + 8 + 16] // dst_width | 443 mov ecx, [esp + 8 + 16] // dst_width |
446 lea edi, [esi + esi * 2] // src_stride * 3 | 444 lea edi, [esi + esi * 2] // src_stride * 3 |
447 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff | 445 vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 |
448 vpsrlw ymm7, ymm7, 8 | 446 vpsrlw ymm4, ymm4, 15 |
| 447 vpsllw ymm5, ymm4, 3 // constant 0x0008 |
| 448 vpackuswb ymm4, ymm4, ymm4 |
449 | 449 |
450 wloop: | 450 wloop: |
451 vmovdqu ymm0, [eax] // average rows | 451 vmovdqu ymm0, [eax] // average rows |
452 vmovdqu ymm1, [eax + 32] | 452 vmovdqu ymm1, [eax + 32] |
453 vpavgb ymm0, ymm0, [eax + esi] | 453 vmovdqu ymm2, [eax + esi] |
454 vpavgb ymm1, ymm1, [eax + esi + 32] | 454 vmovdqu ymm3, [eax + esi + 32] |
| 455 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add |
| 456 vpmaddubsw ymm1, ymm1, ymm4 |
| 457 vpmaddubsw ymm2, ymm2, ymm4 |
| 458 vpmaddubsw ymm3, ymm3, ymm4 |
| 459 vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 |
| 460 vpaddw ymm1, ymm1, ymm3 |
455 vmovdqu ymm2, [eax + esi * 2] | 461 vmovdqu ymm2, [eax + esi * 2] |
456 vmovdqu ymm3, [eax + esi * 2 + 32] | 462 vmovdqu ymm3, [eax + esi * 2 + 32] |
457 vpavgb ymm2, ymm2, [eax + edi] | 463 vpmaddubsw ymm2, ymm2, ymm4 |
458 vpavgb ymm3, ymm3, [eax + edi + 32] | 464 vpmaddubsw ymm3, ymm3, ymm4 |
459 lea eax, [eax + 64] | 465 vpaddw ymm0, ymm0, ymm2 // add row 2 |
460 vpavgb ymm0, ymm0, ymm2 | 466 vpaddw ymm1, ymm1, ymm3 |
461 vpavgb ymm1, ymm1, ymm3 | 467 vmovdqu ymm2, [eax + edi] |
462 | 468 vmovdqu ymm3, [eax + edi + 32] |
463 vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels) | 469 lea eax, [eax + 64] |
464 vpand ymm3, ymm1, ymm7 | 470 vpmaddubsw ymm2, ymm2, ymm4 |
465 vpsrlw ymm0, ymm0, 8 | 471 vpmaddubsw ymm3, ymm3, ymm4 |
466 vpsrlw ymm1, ymm1, 8 | 472 vpaddw ymm0, ymm0, ymm2 // add row 3 |
467 vpavgw ymm0, ymm0, ymm2 | 473 vpaddw ymm1, ymm1, ymm3 |
468 vpavgw ymm1, ymm1, ymm3 | 474 vphaddw ymm0, ymm0, ymm1 // mutates |
469 vpackuswb ymm0, ymm0, ymm1 | 475 vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw |
470 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb | 476 vpaddw ymm0, ymm0, ymm5 // + 8 for round |
471 | 477 vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 |
472 vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels) | |
473 vpsrlw ymm0, ymm0, 8 | |
474 vpavgw ymm0, ymm0, ymm2 | |
475 vpackuswb ymm0, ymm0, ymm0 | 478 vpackuswb ymm0, ymm0, ymm0 |
476 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb | 479 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
477 | |
478 vmovdqu [edx], xmm0 | 480 vmovdqu [edx], xmm0 |
479 lea edx, [edx + 16] | 481 lea edx, [edx + 16] |
480 sub ecx, 16 | 482 sub ecx, 16 |
481 jg wloop | 483 jg wloop |
482 | 484 |
483 pop edi | 485 pop edi |
484 pop esi | 486 pop esi |
485 vzeroupper | 487 vzeroupper |
486 ret | 488 ret |
487 } | 489 } |
(...skipping 858 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1346 idiv ecx | 1348 idiv ecx |
1347 ret | 1349 ret |
1348 } | 1350 } |
1349 } | 1351 } |
1350 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) | 1352 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) |
1351 | 1353 |
1352 #ifdef __cplusplus | 1354 #ifdef __cplusplus |
1353 } // extern "C" | 1355 } // extern "C" |
1354 } // namespace libyuv | 1356 } // namespace libyuv |
1355 #endif | 1357 #endif |
OLD | NEW |