Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(53)

Side by Side Diff: source/scale_win.cc

Issue 1525033005: change scale down by 4 to use rounding. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: scale by 4 uses ssse3 now Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/scale_gcc.cc ('k') | unit_test/scale_test.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 291 matching lines...) Expand 10 before | Expand all | Expand 10 after
302 302
303 pop esi 303 pop esi
304 vzeroupper 304 vzeroupper
305 ret 305 ret
306 } 306 }
307 } 307 }
308 #endif // HAS_SCALEROWDOWN2_AVX2 308 #endif // HAS_SCALEROWDOWN2_AVX2
309 309
310 // Point samples 32 pixels to 8 pixels. 310 // Point samples 32 pixels to 8 pixels.
311 __declspec(naked) 311 __declspec(naked)
312 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 312 void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
313 uint8* dst_ptr, int dst_width) { 313 uint8* dst_ptr, int dst_width) {
314 __asm { 314 __asm {
315 mov eax, [esp + 4] // src_ptr 315 mov eax, [esp + 4] // src_ptr
316 // src_stride ignored 316 // src_stride ignored
317 mov edx, [esp + 12] // dst_ptr 317 mov edx, [esp + 12] // dst_ptr
318 mov ecx, [esp + 16] // dst_width 318 mov ecx, [esp + 16] // dst_width
319 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 319 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
320 psrld xmm5, 24 320 psrld xmm5, 24
321 pslld xmm5, 16 321 pslld xmm5, 16
322 322
(...skipping 10 matching lines...) Expand all
333 lea edx, [edx + 8] 333 lea edx, [edx + 8]
334 sub ecx, 8 334 sub ecx, 8
335 jg wloop 335 jg wloop
336 336
337 ret 337 ret
338 } 338 }
339 } 339 }
340 340
341 // Blends 32x4 rectangle to 8x1. 341 // Blends 32x4 rectangle to 8x1.
342 __declspec(naked) 342 __declspec(naked)
343 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 343 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
344 uint8* dst_ptr, int dst_width) { 344 uint8* dst_ptr, int dst_width) {
345 __asm { 345 __asm {
346 push esi 346 push esi
347 push edi 347 push edi
348 mov eax, [esp + 8 + 4] // src_ptr 348 mov eax, [esp + 8 + 4] // src_ptr
349 mov esi, [esp + 8 + 8] // src_stride 349 mov esi, [esp + 8 + 8] // src_stride
350 mov edx, [esp + 8 + 12] // dst_ptr 350 mov edx, [esp + 8 + 12] // dst_ptr
351 mov ecx, [esp + 8 + 16] // dst_width 351 mov ecx, [esp + 8 + 16] // dst_width
352 lea edi, [esi + esi * 2] // src_stride * 3 352 lea edi, [esi + esi * 2] // src_stride * 3
353 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 353 pcmpeqb xmm4, xmm4 // constant 0x0101
354 psrlw xmm7, 8 354 psrlw xmm4, 15
355 movdqa xmm5, xmm4
356 packuswb xmm4, xmm4
357 psllw xmm5, 3 // constant 0x0008
355 358
356 wloop: 359 wloop:
357 movdqu xmm0, [eax] // average rows 360 movdqu xmm0, [eax] // average rows
358 movdqu xmm1, [eax + 16] 361 movdqu xmm1, [eax + 16]
359 movdqu xmm2, [eax + esi] 362 movdqu xmm2, [eax + esi]
360 movdqu xmm3, [eax + esi + 16] 363 movdqu xmm3, [eax + esi + 16]
361 pavgb xmm0, xmm2 364 pmaddubsw xmm0, xmm4 // horizontal add
362 pavgb xmm1, xmm3 365 pmaddubsw xmm1, xmm4
366 pmaddubsw xmm2, xmm4
367 pmaddubsw xmm3, xmm4
368 paddw xmm0, xmm2 // vertical add rows 0, 1
369 paddw xmm1, xmm3
363 movdqu xmm2, [eax + esi * 2] 370 movdqu xmm2, [eax + esi * 2]
364 movdqu xmm3, [eax + esi * 2 + 16] 371 movdqu xmm3, [eax + esi * 2 + 16]
365 movdqu xmm4, [eax + edi] 372 pmaddubsw xmm2, xmm4
366 movdqu xmm5, [eax + edi + 16] 373 pmaddubsw xmm3, xmm4
374 paddw xmm0, xmm2 // add row 2
375 paddw xmm1, xmm3
376 movdqu xmm2, [eax + edi]
377 movdqu xmm3, [eax + edi + 16]
367 lea eax, [eax + 32] 378 lea eax, [eax + 32]
368 pavgb xmm2, xmm4 379 pmaddubsw xmm2, xmm4
369 pavgb xmm3, xmm5 380 pmaddubsw xmm3, xmm4
370 pavgb xmm0, xmm2 381 paddw xmm0, xmm2 // add row 3
371 pavgb xmm1, xmm3 382 paddw xmm1, xmm3
372 383 phaddw xmm0, xmm1
373 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 384 paddw xmm0, xmm5 // + 8 for round
374 psrlw xmm0, 8 385 psrlw xmm0, 4 // /16 for average of 4 * 4
375 movdqa xmm3, xmm1
376 psrlw xmm1, 8
377 pand xmm2, xmm7
378 pand xmm3, xmm7
379 pavgw xmm0, xmm2
380 pavgw xmm1, xmm3
381 packuswb xmm0, xmm1
382
383 movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
384 psrlw xmm0, 8
385 pand xmm2, xmm7
386 pavgw xmm0, xmm2
387 packuswb xmm0, xmm0 386 packuswb xmm0, xmm0
388
389 movq qword ptr [edx], xmm0 387 movq qword ptr [edx], xmm0
390 lea edx, [edx + 8] 388 lea edx, [edx + 8]
391 sub ecx, 8 389 sub ecx, 8
392 jg wloop 390 jg wloop
393 391
394 pop edi 392 pop edi
395 pop esi 393 pop esi
396 ret 394 ret
397 } 395 }
398 } 396 }
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
437 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 435 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
438 uint8* dst_ptr, int dst_width) { 436 uint8* dst_ptr, int dst_width) {
439 __asm { 437 __asm {
440 push esi 438 push esi
441 push edi 439 push edi
442 mov eax, [esp + 8 + 4] // src_ptr 440 mov eax, [esp + 8 + 4] // src_ptr
443 mov esi, [esp + 8 + 8] // src_stride 441 mov esi, [esp + 8 + 8] // src_stride
444 mov edx, [esp + 8 + 12] // dst_ptr 442 mov edx, [esp + 8 + 12] // dst_ptr
445 mov ecx, [esp + 8 + 16] // dst_width 443 mov ecx, [esp + 8 + 16] // dst_width
446 lea edi, [esi + esi * 2] // src_stride * 3 444 lea edi, [esi + esi * 2] // src_stride * 3
447 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff 445 vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
448 vpsrlw ymm7, ymm7, 8 446 vpsrlw ymm4, ymm4, 15
447 vpsllw ymm5, ymm4, 3 // constant 0x0008
448 vpackuswb ymm4, ymm4, ymm4
449 449
450 wloop: 450 wloop:
451 vmovdqu ymm0, [eax] // average rows 451 vmovdqu ymm0, [eax] // average rows
452 vmovdqu ymm1, [eax + 32] 452 vmovdqu ymm1, [eax + 32]
453 vpavgb ymm0, ymm0, [eax + esi] 453 vmovdqu ymm2, [eax + esi]
454 vpavgb ymm1, ymm1, [eax + esi + 32] 454 vmovdqu ymm3, [eax + esi + 32]
455 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
456 vpmaddubsw ymm1, ymm1, ymm4
457 vpmaddubsw ymm2, ymm2, ymm4
458 vpmaddubsw ymm3, ymm3, ymm4
459 vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
460 vpaddw ymm1, ymm1, ymm3
455 vmovdqu ymm2, [eax + esi * 2] 461 vmovdqu ymm2, [eax + esi * 2]
456 vmovdqu ymm3, [eax + esi * 2 + 32] 462 vmovdqu ymm3, [eax + esi * 2 + 32]
457 vpavgb ymm2, ymm2, [eax + edi] 463 vpmaddubsw ymm2, ymm2, ymm4
458 vpavgb ymm3, ymm3, [eax + edi + 32] 464 vpmaddubsw ymm3, ymm3, ymm4
459 lea eax, [eax + 64] 465 vpaddw ymm0, ymm0, ymm2 // add row 2
460 vpavgb ymm0, ymm0, ymm2 466 vpaddw ymm1, ymm1, ymm3
461 vpavgb ymm1, ymm1, ymm3 467 vmovdqu ymm2, [eax + edi]
462 468 vmovdqu ymm3, [eax + edi + 32]
463 vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels) 469 lea eax, [eax + 64]
464 vpand ymm3, ymm1, ymm7 470 vpmaddubsw ymm2, ymm2, ymm4
465 vpsrlw ymm0, ymm0, 8 471 vpmaddubsw ymm3, ymm3, ymm4
466 vpsrlw ymm1, ymm1, 8 472 vpaddw ymm0, ymm0, ymm2 // add row 3
467 vpavgw ymm0, ymm0, ymm2 473 vpaddw ymm1, ymm1, ymm3
468 vpavgw ymm1, ymm1, ymm3 474 vphaddw ymm0, ymm0, ymm1 // mutates
469 vpackuswb ymm0, ymm0, ymm1 475 vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
470 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 476 vpaddw ymm0, ymm0, ymm5 // + 8 for round
471 477 vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
472 vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
473 vpsrlw ymm0, ymm0, 8
474 vpavgw ymm0, ymm0, ymm2
475 vpackuswb ymm0, ymm0, ymm0 478 vpackuswb ymm0, ymm0, ymm0
476 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 479 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
477
478 vmovdqu [edx], xmm0 480 vmovdqu [edx], xmm0
479 lea edx, [edx + 16] 481 lea edx, [edx + 16]
480 sub ecx, 16 482 sub ecx, 16
481 jg wloop 483 jg wloop
482 484
483 pop edi 485 pop edi
484 pop esi 486 pop esi
485 vzeroupper 487 vzeroupper
486 ret 488 ret
487 } 489 }
(...skipping 858 matching lines...) Expand 10 before | Expand all | Expand 10 after
1346 idiv ecx 1348 idiv ecx
1347 ret 1349 ret
1348 } 1350 }
1349 } 1351 }
1350 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 1352 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1351 1353
1352 #ifdef __cplusplus 1354 #ifdef __cplusplus
1353 } // extern "C" 1355 } // extern "C"
1354 } // namespace libyuv 1356 } // namespace libyuv
1355 #endif 1357 #endif
OLDNEW
« no previous file with comments | « source/scale_gcc.cc ('k') | unit_test/scale_test.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698