Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(487)

Unified Diff: source/scale_win.cc

Issue 1525033005: change scale down by 4 to use rounding. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: scale by 4 uses ssse3 now Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « source/scale_gcc.cc ('k') | unit_test/scale_test.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: source/scale_win.cc
diff --git a/source/scale_win.cc b/source/scale_win.cc
index 6930f729590dd6f4b9c64d5aa9cd8a642230c919..5ab4fa0ccc2524d78b4a939baa53dce464fe0905 100644
--- a/source/scale_win.cc
+++ b/source/scale_win.cc
@@ -309,7 +309,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
// Point samples 32 pixels to 8 pixels.
__declspec(naked)
-void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -340,7 +340,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x4 rectangle to 8x1.
__declspec(naked)
-void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
@@ -350,42 +350,40 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
- psrlw xmm7, 8
+ pcmpeqb xmm4, xmm4 // constant 0x0101
+ psrlw xmm4, 15
+ movdqa xmm5, xmm4
+ packuswb xmm4, xmm4
+ psllw xmm5, 3 // constant 0x0008
wloop:
movdqu xmm0, [eax] // average rows
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
+ pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // vertical add rows 0, 1
+ paddw xmm1, xmm3
movdqu xmm2, [eax + esi * 2]
movdqu xmm3, [eax + esi * 2 + 16]
- movdqu xmm4, [eax + edi]
- movdqu xmm5, [eax + edi + 16]
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // add row 2
+ paddw xmm1, xmm3
+ movdqu xmm2, [eax + edi]
+ movdqu xmm3, [eax + edi + 16]
lea eax, [eax + 32]
- pavgb xmm2, xmm4
- pavgb xmm3, xmm5
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm7
- pand xmm3, xmm7
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
- psrlw xmm0, 8
- pand xmm2, xmm7
- pavgw xmm0, xmm2
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // add row 3
+ paddw xmm1, xmm3
+ phaddw xmm0, xmm1
+ paddw xmm0, xmm5 // + 8 for round
+ psrlw xmm0, 4 // /16 for average of 4 * 4
packuswb xmm0, xmm0
-
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 8
@@ -444,37 +442,41 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
- vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff
- vpsrlw ymm7, ymm7, 8
+ vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
+ vpsrlw ymm4, ymm4, 15
+ vpsllw ymm5, ymm4, 3 // constant 0x0008
+ vpackuswb ymm4, ymm4, ymm4
wloop:
vmovdqu ymm0, [eax] // average rows
vmovdqu ymm1, [eax + 32]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
+ vmovdqu ymm2, [eax + esi]
+ vmovdqu ymm3, [eax + esi + 32]
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
+ vpaddw ymm1, ymm1, ymm3
vmovdqu ymm2, [eax + esi * 2]
vmovdqu ymm3, [eax + esi * 2 + 32]
- vpavgb ymm2, ymm2, [eax + edi]
- vpavgb ymm3, ymm3, [eax + edi + 32]
- lea eax, [eax + 64]
- vpavgb ymm0, ymm0, ymm2
- vpavgb ymm1, ymm1, ymm3
-
- vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)
- vpand ymm3, ymm1, ymm7
- vpsrlw ymm0, ymm0, 8
- vpsrlw ymm1, ymm1, 8
- vpavgw ymm0, ymm0, ymm2
- vpavgw ymm1, ymm1, ymm3
- vpackuswb ymm0, ymm0, ymm1
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
-
- vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
- vpsrlw ymm0, ymm0, 8
- vpavgw ymm0, ymm0, ymm2
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // add row 2
+ vpaddw ymm1, ymm1, ymm3
+ vmovdqu ymm2, [eax + edi]
+ vmovdqu ymm3, [eax + edi + 32]
+ lea eax, [eax + 64]
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // add row 3
+ vpaddw ymm1, ymm1, ymm3
+ vphaddw ymm0, ymm0, ymm1 // mutates
+ vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
+ vpaddw ymm0, ymm0, ymm5 // + 8 for round
+ vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
-
vmovdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
« no previous file with comments | « source/scale_gcc.cc ('k') | unit_test/scale_test.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698