Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(27)

Unified Diff: source/row_win.cc

Issue 1535833003: avx2 interpolate use 8 bit (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: gcc version of interpolate Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « source/row_neon64.cc ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: source/row_win.cc
diff --git a/source/row_win.cc b/source/row_win.cc
index 494043c626f3107588c067accec0fce0ad4d4386..5cb5d1e4f5d9f20f7baa172be1d0fa4faa91af64 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -5566,24 +5566,22 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- shr eax, 1
// Dispatch to specialized filters if applicable.
cmp eax, 0
- je xloop100 // 0 / 128. Blend 100 / 0.
+ je xloop100 // 0 / 256. Blend 100 / 0.
sub edi, esi
- cmp eax, 64
- je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
+ cmp eax, 128
+ je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
- vmovd xmm0, eax // high fraction 0..127
+ vmovd xmm0, eax // high fraction 0..255
neg eax
- add eax, 128
- vmovd xmm5, eax // low fraction 128..1
+ add eax, 256
+ vmovd xmm5, eax // low fraction 256..1
vpunpcklbw xmm5, xmm5, xmm0
vpunpcklwd xmm5, xmm5, xmm5
- vpxor ymm0, ymm0, ymm0
- vpermd ymm5, ymm0, ymm5
+ vbroadcastss ymm5, xmm5
- mov eax, 0x00400040 // 64 for rounding.
+ mov eax, 0x80808080 // 128b for bias and rounding.
vmovd xmm4, eax
vbroadcastss ymm4, xmm4
@@ -5591,13 +5589,15 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vmovdqu ymm0, [esi]
vmovdqu ymm2, [esi + edx]
vpunpckhbw ymm1, ymm0, ymm2 // mutates
- vpunpcklbw ymm0, ymm0, ymm2 // mutates
- vpmaddubsw ymm0, ymm0, ymm5
- vpmaddubsw ymm1, ymm1, ymm5
+ vpunpcklbw ymm0, ymm0, ymm2
+ vpsubb ymm1, ymm1, ymm4 // bias to signed image
+ vpsubb ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm5, ymm1
+ vpmaddubsw ymm0, ymm5, ymm0
+ vpaddw ymm1, ymm1, ymm4 // unbias and round
vpaddw ymm0, ymm0, ymm4
- vpaddw ymm1, ymm1, ymm4
- vpsrlw ymm0, ymm0, 7
- vpsrlw ymm1, ymm1, 7
+ vpsrlw ymm1, ymm1, 8
+ vpsrlw ymm0, ymm0, 8
vpackuswb ymm0, ymm0, ymm1 // unmutates
vmovdqu [esi + edi], ymm0
lea esi, [esi + 32]
@@ -5629,6 +5629,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
#endif // HAS_INTERPOLATEROW_AVX2
// Bilinear filter 16x2 -> 16x1
+// TODO(fbarchard): Consider allowing 256 using memcpy.
__declspec(naked)
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
@@ -5636,28 +5637,27 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
__asm {
push esi
push edi
+
mov edi, [esp + 8 + 4] // dst_ptr
mov esi, [esp + 8 + 8] // src_ptr
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
- shr eax, 1
// Dispatch to specialized filters if applicable.
cmp eax, 0
- je xloop100 // 0 / 128. Blend 100 / 0.
- cmp eax, 64
- je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
+ je xloop100 // 0 /256. Blend 100 / 0.
+ cmp eax, 128
+ je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
- movd xmm0, eax // high fraction 0..127
+ movd xmm0, eax // high fraction 0..255
neg eax
- add eax, 128
- movd xmm5, eax // low fraction 128..1
+ add eax, 256
+ movd xmm5, eax // low fraction 255..1
punpcklbw xmm5, xmm0
punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0
-
- mov eax, 0x00400040 // 64 for rounding.
+ mov eax, 0x80808080 // 128 for biasing image to signed.
movd xmm4, eax
pshufd xmm4, xmm4, 0x00
@@ -5667,14 +5667,18 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm1, xmm5
- paddw xmm0, xmm4
- paddw xmm1, xmm4
- psrlw xmm0, 7
- psrlw xmm1, 7
- packuswb xmm0, xmm1
- movdqu [esi + edi], xmm0
+ psubb xmm0, xmm4 // bias image by -128
+ psubb xmm1, xmm4
+ movdqa xmm2, xmm5
+ movdqa xmm3, xmm5
+ pmaddubsw xmm2, xmm0
+ pmaddubsw xmm3, xmm1
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+ psrlw xmm2, 8
+ psrlw xmm3, 8
+ packuswb xmm2, xmm3
+ movdqu [esi + edi], xmm2
lea esi, [esi + 16]
sub ecx, 16
jg xloop
« no previous file with comments | « source/row_neon64.cc ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698