Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(99)

Unified Diff: source/libvpx/third_party/libyuv/source/scale_win.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: source/libvpx/third_party/libyuv/source/scale_win.cc
diff --git a/source/libvpx/third_party/libyuv/source/scale_win.cc b/source/libvpx/third_party/libyuv/source/scale_win.cc
index e0209cdec8c243d1b06dd4159c6f7c553b380798..c3896ebad2fd89869118c088f90bfe4c36dd9046 100644
--- a/source/libvpx/third_party/libyuv/source/scale_win.cc
+++ b/source/libvpx/third_party/libyuv/source/scale_win.cc
@@ -9,6 +9,7 @@
*/
#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
@@ -16,7 +17,8 @@ extern "C" {
#endif
// This module is for Visual C x86.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+ defined(_MSC_VER) && !defined(__clang__)
// Offsets for source bytes 0 to 9
static uvec8 kShuf0 =
@@ -93,8 +95,7 @@ static uvec16 kScaleAb2 =
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@@ -120,8 +121,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Blends 32x1 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@@ -157,8 +157,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@@ -199,9 +198,116 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
+#ifdef HAS_SCALEROWDOWN2_AVX2
+// Reads 64 pixels, throws half away and writes 32 pixels.
+__declspec(naked)
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+
+// Blends 64x1 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
+ vpsrlw ymm4, ymm4, 15
+ vpackuswb ymm4, ymm4, ymm4
+ vpxor ymm5, ymm5, ymm5 // constant 0
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+
+ vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
+ vpavgw ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+
+// Blends 64x2 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
+ vpsrlw ymm4, ymm4, 15
+ vpackuswb ymm4, ymm4, ymm4
+ vpxor ymm5, ymm5, ymm5 // constant 0
+
+ wloop:
+ vmovdqu ymm0, [eax] // average rows
+ vmovdqu ymm1, [eax + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ lea eax, [eax + 64]
+
+ vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
+ vpavgw ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SCALEROWDOWN2_AVX2
+
// Point samples 32 pixels to 8 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@@ -232,8 +338,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Blends 32x4 rectangle to 8x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@@ -248,11 +353,11 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
psrlw xmm7, 8
wloop:
- movdqu xmm0, [eax]
+ movdqu xmm0, [eax] // average rows
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
- pavgb xmm0, xmm2 // average rows
+ pavgb xmm0, xmm2
pavgb xmm1, xmm3
movdqu xmm2, [eax + esi * 2]
movdqu xmm3, [eax + esi * 2 + 16]
@@ -291,13 +396,102 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
+#ifdef HAS_SCALEROWDOWN4_AVX2
+// Point samples 64 pixels to 16 pixels.
+__declspec(naked)
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
+ vpsrld ymm5, ymm5, 24
+ vpslld ymm5, ymm5, 16
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vpsrlw ymm0, ymm0, 8
+ vpackuswb ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+
+// Blends 64x4 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
+ lea edi, [esi + esi * 2] // src_stride * 3
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff
+ vpsrlw ymm7, ymm7, 8
+
+ wloop:
+ vmovdqu ymm0, [eax] // average rows
+ vmovdqu ymm1, [eax + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ vmovdqu ymm2, [eax + esi * 2]
+ vmovdqu ymm3, [eax + esi * 2 + 32]
+ vpavgb ymm2, ymm2, [eax + edi]
+ vpavgb ymm3, ymm3, [eax + edi + 32]
+ lea eax, [eax + 64]
+ vpavgb ymm0, ymm0, ymm2
+ vpavgb ymm1, ymm1, ymm3
+
+ vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)
+ vpand ymm3, ymm1, ymm7
+ vpsrlw ymm0, ymm0, 8
+ vpsrlw ymm1, ymm1, 8
+ vpavgw ymm0, ymm0, ymm2
+ vpavgw ymm1, ymm1, ymm3
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+
+ vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
+ vpsrlw ymm0, ymm0, 8
+ vpavgw ymm0, ymm0, ymm2
+ vpackuswb ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+
+ vmovdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SCALEROWDOWN4_AVX2
+
// Point samples 32 pixels to 24 pixels.
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
// Then shuffled to do the scaling.
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@@ -344,8 +538,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// xmm7 kRound34
// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
@@ -402,8 +595,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
}
// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
@@ -465,7 +657,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
// 3/8 point sampler
// Scale 32 pixels to 12
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@@ -496,7 +688,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
@@ -561,7 +753,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
}
// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
@@ -605,76 +797,68 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
}
}
-// Reads 16xN bytes and produces 16 shorts at a time.
-// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
-__declspec(naked) __declspec(align(16))
-void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width,
- int src_height) {
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
__asm {
- push esi
- push edi
- push ebx
- push ebp
- mov esi, [esp + 16 + 4] // src_ptr
- mov edx, [esp + 16 + 8] // src_stride
- mov edi, [esp + 16 + 12] // dst_ptr
- mov ecx, [esp + 16 + 16] // dst_width
- mov ebx, [esp + 16 + 20] // height
- pxor xmm4, xmm4
- dec ebx
+ mov eax, [esp + 4] // src_ptr
+ mov edx, [esp + 8] // dst_ptr
+ mov ecx, [esp + 12] // src_width
+ pxor xmm5, xmm5
+ // sum rows
xloop:
- // first row
- movdqu xmm0, [esi]
- lea eax, [esi + edx]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm4
- punpckhbw xmm1, xmm4
- lea esi, [esi + 16]
- mov ebp, ebx
- test ebp, ebp
- je ydone
-
- // sum remaining rows
- yloop:
- movdqu xmm2, [eax] // read 16 pixels
- lea eax, [eax + edx] // advance to next row
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm4
- punpckhbw xmm3, xmm4
+ movdqu xmm3, [eax] // read 16 bytes
+ lea eax, [eax + 16]
+ movdqu xmm0, [edx] // read 16 words from destination
+ movdqu xmm1, [edx + 16]
+ movdqa xmm2, xmm3
+ punpcklbw xmm2, xmm5
+ punpckhbw xmm3, xmm5
paddusw xmm0, xmm2 // sum 16 words
paddusw xmm1, xmm3
- sub ebp, 1
- jg yloop
-
- ydone:
- movdqu [edi], xmm0
- movdqu [edi + 16], xmm1
- lea edi, [edi + 32]
-
+ movdqu [edx], xmm0 // write 16 words to destination
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
sub ecx, 16
jg xloop
+ ret
+ }
+}
- pop ebp
- pop ebx
- pop edi
- pop esi
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ mov edx, [esp + 8] // dst_ptr
+ mov ecx, [esp + 12] // src_width
+ vpxor ymm5, ymm5, ymm5
+
+ // sum rows
+ xloop:
+ vmovdqu ymm3, [eax] // read 32 bytes
+ lea eax, [eax + 32]
+ vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
+ vpunpcklbw ymm2, ymm3, ymm5
+ vpunpckhbw ymm3, ymm3, ymm5
+ vpaddusw ymm0, ymm2, [edx] // sum 16 words
+ vpaddusw ymm1, ymm3, [edx + 32]
+ vmovdqu [edx], ymm0 // write 32 words to destination
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 32
+ jg xloop
+
+ vzeroupper
ret
}
}
+#endif // HAS_SCALEADDROW_AVX2
// Bilinear column filtering. SSSE3 version.
-// TODO(fbarchard): Port to Neon
-// TODO(fbarchard): Switch the following:
-// xor ebx, ebx
-// mov bx, word ptr [esi + eax] // 2 source x0 pixels
-// To
-// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
-// when drmemory bug fixed.
-// https://code.google.com/p/drmemory/issues/detail?id=1396
-
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
__asm {
@@ -751,8 +935,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
}
// Reads 16 pixels, duplicates them and writes 32 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
__asm {
@@ -777,8 +960,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
}
// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
@@ -803,8 +985,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
}
// Blends 8x1 rectangle to 4x1.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
@@ -832,8 +1013,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
}
// Blends 8x2 rectangle to 4x1.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
@@ -867,8 +1047,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
}
// Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
@@ -904,8 +1083,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
}
// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
@@ -953,7 +1131,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
}
// Column scaling unfiltered. SSE2 version.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
__asm {
@@ -1044,7 +1222,7 @@ static uvec8 kShuffleFractions = {
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
};
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
__asm {
@@ -1115,8 +1293,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
}
// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
__asm {
@@ -1141,7 +1318,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
}
// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
int FixedDiv_X86(int num, int div) {
__asm {
mov eax, [esp + 4] // num
@@ -1154,7 +1331,7 @@ int FixedDiv_X86(int num, int div) {
}
// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
int FixedDiv1_X86(int num, int div) {
__asm {
mov eax, [esp + 4] // num
@@ -1169,8 +1346,7 @@ int FixedDiv1_X86(int num, int div) {
ret
}
}
-
-#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus
} // extern "C"
« no previous file with comments | « source/libvpx/third_party/libyuv/source/scale_posix.cc ('k') | source/libvpx/third_party/x86inc/README.libvpx » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698