Index: source/row_gcc.cc |
diff --git a/source/row_gcc.cc b/source/row_gcc.cc |
index 348935405761d16bf12ed18278fe277a1559d3a6..dab651e7ecff2eab903f6e7419c2f886d7446ef0 100644 |
--- a/source/row_gcc.cc |
+++ b/source/row_gcc.cc |
@@ -2726,8 +2726,23 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
#ifdef HAS_COPYROW_SSE2 |
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { |
asm volatile ( |
+ "test $0xf,%0 \n" |
+ "jne 2f \n" |
+ "test $0xf,%1 \n" |
+ "jne 2f \n" |
LABELALIGN |
"1: \n" |
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
+ "lea " MEMLEA(0x20,0) ",%0 \n" |
+ "movdqa %%xmm0," MEMACCESS(1) " \n" |
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
+ "lea " MEMLEA(0x20,1) ",%1 \n" |
+ "sub $0x20,%2 \n" |
+ "jg 1b \n" |
+ "jmp 9f \n" |
+ LABELALIGN |
+ "2: \n" |
"movdqu " MEMACCESS(0) ",%%xmm0 \n" |
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
"lea " MEMLEA(0x20,0) ",%0 \n" |
@@ -2735,7 +2750,8 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { |
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
"lea " MEMLEA(0x20,1) ",%1 \n" |
"sub $0x20,%2 \n" |
- "jg 1b \n" |
+ "jg 2b \n" |
+ "9: \n" |
: "+r"(src), // %0 |
"+r"(dst), // %1 |
"+r"(count) // %2 |