Index: source/row_win.cc |
diff --git a/source/row_win.cc b/source/row_win.cc |
index 1b7322a9ffc005f60919d0c4b419847a334dbf72..54e1d9d240fa4e2de037b1eef6ffdeed882e475f 100644 |
--- a/source/row_win.cc |
+++ b/source/row_win.cc |
@@ -3376,8 +3376,23 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { |
mov eax, [esp + 4] // src |
mov edx, [esp + 8] // dst |
mov ecx, [esp + 12] // count |
+ test eax, 15 |
+ jne convertloopu |
+ test edx, 15 |
+ jne convertloopu |
- convertloop: |
+ convertloopa: |
+ movdqa xmm0, [eax] |
+ movdqa xmm1, [eax + 16] |
+ lea eax, [eax + 32] |
+ movdqa [edx], xmm0 |
+ movdqa [edx + 16], xmm1 |
+ lea edx, [edx + 32] |
+ sub ecx, 32 |
+ jg convertloopa |
+ ret |
+ |
+ convertloopu: |
movdqu xmm0, [eax] |
movdqu xmm1, [eax + 16] |
lea eax, [eax + 32] |
@@ -3385,7 +3400,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { |
movdqu [edx + 16], xmm1 |
lea edx, [edx + 32] |
sub ecx, 32 |
- jg convertloop |
+ jg convertloopu |
ret |
} |
} |