Index: source/row_gcc.cc |
diff --git a/source/row_gcc.cc b/source/row_gcc.cc |
index 03f7f1bdc55a5cfa59cd54a6ac0699057315ffb8..8020108d041ac71be101d3155b017a514ff42f21 100644 |
--- a/source/row_gcc.cc |
+++ b/source/row_gcc.cc |
@@ -5350,17 +5350,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { |
// 16 pixel loop. |
LABELALIGN |
"1: \n" |
- "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts |
+ "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts |
"lea " MEMLEA(0x20,0) ",%0 \n" |
- "vpunpckhwd %%ymm2,%%ymm5,%%ymm3 \n" |
- "vpunpcklwd %%ymm2,%%ymm5,%%ymm2 \n" |
+ "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates |
wangcheng
2016/10/20 22:38:30
reverse order of ymm5 and ymm2
|
+ "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" |
"vcvtdq2ps %%ymm3,%%ymm3 \n" |
"vcvtdq2ps %%ymm2,%%ymm2 \n" |
"vmulps %%ymm3,%%ymm4,%%ymm3 \n" |
"vmulps %%ymm2,%%ymm4,%%ymm2 \n" |
"vpsrld $0xd,%%ymm3,%%ymm3 \n" |
"vpsrld $0xd,%%ymm2,%%ymm2 \n" |
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates |
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates |
"vmovdqu %%ymm2," MEMACCESS(1) " \n" |
"lea " MEMLEA(0x20,1) ",%1 \n" |
"sub $0x10,%2 \n" |
@@ -5384,8 +5384,8 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { |
// 16 pixel loop. |
LABELALIGN |
"1: \n" |
- "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints |
- "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more |
+ "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints |
+ "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" |
"lea " MEMLEA(0x20,0) ",%0 \n" |
"vcvtdq2ps %%ymm2,%%ymm2 \n" |
"vcvtdq2ps %%ymm3,%%ymm3 \n" |