| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 91 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; | 91 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; |
| 92 | 92 |
| 93 // Scaling values for boxes of 3x2 and 2x2 | 93 // Scaling values for boxes of 3x2 and 2x2 |
| 94 static uvec16 kScaleAb2 = | 94 static uvec16 kScaleAb2 = |
| 95 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; | 95 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; |
| 96 | 96 |
| 97 // GCC versions of row functions are verbatim conversions from Visual C. | 97 // GCC versions of row functions are verbatim conversions from Visual C. |
| 98 // Generated using gcc disassembly on Visual C object file: | 98 // Generated using gcc disassembly on Visual C object file: |
| 99 // objdump -D yuvscaler.obj >yuvscaler.txt | 99 // objdump -D yuvscaler.obj >yuvscaler.txt |
| 100 | 100 |
| 101 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 101 void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
| 102 uint8* dst_ptr, int dst_width) { | 102 uint8* dst_ptr, int dst_width) { |
| 103 asm volatile ( | 103 asm volatile ( |
| 104 LABELALIGN | 104 LABELALIGN |
| 105 "1: \n" | 105 "1: \n" |
| 106 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 106 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 107 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 107 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 108 "lea " MEMLEA(0x20,0) ",%0 \n" | 108 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 109 "psrlw $0x8,%%xmm0 \n" | 109 "psrlw $0x8,%%xmm0 \n" |
| 110 "psrlw $0x8,%%xmm1 \n" | 110 "psrlw $0x8,%%xmm1 \n" |
| 111 "packuswb %%xmm1,%%xmm0 \n" | 111 "packuswb %%xmm1,%%xmm0 \n" |
| 112 "movdqu %%xmm0," MEMACCESS(1) " \n" | 112 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 113 "lea " MEMLEA(0x10,1) ",%1 \n" | 113 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 114 "sub $0x10,%2 \n" | 114 "sub $0x10,%2 \n" |
| 115 "jg 1b \n" | 115 "jg 1b \n" |
| 116 : "+r"(src_ptr), // %0 | 116 : "+r"(src_ptr), // %0 |
| 117 "+r"(dst_ptr), // %1 | 117 "+r"(dst_ptr), // %1 |
| 118 "+r"(dst_width) // %2 | 118 "+r"(dst_width) // %2 |
| 119 :: "memory", "cc", "xmm0", "xmm1" | 119 :: "memory", "cc", "xmm0", "xmm1" |
| 120 ); | 120 ); |
| 121 } | 121 } |
| 122 | 122 |
| 123 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 123 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
| 124 uint8* dst_ptr, int dst_width) { | 124 uint8* dst_ptr, int dst_width) { |
| 125 asm volatile ( | 125 asm volatile ( |
| 126 "pcmpeqb %%xmm5,%%xmm5 \n" | 126 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 127 "psrlw $0x8,%%xmm5 \n" | 127 "psrlw $0xf,%%xmm4 \n" |
| 128 "packuswb %%xmm4,%%xmm4 \n" |
| 129 "pxor %%xmm5,%%xmm5 \n" |
| 128 | 130 |
| 129 LABELALIGN | 131 LABELALIGN |
| 130 "1: \n" | 132 "1: \n" |
| 131 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 133 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 132 "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" | 134 "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" |
| 133 "lea " MEMLEA(0x20,0) ",%0 \n" | 135 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 134 "movdqa %%xmm0,%%xmm2 \n" | 136 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 135 "psrlw $0x8,%%xmm0 \n" | 137 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 136 "movdqa %%xmm1,%%xmm3 \n" | 138 "pavgw %%xmm5,%%xmm0 \n" |
| 137 "psrlw $0x8,%%xmm1 \n" | 139 "pavgw %%xmm5,%%xmm1 \n" |
| 138 "pand %%xmm5,%%xmm2 \n" | 140 "packuswb %%xmm1,%%xmm0 \n" |
| 139 "pand %%xmm5,%%xmm3 \n" | |
| 140 "pavgw %%xmm2,%%xmm0 \n" | |
| 141 "pavgw %%xmm3,%%xmm1 \n" | |
| 142 "packuswb %%xmm1,%%xmm0 \n" | |
| 143 "movdqu %%xmm0," MEMACCESS(1) " \n" | 141 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 144 "lea " MEMLEA(0x10,1) ",%1 \n" | 142 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 145 "sub $0x10,%2 \n" | 143 "sub $0x10,%2 \n" |
| 146 "jg 1b \n" | 144 "jg 1b \n" |
| 147 : "+r"(src_ptr), // %0 | 145 : "+r"(src_ptr), // %0 |
| 148 "+r"(dst_ptr), // %1 | 146 "+r"(dst_ptr), // %1 |
| 149 "+r"(dst_width) // %2 | 147 "+r"(dst_width) // %2 |
| 150 :: "memory", "cc", "xmm0", "xmm1", "xmm5" | 148 :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" |
| 151 ); | 149 ); |
| 152 } | 150 } |
| 153 | 151 |
| 154 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 152 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
| 155 uint8* dst_ptr, int dst_width) { | 153 uint8* dst_ptr, int dst_width) { |
| 156 asm volatile ( | 154 asm volatile ( |
| 157 "pcmpeqb %%xmm5,%%xmm5 \n" | 155 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 158 "psrlw $0x8,%%xmm5 \n" | 156 "psrlw $0xf,%%xmm4 \n" |
| 157 "packuswb %%xmm4,%%xmm4 \n" |
| 158 "pxor %%xmm5,%%xmm5 \n" |
| 159 | 159 |
| 160 LABELALIGN | 160 LABELALIGN |
| 161 "1: \n" | 161 "1: \n" |
| 162 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 162 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 163 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 163 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 164 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 | 164 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 |
| 165 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 | 165 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 |
| 166 "lea " MEMLEA(0x20,0) ",%0 \n" | 166 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 167 "pavgb %%xmm2,%%xmm0 \n" | 167 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 168 "pavgb %%xmm3,%%xmm1 \n" | 168 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 169 "movdqa %%xmm0,%%xmm2 \n" | 169 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 170 "psrlw $0x8,%%xmm0 \n" | 170 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 171 "movdqa %%xmm1,%%xmm3 \n" | 171 "paddw %%xmm2,%%xmm0 \n" |
| 172 "psrlw $0x8,%%xmm1 \n" | 172 "paddw %%xmm3,%%xmm1 \n" |
| 173 "pand %%xmm5,%%xmm2 \n" | 173 "psrlw $0x1,%%xmm0 \n" |
| 174 "pand %%xmm5,%%xmm3 \n" | 174 "psrlw $0x1,%%xmm1 \n" |
| 175 "pavgw %%xmm2,%%xmm0 \n" | 175 "pavgw %%xmm5,%%xmm0 \n" |
| 176 "pavgw %%xmm3,%%xmm1 \n" | 176 "pavgw %%xmm5,%%xmm1 \n" |
| 177 "packuswb %%xmm1,%%xmm0 \n" | 177 "packuswb %%xmm1,%%xmm0 \n" |
| 178 "movdqu %%xmm0," MEMACCESS(1) " \n" | 178 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 179 "lea " MEMLEA(0x10,1) ",%1 \n" | 179 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 180 "sub $0x10,%2 \n" | 180 "sub $0x10,%2 \n" |
| 181 "jg 1b \n" | 181 "jg 1b \n" |
| 182 : "+r"(src_ptr), // %0 | 182 : "+r"(src_ptr), // %0 |
| 183 "+r"(dst_ptr), // %1 | 183 "+r"(dst_ptr), // %1 |
| 184 "+r"(dst_width) // %2 | 184 "+r"(dst_width) // %2 |
| 185 : "r"((intptr_t)(src_stride)) // %3 | 185 : "r"((intptr_t)(src_stride)) // %3 |
| 186 : "memory", "cc", NACL_R14 | 186 : "memory", "cc", NACL_R14 |
| 187 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 187 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| (...skipping 909 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1097 ); | 1097 ); |
| 1098 return num; | 1098 return num; |
| 1099 } | 1099 } |
| 1100 | 1100 |
| 1101 #endif // defined(__x86_64__) || defined(__i386__) | 1101 #endif // defined(__x86_64__) || defined(__i386__) |
| 1102 | 1102 |
| 1103 #ifdef __cplusplus | 1103 #ifdef __cplusplus |
| 1104 } // extern "C" | 1104 } // extern "C" |
| 1105 } // namespace libyuv | 1105 } // namespace libyuv |
| 1106 #endif | 1106 #endif |
| OLD | NEW |