OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
91 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; | 91 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; |
92 | 92 |
93 // Scaling values for boxes of 3x2 and 2x2 | 93 // Scaling values for boxes of 3x2 and 2x2 |
94 static uvec16 kScaleAb2 = | 94 static uvec16 kScaleAb2 = |
95 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; | 95 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; |
96 | 96 |
97 // GCC versions of row functions are verbatim conversions from Visual C. | 97 // GCC versions of row functions are verbatim conversions from Visual C. |
98 // Generated using gcc disassembly on Visual C object file: | 98 // Generated using gcc disassembly on Visual C object file: |
99 // objdump -D yuvscaler.obj >yuvscaler.txt | 99 // objdump -D yuvscaler.obj >yuvscaler.txt |
100 | 100 |
101 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 101 void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
102 uint8* dst_ptr, int dst_width) { | 102 uint8* dst_ptr, int dst_width) { |
103 asm volatile ( | 103 asm volatile ( |
104 LABELALIGN | 104 LABELALIGN |
105 "1: \n" | 105 "1: \n" |
106 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 106 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
107 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 107 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
108 "lea " MEMLEA(0x20,0) ",%0 \n" | 108 "lea " MEMLEA(0x20,0) ",%0 \n" |
109 "psrlw $0x8,%%xmm0 \n" | 109 "psrlw $0x8,%%xmm0 \n" |
110 "psrlw $0x8,%%xmm1 \n" | 110 "psrlw $0x8,%%xmm1 \n" |
111 "packuswb %%xmm1,%%xmm0 \n" | 111 "packuswb %%xmm1,%%xmm0 \n" |
112 "movdqu %%xmm0," MEMACCESS(1) " \n" | 112 "movdqu %%xmm0," MEMACCESS(1) " \n" |
113 "lea " MEMLEA(0x10,1) ",%1 \n" | 113 "lea " MEMLEA(0x10,1) ",%1 \n" |
114 "sub $0x10,%2 \n" | 114 "sub $0x10,%2 \n" |
115 "jg 1b \n" | 115 "jg 1b \n" |
116 : "+r"(src_ptr), // %0 | 116 : "+r"(src_ptr), // %0 |
117 "+r"(dst_ptr), // %1 | 117 "+r"(dst_ptr), // %1 |
118 "+r"(dst_width) // %2 | 118 "+r"(dst_width) // %2 |
119 :: "memory", "cc", "xmm0", "xmm1" | 119 :: "memory", "cc", "xmm0", "xmm1" |
120 ); | 120 ); |
121 } | 121 } |
122 | 122 |
123 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 123 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
124 uint8* dst_ptr, int dst_width) { | 124 uint8* dst_ptr, int dst_width) { |
125 asm volatile ( | 125 asm volatile ( |
126 "pcmpeqb %%xmm5,%%xmm5 \n" | 126 "pcmpeqb %%xmm4,%%xmm4 \n" |
127 "psrlw $0x8,%%xmm5 \n" | 127 "psrlw $0xf,%%xmm4 \n" |
| 128 "packuswb %%xmm4,%%xmm4 \n" |
| 129 "pxor %%xmm5,%%xmm5 \n" |
128 | 130 |
129 LABELALIGN | 131 LABELALIGN |
130 "1: \n" | 132 "1: \n" |
131 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 133 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
132 "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" | 134 "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" |
133 "lea " MEMLEA(0x20,0) ",%0 \n" | 135 "lea " MEMLEA(0x20,0) ",%0 \n" |
134 "movdqa %%xmm0,%%xmm2 \n" | 136 "pmaddubsw %%xmm4,%%xmm0 \n" |
135 "psrlw $0x8,%%xmm0 \n" | 137 "pmaddubsw %%xmm4,%%xmm1 \n" |
136 "movdqa %%xmm1,%%xmm3 \n" | 138 "pavgw %%xmm5,%%xmm0 \n" |
137 "psrlw $0x8,%%xmm1 \n" | 139 "pavgw %%xmm5,%%xmm1 \n" |
138 "pand %%xmm5,%%xmm2 \n" | 140 "packuswb %%xmm1,%%xmm0 \n" |
139 "pand %%xmm5,%%xmm3 \n" | |
140 "pavgw %%xmm2,%%xmm0 \n" | |
141 "pavgw %%xmm3,%%xmm1 \n" | |
142 "packuswb %%xmm1,%%xmm0 \n" | |
143 "movdqu %%xmm0," MEMACCESS(1) " \n" | 141 "movdqu %%xmm0," MEMACCESS(1) " \n" |
144 "lea " MEMLEA(0x10,1) ",%1 \n" | 142 "lea " MEMLEA(0x10,1) ",%1 \n" |
145 "sub $0x10,%2 \n" | 143 "sub $0x10,%2 \n" |
146 "jg 1b \n" | 144 "jg 1b \n" |
147 : "+r"(src_ptr), // %0 | 145 : "+r"(src_ptr), // %0 |
148 "+r"(dst_ptr), // %1 | 146 "+r"(dst_ptr), // %1 |
149 "+r"(dst_width) // %2 | 147 "+r"(dst_width) // %2 |
150 :: "memory", "cc", "xmm0", "xmm1", "xmm5" | 148 :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" |
151 ); | 149 ); |
152 } | 150 } |
153 | 151 |
154 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 152 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
155 uint8* dst_ptr, int dst_width) { | 153 uint8* dst_ptr, int dst_width) { |
156 asm volatile ( | 154 asm volatile ( |
157 "pcmpeqb %%xmm5,%%xmm5 \n" | 155 "pcmpeqb %%xmm4,%%xmm4 \n" |
158 "psrlw $0x8,%%xmm5 \n" | 156 "psrlw $0xf,%%xmm4 \n" |
| 157 "packuswb %%xmm4,%%xmm4 \n" |
| 158 "pxor %%xmm5,%%xmm5 \n" |
159 | 159 |
160 LABELALIGN | 160 LABELALIGN |
161 "1: \n" | 161 "1: \n" |
162 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 162 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
163 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 163 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
164 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 | 164 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 |
165 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 | 165 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 |
166 "lea " MEMLEA(0x20,0) ",%0 \n" | 166 "lea " MEMLEA(0x20,0) ",%0 \n" |
167 "pavgb %%xmm2,%%xmm0 \n" | 167 "pmaddubsw %%xmm4,%%xmm0 \n" |
168 "pavgb %%xmm3,%%xmm1 \n" | 168 "pmaddubsw %%xmm4,%%xmm1 \n" |
169 "movdqa %%xmm0,%%xmm2 \n" | 169 "pmaddubsw %%xmm4,%%xmm2 \n" |
170 "psrlw $0x8,%%xmm0 \n" | 170 "pmaddubsw %%xmm4,%%xmm3 \n" |
171 "movdqa %%xmm1,%%xmm3 \n" | 171 "paddw %%xmm2,%%xmm0 \n" |
172 "psrlw $0x8,%%xmm1 \n" | 172 "paddw %%xmm3,%%xmm1 \n" |
173 "pand %%xmm5,%%xmm2 \n" | 173 "psrlw $0x1,%%xmm0 \n" |
174 "pand %%xmm5,%%xmm3 \n" | 174 "psrlw $0x1,%%xmm1 \n" |
175 "pavgw %%xmm2,%%xmm0 \n" | 175 "pavgw %%xmm5,%%xmm0 \n" |
176 "pavgw %%xmm3,%%xmm1 \n" | 176 "pavgw %%xmm5,%%xmm1 \n" |
177 "packuswb %%xmm1,%%xmm0 \n" | 177 "packuswb %%xmm1,%%xmm0 \n" |
178 "movdqu %%xmm0," MEMACCESS(1) " \n" | 178 "movdqu %%xmm0," MEMACCESS(1) " \n" |
179 "lea " MEMLEA(0x10,1) ",%1 \n" | 179 "lea " MEMLEA(0x10,1) ",%1 \n" |
180 "sub $0x10,%2 \n" | 180 "sub $0x10,%2 \n" |
181 "jg 1b \n" | 181 "jg 1b \n" |
182 : "+r"(src_ptr), // %0 | 182 : "+r"(src_ptr), // %0 |
183 "+r"(dst_ptr), // %1 | 183 "+r"(dst_ptr), // %1 |
184 "+r"(dst_width) // %2 | 184 "+r"(dst_width) // %2 |
185 : "r"((intptr_t)(src_stride)) // %3 | 185 : "r"((intptr_t)(src_stride)) // %3 |
186 : "memory", "cc", NACL_R14 | 186 : "memory", "cc", NACL_R14 |
187 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 187 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
(...skipping 909 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1097 ); | 1097 ); |
1098 return num; | 1098 return num; |
1099 } | 1099 } |
1100 | 1100 |
1101 #endif // defined(__x86_64__) || defined(__i386__) | 1101 #endif // defined(__x86_64__) || defined(__i386__) |
1102 | 1102 |
1103 #ifdef __cplusplus | 1103 #ifdef __cplusplus |
1104 } // extern "C" | 1104 } // extern "C" |
1105 } // namespace libyuv | 1105 } // namespace libyuv |
1106 #endif | 1106 #endif |
OLD | NEW |