OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
88 // Arrange third value for pixels 0,1,2,3,4,5 | 88 // Arrange third value for pixels 0,1,2,3,4,5 |
89 static uvec8 kShufAb2 = | 89 static uvec8 kShufAb2 = |
90 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; | 90 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; |
91 | 91 |
92 // Scaling values for boxes of 3x2 and 2x2 | 92 // Scaling values for boxes of 3x2 and 2x2 |
93 static uvec16 kScaleAb2 = | 93 static uvec16 kScaleAb2 = |
94 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; | 94 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; |
95 | 95 |
96 // Reads 32 pixels, throws half away and writes 16 pixels. | 96 // Reads 32 pixels, throws half away and writes 16 pixels. |
97 __declspec(naked) | 97 __declspec(naked) |
98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 98 void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
99 uint8* dst_ptr, int dst_width) { | 99 uint8* dst_ptr, int dst_width) { |
100 __asm { | 100 __asm { |
101 mov eax, [esp + 4] // src_ptr | 101 mov eax, [esp + 4] // src_ptr |
102 // src_stride ignored | 102 // src_stride ignored |
103 mov edx, [esp + 12] // dst_ptr | 103 mov edx, [esp + 12] // dst_ptr |
104 mov ecx, [esp + 16] // dst_width | 104 mov ecx, [esp + 16] // dst_width |
105 | 105 |
106 wloop: | 106 wloop: |
107 movdqu xmm0, [eax] | 107 movdqu xmm0, [eax] |
108 movdqu xmm1, [eax + 16] | 108 movdqu xmm1, [eax + 16] |
109 lea eax, [eax + 32] | 109 lea eax, [eax + 32] |
110 psrlw xmm0, 8 // isolate odd pixels. | 110 psrlw xmm0, 8 // isolate odd pixels. |
111 psrlw xmm1, 8 | 111 psrlw xmm1, 8 |
112 packuswb xmm0, xmm1 | 112 packuswb xmm0, xmm1 |
113 movdqu [edx], xmm0 | 113 movdqu [edx], xmm0 |
114 lea edx, [edx + 16] | 114 lea edx, [edx + 16] |
115 sub ecx, 16 | 115 sub ecx, 16 |
116 jg wloop | 116 jg wloop |
117 | 117 |
118 ret | 118 ret |
119 } | 119 } |
120 } | 120 } |
121 | 121 |
122 // Blends 32x1 rectangle to 16x1. | 122 // Blends 32x1 rectangle to 16x1. |
123 __declspec(naked) | 123 __declspec(naked) |
124 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 124 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
125 uint8* dst_ptr, int dst_width) { | 125 uint8* dst_ptr, int dst_width) { |
126 __asm { | 126 __asm { |
127 mov eax, [esp + 4] // src_ptr | 127 mov eax, [esp + 4] // src_ptr |
128 // src_stride | 128 // src_stride |
129 mov edx, [esp + 12] // dst_ptr | 129 mov edx, [esp + 12] // dst_ptr |
130 mov ecx, [esp + 16] // dst_width | 130 mov ecx, [esp + 16] // dst_width |
131 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 131 |
132 psrlw xmm5, 8 | 132 pcmpeqb xmm4, xmm4 // constant 0x0101 |
| 133 psrlw xmm4, 15 |
| 134 packuswb xmm4, xmm4 |
| 135 pxor xmm5, xmm5 // constant 0 |
133 | 136 |
134 wloop: | 137 wloop: |
135 movdqu xmm0, [eax] | 138 movdqu xmm0, [eax] |
136 movdqu xmm1, [eax + 16] | 139 movdqu xmm1, [eax + 16] |
137 lea eax, [eax + 32] | 140 lea eax, [eax + 32] |
138 | 141 pmaddubsw xmm0, xmm4 // horizontal add |
139 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) | 142 pmaddubsw xmm1, xmm4 |
140 psrlw xmm0, 8 | 143 pavgw xmm0, xmm5 // (x + 1) / 2 |
141 movdqa xmm3, xmm1 | 144 pavgw xmm1, xmm5 |
142 psrlw xmm1, 8 | |
143 pand xmm2, xmm5 | |
144 pand xmm3, xmm5 | |
145 pavgw xmm0, xmm2 | |
146 pavgw xmm1, xmm3 | |
147 packuswb xmm0, xmm1 | 145 packuswb xmm0, xmm1 |
148 | |
149 movdqu [edx], xmm0 | 146 movdqu [edx], xmm0 |
150 lea edx, [edx + 16] | 147 lea edx, [edx + 16] |
151 sub ecx, 16 | 148 sub ecx, 16 |
152 jg wloop | 149 jg wloop |
153 | 150 |
154 ret | 151 ret |
155 } | 152 } |
156 } | 153 } |
157 | 154 |
158 // Blends 32x2 rectangle to 16x1. | 155 // Blends 32x2 rectangle to 16x1. |
159 __declspec(naked) | 156 __declspec(naked) |
160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 157 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
161 uint8* dst_ptr, int dst_width) { | 158 uint8* dst_ptr, int dst_width) { |
162 __asm { | 159 __asm { |
163 push esi | 160 push esi |
164 mov eax, [esp + 4 + 4] // src_ptr | 161 mov eax, [esp + 4 + 4] // src_ptr |
165 mov esi, [esp + 4 + 8] // src_stride | 162 mov esi, [esp + 4 + 8] // src_stride |
166 mov edx, [esp + 4 + 12] // dst_ptr | 163 mov edx, [esp + 4 + 12] // dst_ptr |
167 mov ecx, [esp + 4 + 16] // dst_width | 164 mov ecx, [esp + 4 + 16] // dst_width |
168 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 165 |
169 psrlw xmm5, 8 | 166 pcmpeqb xmm4, xmm4 // constant 0x0101 |
| 167 psrlw xmm4, 15 |
| 168 packuswb xmm4, xmm4 |
| 169 pxor xmm5, xmm5 // constant 0 |
170 | 170 |
171 wloop: | 171 wloop: |
172 movdqu xmm0, [eax] | 172 movdqu xmm0, [eax] |
173 movdqu xmm1, [eax + 16] | 173 movdqu xmm1, [eax + 16] |
174 movdqu xmm2, [eax + esi] | 174 movdqu xmm2, [eax + esi] |
175 movdqu xmm3, [eax + esi + 16] | 175 movdqu xmm3, [eax + esi + 16] |
176 lea eax, [eax + 32] | 176 lea eax, [eax + 32] |
177 pavgb xmm0, xmm2 // average rows | 177 pmaddubsw xmm0, xmm4 // horizontal add |
178 pavgb xmm1, xmm3 | 178 pmaddubsw xmm1, xmm4 |
179 | 179 pmaddubsw xmm2, xmm4 |
180 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) | 180 pmaddubsw xmm3, xmm4 |
181 psrlw xmm0, 8 | 181 paddw xmm0, xmm2 // vertical add |
182 movdqa xmm3, xmm1 | 182 paddw xmm1, xmm3 |
183 psrlw xmm1, 8 | 183 psrlw xmm0, 1 |
184 pand xmm2, xmm5 | 184 psrlw xmm1, 1 |
185 pand xmm3, xmm5 | 185 pavgw xmm0, xmm5 // (x + 1) / 2 |
186 pavgw xmm0, xmm2 | 186 pavgw xmm1, xmm5 |
187 pavgw xmm1, xmm3 | |
188 packuswb xmm0, xmm1 | 187 packuswb xmm0, xmm1 |
189 | |
190 movdqu [edx], xmm0 | 188 movdqu [edx], xmm0 |
191 lea edx, [edx + 16] | 189 lea edx, [edx + 16] |
192 sub ecx, 16 | 190 sub ecx, 16 |
193 jg wloop | 191 jg wloop |
194 | 192 |
195 pop esi | 193 pop esi |
196 ret | 194 ret |
197 } | 195 } |
198 } | 196 } |
199 | 197 |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
238 | 236 |
239 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b | 237 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b |
240 vpsrlw ymm4, ymm4, 15 | 238 vpsrlw ymm4, ymm4, 15 |
241 vpackuswb ymm4, ymm4, ymm4 | 239 vpackuswb ymm4, ymm4, ymm4 |
242 vpxor ymm5, ymm5, ymm5 // constant 0 | 240 vpxor ymm5, ymm5, ymm5 // constant 0 |
243 | 241 |
244 wloop: | 242 wloop: |
245 vmovdqu ymm0, [eax] | 243 vmovdqu ymm0, [eax] |
246 vmovdqu ymm1, [eax + 32] | 244 vmovdqu ymm1, [eax + 32] |
247 lea eax, [eax + 64] | 245 lea eax, [eax + 64] |
248 | 246 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add |
249 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally | |
250 vpmaddubsw ymm1, ymm1, ymm4 | 247 vpmaddubsw ymm1, ymm1, ymm4 |
251 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 | 248 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 |
252 vpavgw ymm1, ymm1, ymm5 | 249 vpavgw ymm1, ymm1, ymm5 |
253 vpackuswb ymm0, ymm0, ymm1 | 250 vpackuswb ymm0, ymm0, ymm1 |
254 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb | 251 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
255 | |
256 vmovdqu [edx], ymm0 | 252 vmovdqu [edx], ymm0 |
257 lea edx, [edx + 32] | 253 lea edx, [edx + 32] |
258 sub ecx, 32 | 254 sub ecx, 32 |
259 jg wloop | 255 jg wloop |
260 | 256 |
261 vzeroupper | 257 vzeroupper |
262 ret | 258 ret |
263 } | 259 } |
264 } | 260 } |
265 | 261 |
| 262 // For rounding, average = (sum + 2) / 4 |
| 263 // becomes average((sum >> 1), 0) |
266 // Blends 64x2 rectangle to 32x1. | 264 // Blends 64x2 rectangle to 32x1. |
267 __declspec(naked) | 265 __declspec(naked) |
268 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, | 266 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
269 uint8* dst_ptr, int dst_width) { | 267 uint8* dst_ptr, int dst_width) { |
270 __asm { | 268 __asm { |
271 push esi | 269 push esi |
272 mov eax, [esp + 4 + 4] // src_ptr | 270 mov eax, [esp + 4 + 4] // src_ptr |
273 mov esi, [esp + 4 + 8] // src_stride | 271 mov esi, [esp + 4 + 8] // src_stride |
274 mov edx, [esp + 4 + 12] // dst_ptr | 272 mov edx, [esp + 4 + 12] // dst_ptr |
275 mov ecx, [esp + 4 + 16] // dst_width | 273 mov ecx, [esp + 4 + 16] // dst_width |
276 | 274 |
277 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b | 275 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b |
278 vpsrlw ymm4, ymm4, 15 | 276 vpsrlw ymm4, ymm4, 15 |
279 vpackuswb ymm4, ymm4, ymm4 | 277 vpackuswb ymm4, ymm4, ymm4 |
280 vpxor ymm5, ymm5, ymm5 // constant 0 | 278 vpxor ymm5, ymm5, ymm5 // constant 0 |
281 | 279 |
282 wloop: | 280 wloop: |
283 vmovdqu ymm0, [eax] // average rows | 281 vmovdqu ymm0, [eax] |
284 vmovdqu ymm1, [eax + 32] | 282 vmovdqu ymm1, [eax + 32] |
285 vpavgb ymm0, ymm0, [eax + esi] | 283 vmovdqu ymm2, [eax + esi] |
286 vpavgb ymm1, ymm1, [eax + esi + 32] | 284 vmovdqu ymm3, [eax + esi + 32] |
287 lea eax, [eax + 64] | 285 lea eax, [eax + 64] |
288 | 286 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add |
289 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally | |
290 vpmaddubsw ymm1, ymm1, ymm4 | 287 vpmaddubsw ymm1, ymm1, ymm4 |
| 288 vpmaddubsw ymm2, ymm2, ymm4 |
| 289 vpmaddubsw ymm3, ymm3, ymm4 |
| 290 vpaddw ymm0, ymm0, ymm2 // vertical add |
| 291 vpaddw ymm1, ymm1, ymm3 |
| 292 vpsrlw ymm0, ymm0, 1 |
| 293 vpsrlw ymm1, ymm1, 1 |
291 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 | 294 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 |
292 vpavgw ymm1, ymm1, ymm5 | 295 vpavgw ymm1, ymm1, ymm5 |
293 vpackuswb ymm0, ymm0, ymm1 | 296 vpackuswb ymm0, ymm0, ymm1 |
294 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb | 297 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
295 | |
296 vmovdqu [edx], ymm0 | 298 vmovdqu [edx], ymm0 |
297 lea edx, [edx + 32] | 299 lea edx, [edx + 32] |
298 sub ecx, 32 | 300 sub ecx, 32 |
299 jg wloop | 301 jg wloop |
300 | 302 |
301 pop esi | 303 pop esi |
302 vzeroupper | 304 vzeroupper |
303 ret | 305 ret |
304 } | 306 } |
305 } | 307 } |
(...skipping 1038 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1344 idiv ecx | 1346 idiv ecx |
1345 ret | 1347 ret |
1346 } | 1348 } |
1347 } | 1349 } |
1348 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) | 1350 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) |
1349 | 1351 |
1350 #ifdef __cplusplus | 1352 #ifdef __cplusplus |
1351 } // extern "C" | 1353 } // extern "C" |
1352 } // namespace libyuv | 1354 } // namespace libyuv |
1353 #endif | 1355 #endif |
OLD | NEW |