OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "libyuv/rotate.h" | 11 #include "libyuv/rotate.h" |
12 | 12 |
13 #include "libyuv/cpu_id.h" | 13 #include "libyuv/cpu_id.h" |
14 #include "libyuv/convert.h" | 14 #include "libyuv/convert.h" |
15 #include "libyuv/planar_functions.h" | 15 #include "libyuv/planar_functions.h" |
| 16 #include "libyuv/rotate_row.h" |
16 #include "libyuv/row.h" | 17 #include "libyuv/row.h" |
17 | 18 |
18 #ifdef __cplusplus | 19 #ifdef __cplusplus |
19 namespace libyuv { | 20 namespace libyuv { |
20 extern "C" { | 21 extern "C" { |
21 #endif | 22 #endif |
22 | 23 |
23 #if !defined(LIBYUV_DISABLE_X86) && \ | |
24 (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) | |
25 #if defined(__APPLE__) && defined(__i386__) | |
26 #define DECLARE_FUNCTION(name) \ | |
27 ".text \n" \ | |
28 ".private_extern _" #name " \n" \ | |
29 ".align 4,0x90 \n" \ | |
30 "_" #name ": \n" | |
31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__) | |
32 #define DECLARE_FUNCTION(name) \ | |
33 ".text \n" \ | |
34 ".align 4,0x90 \n" \ | |
35 "_" #name ": \n" | |
36 #else | |
37 #define DECLARE_FUNCTION(name) \ | |
38 ".text \n" \ | |
39 ".align 4,0x90 \n" \ | |
40 #name ": \n" | |
41 #endif | |
42 #endif | |
43 | |
44 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ | |
45 (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) | |
46 #define HAS_TRANSPOSE_WX8_NEON | |
47 void TransposeWx8_NEON(const uint8* src, int src_stride, | |
48 uint8* dst, int dst_stride, int width); | |
49 #define HAS_TRANSPOSE_UVWX8_NEON | |
50 void TransposeUVWx8_NEON(const uint8* src, int src_stride, | |
51 uint8* dst_a, int dst_stride_a, | |
52 uint8* dst_b, int dst_stride_b, | |
53 int width); | |
54 #endif | |
55 | |
56 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ | |
57 defined(__mips__) && \ | |
58 defined(__mips_dsp) && (__mips_dsp_rev >= 2) | |
59 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2 | |
60 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, | |
61 uint8* dst, int dst_stride, int width); | |
62 | |
63 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, | |
64 uint8* dst, int dst_stride, int width); | |
65 #define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2 | |
66 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, | |
67 uint8* dst_a, int dst_stride_a, | |
68 uint8* dst_b, int dst_stride_b, | |
69 int width); | |
70 #endif // defined(__mips__) | |
71 | |
72 #if !defined(LIBYUV_DISABLE_X86) && \ | |
73 defined(_M_IX86) && defined(_MSC_VER) | |
74 #define HAS_TRANSPOSE_WX8_SSSE3 | |
75 __declspec(naked) __declspec(align(16)) | |
76 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, | |
77 uint8* dst, int dst_stride, int width) { | |
78 __asm { | |
79 push edi | |
80 push esi | |
81 push ebp | |
82 mov eax, [esp + 12 + 4] // src | |
83 mov edi, [esp + 12 + 8] // src_stride | |
84 mov edx, [esp + 12 + 12] // dst | |
85 mov esi, [esp + 12 + 16] // dst_stride | |
86 mov ecx, [esp + 12 + 20] // width | |
87 | |
88 // Read in the data from the source pointer. | |
89 // First round of bit swap. | |
90 align 4 | |
91 convertloop: | |
92 movq xmm0, qword ptr [eax] | |
93 lea ebp, [eax + 8] | |
94 movq xmm1, qword ptr [eax + edi] | |
95 lea eax, [eax + 2 * edi] | |
96 punpcklbw xmm0, xmm1 | |
97 movq xmm2, qword ptr [eax] | |
98 movdqa xmm1, xmm0 | |
99 palignr xmm1, xmm1, 8 | |
100 movq xmm3, qword ptr [eax + edi] | |
101 lea eax, [eax + 2 * edi] | |
102 punpcklbw xmm2, xmm3 | |
103 movdqa xmm3, xmm2 | |
104 movq xmm4, qword ptr [eax] | |
105 palignr xmm3, xmm3, 8 | |
106 movq xmm5, qword ptr [eax + edi] | |
107 punpcklbw xmm4, xmm5 | |
108 lea eax, [eax + 2 * edi] | |
109 movdqa xmm5, xmm4 | |
110 movq xmm6, qword ptr [eax] | |
111 palignr xmm5, xmm5, 8 | |
112 movq xmm7, qword ptr [eax + edi] | |
113 punpcklbw xmm6, xmm7 | |
114 mov eax, ebp | |
115 movdqa xmm7, xmm6 | |
116 palignr xmm7, xmm7, 8 | |
117 // Second round of bit swap. | |
118 punpcklwd xmm0, xmm2 | |
119 punpcklwd xmm1, xmm3 | |
120 movdqa xmm2, xmm0 | |
121 movdqa xmm3, xmm1 | |
122 palignr xmm2, xmm2, 8 | |
123 palignr xmm3, xmm3, 8 | |
124 punpcklwd xmm4, xmm6 | |
125 punpcklwd xmm5, xmm7 | |
126 movdqa xmm6, xmm4 | |
127 movdqa xmm7, xmm5 | |
128 palignr xmm6, xmm6, 8 | |
129 palignr xmm7, xmm7, 8 | |
130 // Third round of bit swap. | |
131 // Write to the destination pointer. | |
132 punpckldq xmm0, xmm4 | |
133 movq qword ptr [edx], xmm0 | |
134 movdqa xmm4, xmm0 | |
135 palignr xmm4, xmm4, 8 | |
136 movq qword ptr [edx + esi], xmm4 | |
137 lea edx, [edx + 2 * esi] | |
138 punpckldq xmm2, xmm6 | |
139 movdqa xmm6, xmm2 | |
140 palignr xmm6, xmm6, 8 | |
141 movq qword ptr [edx], xmm2 | |
142 punpckldq xmm1, xmm5 | |
143 movq qword ptr [edx + esi], xmm6 | |
144 lea edx, [edx + 2 * esi] | |
145 movdqa xmm5, xmm1 | |
146 movq qword ptr [edx], xmm1 | |
147 palignr xmm5, xmm5, 8 | |
148 punpckldq xmm3, xmm7 | |
149 movq qword ptr [edx + esi], xmm5 | |
150 lea edx, [edx + 2 * esi] | |
151 movq qword ptr [edx], xmm3 | |
152 movdqa xmm7, xmm3 | |
153 palignr xmm7, xmm7, 8 | |
154 sub ecx, 8 | |
155 movq qword ptr [edx + esi], xmm7 | |
156 lea edx, [edx + 2 * esi] | |
157 jg convertloop | |
158 | |
159 pop ebp | |
160 pop esi | |
161 pop edi | |
162 ret | |
163 } | |
164 } | |
165 | |
166 #define HAS_TRANSPOSE_UVWX8_SSE2 | |
167 __declspec(naked) __declspec(align(16)) | |
168 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, | |
169 uint8* dst_a, int dst_stride_a, | |
170 uint8* dst_b, int dst_stride_b, | |
171 int w) { | |
172 __asm { | |
173 push ebx | |
174 push esi | |
175 push edi | |
176 push ebp | |
177 mov eax, [esp + 16 + 4] // src | |
178 mov edi, [esp + 16 + 8] // src_stride | |
179 mov edx, [esp + 16 + 12] // dst_a | |
180 mov esi, [esp + 16 + 16] // dst_stride_a | |
181 mov ebx, [esp + 16 + 20] // dst_b | |
182 mov ebp, [esp + 16 + 24] // dst_stride_b | |
183 mov ecx, esp | |
184 sub esp, 4 + 16 | |
185 and esp, ~15 | |
186 mov [esp + 16], ecx | |
187 mov ecx, [ecx + 16 + 28] // w | |
188 | |
189 align 4 | |
190 convertloop: | |
191 // Read in the data from the source pointer. | |
192 // First round of bit swap. | |
193 movdqu xmm0, [eax] | |
194 movdqu xmm1, [eax + edi] | |
195 lea eax, [eax + 2 * edi] | |
196 movdqa xmm7, xmm0 // use xmm7 as temp register. | |
197 punpcklbw xmm0, xmm1 | |
198 punpckhbw xmm7, xmm1 | |
199 movdqa xmm1, xmm7 | |
200 movdqu xmm2, [eax] | |
201 movdqu xmm3, [eax + edi] | |
202 lea eax, [eax + 2 * edi] | |
203 movdqa xmm7, xmm2 | |
204 punpcklbw xmm2, xmm3 | |
205 punpckhbw xmm7, xmm3 | |
206 movdqa xmm3, xmm7 | |
207 movdqu xmm4, [eax] | |
208 movdqu xmm5, [eax + edi] | |
209 lea eax, [eax + 2 * edi] | |
210 movdqa xmm7, xmm4 | |
211 punpcklbw xmm4, xmm5 | |
212 punpckhbw xmm7, xmm5 | |
213 movdqa xmm5, xmm7 | |
214 movdqu xmm6, [eax] | |
215 movdqu xmm7, [eax + edi] | |
216 lea eax, [eax + 2 * edi] | |
217 movdqu [esp], xmm5 // backup xmm5 | |
218 neg edi | |
219 movdqa xmm5, xmm6 // use xmm5 as temp register. | |
220 punpcklbw xmm6, xmm7 | |
221 punpckhbw xmm5, xmm7 | |
222 movdqa xmm7, xmm5 | |
223 lea eax, [eax + 8 * edi + 16] | |
224 neg edi | |
225 // Second round of bit swap. | |
226 movdqa xmm5, xmm0 | |
227 punpcklwd xmm0, xmm2 | |
228 punpckhwd xmm5, xmm2 | |
229 movdqa xmm2, xmm5 | |
230 movdqa xmm5, xmm1 | |
231 punpcklwd xmm1, xmm3 | |
232 punpckhwd xmm5, xmm3 | |
233 movdqa xmm3, xmm5 | |
234 movdqa xmm5, xmm4 | |
235 punpcklwd xmm4, xmm6 | |
236 punpckhwd xmm5, xmm6 | |
237 movdqa xmm6, xmm5 | |
238 movdqu xmm5, [esp] // restore xmm5 | |
239 movdqu [esp], xmm6 // backup xmm6 | |
240 movdqa xmm6, xmm5 // use xmm6 as temp register. | |
241 punpcklwd xmm5, xmm7 | |
242 punpckhwd xmm6, xmm7 | |
243 movdqa xmm7, xmm6 | |
244 // Third round of bit swap. | |
245 // Write to the destination pointer. | |
246 movdqa xmm6, xmm0 | |
247 punpckldq xmm0, xmm4 | |
248 punpckhdq xmm6, xmm4 | |
249 movdqa xmm4, xmm6 | |
250 movdqu xmm6, [esp] // restore xmm6 | |
251 movlpd qword ptr [edx], xmm0 | |
252 movhpd qword ptr [ebx], xmm0 | |
253 movlpd qword ptr [edx + esi], xmm4 | |
254 lea edx, [edx + 2 * esi] | |
255 movhpd qword ptr [ebx + ebp], xmm4 | |
256 lea ebx, [ebx + 2 * ebp] | |
257 movdqa xmm0, xmm2 // use xmm0 as the temp register. | |
258 punpckldq xmm2, xmm6 | |
259 movlpd qword ptr [edx], xmm2 | |
260 movhpd qword ptr [ebx], xmm2 | |
261 punpckhdq xmm0, xmm6 | |
262 movlpd qword ptr [edx + esi], xmm0 | |
263 lea edx, [edx + 2 * esi] | |
264 movhpd qword ptr [ebx + ebp], xmm0 | |
265 lea ebx, [ebx + 2 * ebp] | |
266 movdqa xmm0, xmm1 // use xmm0 as the temp register. | |
267 punpckldq xmm1, xmm5 | |
268 movlpd qword ptr [edx], xmm1 | |
269 movhpd qword ptr [ebx], xmm1 | |
270 punpckhdq xmm0, xmm5 | |
271 movlpd qword ptr [edx + esi], xmm0 | |
272 lea edx, [edx + 2 * esi] | |
273 movhpd qword ptr [ebx + ebp], xmm0 | |
274 lea ebx, [ebx + 2 * ebp] | |
275 movdqa xmm0, xmm3 // use xmm0 as the temp register. | |
276 punpckldq xmm3, xmm7 | |
277 movlpd qword ptr [edx], xmm3 | |
278 movhpd qword ptr [ebx], xmm3 | |
279 punpckhdq xmm0, xmm7 | |
280 sub ecx, 8 | |
281 movlpd qword ptr [edx + esi], xmm0 | |
282 lea edx, [edx + 2 * esi] | |
283 movhpd qword ptr [ebx + ebp], xmm0 | |
284 lea ebx, [ebx + 2 * ebp] | |
285 jg convertloop | |
286 | |
287 mov esp, [esp + 16] | |
288 pop ebp | |
289 pop edi | |
290 pop esi | |
291 pop ebx | |
292 ret | |
293 } | |
294 } | |
295 #endif | |
296 #if !defined(LIBYUV_DISABLE_X86) && \ | |
297 (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) | |
298 #define HAS_TRANSPOSE_WX8_SSSE3 | |
299 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, | |
300 uint8* dst, int dst_stride, int width) { | |
301 asm volatile ( | |
302 // Read in the data from the source pointer. | |
303 // First round of bit swap. | |
304 ".p2align 2 \n" | |
305 "1: \n" | |
306 "movq (%0),%%xmm0 \n" | |
307 "movq (%0,%3),%%xmm1 \n" | |
308 "lea (%0,%3,2),%0 \n" | |
309 "punpcklbw %%xmm1,%%xmm0 \n" | |
310 "movq (%0),%%xmm2 \n" | |
311 "movdqa %%xmm0,%%xmm1 \n" | |
312 "palignr $0x8,%%xmm1,%%xmm1 \n" | |
313 "movq (%0,%3),%%xmm3 \n" | |
314 "lea (%0,%3,2),%0 \n" | |
315 "punpcklbw %%xmm3,%%xmm2 \n" | |
316 "movdqa %%xmm2,%%xmm3 \n" | |
317 "movq (%0),%%xmm4 \n" | |
318 "palignr $0x8,%%xmm3,%%xmm3 \n" | |
319 "movq (%0,%3),%%xmm5 \n" | |
320 "lea (%0,%3,2),%0 \n" | |
321 "punpcklbw %%xmm5,%%xmm4 \n" | |
322 "movdqa %%xmm4,%%xmm5 \n" | |
323 "movq (%0),%%xmm6 \n" | |
324 "palignr $0x8,%%xmm5,%%xmm5 \n" | |
325 "movq (%0,%3),%%xmm7 \n" | |
326 "lea (%0,%3,2),%0 \n" | |
327 "punpcklbw %%xmm7,%%xmm6 \n" | |
328 "neg %3 \n" | |
329 "movdqa %%xmm6,%%xmm7 \n" | |
330 "lea 0x8(%0,%3,8),%0 \n" | |
331 "palignr $0x8,%%xmm7,%%xmm7 \n" | |
332 "neg %3 \n" | |
333 // Second round of bit swap. | |
334 "punpcklwd %%xmm2,%%xmm0 \n" | |
335 "punpcklwd %%xmm3,%%xmm1 \n" | |
336 "movdqa %%xmm0,%%xmm2 \n" | |
337 "movdqa %%xmm1,%%xmm3 \n" | |
338 "palignr $0x8,%%xmm2,%%xmm2 \n" | |
339 "palignr $0x8,%%xmm3,%%xmm3 \n" | |
340 "punpcklwd %%xmm6,%%xmm4 \n" | |
341 "punpcklwd %%xmm7,%%xmm5 \n" | |
342 "movdqa %%xmm4,%%xmm6 \n" | |
343 "movdqa %%xmm5,%%xmm7 \n" | |
344 "palignr $0x8,%%xmm6,%%xmm6 \n" | |
345 "palignr $0x8,%%xmm7,%%xmm7 \n" | |
346 // Third round of bit swap. | |
347 // Write to the destination pointer. | |
348 "punpckldq %%xmm4,%%xmm0 \n" | |
349 "movq %%xmm0,(%1) \n" | |
350 "movdqa %%xmm0,%%xmm4 \n" | |
351 "palignr $0x8,%%xmm4,%%xmm4 \n" | |
352 "movq %%xmm4,(%1,%4) \n" | |
353 "lea (%1,%4,2),%1 \n" | |
354 "punpckldq %%xmm6,%%xmm2 \n" | |
355 "movdqa %%xmm2,%%xmm6 \n" | |
356 "movq %%xmm2,(%1) \n" | |
357 "palignr $0x8,%%xmm6,%%xmm6 \n" | |
358 "punpckldq %%xmm5,%%xmm1 \n" | |
359 "movq %%xmm6,(%1,%4) \n" | |
360 "lea (%1,%4,2),%1 \n" | |
361 "movdqa %%xmm1,%%xmm5 \n" | |
362 "movq %%xmm1,(%1) \n" | |
363 "palignr $0x8,%%xmm5,%%xmm5 \n" | |
364 "movq %%xmm5,(%1,%4) \n" | |
365 "lea (%1,%4,2),%1 \n" | |
366 "punpckldq %%xmm7,%%xmm3 \n" | |
367 "movq %%xmm3,(%1) \n" | |
368 "movdqa %%xmm3,%%xmm7 \n" | |
369 "palignr $0x8,%%xmm7,%%xmm7 \n" | |
370 "sub $0x8,%2 \n" | |
371 "movq %%xmm7,(%1,%4) \n" | |
372 "lea (%1,%4,2),%1 \n" | |
373 "jg 1b \n" | |
374 : "+r"(src), // %0 | |
375 "+r"(dst), // %1 | |
376 "+r"(width) // %2 | |
377 : "r"((intptr_t)(src_stride)), // %3 | |
378 "r"((intptr_t)(dst_stride)) // %4 | |
379 : "memory", "cc", | |
380 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
381 ); | |
382 } | |
383 | |
384 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) | |
385 #define HAS_TRANSPOSE_UVWX8_SSE2 | |
386 void TransposeUVWx8_SSE2(const uint8* src, int src_stride, | |
387 uint8* dst_a, int dst_stride_a, | |
388 uint8* dst_b, int dst_stride_b, | |
389 int w); | |
390 asm ( | |
391 DECLARE_FUNCTION(TransposeUVWx8_SSE2) | |
392 "push %ebx \n" | |
393 "push %esi \n" | |
394 "push %edi \n" | |
395 "push %ebp \n" | |
396 "mov 0x14(%esp),%eax \n" | |
397 "mov 0x18(%esp),%edi \n" | |
398 "mov 0x1c(%esp),%edx \n" | |
399 "mov 0x20(%esp),%esi \n" | |
400 "mov 0x24(%esp),%ebx \n" | |
401 "mov 0x28(%esp),%ebp \n" | |
402 "mov %esp,%ecx \n" | |
403 "sub $0x14,%esp \n" | |
404 "and $0xfffffff0,%esp \n" | |
405 "mov %ecx,0x10(%esp) \n" | |
406 "mov 0x2c(%ecx),%ecx \n" | |
407 | |
408 "1: \n" | |
409 "movdqu (%eax),%xmm0 \n" | |
410 "movdqu (%eax,%edi,1),%xmm1 \n" | |
411 "lea (%eax,%edi,2),%eax \n" | |
412 "movdqa %xmm0,%xmm7 \n" | |
413 "punpcklbw %xmm1,%xmm0 \n" | |
414 "punpckhbw %xmm1,%xmm7 \n" | |
415 "movdqa %xmm7,%xmm1 \n" | |
416 "movdqu (%eax),%xmm2 \n" | |
417 "movdqu (%eax,%edi,1),%xmm3 \n" | |
418 "lea (%eax,%edi,2),%eax \n" | |
419 "movdqa %xmm2,%xmm7 \n" | |
420 "punpcklbw %xmm3,%xmm2 \n" | |
421 "punpckhbw %xmm3,%xmm7 \n" | |
422 "movdqa %xmm7,%xmm3 \n" | |
423 "movdqu (%eax),%xmm4 \n" | |
424 "movdqu (%eax,%edi,1),%xmm5 \n" | |
425 "lea (%eax,%edi,2),%eax \n" | |
426 "movdqa %xmm4,%xmm7 \n" | |
427 "punpcklbw %xmm5,%xmm4 \n" | |
428 "punpckhbw %xmm5,%xmm7 \n" | |
429 "movdqa %xmm7,%xmm5 \n" | |
430 "movdqu (%eax),%xmm6 \n" | |
431 "movdqu (%eax,%edi,1),%xmm7 \n" | |
432 "lea (%eax,%edi,2),%eax \n" | |
433 "movdqu %xmm5,(%esp) \n" | |
434 "neg %edi \n" | |
435 "movdqa %xmm6,%xmm5 \n" | |
436 "punpcklbw %xmm7,%xmm6 \n" | |
437 "punpckhbw %xmm7,%xmm5 \n" | |
438 "movdqa %xmm5,%xmm7 \n" | |
439 "lea 0x10(%eax,%edi,8),%eax \n" | |
440 "neg %edi \n" | |
441 "movdqa %xmm0,%xmm5 \n" | |
442 "punpcklwd %xmm2,%xmm0 \n" | |
443 "punpckhwd %xmm2,%xmm5 \n" | |
444 "movdqa %xmm5,%xmm2 \n" | |
445 "movdqa %xmm1,%xmm5 \n" | |
446 "punpcklwd %xmm3,%xmm1 \n" | |
447 "punpckhwd %xmm3,%xmm5 \n" | |
448 "movdqa %xmm5,%xmm3 \n" | |
449 "movdqa %xmm4,%xmm5 \n" | |
450 "punpcklwd %xmm6,%xmm4 \n" | |
451 "punpckhwd %xmm6,%xmm5 \n" | |
452 "movdqa %xmm5,%xmm6 \n" | |
453 "movdqu (%esp),%xmm5 \n" | |
454 "movdqu %xmm6,(%esp) \n" | |
455 "movdqa %xmm5,%xmm6 \n" | |
456 "punpcklwd %xmm7,%xmm5 \n" | |
457 "punpckhwd %xmm7,%xmm6 \n" | |
458 "movdqa %xmm6,%xmm7 \n" | |
459 "movdqa %xmm0,%xmm6 \n" | |
460 "punpckldq %xmm4,%xmm0 \n" | |
461 "punpckhdq %xmm4,%xmm6 \n" | |
462 "movdqa %xmm6,%xmm4 \n" | |
463 "movdqu (%esp),%xmm6 \n" | |
464 "movlpd %xmm0,(%edx) \n" | |
465 "movhpd %xmm0,(%ebx) \n" | |
466 "movlpd %xmm4,(%edx,%esi,1) \n" | |
467 "lea (%edx,%esi,2),%edx \n" | |
468 "movhpd %xmm4,(%ebx,%ebp,1) \n" | |
469 "lea (%ebx,%ebp,2),%ebx \n" | |
470 "movdqa %xmm2,%xmm0 \n" | |
471 "punpckldq %xmm6,%xmm2 \n" | |
472 "movlpd %xmm2,(%edx) \n" | |
473 "movhpd %xmm2,(%ebx) \n" | |
474 "punpckhdq %xmm6,%xmm0 \n" | |
475 "movlpd %xmm0,(%edx,%esi,1) \n" | |
476 "lea (%edx,%esi,2),%edx \n" | |
477 "movhpd %xmm0,(%ebx,%ebp,1) \n" | |
478 "lea (%ebx,%ebp,2),%ebx \n" | |
479 "movdqa %xmm1,%xmm0 \n" | |
480 "punpckldq %xmm5,%xmm1 \n" | |
481 "movlpd %xmm1,(%edx) \n" | |
482 "movhpd %xmm1,(%ebx) \n" | |
483 "punpckhdq %xmm5,%xmm0 \n" | |
484 "movlpd %xmm0,(%edx,%esi,1) \n" | |
485 "lea (%edx,%esi,2),%edx \n" | |
486 "movhpd %xmm0,(%ebx,%ebp,1) \n" | |
487 "lea (%ebx,%ebp,2),%ebx \n" | |
488 "movdqa %xmm3,%xmm0 \n" | |
489 "punpckldq %xmm7,%xmm3 \n" | |
490 "movlpd %xmm3,(%edx) \n" | |
491 "movhpd %xmm3,(%ebx) \n" | |
492 "punpckhdq %xmm7,%xmm0 \n" | |
493 "sub $0x8,%ecx \n" | |
494 "movlpd %xmm0,(%edx,%esi,1) \n" | |
495 "lea (%edx,%esi,2),%edx \n" | |
496 "movhpd %xmm0,(%ebx,%ebp,1) \n" | |
497 "lea (%ebx,%ebp,2),%ebx \n" | |
498 "jg 1b \n" | |
499 "mov 0x10(%esp),%esp \n" | |
500 "pop %ebp \n" | |
501 "pop %edi \n" | |
502 "pop %esi \n" | |
503 "pop %ebx \n" | |
504 #if defined(__native_client__) | |
505 "pop %ecx \n" | |
506 "and $0xffffffe0,%ecx \n" | |
507 "jmp *%ecx \n" | |
508 #else | |
509 "ret \n" | |
510 #endif | |
511 ); | |
512 #endif | |
513 #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ | |
514 defined(__x86_64__) | |
515 // 64 bit version has enough registers to do 16x8 to 8x16 at a time. | |
516 #define HAS_TRANSPOSE_WX8_FAST_SSSE3 | |
517 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, | |
518 uint8* dst, int dst_stride, int width) { | |
519 asm volatile ( | |
520 // Read in the data from the source pointer. | |
521 // First round of bit swap. | |
522 ".p2align 2 \n" | |
523 "1: \n" | |
524 "movdqu (%0),%%xmm0 \n" | |
525 "movdqu (%0,%3),%%xmm1 \n" | |
526 "lea (%0,%3,2),%0 \n" | |
527 "movdqa %%xmm0,%%xmm8 \n" | |
528 "punpcklbw %%xmm1,%%xmm0 \n" | |
529 "punpckhbw %%xmm1,%%xmm8 \n" | |
530 "movdqu (%0),%%xmm2 \n" | |
531 "movdqa %%xmm0,%%xmm1 \n" | |
532 "movdqa %%xmm8,%%xmm9 \n" | |
533 "palignr $0x8,%%xmm1,%%xmm1 \n" | |
534 "palignr $0x8,%%xmm9,%%xmm9 \n" | |
535 "movdqu (%0,%3),%%xmm3 \n" | |
536 "lea (%0,%3,2),%0 \n" | |
537 "movdqa %%xmm2,%%xmm10 \n" | |
538 "punpcklbw %%xmm3,%%xmm2 \n" | |
539 "punpckhbw %%xmm3,%%xmm10 \n" | |
540 "movdqa %%xmm2,%%xmm3 \n" | |
541 "movdqa %%xmm10,%%xmm11 \n" | |
542 "movdqu (%0),%%xmm4 \n" | |
543 "palignr $0x8,%%xmm3,%%xmm3 \n" | |
544 "palignr $0x8,%%xmm11,%%xmm11 \n" | |
545 "movdqu (%0,%3),%%xmm5 \n" | |
546 "lea (%0,%3,2),%0 \n" | |
547 "movdqa %%xmm4,%%xmm12 \n" | |
548 "punpcklbw %%xmm5,%%xmm4 \n" | |
549 "punpckhbw %%xmm5,%%xmm12 \n" | |
550 "movdqa %%xmm4,%%xmm5 \n" | |
551 "movdqa %%xmm12,%%xmm13 \n" | |
552 "movdqu (%0),%%xmm6 \n" | |
553 "palignr $0x8,%%xmm5,%%xmm5 \n" | |
554 "palignr $0x8,%%xmm13,%%xmm13 \n" | |
555 "movdqu (%0,%3),%%xmm7 \n" | |
556 "lea (%0,%3,2),%0 \n" | |
557 "movdqa %%xmm6,%%xmm14 \n" | |
558 "punpcklbw %%xmm7,%%xmm6 \n" | |
559 "punpckhbw %%xmm7,%%xmm14 \n" | |
560 "neg %3 \n" | |
561 "movdqa %%xmm6,%%xmm7 \n" | |
562 "movdqa %%xmm14,%%xmm15 \n" | |
563 "lea 0x10(%0,%3,8),%0 \n" | |
564 "palignr $0x8,%%xmm7,%%xmm7 \n" | |
565 "palignr $0x8,%%xmm15,%%xmm15 \n" | |
566 "neg %3 \n" | |
567 // Second round of bit swap. | |
568 "punpcklwd %%xmm2,%%xmm0 \n" | |
569 "punpcklwd %%xmm3,%%xmm1 \n" | |
570 "movdqa %%xmm0,%%xmm2 \n" | |
571 "movdqa %%xmm1,%%xmm3 \n" | |
572 "palignr $0x8,%%xmm2,%%xmm2 \n" | |
573 "palignr $0x8,%%xmm3,%%xmm3 \n" | |
574 "punpcklwd %%xmm6,%%xmm4 \n" | |
575 "punpcklwd %%xmm7,%%xmm5 \n" | |
576 "movdqa %%xmm4,%%xmm6 \n" | |
577 "movdqa %%xmm5,%%xmm7 \n" | |
578 "palignr $0x8,%%xmm6,%%xmm6 \n" | |
579 "palignr $0x8,%%xmm7,%%xmm7 \n" | |
580 "punpcklwd %%xmm10,%%xmm8 \n" | |
581 "punpcklwd %%xmm11,%%xmm9 \n" | |
582 "movdqa %%xmm8,%%xmm10 \n" | |
583 "movdqa %%xmm9,%%xmm11 \n" | |
584 "palignr $0x8,%%xmm10,%%xmm10 \n" | |
585 "palignr $0x8,%%xmm11,%%xmm11 \n" | |
586 "punpcklwd %%xmm14,%%xmm12 \n" | |
587 "punpcklwd %%xmm15,%%xmm13 \n" | |
588 "movdqa %%xmm12,%%xmm14 \n" | |
589 "movdqa %%xmm13,%%xmm15 \n" | |
590 "palignr $0x8,%%xmm14,%%xmm14 \n" | |
591 "palignr $0x8,%%xmm15,%%xmm15 \n" | |
592 // Third round of bit swap. | |
593 // Write to the destination pointer. | |
594 "punpckldq %%xmm4,%%xmm0 \n" | |
595 "movq %%xmm0,(%1) \n" | |
596 "movdqa %%xmm0,%%xmm4 \n" | |
597 "palignr $0x8,%%xmm4,%%xmm4 \n" | |
598 "movq %%xmm4,(%1,%4) \n" | |
599 "lea (%1,%4,2),%1 \n" | |
600 "punpckldq %%xmm6,%%xmm2 \n" | |
601 "movdqa %%xmm2,%%xmm6 \n" | |
602 "movq %%xmm2,(%1) \n" | |
603 "palignr $0x8,%%xmm6,%%xmm6 \n" | |
604 "punpckldq %%xmm5,%%xmm1 \n" | |
605 "movq %%xmm6,(%1,%4) \n" | |
606 "lea (%1,%4,2),%1 \n" | |
607 "movdqa %%xmm1,%%xmm5 \n" | |
608 "movq %%xmm1,(%1) \n" | |
609 "palignr $0x8,%%xmm5,%%xmm5 \n" | |
610 "movq %%xmm5,(%1,%4) \n" | |
611 "lea (%1,%4,2),%1 \n" | |
612 "punpckldq %%xmm7,%%xmm3 \n" | |
613 "movq %%xmm3,(%1) \n" | |
614 "movdqa %%xmm3,%%xmm7 \n" | |
615 "palignr $0x8,%%xmm7,%%xmm7 \n" | |
616 "movq %%xmm7,(%1,%4) \n" | |
617 "lea (%1,%4,2),%1 \n" | |
618 "punpckldq %%xmm12,%%xmm8 \n" | |
619 "movq %%xmm8,(%1) \n" | |
620 "movdqa %%xmm8,%%xmm12 \n" | |
621 "palignr $0x8,%%xmm12,%%xmm12 \n" | |
622 "movq %%xmm12,(%1,%4) \n" | |
623 "lea (%1,%4,2),%1 \n" | |
624 "punpckldq %%xmm14,%%xmm10 \n" | |
625 "movdqa %%xmm10,%%xmm14 \n" | |
626 "movq %%xmm10,(%1) \n" | |
627 "palignr $0x8,%%xmm14,%%xmm14 \n" | |
628 "punpckldq %%xmm13,%%xmm9 \n" | |
629 "movq %%xmm14,(%1,%4) \n" | |
630 "lea (%1,%4,2),%1 \n" | |
631 "movdqa %%xmm9,%%xmm13 \n" | |
632 "movq %%xmm9,(%1) \n" | |
633 "palignr $0x8,%%xmm13,%%xmm13 \n" | |
634 "movq %%xmm13,(%1,%4) \n" | |
635 "lea (%1,%4,2),%1 \n" | |
636 "punpckldq %%xmm15,%%xmm11 \n" | |
637 "movq %%xmm11,(%1) \n" | |
638 "movdqa %%xmm11,%%xmm15 \n" | |
639 "palignr $0x8,%%xmm15,%%xmm15 \n" | |
640 "sub $0x10,%2 \n" | |
641 "movq %%xmm15,(%1,%4) \n" | |
642 "lea (%1,%4,2),%1 \n" | |
643 "jg 1b \n" | |
644 : "+r"(src), // %0 | |
645 "+r"(dst), // %1 | |
646 "+r"(width) // %2 | |
647 : "r"((intptr_t)(src_stride)), // %3 | |
648 "r"((intptr_t)(dst_stride)) // %4 | |
649 : "memory", "cc", | |
650 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", | |
651 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" | |
652 ); | |
653 } | |
654 | |
655 #define HAS_TRANSPOSE_UVWX8_SSE2 | |
656 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, | |
657 uint8* dst_a, int dst_stride_a, | |
658 uint8* dst_b, int dst_stride_b, | |
659 int w) { | |
660 asm volatile ( | |
661 // Read in the data from the source pointer. | |
662 // First round of bit swap. | |
663 ".p2align 2 \n" | |
664 "1: \n" | |
665 "movdqu (%0),%%xmm0 \n" | |
666 "movdqu (%0,%4),%%xmm1 \n" | |
667 "lea (%0,%4,2),%0 \n" | |
668 "movdqa %%xmm0,%%xmm8 \n" | |
669 "punpcklbw %%xmm1,%%xmm0 \n" | |
670 "punpckhbw %%xmm1,%%xmm8 \n" | |
671 "movdqa %%xmm8,%%xmm1 \n" | |
672 "movdqu (%0),%%xmm2 \n" | |
673 "movdqu (%0,%4),%%xmm3 \n" | |
674 "lea (%0,%4,2),%0 \n" | |
675 "movdqa %%xmm2,%%xmm8 \n" | |
676 "punpcklbw %%xmm3,%%xmm2 \n" | |
677 "punpckhbw %%xmm3,%%xmm8 \n" | |
678 "movdqa %%xmm8,%%xmm3 \n" | |
679 "movdqu (%0),%%xmm4 \n" | |
680 "movdqu (%0,%4),%%xmm5 \n" | |
681 "lea (%0,%4,2),%0 \n" | |
682 "movdqa %%xmm4,%%xmm8 \n" | |
683 "punpcklbw %%xmm5,%%xmm4 \n" | |
684 "punpckhbw %%xmm5,%%xmm8 \n" | |
685 "movdqa %%xmm8,%%xmm5 \n" | |
686 "movdqu (%0),%%xmm6 \n" | |
687 "movdqu (%0,%4),%%xmm7 \n" | |
688 "lea (%0,%4,2),%0 \n" | |
689 "movdqa %%xmm6,%%xmm8 \n" | |
690 "punpcklbw %%xmm7,%%xmm6 \n" | |
691 "neg %4 \n" | |
692 "lea 0x10(%0,%4,8),%0 \n" | |
693 "punpckhbw %%xmm7,%%xmm8 \n" | |
694 "movdqa %%xmm8,%%xmm7 \n" | |
695 "neg %4 \n" | |
696 // Second round of bit swap. | |
697 "movdqa %%xmm0,%%xmm8 \n" | |
698 "movdqa %%xmm1,%%xmm9 \n" | |
699 "punpckhwd %%xmm2,%%xmm8 \n" | |
700 "punpckhwd %%xmm3,%%xmm9 \n" | |
701 "punpcklwd %%xmm2,%%xmm0 \n" | |
702 "punpcklwd %%xmm3,%%xmm1 \n" | |
703 "movdqa %%xmm8,%%xmm2 \n" | |
704 "movdqa %%xmm9,%%xmm3 \n" | |
705 "movdqa %%xmm4,%%xmm8 \n" | |
706 "movdqa %%xmm5,%%xmm9 \n" | |
707 "punpckhwd %%xmm6,%%xmm8 \n" | |
708 "punpckhwd %%xmm7,%%xmm9 \n" | |
709 "punpcklwd %%xmm6,%%xmm4 \n" | |
710 "punpcklwd %%xmm7,%%xmm5 \n" | |
711 "movdqa %%xmm8,%%xmm6 \n" | |
712 "movdqa %%xmm9,%%xmm7 \n" | |
713 // Third round of bit swap. | |
714 // Write to the destination pointer. | |
715 "movdqa %%xmm0,%%xmm8 \n" | |
716 "punpckldq %%xmm4,%%xmm0 \n" | |
717 "movlpd %%xmm0,(%1) \n" // Write back U channel | |
718 "movhpd %%xmm0,(%2) \n" // Write back V channel | |
719 "punpckhdq %%xmm4,%%xmm8 \n" | |
720 "movlpd %%xmm8,(%1,%5) \n" | |
721 "lea (%1,%5,2),%1 \n" | |
722 "movhpd %%xmm8,(%2,%6) \n" | |
723 "lea (%2,%6,2),%2 \n" | |
724 "movdqa %%xmm2,%%xmm8 \n" | |
725 "punpckldq %%xmm6,%%xmm2 \n" | |
726 "movlpd %%xmm2,(%1) \n" | |
727 "movhpd %%xmm2,(%2) \n" | |
728 "punpckhdq %%xmm6,%%xmm8 \n" | |
729 "movlpd %%xmm8,(%1,%5) \n" | |
730 "lea (%1,%5,2),%1 \n" | |
731 "movhpd %%xmm8,(%2,%6) \n" | |
732 "lea (%2,%6,2),%2 \n" | |
733 "movdqa %%xmm1,%%xmm8 \n" | |
734 "punpckldq %%xmm5,%%xmm1 \n" | |
735 "movlpd %%xmm1,(%1) \n" | |
736 "movhpd %%xmm1,(%2) \n" | |
737 "punpckhdq %%xmm5,%%xmm8 \n" | |
738 "movlpd %%xmm8,(%1,%5) \n" | |
739 "lea (%1,%5,2),%1 \n" | |
740 "movhpd %%xmm8,(%2,%6) \n" | |
741 "lea (%2,%6,2),%2 \n" | |
742 "movdqa %%xmm3,%%xmm8 \n" | |
743 "punpckldq %%xmm7,%%xmm3 \n" | |
744 "movlpd %%xmm3,(%1) \n" | |
745 "movhpd %%xmm3,(%2) \n" | |
746 "punpckhdq %%xmm7,%%xmm8 \n" | |
747 "sub $0x8,%3 \n" | |
748 "movlpd %%xmm8,(%1,%5) \n" | |
749 "lea (%1,%5,2),%1 \n" | |
750 "movhpd %%xmm8,(%2,%6) \n" | |
751 "lea (%2,%6,2),%2 \n" | |
752 "jg 1b \n" | |
753 : "+r"(src), // %0 | |
754 "+r"(dst_a), // %1 | |
755 "+r"(dst_b), // %2 | |
756 "+r"(w) // %3 | |
757 : "r"((intptr_t)(src_stride)), // %4 | |
758 "r"((intptr_t)(dst_stride_a)), // %5 | |
759 "r"((intptr_t)(dst_stride_b)) // %6 | |
760 : "memory", "cc", | |
761 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", | |
762 "xmm8", "xmm9" | |
763 ); | |
764 } | |
765 #endif | |
766 #endif | |
767 | |
768 static void TransposeWx8_C(const uint8* src, int src_stride, | |
769 uint8* dst, int dst_stride, | |
770 int width) { | |
771 int i; | |
772 for (i = 0; i < width; ++i) { | |
773 dst[0] = src[0 * src_stride]; | |
774 dst[1] = src[1 * src_stride]; | |
775 dst[2] = src[2 * src_stride]; | |
776 dst[3] = src[3 * src_stride]; | |
777 dst[4] = src[4 * src_stride]; | |
778 dst[5] = src[5 * src_stride]; | |
779 dst[6] = src[6 * src_stride]; | |
780 dst[7] = src[7 * src_stride]; | |
781 ++src; | |
782 dst += dst_stride; | |
783 } | |
784 } | |
785 | |
786 static void TransposeWxH_C(const uint8* src, int src_stride, | |
787 uint8* dst, int dst_stride, | |
788 int width, int height) { | |
789 int i; | |
790 for (i = 0; i < width; ++i) { | |
791 int j; | |
792 for (j = 0; j < height; ++j) { | |
793 dst[i * dst_stride + j] = src[j * src_stride + i]; | |
794 } | |
795 } | |
796 } | |
797 | |
798 LIBYUV_API | 24 LIBYUV_API |
799 void TransposePlane(const uint8* src, int src_stride, | 25 void TransposePlane(const uint8* src, int src_stride, |
800 uint8* dst, int dst_stride, | 26 uint8* dst, int dst_stride, |
801 int width, int height) { | 27 int width, int height) { |
802 int i = height; | 28 int i = height; |
803 void (*TransposeWx8)(const uint8* src, int src_stride, | 29 void (*TransposeWx8)(const uint8* src, int src_stride, |
804 uint8* dst, int dst_stride, | 30 uint8* dst, int dst_stride, int width) = TransposeWx8_C; |
805 int width) = TransposeWx8_C; | 31 #if defined(HAS_TRANSPOSEWX8_NEON) |
806 #if defined(HAS_TRANSPOSE_WX8_NEON) | |
807 if (TestCpuFlag(kCpuHasNEON)) { | 32 if (TestCpuFlag(kCpuHasNEON)) { |
808 TransposeWx8 = TransposeWx8_NEON; | 33 TransposeWx8 = TransposeWx8_NEON; |
809 } | 34 } |
810 #endif | 35 #endif |
811 #if defined(HAS_TRANSPOSE_WX8_SSSE3) | 36 #if defined(HAS_TRANSPOSEWX8_SSSE3) |
812 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { | 37 if (TestCpuFlag(kCpuHasSSSE3)) { |
813 TransposeWx8 = TransposeWx8_SSSE3; | 38 TransposeWx8 = TransposeWx8_Any_SSSE3; |
| 39 if (IS_ALIGNED(width, 8)) { |
| 40 TransposeWx8 = TransposeWx8_SSSE3; |
| 41 } |
814 } | 42 } |
815 #endif | 43 #endif |
816 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) | 44 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) |
817 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { | 45 if (TestCpuFlag(kCpuHasSSSE3)) { |
818 TransposeWx8 = TransposeWx8_FAST_SSSE3; | 46 TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; |
| 47 if (IS_ALIGNED(width, 16)) { |
| 48 TransposeWx8 = TransposeWx8_Fast_SSSE3; |
| 49 } |
819 } | 50 } |
820 #endif | 51 #endif |
821 #if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2) | 52 #if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2) |
822 if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { | 53 if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { |
823 if (IS_ALIGNED(width, 4) && | 54 if (IS_ALIGNED(width, 4) && |
824 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { | 55 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { |
825 TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2; | 56 TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2; |
826 } else { | 57 } else { |
827 TransposeWx8 = TransposeWx8_MIPS_DSPR2; | 58 TransposeWx8 = TransposeWx8_MIPS_DSPR2; |
828 } | 59 } |
829 } | 60 } |
830 #endif | 61 #endif |
831 | 62 |
832 // Work across the source in 8x8 tiles | 63 // Work across the source in 8x8 tiles |
833 while (i >= 8) { | 64 while (i >= 8) { |
834 TransposeWx8(src, src_stride, dst, dst_stride, width); | 65 TransposeWx8(src, src_stride, dst, dst_stride, width); |
835 src += 8 * src_stride; // Go down 8 rows. | 66 src += 8 * src_stride; // Go down 8 rows. |
836 dst += 8; // Move over 8 columns. | 67 dst += 8; // Move over 8 columns. |
837 i -= 8; | 68 i -= 8; |
838 } | 69 } |
839 | 70 |
840 TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); | 71 if (i > 0) { |
| 72 TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); |
| 73 } |
841 } | 74 } |
842 | 75 |
843 LIBYUV_API | 76 LIBYUV_API |
844 void RotatePlane90(const uint8* src, int src_stride, | 77 void RotatePlane90(const uint8* src, int src_stride, |
845 uint8* dst, int dst_stride, | 78 uint8* dst, int dst_stride, |
846 int width, int height) { | 79 int width, int height) { |
847 // Rotate by 90 is a transpose with the source read | 80 // Rotate by 90 is a transpose with the source read |
848 // from bottom to top. So set the source pointer to the end | 81 // from bottom to top. So set the source pointer to the end |
849 // of the buffer and flip the sign of the source stride. | 82 // of the buffer and flip the sign of the source stride. |
850 src += src_stride * (height - 1); | 83 src += src_stride * (height - 1); |
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
948 src += src_stride; | 181 src += src_stride; |
949 MirrorRow(src_bot, dst, width); // Mirror last row into first row | 182 MirrorRow(src_bot, dst, width); // Mirror last row into first row |
950 dst += dst_stride; | 183 dst += dst_stride; |
951 CopyRow(row, dst_bot, width); // Copy first mirrored row into last | 184 CopyRow(row, dst_bot, width); // Copy first mirrored row into last |
952 src_bot -= src_stride; | 185 src_bot -= src_stride; |
953 dst_bot -= dst_stride; | 186 dst_bot -= dst_stride; |
954 } | 187 } |
955 free_aligned_buffer_64(row); | 188 free_aligned_buffer_64(row); |
956 } | 189 } |
957 | 190 |
958 static void TransposeUVWx8_C(const uint8* src, int src_stride, | |
959 uint8* dst_a, int dst_stride_a, | |
960 uint8* dst_b, int dst_stride_b, | |
961 int width) { | |
962 int i; | |
963 for (i = 0; i < width; ++i) { | |
964 dst_a[0] = src[0 * src_stride + 0]; | |
965 dst_b[0] = src[0 * src_stride + 1]; | |
966 dst_a[1] = src[1 * src_stride + 0]; | |
967 dst_b[1] = src[1 * src_stride + 1]; | |
968 dst_a[2] = src[2 * src_stride + 0]; | |
969 dst_b[2] = src[2 * src_stride + 1]; | |
970 dst_a[3] = src[3 * src_stride + 0]; | |
971 dst_b[3] = src[3 * src_stride + 1]; | |
972 dst_a[4] = src[4 * src_stride + 0]; | |
973 dst_b[4] = src[4 * src_stride + 1]; | |
974 dst_a[5] = src[5 * src_stride + 0]; | |
975 dst_b[5] = src[5 * src_stride + 1]; | |
976 dst_a[6] = src[6 * src_stride + 0]; | |
977 dst_b[6] = src[6 * src_stride + 1]; | |
978 dst_a[7] = src[7 * src_stride + 0]; | |
979 dst_b[7] = src[7 * src_stride + 1]; | |
980 src += 2; | |
981 dst_a += dst_stride_a; | |
982 dst_b += dst_stride_b; | |
983 } | |
984 } | |
985 | |
986 static void TransposeUVWxH_C(const uint8* src, int src_stride, | |
987 uint8* dst_a, int dst_stride_a, | |
988 uint8* dst_b, int dst_stride_b, | |
989 int width, int height) { | |
990 int i; | |
991 for (i = 0; i < width * 2; i += 2) { | |
992 int j; | |
993 for (j = 0; j < height; ++j) { | |
994 dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; | |
995 dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; | |
996 } | |
997 } | |
998 } | |
999 | |
1000 LIBYUV_API | 191 LIBYUV_API |
1001 void TransposeUV(const uint8* src, int src_stride, | 192 void TransposeUV(const uint8* src, int src_stride, |
1002 uint8* dst_a, int dst_stride_a, | 193 uint8* dst_a, int dst_stride_a, |
1003 uint8* dst_b, int dst_stride_b, | 194 uint8* dst_b, int dst_stride_b, |
1004 int width, int height) { | 195 int width, int height) { |
1005 int i = height; | 196 int i = height; |
1006 void (*TransposeUVWx8)(const uint8* src, int src_stride, | 197 void (*TransposeUVWx8)(const uint8* src, int src_stride, |
1007 uint8* dst_a, int dst_stride_a, | 198 uint8* dst_a, int dst_stride_a, |
1008 uint8* dst_b, int dst_stride_b, | 199 uint8* dst_b, int dst_stride_b, |
1009 int width) = TransposeUVWx8_C; | 200 int width) = TransposeUVWx8_C; |
1010 #if defined(HAS_TRANSPOSE_UVWX8_NEON) | 201 #if defined(HAS_TRANSPOSEUVWX8_NEON) |
1011 if (TestCpuFlag(kCpuHasNEON)) { | 202 if (TestCpuFlag(kCpuHasNEON)) { |
1012 TransposeUVWx8 = TransposeUVWx8_NEON; | 203 TransposeUVWx8 = TransposeUVWx8_NEON; |
1013 } | 204 } |
1014 #endif | 205 #endif |
1015 #if defined(HAS_TRANSPOSE_UVWX8_SSE2) | 206 #if defined(HAS_TRANSPOSEUVWX8_SSE2) |
1016 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { | 207 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { |
1017 TransposeUVWx8 = TransposeUVWx8_SSE2; | 208 TransposeUVWx8 = TransposeUVWx8_SSE2; |
1018 } | 209 } |
1019 #endif | 210 #endif |
1020 #if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) | 211 #if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2) |
1021 if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && | 212 if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && |
1022 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { | 213 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { |
1023 TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; | 214 TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; |
1024 } | 215 } |
1025 #endif | 216 #endif |
1026 | 217 |
1027 // Work through the source in 8x8 tiles. | 218 // Work through the source in 8x8 tiles. |
1028 while (i >= 8) { | 219 while (i >= 8) { |
1029 TransposeUVWx8(src, src_stride, | 220 TransposeUVWx8(src, src_stride, |
1030 dst_a, dst_stride_a, | 221 dst_a, dst_stride_a, |
1031 dst_b, dst_stride_b, | 222 dst_b, dst_stride_b, |
1032 width); | 223 width); |
1033 src += 8 * src_stride; // Go down 8 rows. | 224 src += 8 * src_stride; // Go down 8 rows. |
1034 dst_a += 8; // Move over 8 columns. | 225 dst_a += 8; // Move over 8 columns. |
1035 dst_b += 8; // Move over 8 columns. | 226 dst_b += 8; // Move over 8 columns. |
1036 i -= 8; | 227 i -= 8; |
1037 } | 228 } |
1038 | 229 |
1039 TransposeUVWxH_C(src, src_stride, | 230 if (i > 0) { |
1040 dst_a, dst_stride_a, | 231 TransposeUVWxH_C(src, src_stride, |
1041 dst_b, dst_stride_b, | 232 dst_a, dst_stride_a, |
1042 width, i); | 233 dst_b, dst_stride_b, |
| 234 width, i); |
| 235 } |
1043 } | 236 } |
1044 | 237 |
1045 LIBYUV_API | 238 LIBYUV_API |
1046 void RotateUV90(const uint8* src, int src_stride, | 239 void RotateUV90(const uint8* src, int src_stride, |
1047 uint8* dst_a, int dst_stride_a, | 240 uint8* dst_a, int dst_stride_a, |
1048 uint8* dst_b, int dst_stride_b, | 241 uint8* dst_b, int dst_stride_b, |
1049 int width, int height) { | 242 int width, int height) { |
1050 src += src_stride * (height - 1); | 243 src += src_stride * (height - 1); |
1051 src_stride = -src_stride; | 244 src_stride = -src_stride; |
1052 | 245 |
(...skipping 241 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1294 default: | 487 default: |
1295 break; | 488 break; |
1296 } | 489 } |
1297 return -1; | 490 return -1; |
1298 } | 491 } |
1299 | 492 |
1300 #ifdef __cplusplus | 493 #ifdef __cplusplus |
1301 } // extern "C" | 494 } // extern "C" |
1302 } // namespace libyuv | 495 } // namespace libyuv |
1303 #endif | 496 #endif |
OLD | NEW |