Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4)

Side by Side Diff: source/libvpx/third_party/libyuv/source/rotate.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "libyuv/rotate.h" 11 #include "libyuv/rotate.h"
12 12
13 #include "libyuv/cpu_id.h" 13 #include "libyuv/cpu_id.h"
14 #include "libyuv/convert.h" 14 #include "libyuv/convert.h"
15 #include "libyuv/planar_functions.h" 15 #include "libyuv/planar_functions.h"
16 #include "libyuv/rotate_row.h"
16 #include "libyuv/row.h" 17 #include "libyuv/row.h"
17 18
18 #ifdef __cplusplus 19 #ifdef __cplusplus
19 namespace libyuv { 20 namespace libyuv {
20 extern "C" { 21 extern "C" {
21 #endif 22 #endif
22 23
23 #if !defined(LIBYUV_DISABLE_X86) && \
24 (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
25 #if defined(__APPLE__) && defined(__i386__)
26 #define DECLARE_FUNCTION(name) \
27 ".text \n" \
28 ".private_extern _" #name " \n" \
29 ".align 4,0x90 \n" \
30 "_" #name ": \n"
31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
32 #define DECLARE_FUNCTION(name) \
33 ".text \n" \
34 ".align 4,0x90 \n" \
35 "_" #name ": \n"
36 #else
37 #define DECLARE_FUNCTION(name) \
38 ".text \n" \
39 ".align 4,0x90 \n" \
40 #name ": \n"
41 #endif
42 #endif
43
44 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
45 (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
46 #define HAS_TRANSPOSE_WX8_NEON
47 void TransposeWx8_NEON(const uint8* src, int src_stride,
48 uint8* dst, int dst_stride, int width);
49 #define HAS_TRANSPOSE_UVWX8_NEON
50 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
51 uint8* dst_a, int dst_stride_a,
52 uint8* dst_b, int dst_stride_b,
53 int width);
54 #endif
55
56 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
57 defined(__mips__) && \
58 defined(__mips_dsp) && (__mips_dsp_rev >= 2)
59 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2
60 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
61 uint8* dst, int dst_stride, int width);
62
63 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
64 uint8* dst, int dst_stride, int width);
65 #define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
66 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
67 uint8* dst_a, int dst_stride_a,
68 uint8* dst_b, int dst_stride_b,
69 int width);
70 #endif // defined(__mips__)
71
72 #if !defined(LIBYUV_DISABLE_X86) && \
73 defined(_M_IX86) && defined(_MSC_VER)
74 #define HAS_TRANSPOSE_WX8_SSSE3
75 __declspec(naked) __declspec(align(16))
76 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
77 uint8* dst, int dst_stride, int width) {
78 __asm {
79 push edi
80 push esi
81 push ebp
82 mov eax, [esp + 12 + 4] // src
83 mov edi, [esp + 12 + 8] // src_stride
84 mov edx, [esp + 12 + 12] // dst
85 mov esi, [esp + 12 + 16] // dst_stride
86 mov ecx, [esp + 12 + 20] // width
87
88 // Read in the data from the source pointer.
89 // First round of bit swap.
90 align 4
91 convertloop:
92 movq xmm0, qword ptr [eax]
93 lea ebp, [eax + 8]
94 movq xmm1, qword ptr [eax + edi]
95 lea eax, [eax + 2 * edi]
96 punpcklbw xmm0, xmm1
97 movq xmm2, qword ptr [eax]
98 movdqa xmm1, xmm0
99 palignr xmm1, xmm1, 8
100 movq xmm3, qword ptr [eax + edi]
101 lea eax, [eax + 2 * edi]
102 punpcklbw xmm2, xmm3
103 movdqa xmm3, xmm2
104 movq xmm4, qword ptr [eax]
105 palignr xmm3, xmm3, 8
106 movq xmm5, qword ptr [eax + edi]
107 punpcklbw xmm4, xmm5
108 lea eax, [eax + 2 * edi]
109 movdqa xmm5, xmm4
110 movq xmm6, qword ptr [eax]
111 palignr xmm5, xmm5, 8
112 movq xmm7, qword ptr [eax + edi]
113 punpcklbw xmm6, xmm7
114 mov eax, ebp
115 movdqa xmm7, xmm6
116 palignr xmm7, xmm7, 8
117 // Second round of bit swap.
118 punpcklwd xmm0, xmm2
119 punpcklwd xmm1, xmm3
120 movdqa xmm2, xmm0
121 movdqa xmm3, xmm1
122 palignr xmm2, xmm2, 8
123 palignr xmm3, xmm3, 8
124 punpcklwd xmm4, xmm6
125 punpcklwd xmm5, xmm7
126 movdqa xmm6, xmm4
127 movdqa xmm7, xmm5
128 palignr xmm6, xmm6, 8
129 palignr xmm7, xmm7, 8
130 // Third round of bit swap.
131 // Write to the destination pointer.
132 punpckldq xmm0, xmm4
133 movq qword ptr [edx], xmm0
134 movdqa xmm4, xmm0
135 palignr xmm4, xmm4, 8
136 movq qword ptr [edx + esi], xmm4
137 lea edx, [edx + 2 * esi]
138 punpckldq xmm2, xmm6
139 movdqa xmm6, xmm2
140 palignr xmm6, xmm6, 8
141 movq qword ptr [edx], xmm2
142 punpckldq xmm1, xmm5
143 movq qword ptr [edx + esi], xmm6
144 lea edx, [edx + 2 * esi]
145 movdqa xmm5, xmm1
146 movq qword ptr [edx], xmm1
147 palignr xmm5, xmm5, 8
148 punpckldq xmm3, xmm7
149 movq qword ptr [edx + esi], xmm5
150 lea edx, [edx + 2 * esi]
151 movq qword ptr [edx], xmm3
152 movdqa xmm7, xmm3
153 palignr xmm7, xmm7, 8
154 sub ecx, 8
155 movq qword ptr [edx + esi], xmm7
156 lea edx, [edx + 2 * esi]
157 jg convertloop
158
159 pop ebp
160 pop esi
161 pop edi
162 ret
163 }
164 }
165
166 #define HAS_TRANSPOSE_UVWX8_SSE2
167 __declspec(naked) __declspec(align(16))
168 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
169 uint8* dst_a, int dst_stride_a,
170 uint8* dst_b, int dst_stride_b,
171 int w) {
172 __asm {
173 push ebx
174 push esi
175 push edi
176 push ebp
177 mov eax, [esp + 16 + 4] // src
178 mov edi, [esp + 16 + 8] // src_stride
179 mov edx, [esp + 16 + 12] // dst_a
180 mov esi, [esp + 16 + 16] // dst_stride_a
181 mov ebx, [esp + 16 + 20] // dst_b
182 mov ebp, [esp + 16 + 24] // dst_stride_b
183 mov ecx, esp
184 sub esp, 4 + 16
185 and esp, ~15
186 mov [esp + 16], ecx
187 mov ecx, [ecx + 16 + 28] // w
188
189 align 4
190 convertloop:
191 // Read in the data from the source pointer.
192 // First round of bit swap.
193 movdqu xmm0, [eax]
194 movdqu xmm1, [eax + edi]
195 lea eax, [eax + 2 * edi]
196 movdqa xmm7, xmm0 // use xmm7 as temp register.
197 punpcklbw xmm0, xmm1
198 punpckhbw xmm7, xmm1
199 movdqa xmm1, xmm7
200 movdqu xmm2, [eax]
201 movdqu xmm3, [eax + edi]
202 lea eax, [eax + 2 * edi]
203 movdqa xmm7, xmm2
204 punpcklbw xmm2, xmm3
205 punpckhbw xmm7, xmm3
206 movdqa xmm3, xmm7
207 movdqu xmm4, [eax]
208 movdqu xmm5, [eax + edi]
209 lea eax, [eax + 2 * edi]
210 movdqa xmm7, xmm4
211 punpcklbw xmm4, xmm5
212 punpckhbw xmm7, xmm5
213 movdqa xmm5, xmm7
214 movdqu xmm6, [eax]
215 movdqu xmm7, [eax + edi]
216 lea eax, [eax + 2 * edi]
217 movdqu [esp], xmm5 // backup xmm5
218 neg edi
219 movdqa xmm5, xmm6 // use xmm5 as temp register.
220 punpcklbw xmm6, xmm7
221 punpckhbw xmm5, xmm7
222 movdqa xmm7, xmm5
223 lea eax, [eax + 8 * edi + 16]
224 neg edi
225 // Second round of bit swap.
226 movdqa xmm5, xmm0
227 punpcklwd xmm0, xmm2
228 punpckhwd xmm5, xmm2
229 movdqa xmm2, xmm5
230 movdqa xmm5, xmm1
231 punpcklwd xmm1, xmm3
232 punpckhwd xmm5, xmm3
233 movdqa xmm3, xmm5
234 movdqa xmm5, xmm4
235 punpcklwd xmm4, xmm6
236 punpckhwd xmm5, xmm6
237 movdqa xmm6, xmm5
238 movdqu xmm5, [esp] // restore xmm5
239 movdqu [esp], xmm6 // backup xmm6
240 movdqa xmm6, xmm5 // use xmm6 as temp register.
241 punpcklwd xmm5, xmm7
242 punpckhwd xmm6, xmm7
243 movdqa xmm7, xmm6
244 // Third round of bit swap.
245 // Write to the destination pointer.
246 movdqa xmm6, xmm0
247 punpckldq xmm0, xmm4
248 punpckhdq xmm6, xmm4
249 movdqa xmm4, xmm6
250 movdqu xmm6, [esp] // restore xmm6
251 movlpd qword ptr [edx], xmm0
252 movhpd qword ptr [ebx], xmm0
253 movlpd qword ptr [edx + esi], xmm4
254 lea edx, [edx + 2 * esi]
255 movhpd qword ptr [ebx + ebp], xmm4
256 lea ebx, [ebx + 2 * ebp]
257 movdqa xmm0, xmm2 // use xmm0 as the temp register.
258 punpckldq xmm2, xmm6
259 movlpd qword ptr [edx], xmm2
260 movhpd qword ptr [ebx], xmm2
261 punpckhdq xmm0, xmm6
262 movlpd qword ptr [edx + esi], xmm0
263 lea edx, [edx + 2 * esi]
264 movhpd qword ptr [ebx + ebp], xmm0
265 lea ebx, [ebx + 2 * ebp]
266 movdqa xmm0, xmm1 // use xmm0 as the temp register.
267 punpckldq xmm1, xmm5
268 movlpd qword ptr [edx], xmm1
269 movhpd qword ptr [ebx], xmm1
270 punpckhdq xmm0, xmm5
271 movlpd qword ptr [edx + esi], xmm0
272 lea edx, [edx + 2 * esi]
273 movhpd qword ptr [ebx + ebp], xmm0
274 lea ebx, [ebx + 2 * ebp]
275 movdqa xmm0, xmm3 // use xmm0 as the temp register.
276 punpckldq xmm3, xmm7
277 movlpd qword ptr [edx], xmm3
278 movhpd qword ptr [ebx], xmm3
279 punpckhdq xmm0, xmm7
280 sub ecx, 8
281 movlpd qword ptr [edx + esi], xmm0
282 lea edx, [edx + 2 * esi]
283 movhpd qword ptr [ebx + ebp], xmm0
284 lea ebx, [ebx + 2 * ebp]
285 jg convertloop
286
287 mov esp, [esp + 16]
288 pop ebp
289 pop edi
290 pop esi
291 pop ebx
292 ret
293 }
294 }
295 #endif
296 #if !defined(LIBYUV_DISABLE_X86) && \
297 (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
298 #define HAS_TRANSPOSE_WX8_SSSE3
299 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
300 uint8* dst, int dst_stride, int width) {
301 asm volatile (
302 // Read in the data from the source pointer.
303 // First round of bit swap.
304 ".p2align 2 \n"
305 "1: \n"
306 "movq (%0),%%xmm0 \n"
307 "movq (%0,%3),%%xmm1 \n"
308 "lea (%0,%3,2),%0 \n"
309 "punpcklbw %%xmm1,%%xmm0 \n"
310 "movq (%0),%%xmm2 \n"
311 "movdqa %%xmm0,%%xmm1 \n"
312 "palignr $0x8,%%xmm1,%%xmm1 \n"
313 "movq (%0,%3),%%xmm3 \n"
314 "lea (%0,%3,2),%0 \n"
315 "punpcklbw %%xmm3,%%xmm2 \n"
316 "movdqa %%xmm2,%%xmm3 \n"
317 "movq (%0),%%xmm4 \n"
318 "palignr $0x8,%%xmm3,%%xmm3 \n"
319 "movq (%0,%3),%%xmm5 \n"
320 "lea (%0,%3,2),%0 \n"
321 "punpcklbw %%xmm5,%%xmm4 \n"
322 "movdqa %%xmm4,%%xmm5 \n"
323 "movq (%0),%%xmm6 \n"
324 "palignr $0x8,%%xmm5,%%xmm5 \n"
325 "movq (%0,%3),%%xmm7 \n"
326 "lea (%0,%3,2),%0 \n"
327 "punpcklbw %%xmm7,%%xmm6 \n"
328 "neg %3 \n"
329 "movdqa %%xmm6,%%xmm7 \n"
330 "lea 0x8(%0,%3,8),%0 \n"
331 "palignr $0x8,%%xmm7,%%xmm7 \n"
332 "neg %3 \n"
333 // Second round of bit swap.
334 "punpcklwd %%xmm2,%%xmm0 \n"
335 "punpcklwd %%xmm3,%%xmm1 \n"
336 "movdqa %%xmm0,%%xmm2 \n"
337 "movdqa %%xmm1,%%xmm3 \n"
338 "palignr $0x8,%%xmm2,%%xmm2 \n"
339 "palignr $0x8,%%xmm3,%%xmm3 \n"
340 "punpcklwd %%xmm6,%%xmm4 \n"
341 "punpcklwd %%xmm7,%%xmm5 \n"
342 "movdqa %%xmm4,%%xmm6 \n"
343 "movdqa %%xmm5,%%xmm7 \n"
344 "palignr $0x8,%%xmm6,%%xmm6 \n"
345 "palignr $0x8,%%xmm7,%%xmm7 \n"
346 // Third round of bit swap.
347 // Write to the destination pointer.
348 "punpckldq %%xmm4,%%xmm0 \n"
349 "movq %%xmm0,(%1) \n"
350 "movdqa %%xmm0,%%xmm4 \n"
351 "palignr $0x8,%%xmm4,%%xmm4 \n"
352 "movq %%xmm4,(%1,%4) \n"
353 "lea (%1,%4,2),%1 \n"
354 "punpckldq %%xmm6,%%xmm2 \n"
355 "movdqa %%xmm2,%%xmm6 \n"
356 "movq %%xmm2,(%1) \n"
357 "palignr $0x8,%%xmm6,%%xmm6 \n"
358 "punpckldq %%xmm5,%%xmm1 \n"
359 "movq %%xmm6,(%1,%4) \n"
360 "lea (%1,%4,2),%1 \n"
361 "movdqa %%xmm1,%%xmm5 \n"
362 "movq %%xmm1,(%1) \n"
363 "palignr $0x8,%%xmm5,%%xmm5 \n"
364 "movq %%xmm5,(%1,%4) \n"
365 "lea (%1,%4,2),%1 \n"
366 "punpckldq %%xmm7,%%xmm3 \n"
367 "movq %%xmm3,(%1) \n"
368 "movdqa %%xmm3,%%xmm7 \n"
369 "palignr $0x8,%%xmm7,%%xmm7 \n"
370 "sub $0x8,%2 \n"
371 "movq %%xmm7,(%1,%4) \n"
372 "lea (%1,%4,2),%1 \n"
373 "jg 1b \n"
374 : "+r"(src), // %0
375 "+r"(dst), // %1
376 "+r"(width) // %2
377 : "r"((intptr_t)(src_stride)), // %3
378 "r"((intptr_t)(dst_stride)) // %4
379 : "memory", "cc",
380 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
381 );
382 }
383
384 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
385 #define HAS_TRANSPOSE_UVWX8_SSE2
386 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
387 uint8* dst_a, int dst_stride_a,
388 uint8* dst_b, int dst_stride_b,
389 int w);
390 asm (
391 DECLARE_FUNCTION(TransposeUVWx8_SSE2)
392 "push %ebx \n"
393 "push %esi \n"
394 "push %edi \n"
395 "push %ebp \n"
396 "mov 0x14(%esp),%eax \n"
397 "mov 0x18(%esp),%edi \n"
398 "mov 0x1c(%esp),%edx \n"
399 "mov 0x20(%esp),%esi \n"
400 "mov 0x24(%esp),%ebx \n"
401 "mov 0x28(%esp),%ebp \n"
402 "mov %esp,%ecx \n"
403 "sub $0x14,%esp \n"
404 "and $0xfffffff0,%esp \n"
405 "mov %ecx,0x10(%esp) \n"
406 "mov 0x2c(%ecx),%ecx \n"
407
408 "1: \n"
409 "movdqu (%eax),%xmm0 \n"
410 "movdqu (%eax,%edi,1),%xmm1 \n"
411 "lea (%eax,%edi,2),%eax \n"
412 "movdqa %xmm0,%xmm7 \n"
413 "punpcklbw %xmm1,%xmm0 \n"
414 "punpckhbw %xmm1,%xmm7 \n"
415 "movdqa %xmm7,%xmm1 \n"
416 "movdqu (%eax),%xmm2 \n"
417 "movdqu (%eax,%edi,1),%xmm3 \n"
418 "lea (%eax,%edi,2),%eax \n"
419 "movdqa %xmm2,%xmm7 \n"
420 "punpcklbw %xmm3,%xmm2 \n"
421 "punpckhbw %xmm3,%xmm7 \n"
422 "movdqa %xmm7,%xmm3 \n"
423 "movdqu (%eax),%xmm4 \n"
424 "movdqu (%eax,%edi,1),%xmm5 \n"
425 "lea (%eax,%edi,2),%eax \n"
426 "movdqa %xmm4,%xmm7 \n"
427 "punpcklbw %xmm5,%xmm4 \n"
428 "punpckhbw %xmm5,%xmm7 \n"
429 "movdqa %xmm7,%xmm5 \n"
430 "movdqu (%eax),%xmm6 \n"
431 "movdqu (%eax,%edi,1),%xmm7 \n"
432 "lea (%eax,%edi,2),%eax \n"
433 "movdqu %xmm5,(%esp) \n"
434 "neg %edi \n"
435 "movdqa %xmm6,%xmm5 \n"
436 "punpcklbw %xmm7,%xmm6 \n"
437 "punpckhbw %xmm7,%xmm5 \n"
438 "movdqa %xmm5,%xmm7 \n"
439 "lea 0x10(%eax,%edi,8),%eax \n"
440 "neg %edi \n"
441 "movdqa %xmm0,%xmm5 \n"
442 "punpcklwd %xmm2,%xmm0 \n"
443 "punpckhwd %xmm2,%xmm5 \n"
444 "movdqa %xmm5,%xmm2 \n"
445 "movdqa %xmm1,%xmm5 \n"
446 "punpcklwd %xmm3,%xmm1 \n"
447 "punpckhwd %xmm3,%xmm5 \n"
448 "movdqa %xmm5,%xmm3 \n"
449 "movdqa %xmm4,%xmm5 \n"
450 "punpcklwd %xmm6,%xmm4 \n"
451 "punpckhwd %xmm6,%xmm5 \n"
452 "movdqa %xmm5,%xmm6 \n"
453 "movdqu (%esp),%xmm5 \n"
454 "movdqu %xmm6,(%esp) \n"
455 "movdqa %xmm5,%xmm6 \n"
456 "punpcklwd %xmm7,%xmm5 \n"
457 "punpckhwd %xmm7,%xmm6 \n"
458 "movdqa %xmm6,%xmm7 \n"
459 "movdqa %xmm0,%xmm6 \n"
460 "punpckldq %xmm4,%xmm0 \n"
461 "punpckhdq %xmm4,%xmm6 \n"
462 "movdqa %xmm6,%xmm4 \n"
463 "movdqu (%esp),%xmm6 \n"
464 "movlpd %xmm0,(%edx) \n"
465 "movhpd %xmm0,(%ebx) \n"
466 "movlpd %xmm4,(%edx,%esi,1) \n"
467 "lea (%edx,%esi,2),%edx \n"
468 "movhpd %xmm4,(%ebx,%ebp,1) \n"
469 "lea (%ebx,%ebp,2),%ebx \n"
470 "movdqa %xmm2,%xmm0 \n"
471 "punpckldq %xmm6,%xmm2 \n"
472 "movlpd %xmm2,(%edx) \n"
473 "movhpd %xmm2,(%ebx) \n"
474 "punpckhdq %xmm6,%xmm0 \n"
475 "movlpd %xmm0,(%edx,%esi,1) \n"
476 "lea (%edx,%esi,2),%edx \n"
477 "movhpd %xmm0,(%ebx,%ebp,1) \n"
478 "lea (%ebx,%ebp,2),%ebx \n"
479 "movdqa %xmm1,%xmm0 \n"
480 "punpckldq %xmm5,%xmm1 \n"
481 "movlpd %xmm1,(%edx) \n"
482 "movhpd %xmm1,(%ebx) \n"
483 "punpckhdq %xmm5,%xmm0 \n"
484 "movlpd %xmm0,(%edx,%esi,1) \n"
485 "lea (%edx,%esi,2),%edx \n"
486 "movhpd %xmm0,(%ebx,%ebp,1) \n"
487 "lea (%ebx,%ebp,2),%ebx \n"
488 "movdqa %xmm3,%xmm0 \n"
489 "punpckldq %xmm7,%xmm3 \n"
490 "movlpd %xmm3,(%edx) \n"
491 "movhpd %xmm3,(%ebx) \n"
492 "punpckhdq %xmm7,%xmm0 \n"
493 "sub $0x8,%ecx \n"
494 "movlpd %xmm0,(%edx,%esi,1) \n"
495 "lea (%edx,%esi,2),%edx \n"
496 "movhpd %xmm0,(%ebx,%ebp,1) \n"
497 "lea (%ebx,%ebp,2),%ebx \n"
498 "jg 1b \n"
499 "mov 0x10(%esp),%esp \n"
500 "pop %ebp \n"
501 "pop %edi \n"
502 "pop %esi \n"
503 "pop %ebx \n"
504 #if defined(__native_client__)
505 "pop %ecx \n"
506 "and $0xffffffe0,%ecx \n"
507 "jmp *%ecx \n"
508 #else
509 "ret \n"
510 #endif
511 );
512 #endif
513 #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
514 defined(__x86_64__)
515 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
516 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
517 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
518 uint8* dst, int dst_stride, int width) {
519 asm volatile (
520 // Read in the data from the source pointer.
521 // First round of bit swap.
522 ".p2align 2 \n"
523 "1: \n"
524 "movdqu (%0),%%xmm0 \n"
525 "movdqu (%0,%3),%%xmm1 \n"
526 "lea (%0,%3,2),%0 \n"
527 "movdqa %%xmm0,%%xmm8 \n"
528 "punpcklbw %%xmm1,%%xmm0 \n"
529 "punpckhbw %%xmm1,%%xmm8 \n"
530 "movdqu (%0),%%xmm2 \n"
531 "movdqa %%xmm0,%%xmm1 \n"
532 "movdqa %%xmm8,%%xmm9 \n"
533 "palignr $0x8,%%xmm1,%%xmm1 \n"
534 "palignr $0x8,%%xmm9,%%xmm9 \n"
535 "movdqu (%0,%3),%%xmm3 \n"
536 "lea (%0,%3,2),%0 \n"
537 "movdqa %%xmm2,%%xmm10 \n"
538 "punpcklbw %%xmm3,%%xmm2 \n"
539 "punpckhbw %%xmm3,%%xmm10 \n"
540 "movdqa %%xmm2,%%xmm3 \n"
541 "movdqa %%xmm10,%%xmm11 \n"
542 "movdqu (%0),%%xmm4 \n"
543 "palignr $0x8,%%xmm3,%%xmm3 \n"
544 "palignr $0x8,%%xmm11,%%xmm11 \n"
545 "movdqu (%0,%3),%%xmm5 \n"
546 "lea (%0,%3,2),%0 \n"
547 "movdqa %%xmm4,%%xmm12 \n"
548 "punpcklbw %%xmm5,%%xmm4 \n"
549 "punpckhbw %%xmm5,%%xmm12 \n"
550 "movdqa %%xmm4,%%xmm5 \n"
551 "movdqa %%xmm12,%%xmm13 \n"
552 "movdqu (%0),%%xmm6 \n"
553 "palignr $0x8,%%xmm5,%%xmm5 \n"
554 "palignr $0x8,%%xmm13,%%xmm13 \n"
555 "movdqu (%0,%3),%%xmm7 \n"
556 "lea (%0,%3,2),%0 \n"
557 "movdqa %%xmm6,%%xmm14 \n"
558 "punpcklbw %%xmm7,%%xmm6 \n"
559 "punpckhbw %%xmm7,%%xmm14 \n"
560 "neg %3 \n"
561 "movdqa %%xmm6,%%xmm7 \n"
562 "movdqa %%xmm14,%%xmm15 \n"
563 "lea 0x10(%0,%3,8),%0 \n"
564 "palignr $0x8,%%xmm7,%%xmm7 \n"
565 "palignr $0x8,%%xmm15,%%xmm15 \n"
566 "neg %3 \n"
567 // Second round of bit swap.
568 "punpcklwd %%xmm2,%%xmm0 \n"
569 "punpcklwd %%xmm3,%%xmm1 \n"
570 "movdqa %%xmm0,%%xmm2 \n"
571 "movdqa %%xmm1,%%xmm3 \n"
572 "palignr $0x8,%%xmm2,%%xmm2 \n"
573 "palignr $0x8,%%xmm3,%%xmm3 \n"
574 "punpcklwd %%xmm6,%%xmm4 \n"
575 "punpcklwd %%xmm7,%%xmm5 \n"
576 "movdqa %%xmm4,%%xmm6 \n"
577 "movdqa %%xmm5,%%xmm7 \n"
578 "palignr $0x8,%%xmm6,%%xmm6 \n"
579 "palignr $0x8,%%xmm7,%%xmm7 \n"
580 "punpcklwd %%xmm10,%%xmm8 \n"
581 "punpcklwd %%xmm11,%%xmm9 \n"
582 "movdqa %%xmm8,%%xmm10 \n"
583 "movdqa %%xmm9,%%xmm11 \n"
584 "palignr $0x8,%%xmm10,%%xmm10 \n"
585 "palignr $0x8,%%xmm11,%%xmm11 \n"
586 "punpcklwd %%xmm14,%%xmm12 \n"
587 "punpcklwd %%xmm15,%%xmm13 \n"
588 "movdqa %%xmm12,%%xmm14 \n"
589 "movdqa %%xmm13,%%xmm15 \n"
590 "palignr $0x8,%%xmm14,%%xmm14 \n"
591 "palignr $0x8,%%xmm15,%%xmm15 \n"
592 // Third round of bit swap.
593 // Write to the destination pointer.
594 "punpckldq %%xmm4,%%xmm0 \n"
595 "movq %%xmm0,(%1) \n"
596 "movdqa %%xmm0,%%xmm4 \n"
597 "palignr $0x8,%%xmm4,%%xmm4 \n"
598 "movq %%xmm4,(%1,%4) \n"
599 "lea (%1,%4,2),%1 \n"
600 "punpckldq %%xmm6,%%xmm2 \n"
601 "movdqa %%xmm2,%%xmm6 \n"
602 "movq %%xmm2,(%1) \n"
603 "palignr $0x8,%%xmm6,%%xmm6 \n"
604 "punpckldq %%xmm5,%%xmm1 \n"
605 "movq %%xmm6,(%1,%4) \n"
606 "lea (%1,%4,2),%1 \n"
607 "movdqa %%xmm1,%%xmm5 \n"
608 "movq %%xmm1,(%1) \n"
609 "palignr $0x8,%%xmm5,%%xmm5 \n"
610 "movq %%xmm5,(%1,%4) \n"
611 "lea (%1,%4,2),%1 \n"
612 "punpckldq %%xmm7,%%xmm3 \n"
613 "movq %%xmm3,(%1) \n"
614 "movdqa %%xmm3,%%xmm7 \n"
615 "palignr $0x8,%%xmm7,%%xmm7 \n"
616 "movq %%xmm7,(%1,%4) \n"
617 "lea (%1,%4,2),%1 \n"
618 "punpckldq %%xmm12,%%xmm8 \n"
619 "movq %%xmm8,(%1) \n"
620 "movdqa %%xmm8,%%xmm12 \n"
621 "palignr $0x8,%%xmm12,%%xmm12 \n"
622 "movq %%xmm12,(%1,%4) \n"
623 "lea (%1,%4,2),%1 \n"
624 "punpckldq %%xmm14,%%xmm10 \n"
625 "movdqa %%xmm10,%%xmm14 \n"
626 "movq %%xmm10,(%1) \n"
627 "palignr $0x8,%%xmm14,%%xmm14 \n"
628 "punpckldq %%xmm13,%%xmm9 \n"
629 "movq %%xmm14,(%1,%4) \n"
630 "lea (%1,%4,2),%1 \n"
631 "movdqa %%xmm9,%%xmm13 \n"
632 "movq %%xmm9,(%1) \n"
633 "palignr $0x8,%%xmm13,%%xmm13 \n"
634 "movq %%xmm13,(%1,%4) \n"
635 "lea (%1,%4,2),%1 \n"
636 "punpckldq %%xmm15,%%xmm11 \n"
637 "movq %%xmm11,(%1) \n"
638 "movdqa %%xmm11,%%xmm15 \n"
639 "palignr $0x8,%%xmm15,%%xmm15 \n"
640 "sub $0x10,%2 \n"
641 "movq %%xmm15,(%1,%4) \n"
642 "lea (%1,%4,2),%1 \n"
643 "jg 1b \n"
644 : "+r"(src), // %0
645 "+r"(dst), // %1
646 "+r"(width) // %2
647 : "r"((intptr_t)(src_stride)), // %3
648 "r"((intptr_t)(dst_stride)) // %4
649 : "memory", "cc",
650 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
651 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
652 );
653 }
654
655 #define HAS_TRANSPOSE_UVWX8_SSE2
656 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
657 uint8* dst_a, int dst_stride_a,
658 uint8* dst_b, int dst_stride_b,
659 int w) {
660 asm volatile (
661 // Read in the data from the source pointer.
662 // First round of bit swap.
663 ".p2align 2 \n"
664 "1: \n"
665 "movdqu (%0),%%xmm0 \n"
666 "movdqu (%0,%4),%%xmm1 \n"
667 "lea (%0,%4,2),%0 \n"
668 "movdqa %%xmm0,%%xmm8 \n"
669 "punpcklbw %%xmm1,%%xmm0 \n"
670 "punpckhbw %%xmm1,%%xmm8 \n"
671 "movdqa %%xmm8,%%xmm1 \n"
672 "movdqu (%0),%%xmm2 \n"
673 "movdqu (%0,%4),%%xmm3 \n"
674 "lea (%0,%4,2),%0 \n"
675 "movdqa %%xmm2,%%xmm8 \n"
676 "punpcklbw %%xmm3,%%xmm2 \n"
677 "punpckhbw %%xmm3,%%xmm8 \n"
678 "movdqa %%xmm8,%%xmm3 \n"
679 "movdqu (%0),%%xmm4 \n"
680 "movdqu (%0,%4),%%xmm5 \n"
681 "lea (%0,%4,2),%0 \n"
682 "movdqa %%xmm4,%%xmm8 \n"
683 "punpcklbw %%xmm5,%%xmm4 \n"
684 "punpckhbw %%xmm5,%%xmm8 \n"
685 "movdqa %%xmm8,%%xmm5 \n"
686 "movdqu (%0),%%xmm6 \n"
687 "movdqu (%0,%4),%%xmm7 \n"
688 "lea (%0,%4,2),%0 \n"
689 "movdqa %%xmm6,%%xmm8 \n"
690 "punpcklbw %%xmm7,%%xmm6 \n"
691 "neg %4 \n"
692 "lea 0x10(%0,%4,8),%0 \n"
693 "punpckhbw %%xmm7,%%xmm8 \n"
694 "movdqa %%xmm8,%%xmm7 \n"
695 "neg %4 \n"
696 // Second round of bit swap.
697 "movdqa %%xmm0,%%xmm8 \n"
698 "movdqa %%xmm1,%%xmm9 \n"
699 "punpckhwd %%xmm2,%%xmm8 \n"
700 "punpckhwd %%xmm3,%%xmm9 \n"
701 "punpcklwd %%xmm2,%%xmm0 \n"
702 "punpcklwd %%xmm3,%%xmm1 \n"
703 "movdqa %%xmm8,%%xmm2 \n"
704 "movdqa %%xmm9,%%xmm3 \n"
705 "movdqa %%xmm4,%%xmm8 \n"
706 "movdqa %%xmm5,%%xmm9 \n"
707 "punpckhwd %%xmm6,%%xmm8 \n"
708 "punpckhwd %%xmm7,%%xmm9 \n"
709 "punpcklwd %%xmm6,%%xmm4 \n"
710 "punpcklwd %%xmm7,%%xmm5 \n"
711 "movdqa %%xmm8,%%xmm6 \n"
712 "movdqa %%xmm9,%%xmm7 \n"
713 // Third round of bit swap.
714 // Write to the destination pointer.
715 "movdqa %%xmm0,%%xmm8 \n"
716 "punpckldq %%xmm4,%%xmm0 \n"
717 "movlpd %%xmm0,(%1) \n" // Write back U channel
718 "movhpd %%xmm0,(%2) \n" // Write back V channel
719 "punpckhdq %%xmm4,%%xmm8 \n"
720 "movlpd %%xmm8,(%1,%5) \n"
721 "lea (%1,%5,2),%1 \n"
722 "movhpd %%xmm8,(%2,%6) \n"
723 "lea (%2,%6,2),%2 \n"
724 "movdqa %%xmm2,%%xmm8 \n"
725 "punpckldq %%xmm6,%%xmm2 \n"
726 "movlpd %%xmm2,(%1) \n"
727 "movhpd %%xmm2,(%2) \n"
728 "punpckhdq %%xmm6,%%xmm8 \n"
729 "movlpd %%xmm8,(%1,%5) \n"
730 "lea (%1,%5,2),%1 \n"
731 "movhpd %%xmm8,(%2,%6) \n"
732 "lea (%2,%6,2),%2 \n"
733 "movdqa %%xmm1,%%xmm8 \n"
734 "punpckldq %%xmm5,%%xmm1 \n"
735 "movlpd %%xmm1,(%1) \n"
736 "movhpd %%xmm1,(%2) \n"
737 "punpckhdq %%xmm5,%%xmm8 \n"
738 "movlpd %%xmm8,(%1,%5) \n"
739 "lea (%1,%5,2),%1 \n"
740 "movhpd %%xmm8,(%2,%6) \n"
741 "lea (%2,%6,2),%2 \n"
742 "movdqa %%xmm3,%%xmm8 \n"
743 "punpckldq %%xmm7,%%xmm3 \n"
744 "movlpd %%xmm3,(%1) \n"
745 "movhpd %%xmm3,(%2) \n"
746 "punpckhdq %%xmm7,%%xmm8 \n"
747 "sub $0x8,%3 \n"
748 "movlpd %%xmm8,(%1,%5) \n"
749 "lea (%1,%5,2),%1 \n"
750 "movhpd %%xmm8,(%2,%6) \n"
751 "lea (%2,%6,2),%2 \n"
752 "jg 1b \n"
753 : "+r"(src), // %0
754 "+r"(dst_a), // %1
755 "+r"(dst_b), // %2
756 "+r"(w) // %3
757 : "r"((intptr_t)(src_stride)), // %4
758 "r"((intptr_t)(dst_stride_a)), // %5
759 "r"((intptr_t)(dst_stride_b)) // %6
760 : "memory", "cc",
761 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
762 "xmm8", "xmm9"
763 );
764 }
765 #endif
766 #endif
767
768 static void TransposeWx8_C(const uint8* src, int src_stride,
769 uint8* dst, int dst_stride,
770 int width) {
771 int i;
772 for (i = 0; i < width; ++i) {
773 dst[0] = src[0 * src_stride];
774 dst[1] = src[1 * src_stride];
775 dst[2] = src[2 * src_stride];
776 dst[3] = src[3 * src_stride];
777 dst[4] = src[4 * src_stride];
778 dst[5] = src[5 * src_stride];
779 dst[6] = src[6 * src_stride];
780 dst[7] = src[7 * src_stride];
781 ++src;
782 dst += dst_stride;
783 }
784 }
785
786 static void TransposeWxH_C(const uint8* src, int src_stride,
787 uint8* dst, int dst_stride,
788 int width, int height) {
789 int i;
790 for (i = 0; i < width; ++i) {
791 int j;
792 for (j = 0; j < height; ++j) {
793 dst[i * dst_stride + j] = src[j * src_stride + i];
794 }
795 }
796 }
797
798 LIBYUV_API 24 LIBYUV_API
799 void TransposePlane(const uint8* src, int src_stride, 25 void TransposePlane(const uint8* src, int src_stride,
800 uint8* dst, int dst_stride, 26 uint8* dst, int dst_stride,
801 int width, int height) { 27 int width, int height) {
802 int i = height; 28 int i = height;
803 void (*TransposeWx8)(const uint8* src, int src_stride, 29 void (*TransposeWx8)(const uint8* src, int src_stride,
804 uint8* dst, int dst_stride, 30 uint8* dst, int dst_stride, int width) = TransposeWx8_C;
805 int width) = TransposeWx8_C; 31 #if defined(HAS_TRANSPOSEWX8_NEON)
806 #if defined(HAS_TRANSPOSE_WX8_NEON)
807 if (TestCpuFlag(kCpuHasNEON)) { 32 if (TestCpuFlag(kCpuHasNEON)) {
808 TransposeWx8 = TransposeWx8_NEON; 33 TransposeWx8 = TransposeWx8_NEON;
809 } 34 }
810 #endif 35 #endif
811 #if defined(HAS_TRANSPOSE_WX8_SSSE3) 36 #if defined(HAS_TRANSPOSEWX8_SSSE3)
812 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { 37 if (TestCpuFlag(kCpuHasSSSE3)) {
813 TransposeWx8 = TransposeWx8_SSSE3; 38 TransposeWx8 = TransposeWx8_Any_SSSE3;
39 if (IS_ALIGNED(width, 8)) {
40 TransposeWx8 = TransposeWx8_SSSE3;
41 }
814 } 42 }
815 #endif 43 #endif
816 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) 44 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
817 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { 45 if (TestCpuFlag(kCpuHasSSSE3)) {
818 TransposeWx8 = TransposeWx8_FAST_SSSE3; 46 TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
47 if (IS_ALIGNED(width, 16)) {
48 TransposeWx8 = TransposeWx8_Fast_SSSE3;
49 }
819 } 50 }
820 #endif 51 #endif
821 #if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2) 52 #if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)
822 if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { 53 if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
823 if (IS_ALIGNED(width, 4) && 54 if (IS_ALIGNED(width, 4) &&
824 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { 55 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
825 TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2; 56 TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;
826 } else { 57 } else {
827 TransposeWx8 = TransposeWx8_MIPS_DSPR2; 58 TransposeWx8 = TransposeWx8_MIPS_DSPR2;
828 } 59 }
829 } 60 }
830 #endif 61 #endif
831 62
832 // Work across the source in 8x8 tiles 63 // Work across the source in 8x8 tiles
833 while (i >= 8) { 64 while (i >= 8) {
834 TransposeWx8(src, src_stride, dst, dst_stride, width); 65 TransposeWx8(src, src_stride, dst, dst_stride, width);
835 src += 8 * src_stride; // Go down 8 rows. 66 src += 8 * src_stride; // Go down 8 rows.
836 dst += 8; // Move over 8 columns. 67 dst += 8; // Move over 8 columns.
837 i -= 8; 68 i -= 8;
838 } 69 }
839 70
840 TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); 71 if (i > 0) {
72 TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
73 }
841 } 74 }
842 75
843 LIBYUV_API 76 LIBYUV_API
844 void RotatePlane90(const uint8* src, int src_stride, 77 void RotatePlane90(const uint8* src, int src_stride,
845 uint8* dst, int dst_stride, 78 uint8* dst, int dst_stride,
846 int width, int height) { 79 int width, int height) {
847 // Rotate by 90 is a transpose with the source read 80 // Rotate by 90 is a transpose with the source read
848 // from bottom to top. So set the source pointer to the end 81 // from bottom to top. So set the source pointer to the end
849 // of the buffer and flip the sign of the source stride. 82 // of the buffer and flip the sign of the source stride.
850 src += src_stride * (height - 1); 83 src += src_stride * (height - 1);
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after
948 src += src_stride; 181 src += src_stride;
949 MirrorRow(src_bot, dst, width); // Mirror last row into first row 182 MirrorRow(src_bot, dst, width); // Mirror last row into first row
950 dst += dst_stride; 183 dst += dst_stride;
951 CopyRow(row, dst_bot, width); // Copy first mirrored row into last 184 CopyRow(row, dst_bot, width); // Copy first mirrored row into last
952 src_bot -= src_stride; 185 src_bot -= src_stride;
953 dst_bot -= dst_stride; 186 dst_bot -= dst_stride;
954 } 187 }
955 free_aligned_buffer_64(row); 188 free_aligned_buffer_64(row);
956 } 189 }
957 190
958 static void TransposeUVWx8_C(const uint8* src, int src_stride,
959 uint8* dst_a, int dst_stride_a,
960 uint8* dst_b, int dst_stride_b,
961 int width) {
962 int i;
963 for (i = 0; i < width; ++i) {
964 dst_a[0] = src[0 * src_stride + 0];
965 dst_b[0] = src[0 * src_stride + 1];
966 dst_a[1] = src[1 * src_stride + 0];
967 dst_b[1] = src[1 * src_stride + 1];
968 dst_a[2] = src[2 * src_stride + 0];
969 dst_b[2] = src[2 * src_stride + 1];
970 dst_a[3] = src[3 * src_stride + 0];
971 dst_b[3] = src[3 * src_stride + 1];
972 dst_a[4] = src[4 * src_stride + 0];
973 dst_b[4] = src[4 * src_stride + 1];
974 dst_a[5] = src[5 * src_stride + 0];
975 dst_b[5] = src[5 * src_stride + 1];
976 dst_a[6] = src[6 * src_stride + 0];
977 dst_b[6] = src[6 * src_stride + 1];
978 dst_a[7] = src[7 * src_stride + 0];
979 dst_b[7] = src[7 * src_stride + 1];
980 src += 2;
981 dst_a += dst_stride_a;
982 dst_b += dst_stride_b;
983 }
984 }
985
986 static void TransposeUVWxH_C(const uint8* src, int src_stride,
987 uint8* dst_a, int dst_stride_a,
988 uint8* dst_b, int dst_stride_b,
989 int width, int height) {
990 int i;
991 for (i = 0; i < width * 2; i += 2) {
992 int j;
993 for (j = 0; j < height; ++j) {
994 dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
995 dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
996 }
997 }
998 }
999
1000 LIBYUV_API 191 LIBYUV_API
1001 void TransposeUV(const uint8* src, int src_stride, 192 void TransposeUV(const uint8* src, int src_stride,
1002 uint8* dst_a, int dst_stride_a, 193 uint8* dst_a, int dst_stride_a,
1003 uint8* dst_b, int dst_stride_b, 194 uint8* dst_b, int dst_stride_b,
1004 int width, int height) { 195 int width, int height) {
1005 int i = height; 196 int i = height;
1006 void (*TransposeUVWx8)(const uint8* src, int src_stride, 197 void (*TransposeUVWx8)(const uint8* src, int src_stride,
1007 uint8* dst_a, int dst_stride_a, 198 uint8* dst_a, int dst_stride_a,
1008 uint8* dst_b, int dst_stride_b, 199 uint8* dst_b, int dst_stride_b,
1009 int width) = TransposeUVWx8_C; 200 int width) = TransposeUVWx8_C;
1010 #if defined(HAS_TRANSPOSE_UVWX8_NEON) 201 #if defined(HAS_TRANSPOSEUVWX8_NEON)
1011 if (TestCpuFlag(kCpuHasNEON)) { 202 if (TestCpuFlag(kCpuHasNEON)) {
1012 TransposeUVWx8 = TransposeUVWx8_NEON; 203 TransposeUVWx8 = TransposeUVWx8_NEON;
1013 } 204 }
1014 #endif 205 #endif
1015 #if defined(HAS_TRANSPOSE_UVWX8_SSE2) 206 #if defined(HAS_TRANSPOSEUVWX8_SSE2)
1016 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { 207 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
1017 TransposeUVWx8 = TransposeUVWx8_SSE2; 208 TransposeUVWx8 = TransposeUVWx8_SSE2;
1018 } 209 }
1019 #endif 210 #endif
1020 #if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) 211 #if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)
1021 if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && 212 if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
1022 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { 213 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1023 TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; 214 TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
1024 } 215 }
1025 #endif 216 #endif
1026 217
1027 // Work through the source in 8x8 tiles. 218 // Work through the source in 8x8 tiles.
1028 while (i >= 8) { 219 while (i >= 8) {
1029 TransposeUVWx8(src, src_stride, 220 TransposeUVWx8(src, src_stride,
1030 dst_a, dst_stride_a, 221 dst_a, dst_stride_a,
1031 dst_b, dst_stride_b, 222 dst_b, dst_stride_b,
1032 width); 223 width);
1033 src += 8 * src_stride; // Go down 8 rows. 224 src += 8 * src_stride; // Go down 8 rows.
1034 dst_a += 8; // Move over 8 columns. 225 dst_a += 8; // Move over 8 columns.
1035 dst_b += 8; // Move over 8 columns. 226 dst_b += 8; // Move over 8 columns.
1036 i -= 8; 227 i -= 8;
1037 } 228 }
1038 229
1039 TransposeUVWxH_C(src, src_stride, 230 if (i > 0) {
1040 dst_a, dst_stride_a, 231 TransposeUVWxH_C(src, src_stride,
1041 dst_b, dst_stride_b, 232 dst_a, dst_stride_a,
1042 width, i); 233 dst_b, dst_stride_b,
234 width, i);
235 }
1043 } 236 }
1044 237
1045 LIBYUV_API 238 LIBYUV_API
1046 void RotateUV90(const uint8* src, int src_stride, 239 void RotateUV90(const uint8* src, int src_stride,
1047 uint8* dst_a, int dst_stride_a, 240 uint8* dst_a, int dst_stride_a,
1048 uint8* dst_b, int dst_stride_b, 241 uint8* dst_b, int dst_stride_b,
1049 int width, int height) { 242 int width, int height) {
1050 src += src_stride * (height - 1); 243 src += src_stride * (height - 1);
1051 src_stride = -src_stride; 244 src_stride = -src_stride;
1052 245
(...skipping 241 matching lines...) Expand 10 before | Expand all | Expand 10 after
1294 default: 487 default:
1295 break; 488 break;
1296 } 489 }
1297 return -1; 490 return -1;
1298 } 491 }
1299 492
1300 #ifdef __cplusplus 493 #ifdef __cplusplus
1301 } // extern "C" 494 } // extern "C"
1302 } // namespace libyuv 495 } // namespace libyuv
1303 #endif 496 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/planar_functions.cc ('k') | source/libvpx/third_party/libyuv/source/rotate_any.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698