| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include "libyuv/rotate.h" | 11 #include "libyuv/rotate.h" |
| 12 | 12 |
| 13 #include "libyuv/cpu_id.h" | 13 #include "libyuv/cpu_id.h" |
| 14 #include "libyuv/convert.h" | 14 #include "libyuv/convert.h" |
| 15 #include "libyuv/planar_functions.h" | 15 #include "libyuv/planar_functions.h" |
| 16 #include "libyuv/rotate_row.h" |
| 16 #include "libyuv/row.h" | 17 #include "libyuv/row.h" |
| 17 | 18 |
| 18 #ifdef __cplusplus | 19 #ifdef __cplusplus |
| 19 namespace libyuv { | 20 namespace libyuv { |
| 20 extern "C" { | 21 extern "C" { |
| 21 #endif | 22 #endif |
| 22 | 23 |
| 23 #if !defined(LIBYUV_DISABLE_X86) && \ | |
| 24 (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) | |
| 25 #if defined(__APPLE__) && defined(__i386__) | |
| 26 #define DECLARE_FUNCTION(name) \ | |
| 27 ".text \n" \ | |
| 28 ".private_extern _" #name " \n" \ | |
| 29 ".align 4,0x90 \n" \ | |
| 30 "_" #name ": \n" | |
| 31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__) | |
| 32 #define DECLARE_FUNCTION(name) \ | |
| 33 ".text \n" \ | |
| 34 ".align 4,0x90 \n" \ | |
| 35 "_" #name ": \n" | |
| 36 #else | |
| 37 #define DECLARE_FUNCTION(name) \ | |
| 38 ".text \n" \ | |
| 39 ".align 4,0x90 \n" \ | |
| 40 #name ": \n" | |
| 41 #endif | |
| 42 #endif | |
| 43 | |
| 44 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ | |
| 45 (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) | |
| 46 #define HAS_TRANSPOSE_WX8_NEON | |
| 47 void TransposeWx8_NEON(const uint8* src, int src_stride, | |
| 48 uint8* dst, int dst_stride, int width); | |
| 49 #define HAS_TRANSPOSE_UVWX8_NEON | |
| 50 void TransposeUVWx8_NEON(const uint8* src, int src_stride, | |
| 51 uint8* dst_a, int dst_stride_a, | |
| 52 uint8* dst_b, int dst_stride_b, | |
| 53 int width); | |
| 54 #endif | |
| 55 | |
| 56 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ | |
| 57 defined(__mips__) && \ | |
| 58 defined(__mips_dsp) && (__mips_dsp_rev >= 2) | |
| 59 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2 | |
| 60 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, | |
| 61 uint8* dst, int dst_stride, int width); | |
| 62 | |
| 63 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, | |
| 64 uint8* dst, int dst_stride, int width); | |
| 65 #define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2 | |
| 66 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, | |
| 67 uint8* dst_a, int dst_stride_a, | |
| 68 uint8* dst_b, int dst_stride_b, | |
| 69 int width); | |
| 70 #endif // defined(__mips__) | |
| 71 | |
| 72 #if !defined(LIBYUV_DISABLE_X86) && \ | |
| 73 defined(_M_IX86) && defined(_MSC_VER) | |
| 74 #define HAS_TRANSPOSE_WX8_SSSE3 | |
| 75 __declspec(naked) __declspec(align(16)) | |
| 76 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, | |
| 77 uint8* dst, int dst_stride, int width) { | |
| 78 __asm { | |
| 79 push edi | |
| 80 push esi | |
| 81 push ebp | |
| 82 mov eax, [esp + 12 + 4] // src | |
| 83 mov edi, [esp + 12 + 8] // src_stride | |
| 84 mov edx, [esp + 12 + 12] // dst | |
| 85 mov esi, [esp + 12 + 16] // dst_stride | |
| 86 mov ecx, [esp + 12 + 20] // width | |
| 87 | |
| 88 // Read in the data from the source pointer. | |
| 89 // First round of bit swap. | |
| 90 align 4 | |
| 91 convertloop: | |
| 92 movq xmm0, qword ptr [eax] | |
| 93 lea ebp, [eax + 8] | |
| 94 movq xmm1, qword ptr [eax + edi] | |
| 95 lea eax, [eax + 2 * edi] | |
| 96 punpcklbw xmm0, xmm1 | |
| 97 movq xmm2, qword ptr [eax] | |
| 98 movdqa xmm1, xmm0 | |
| 99 palignr xmm1, xmm1, 8 | |
| 100 movq xmm3, qword ptr [eax + edi] | |
| 101 lea eax, [eax + 2 * edi] | |
| 102 punpcklbw xmm2, xmm3 | |
| 103 movdqa xmm3, xmm2 | |
| 104 movq xmm4, qword ptr [eax] | |
| 105 palignr xmm3, xmm3, 8 | |
| 106 movq xmm5, qword ptr [eax + edi] | |
| 107 punpcklbw xmm4, xmm5 | |
| 108 lea eax, [eax + 2 * edi] | |
| 109 movdqa xmm5, xmm4 | |
| 110 movq xmm6, qword ptr [eax] | |
| 111 palignr xmm5, xmm5, 8 | |
| 112 movq xmm7, qword ptr [eax + edi] | |
| 113 punpcklbw xmm6, xmm7 | |
| 114 mov eax, ebp | |
| 115 movdqa xmm7, xmm6 | |
| 116 palignr xmm7, xmm7, 8 | |
| 117 // Second round of bit swap. | |
| 118 punpcklwd xmm0, xmm2 | |
| 119 punpcklwd xmm1, xmm3 | |
| 120 movdqa xmm2, xmm0 | |
| 121 movdqa xmm3, xmm1 | |
| 122 palignr xmm2, xmm2, 8 | |
| 123 palignr xmm3, xmm3, 8 | |
| 124 punpcklwd xmm4, xmm6 | |
| 125 punpcklwd xmm5, xmm7 | |
| 126 movdqa xmm6, xmm4 | |
| 127 movdqa xmm7, xmm5 | |
| 128 palignr xmm6, xmm6, 8 | |
| 129 palignr xmm7, xmm7, 8 | |
| 130 // Third round of bit swap. | |
| 131 // Write to the destination pointer. | |
| 132 punpckldq xmm0, xmm4 | |
| 133 movq qword ptr [edx], xmm0 | |
| 134 movdqa xmm4, xmm0 | |
| 135 palignr xmm4, xmm4, 8 | |
| 136 movq qword ptr [edx + esi], xmm4 | |
| 137 lea edx, [edx + 2 * esi] | |
| 138 punpckldq xmm2, xmm6 | |
| 139 movdqa xmm6, xmm2 | |
| 140 palignr xmm6, xmm6, 8 | |
| 141 movq qword ptr [edx], xmm2 | |
| 142 punpckldq xmm1, xmm5 | |
| 143 movq qword ptr [edx + esi], xmm6 | |
| 144 lea edx, [edx + 2 * esi] | |
| 145 movdqa xmm5, xmm1 | |
| 146 movq qword ptr [edx], xmm1 | |
| 147 palignr xmm5, xmm5, 8 | |
| 148 punpckldq xmm3, xmm7 | |
| 149 movq qword ptr [edx + esi], xmm5 | |
| 150 lea edx, [edx + 2 * esi] | |
| 151 movq qword ptr [edx], xmm3 | |
| 152 movdqa xmm7, xmm3 | |
| 153 palignr xmm7, xmm7, 8 | |
| 154 sub ecx, 8 | |
| 155 movq qword ptr [edx + esi], xmm7 | |
| 156 lea edx, [edx + 2 * esi] | |
| 157 jg convertloop | |
| 158 | |
| 159 pop ebp | |
| 160 pop esi | |
| 161 pop edi | |
| 162 ret | |
| 163 } | |
| 164 } | |
| 165 | |
| 166 #define HAS_TRANSPOSE_UVWX8_SSE2 | |
| 167 __declspec(naked) __declspec(align(16)) | |
| 168 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, | |
| 169 uint8* dst_a, int dst_stride_a, | |
| 170 uint8* dst_b, int dst_stride_b, | |
| 171 int w) { | |
| 172 __asm { | |
| 173 push ebx | |
| 174 push esi | |
| 175 push edi | |
| 176 push ebp | |
| 177 mov eax, [esp + 16 + 4] // src | |
| 178 mov edi, [esp + 16 + 8] // src_stride | |
| 179 mov edx, [esp + 16 + 12] // dst_a | |
| 180 mov esi, [esp + 16 + 16] // dst_stride_a | |
| 181 mov ebx, [esp + 16 + 20] // dst_b | |
| 182 mov ebp, [esp + 16 + 24] // dst_stride_b | |
| 183 mov ecx, esp | |
| 184 sub esp, 4 + 16 | |
| 185 and esp, ~15 | |
| 186 mov [esp + 16], ecx | |
| 187 mov ecx, [ecx + 16 + 28] // w | |
| 188 | |
| 189 align 4 | |
| 190 convertloop: | |
| 191 // Read in the data from the source pointer. | |
| 192 // First round of bit swap. | |
| 193 movdqu xmm0, [eax] | |
| 194 movdqu xmm1, [eax + edi] | |
| 195 lea eax, [eax + 2 * edi] | |
| 196 movdqa xmm7, xmm0 // use xmm7 as temp register. | |
| 197 punpcklbw xmm0, xmm1 | |
| 198 punpckhbw xmm7, xmm1 | |
| 199 movdqa xmm1, xmm7 | |
| 200 movdqu xmm2, [eax] | |
| 201 movdqu xmm3, [eax + edi] | |
| 202 lea eax, [eax + 2 * edi] | |
| 203 movdqa xmm7, xmm2 | |
| 204 punpcklbw xmm2, xmm3 | |
| 205 punpckhbw xmm7, xmm3 | |
| 206 movdqa xmm3, xmm7 | |
| 207 movdqu xmm4, [eax] | |
| 208 movdqu xmm5, [eax + edi] | |
| 209 lea eax, [eax + 2 * edi] | |
| 210 movdqa xmm7, xmm4 | |
| 211 punpcklbw xmm4, xmm5 | |
| 212 punpckhbw xmm7, xmm5 | |
| 213 movdqa xmm5, xmm7 | |
| 214 movdqu xmm6, [eax] | |
| 215 movdqu xmm7, [eax + edi] | |
| 216 lea eax, [eax + 2 * edi] | |
| 217 movdqu [esp], xmm5 // backup xmm5 | |
| 218 neg edi | |
| 219 movdqa xmm5, xmm6 // use xmm5 as temp register. | |
| 220 punpcklbw xmm6, xmm7 | |
| 221 punpckhbw xmm5, xmm7 | |
| 222 movdqa xmm7, xmm5 | |
| 223 lea eax, [eax + 8 * edi + 16] | |
| 224 neg edi | |
| 225 // Second round of bit swap. | |
| 226 movdqa xmm5, xmm0 | |
| 227 punpcklwd xmm0, xmm2 | |
| 228 punpckhwd xmm5, xmm2 | |
| 229 movdqa xmm2, xmm5 | |
| 230 movdqa xmm5, xmm1 | |
| 231 punpcklwd xmm1, xmm3 | |
| 232 punpckhwd xmm5, xmm3 | |
| 233 movdqa xmm3, xmm5 | |
| 234 movdqa xmm5, xmm4 | |
| 235 punpcklwd xmm4, xmm6 | |
| 236 punpckhwd xmm5, xmm6 | |
| 237 movdqa xmm6, xmm5 | |
| 238 movdqu xmm5, [esp] // restore xmm5 | |
| 239 movdqu [esp], xmm6 // backup xmm6 | |
| 240 movdqa xmm6, xmm5 // use xmm6 as temp register. | |
| 241 punpcklwd xmm5, xmm7 | |
| 242 punpckhwd xmm6, xmm7 | |
| 243 movdqa xmm7, xmm6 | |
| 244 // Third round of bit swap. | |
| 245 // Write to the destination pointer. | |
| 246 movdqa xmm6, xmm0 | |
| 247 punpckldq xmm0, xmm4 | |
| 248 punpckhdq xmm6, xmm4 | |
| 249 movdqa xmm4, xmm6 | |
| 250 movdqu xmm6, [esp] // restore xmm6 | |
| 251 movlpd qword ptr [edx], xmm0 | |
| 252 movhpd qword ptr [ebx], xmm0 | |
| 253 movlpd qword ptr [edx + esi], xmm4 | |
| 254 lea edx, [edx + 2 * esi] | |
| 255 movhpd qword ptr [ebx + ebp], xmm4 | |
| 256 lea ebx, [ebx + 2 * ebp] | |
| 257 movdqa xmm0, xmm2 // use xmm0 as the temp register. | |
| 258 punpckldq xmm2, xmm6 | |
| 259 movlpd qword ptr [edx], xmm2 | |
| 260 movhpd qword ptr [ebx], xmm2 | |
| 261 punpckhdq xmm0, xmm6 | |
| 262 movlpd qword ptr [edx + esi], xmm0 | |
| 263 lea edx, [edx + 2 * esi] | |
| 264 movhpd qword ptr [ebx + ebp], xmm0 | |
| 265 lea ebx, [ebx + 2 * ebp] | |
| 266 movdqa xmm0, xmm1 // use xmm0 as the temp register. | |
| 267 punpckldq xmm1, xmm5 | |
| 268 movlpd qword ptr [edx], xmm1 | |
| 269 movhpd qword ptr [ebx], xmm1 | |
| 270 punpckhdq xmm0, xmm5 | |
| 271 movlpd qword ptr [edx + esi], xmm0 | |
| 272 lea edx, [edx + 2 * esi] | |
| 273 movhpd qword ptr [ebx + ebp], xmm0 | |
| 274 lea ebx, [ebx + 2 * ebp] | |
| 275 movdqa xmm0, xmm3 // use xmm0 as the temp register. | |
| 276 punpckldq xmm3, xmm7 | |
| 277 movlpd qword ptr [edx], xmm3 | |
| 278 movhpd qword ptr [ebx], xmm3 | |
| 279 punpckhdq xmm0, xmm7 | |
| 280 sub ecx, 8 | |
| 281 movlpd qword ptr [edx + esi], xmm0 | |
| 282 lea edx, [edx + 2 * esi] | |
| 283 movhpd qword ptr [ebx + ebp], xmm0 | |
| 284 lea ebx, [ebx + 2 * ebp] | |
| 285 jg convertloop | |
| 286 | |
| 287 mov esp, [esp + 16] | |
| 288 pop ebp | |
| 289 pop edi | |
| 290 pop esi | |
| 291 pop ebx | |
| 292 ret | |
| 293 } | |
| 294 } | |
| 295 #endif | |
| 296 #if !defined(LIBYUV_DISABLE_X86) && \ | |
| 297 (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) | |
| 298 #define HAS_TRANSPOSE_WX8_SSSE3 | |
| 299 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, | |
| 300 uint8* dst, int dst_stride, int width) { | |
| 301 asm volatile ( | |
| 302 // Read in the data from the source pointer. | |
| 303 // First round of bit swap. | |
| 304 ".p2align 2 \n" | |
| 305 "1: \n" | |
| 306 "movq (%0),%%xmm0 \n" | |
| 307 "movq (%0,%3),%%xmm1 \n" | |
| 308 "lea (%0,%3,2),%0 \n" | |
| 309 "punpcklbw %%xmm1,%%xmm0 \n" | |
| 310 "movq (%0),%%xmm2 \n" | |
| 311 "movdqa %%xmm0,%%xmm1 \n" | |
| 312 "palignr $0x8,%%xmm1,%%xmm1 \n" | |
| 313 "movq (%0,%3),%%xmm3 \n" | |
| 314 "lea (%0,%3,2),%0 \n" | |
| 315 "punpcklbw %%xmm3,%%xmm2 \n" | |
| 316 "movdqa %%xmm2,%%xmm3 \n" | |
| 317 "movq (%0),%%xmm4 \n" | |
| 318 "palignr $0x8,%%xmm3,%%xmm3 \n" | |
| 319 "movq (%0,%3),%%xmm5 \n" | |
| 320 "lea (%0,%3,2),%0 \n" | |
| 321 "punpcklbw %%xmm5,%%xmm4 \n" | |
| 322 "movdqa %%xmm4,%%xmm5 \n" | |
| 323 "movq (%0),%%xmm6 \n" | |
| 324 "palignr $0x8,%%xmm5,%%xmm5 \n" | |
| 325 "movq (%0,%3),%%xmm7 \n" | |
| 326 "lea (%0,%3,2),%0 \n" | |
| 327 "punpcklbw %%xmm7,%%xmm6 \n" | |
| 328 "neg %3 \n" | |
| 329 "movdqa %%xmm6,%%xmm7 \n" | |
| 330 "lea 0x8(%0,%3,8),%0 \n" | |
| 331 "palignr $0x8,%%xmm7,%%xmm7 \n" | |
| 332 "neg %3 \n" | |
| 333 // Second round of bit swap. | |
| 334 "punpcklwd %%xmm2,%%xmm0 \n" | |
| 335 "punpcklwd %%xmm3,%%xmm1 \n" | |
| 336 "movdqa %%xmm0,%%xmm2 \n" | |
| 337 "movdqa %%xmm1,%%xmm3 \n" | |
| 338 "palignr $0x8,%%xmm2,%%xmm2 \n" | |
| 339 "palignr $0x8,%%xmm3,%%xmm3 \n" | |
| 340 "punpcklwd %%xmm6,%%xmm4 \n" | |
| 341 "punpcklwd %%xmm7,%%xmm5 \n" | |
| 342 "movdqa %%xmm4,%%xmm6 \n" | |
| 343 "movdqa %%xmm5,%%xmm7 \n" | |
| 344 "palignr $0x8,%%xmm6,%%xmm6 \n" | |
| 345 "palignr $0x8,%%xmm7,%%xmm7 \n" | |
| 346 // Third round of bit swap. | |
| 347 // Write to the destination pointer. | |
| 348 "punpckldq %%xmm4,%%xmm0 \n" | |
| 349 "movq %%xmm0,(%1) \n" | |
| 350 "movdqa %%xmm0,%%xmm4 \n" | |
| 351 "palignr $0x8,%%xmm4,%%xmm4 \n" | |
| 352 "movq %%xmm4,(%1,%4) \n" | |
| 353 "lea (%1,%4,2),%1 \n" | |
| 354 "punpckldq %%xmm6,%%xmm2 \n" | |
| 355 "movdqa %%xmm2,%%xmm6 \n" | |
| 356 "movq %%xmm2,(%1) \n" | |
| 357 "palignr $0x8,%%xmm6,%%xmm6 \n" | |
| 358 "punpckldq %%xmm5,%%xmm1 \n" | |
| 359 "movq %%xmm6,(%1,%4) \n" | |
| 360 "lea (%1,%4,2),%1 \n" | |
| 361 "movdqa %%xmm1,%%xmm5 \n" | |
| 362 "movq %%xmm1,(%1) \n" | |
| 363 "palignr $0x8,%%xmm5,%%xmm5 \n" | |
| 364 "movq %%xmm5,(%1,%4) \n" | |
| 365 "lea (%1,%4,2),%1 \n" | |
| 366 "punpckldq %%xmm7,%%xmm3 \n" | |
| 367 "movq %%xmm3,(%1) \n" | |
| 368 "movdqa %%xmm3,%%xmm7 \n" | |
| 369 "palignr $0x8,%%xmm7,%%xmm7 \n" | |
| 370 "sub $0x8,%2 \n" | |
| 371 "movq %%xmm7,(%1,%4) \n" | |
| 372 "lea (%1,%4,2),%1 \n" | |
| 373 "jg 1b \n" | |
| 374 : "+r"(src), // %0 | |
| 375 "+r"(dst), // %1 | |
| 376 "+r"(width) // %2 | |
| 377 : "r"((intptr_t)(src_stride)), // %3 | |
| 378 "r"((intptr_t)(dst_stride)) // %4 | |
| 379 : "memory", "cc", | |
| 380 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 381 ); | |
| 382 } | |
| 383 | |
| 384 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) | |
| 385 #define HAS_TRANSPOSE_UVWX8_SSE2 | |
| 386 void TransposeUVWx8_SSE2(const uint8* src, int src_stride, | |
| 387 uint8* dst_a, int dst_stride_a, | |
| 388 uint8* dst_b, int dst_stride_b, | |
| 389 int w); | |
| 390 asm ( | |
| 391 DECLARE_FUNCTION(TransposeUVWx8_SSE2) | |
| 392 "push %ebx \n" | |
| 393 "push %esi \n" | |
| 394 "push %edi \n" | |
| 395 "push %ebp \n" | |
| 396 "mov 0x14(%esp),%eax \n" | |
| 397 "mov 0x18(%esp),%edi \n" | |
| 398 "mov 0x1c(%esp),%edx \n" | |
| 399 "mov 0x20(%esp),%esi \n" | |
| 400 "mov 0x24(%esp),%ebx \n" | |
| 401 "mov 0x28(%esp),%ebp \n" | |
| 402 "mov %esp,%ecx \n" | |
| 403 "sub $0x14,%esp \n" | |
| 404 "and $0xfffffff0,%esp \n" | |
| 405 "mov %ecx,0x10(%esp) \n" | |
| 406 "mov 0x2c(%ecx),%ecx \n" | |
| 407 | |
| 408 "1: \n" | |
| 409 "movdqu (%eax),%xmm0 \n" | |
| 410 "movdqu (%eax,%edi,1),%xmm1 \n" | |
| 411 "lea (%eax,%edi,2),%eax \n" | |
| 412 "movdqa %xmm0,%xmm7 \n" | |
| 413 "punpcklbw %xmm1,%xmm0 \n" | |
| 414 "punpckhbw %xmm1,%xmm7 \n" | |
| 415 "movdqa %xmm7,%xmm1 \n" | |
| 416 "movdqu (%eax),%xmm2 \n" | |
| 417 "movdqu (%eax,%edi,1),%xmm3 \n" | |
| 418 "lea (%eax,%edi,2),%eax \n" | |
| 419 "movdqa %xmm2,%xmm7 \n" | |
| 420 "punpcklbw %xmm3,%xmm2 \n" | |
| 421 "punpckhbw %xmm3,%xmm7 \n" | |
| 422 "movdqa %xmm7,%xmm3 \n" | |
| 423 "movdqu (%eax),%xmm4 \n" | |
| 424 "movdqu (%eax,%edi,1),%xmm5 \n" | |
| 425 "lea (%eax,%edi,2),%eax \n" | |
| 426 "movdqa %xmm4,%xmm7 \n" | |
| 427 "punpcklbw %xmm5,%xmm4 \n" | |
| 428 "punpckhbw %xmm5,%xmm7 \n" | |
| 429 "movdqa %xmm7,%xmm5 \n" | |
| 430 "movdqu (%eax),%xmm6 \n" | |
| 431 "movdqu (%eax,%edi,1),%xmm7 \n" | |
| 432 "lea (%eax,%edi,2),%eax \n" | |
| 433 "movdqu %xmm5,(%esp) \n" | |
| 434 "neg %edi \n" | |
| 435 "movdqa %xmm6,%xmm5 \n" | |
| 436 "punpcklbw %xmm7,%xmm6 \n" | |
| 437 "punpckhbw %xmm7,%xmm5 \n" | |
| 438 "movdqa %xmm5,%xmm7 \n" | |
| 439 "lea 0x10(%eax,%edi,8),%eax \n" | |
| 440 "neg %edi \n" | |
| 441 "movdqa %xmm0,%xmm5 \n" | |
| 442 "punpcklwd %xmm2,%xmm0 \n" | |
| 443 "punpckhwd %xmm2,%xmm5 \n" | |
| 444 "movdqa %xmm5,%xmm2 \n" | |
| 445 "movdqa %xmm1,%xmm5 \n" | |
| 446 "punpcklwd %xmm3,%xmm1 \n" | |
| 447 "punpckhwd %xmm3,%xmm5 \n" | |
| 448 "movdqa %xmm5,%xmm3 \n" | |
| 449 "movdqa %xmm4,%xmm5 \n" | |
| 450 "punpcklwd %xmm6,%xmm4 \n" | |
| 451 "punpckhwd %xmm6,%xmm5 \n" | |
| 452 "movdqa %xmm5,%xmm6 \n" | |
| 453 "movdqu (%esp),%xmm5 \n" | |
| 454 "movdqu %xmm6,(%esp) \n" | |
| 455 "movdqa %xmm5,%xmm6 \n" | |
| 456 "punpcklwd %xmm7,%xmm5 \n" | |
| 457 "punpckhwd %xmm7,%xmm6 \n" | |
| 458 "movdqa %xmm6,%xmm7 \n" | |
| 459 "movdqa %xmm0,%xmm6 \n" | |
| 460 "punpckldq %xmm4,%xmm0 \n" | |
| 461 "punpckhdq %xmm4,%xmm6 \n" | |
| 462 "movdqa %xmm6,%xmm4 \n" | |
| 463 "movdqu (%esp),%xmm6 \n" | |
| 464 "movlpd %xmm0,(%edx) \n" | |
| 465 "movhpd %xmm0,(%ebx) \n" | |
| 466 "movlpd %xmm4,(%edx,%esi,1) \n" | |
| 467 "lea (%edx,%esi,2),%edx \n" | |
| 468 "movhpd %xmm4,(%ebx,%ebp,1) \n" | |
| 469 "lea (%ebx,%ebp,2),%ebx \n" | |
| 470 "movdqa %xmm2,%xmm0 \n" | |
| 471 "punpckldq %xmm6,%xmm2 \n" | |
| 472 "movlpd %xmm2,(%edx) \n" | |
| 473 "movhpd %xmm2,(%ebx) \n" | |
| 474 "punpckhdq %xmm6,%xmm0 \n" | |
| 475 "movlpd %xmm0,(%edx,%esi,1) \n" | |
| 476 "lea (%edx,%esi,2),%edx \n" | |
| 477 "movhpd %xmm0,(%ebx,%ebp,1) \n" | |
| 478 "lea (%ebx,%ebp,2),%ebx \n" | |
| 479 "movdqa %xmm1,%xmm0 \n" | |
| 480 "punpckldq %xmm5,%xmm1 \n" | |
| 481 "movlpd %xmm1,(%edx) \n" | |
| 482 "movhpd %xmm1,(%ebx) \n" | |
| 483 "punpckhdq %xmm5,%xmm0 \n" | |
| 484 "movlpd %xmm0,(%edx,%esi,1) \n" | |
| 485 "lea (%edx,%esi,2),%edx \n" | |
| 486 "movhpd %xmm0,(%ebx,%ebp,1) \n" | |
| 487 "lea (%ebx,%ebp,2),%ebx \n" | |
| 488 "movdqa %xmm3,%xmm0 \n" | |
| 489 "punpckldq %xmm7,%xmm3 \n" | |
| 490 "movlpd %xmm3,(%edx) \n" | |
| 491 "movhpd %xmm3,(%ebx) \n" | |
| 492 "punpckhdq %xmm7,%xmm0 \n" | |
| 493 "sub $0x8,%ecx \n" | |
| 494 "movlpd %xmm0,(%edx,%esi,1) \n" | |
| 495 "lea (%edx,%esi,2),%edx \n" | |
| 496 "movhpd %xmm0,(%ebx,%ebp,1) \n" | |
| 497 "lea (%ebx,%ebp,2),%ebx \n" | |
| 498 "jg 1b \n" | |
| 499 "mov 0x10(%esp),%esp \n" | |
| 500 "pop %ebp \n" | |
| 501 "pop %edi \n" | |
| 502 "pop %esi \n" | |
| 503 "pop %ebx \n" | |
| 504 #if defined(__native_client__) | |
| 505 "pop %ecx \n" | |
| 506 "and $0xffffffe0,%ecx \n" | |
| 507 "jmp *%ecx \n" | |
| 508 #else | |
| 509 "ret \n" | |
| 510 #endif | |
| 511 ); | |
| 512 #endif | |
| 513 #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ | |
| 514 defined(__x86_64__) | |
| 515 // 64 bit version has enough registers to do 16x8 to 8x16 at a time. | |
| 516 #define HAS_TRANSPOSE_WX8_FAST_SSSE3 | |
| 517 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, | |
| 518 uint8* dst, int dst_stride, int width) { | |
| 519 asm volatile ( | |
| 520 // Read in the data from the source pointer. | |
| 521 // First round of bit swap. | |
| 522 ".p2align 2 \n" | |
| 523 "1: \n" | |
| 524 "movdqu (%0),%%xmm0 \n" | |
| 525 "movdqu (%0,%3),%%xmm1 \n" | |
| 526 "lea (%0,%3,2),%0 \n" | |
| 527 "movdqa %%xmm0,%%xmm8 \n" | |
| 528 "punpcklbw %%xmm1,%%xmm0 \n" | |
| 529 "punpckhbw %%xmm1,%%xmm8 \n" | |
| 530 "movdqu (%0),%%xmm2 \n" | |
| 531 "movdqa %%xmm0,%%xmm1 \n" | |
| 532 "movdqa %%xmm8,%%xmm9 \n" | |
| 533 "palignr $0x8,%%xmm1,%%xmm1 \n" | |
| 534 "palignr $0x8,%%xmm9,%%xmm9 \n" | |
| 535 "movdqu (%0,%3),%%xmm3 \n" | |
| 536 "lea (%0,%3,2),%0 \n" | |
| 537 "movdqa %%xmm2,%%xmm10 \n" | |
| 538 "punpcklbw %%xmm3,%%xmm2 \n" | |
| 539 "punpckhbw %%xmm3,%%xmm10 \n" | |
| 540 "movdqa %%xmm2,%%xmm3 \n" | |
| 541 "movdqa %%xmm10,%%xmm11 \n" | |
| 542 "movdqu (%0),%%xmm4 \n" | |
| 543 "palignr $0x8,%%xmm3,%%xmm3 \n" | |
| 544 "palignr $0x8,%%xmm11,%%xmm11 \n" | |
| 545 "movdqu (%0,%3),%%xmm5 \n" | |
| 546 "lea (%0,%3,2),%0 \n" | |
| 547 "movdqa %%xmm4,%%xmm12 \n" | |
| 548 "punpcklbw %%xmm5,%%xmm4 \n" | |
| 549 "punpckhbw %%xmm5,%%xmm12 \n" | |
| 550 "movdqa %%xmm4,%%xmm5 \n" | |
| 551 "movdqa %%xmm12,%%xmm13 \n" | |
| 552 "movdqu (%0),%%xmm6 \n" | |
| 553 "palignr $0x8,%%xmm5,%%xmm5 \n" | |
| 554 "palignr $0x8,%%xmm13,%%xmm13 \n" | |
| 555 "movdqu (%0,%3),%%xmm7 \n" | |
| 556 "lea (%0,%3,2),%0 \n" | |
| 557 "movdqa %%xmm6,%%xmm14 \n" | |
| 558 "punpcklbw %%xmm7,%%xmm6 \n" | |
| 559 "punpckhbw %%xmm7,%%xmm14 \n" | |
| 560 "neg %3 \n" | |
| 561 "movdqa %%xmm6,%%xmm7 \n" | |
| 562 "movdqa %%xmm14,%%xmm15 \n" | |
| 563 "lea 0x10(%0,%3,8),%0 \n" | |
| 564 "palignr $0x8,%%xmm7,%%xmm7 \n" | |
| 565 "palignr $0x8,%%xmm15,%%xmm15 \n" | |
| 566 "neg %3 \n" | |
| 567 // Second round of bit swap. | |
| 568 "punpcklwd %%xmm2,%%xmm0 \n" | |
| 569 "punpcklwd %%xmm3,%%xmm1 \n" | |
| 570 "movdqa %%xmm0,%%xmm2 \n" | |
| 571 "movdqa %%xmm1,%%xmm3 \n" | |
| 572 "palignr $0x8,%%xmm2,%%xmm2 \n" | |
| 573 "palignr $0x8,%%xmm3,%%xmm3 \n" | |
| 574 "punpcklwd %%xmm6,%%xmm4 \n" | |
| 575 "punpcklwd %%xmm7,%%xmm5 \n" | |
| 576 "movdqa %%xmm4,%%xmm6 \n" | |
| 577 "movdqa %%xmm5,%%xmm7 \n" | |
| 578 "palignr $0x8,%%xmm6,%%xmm6 \n" | |
| 579 "palignr $0x8,%%xmm7,%%xmm7 \n" | |
| 580 "punpcklwd %%xmm10,%%xmm8 \n" | |
| 581 "punpcklwd %%xmm11,%%xmm9 \n" | |
| 582 "movdqa %%xmm8,%%xmm10 \n" | |
| 583 "movdqa %%xmm9,%%xmm11 \n" | |
| 584 "palignr $0x8,%%xmm10,%%xmm10 \n" | |
| 585 "palignr $0x8,%%xmm11,%%xmm11 \n" | |
| 586 "punpcklwd %%xmm14,%%xmm12 \n" | |
| 587 "punpcklwd %%xmm15,%%xmm13 \n" | |
| 588 "movdqa %%xmm12,%%xmm14 \n" | |
| 589 "movdqa %%xmm13,%%xmm15 \n" | |
| 590 "palignr $0x8,%%xmm14,%%xmm14 \n" | |
| 591 "palignr $0x8,%%xmm15,%%xmm15 \n" | |
| 592 // Third round of bit swap. | |
| 593 // Write to the destination pointer. | |
| 594 "punpckldq %%xmm4,%%xmm0 \n" | |
| 595 "movq %%xmm0,(%1) \n" | |
| 596 "movdqa %%xmm0,%%xmm4 \n" | |
| 597 "palignr $0x8,%%xmm4,%%xmm4 \n" | |
| 598 "movq %%xmm4,(%1,%4) \n" | |
| 599 "lea (%1,%4,2),%1 \n" | |
| 600 "punpckldq %%xmm6,%%xmm2 \n" | |
| 601 "movdqa %%xmm2,%%xmm6 \n" | |
| 602 "movq %%xmm2,(%1) \n" | |
| 603 "palignr $0x8,%%xmm6,%%xmm6 \n" | |
| 604 "punpckldq %%xmm5,%%xmm1 \n" | |
| 605 "movq %%xmm6,(%1,%4) \n" | |
| 606 "lea (%1,%4,2),%1 \n" | |
| 607 "movdqa %%xmm1,%%xmm5 \n" | |
| 608 "movq %%xmm1,(%1) \n" | |
| 609 "palignr $0x8,%%xmm5,%%xmm5 \n" | |
| 610 "movq %%xmm5,(%1,%4) \n" | |
| 611 "lea (%1,%4,2),%1 \n" | |
| 612 "punpckldq %%xmm7,%%xmm3 \n" | |
| 613 "movq %%xmm3,(%1) \n" | |
| 614 "movdqa %%xmm3,%%xmm7 \n" | |
| 615 "palignr $0x8,%%xmm7,%%xmm7 \n" | |
| 616 "movq %%xmm7,(%1,%4) \n" | |
| 617 "lea (%1,%4,2),%1 \n" | |
| 618 "punpckldq %%xmm12,%%xmm8 \n" | |
| 619 "movq %%xmm8,(%1) \n" | |
| 620 "movdqa %%xmm8,%%xmm12 \n" | |
| 621 "palignr $0x8,%%xmm12,%%xmm12 \n" | |
| 622 "movq %%xmm12,(%1,%4) \n" | |
| 623 "lea (%1,%4,2),%1 \n" | |
| 624 "punpckldq %%xmm14,%%xmm10 \n" | |
| 625 "movdqa %%xmm10,%%xmm14 \n" | |
| 626 "movq %%xmm10,(%1) \n" | |
| 627 "palignr $0x8,%%xmm14,%%xmm14 \n" | |
| 628 "punpckldq %%xmm13,%%xmm9 \n" | |
| 629 "movq %%xmm14,(%1,%4) \n" | |
| 630 "lea (%1,%4,2),%1 \n" | |
| 631 "movdqa %%xmm9,%%xmm13 \n" | |
| 632 "movq %%xmm9,(%1) \n" | |
| 633 "palignr $0x8,%%xmm13,%%xmm13 \n" | |
| 634 "movq %%xmm13,(%1,%4) \n" | |
| 635 "lea (%1,%4,2),%1 \n" | |
| 636 "punpckldq %%xmm15,%%xmm11 \n" | |
| 637 "movq %%xmm11,(%1) \n" | |
| 638 "movdqa %%xmm11,%%xmm15 \n" | |
| 639 "palignr $0x8,%%xmm15,%%xmm15 \n" | |
| 640 "sub $0x10,%2 \n" | |
| 641 "movq %%xmm15,(%1,%4) \n" | |
| 642 "lea (%1,%4,2),%1 \n" | |
| 643 "jg 1b \n" | |
| 644 : "+r"(src), // %0 | |
| 645 "+r"(dst), // %1 | |
| 646 "+r"(width) // %2 | |
| 647 : "r"((intptr_t)(src_stride)), // %3 | |
| 648 "r"((intptr_t)(dst_stride)) // %4 | |
| 649 : "memory", "cc", | |
| 650 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", | |
| 651 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" | |
| 652 ); | |
| 653 } | |
| 654 | |
| 655 #define HAS_TRANSPOSE_UVWX8_SSE2 | |
| 656 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, | |
| 657 uint8* dst_a, int dst_stride_a, | |
| 658 uint8* dst_b, int dst_stride_b, | |
| 659 int w) { | |
| 660 asm volatile ( | |
| 661 // Read in the data from the source pointer. | |
| 662 // First round of bit swap. | |
| 663 ".p2align 2 \n" | |
| 664 "1: \n" | |
| 665 "movdqu (%0),%%xmm0 \n" | |
| 666 "movdqu (%0,%4),%%xmm1 \n" | |
| 667 "lea (%0,%4,2),%0 \n" | |
| 668 "movdqa %%xmm0,%%xmm8 \n" | |
| 669 "punpcklbw %%xmm1,%%xmm0 \n" | |
| 670 "punpckhbw %%xmm1,%%xmm8 \n" | |
| 671 "movdqa %%xmm8,%%xmm1 \n" | |
| 672 "movdqu (%0),%%xmm2 \n" | |
| 673 "movdqu (%0,%4),%%xmm3 \n" | |
| 674 "lea (%0,%4,2),%0 \n" | |
| 675 "movdqa %%xmm2,%%xmm8 \n" | |
| 676 "punpcklbw %%xmm3,%%xmm2 \n" | |
| 677 "punpckhbw %%xmm3,%%xmm8 \n" | |
| 678 "movdqa %%xmm8,%%xmm3 \n" | |
| 679 "movdqu (%0),%%xmm4 \n" | |
| 680 "movdqu (%0,%4),%%xmm5 \n" | |
| 681 "lea (%0,%4,2),%0 \n" | |
| 682 "movdqa %%xmm4,%%xmm8 \n" | |
| 683 "punpcklbw %%xmm5,%%xmm4 \n" | |
| 684 "punpckhbw %%xmm5,%%xmm8 \n" | |
| 685 "movdqa %%xmm8,%%xmm5 \n" | |
| 686 "movdqu (%0),%%xmm6 \n" | |
| 687 "movdqu (%0,%4),%%xmm7 \n" | |
| 688 "lea (%0,%4,2),%0 \n" | |
| 689 "movdqa %%xmm6,%%xmm8 \n" | |
| 690 "punpcklbw %%xmm7,%%xmm6 \n" | |
| 691 "neg %4 \n" | |
| 692 "lea 0x10(%0,%4,8),%0 \n" | |
| 693 "punpckhbw %%xmm7,%%xmm8 \n" | |
| 694 "movdqa %%xmm8,%%xmm7 \n" | |
| 695 "neg %4 \n" | |
| 696 // Second round of bit swap. | |
| 697 "movdqa %%xmm0,%%xmm8 \n" | |
| 698 "movdqa %%xmm1,%%xmm9 \n" | |
| 699 "punpckhwd %%xmm2,%%xmm8 \n" | |
| 700 "punpckhwd %%xmm3,%%xmm9 \n" | |
| 701 "punpcklwd %%xmm2,%%xmm0 \n" | |
| 702 "punpcklwd %%xmm3,%%xmm1 \n" | |
| 703 "movdqa %%xmm8,%%xmm2 \n" | |
| 704 "movdqa %%xmm9,%%xmm3 \n" | |
| 705 "movdqa %%xmm4,%%xmm8 \n" | |
| 706 "movdqa %%xmm5,%%xmm9 \n" | |
| 707 "punpckhwd %%xmm6,%%xmm8 \n" | |
| 708 "punpckhwd %%xmm7,%%xmm9 \n" | |
| 709 "punpcklwd %%xmm6,%%xmm4 \n" | |
| 710 "punpcklwd %%xmm7,%%xmm5 \n" | |
| 711 "movdqa %%xmm8,%%xmm6 \n" | |
| 712 "movdqa %%xmm9,%%xmm7 \n" | |
| 713 // Third round of bit swap. | |
| 714 // Write to the destination pointer. | |
| 715 "movdqa %%xmm0,%%xmm8 \n" | |
| 716 "punpckldq %%xmm4,%%xmm0 \n" | |
| 717 "movlpd %%xmm0,(%1) \n" // Write back U channel | |
| 718 "movhpd %%xmm0,(%2) \n" // Write back V channel | |
| 719 "punpckhdq %%xmm4,%%xmm8 \n" | |
| 720 "movlpd %%xmm8,(%1,%5) \n" | |
| 721 "lea (%1,%5,2),%1 \n" | |
| 722 "movhpd %%xmm8,(%2,%6) \n" | |
| 723 "lea (%2,%6,2),%2 \n" | |
| 724 "movdqa %%xmm2,%%xmm8 \n" | |
| 725 "punpckldq %%xmm6,%%xmm2 \n" | |
| 726 "movlpd %%xmm2,(%1) \n" | |
| 727 "movhpd %%xmm2,(%2) \n" | |
| 728 "punpckhdq %%xmm6,%%xmm8 \n" | |
| 729 "movlpd %%xmm8,(%1,%5) \n" | |
| 730 "lea (%1,%5,2),%1 \n" | |
| 731 "movhpd %%xmm8,(%2,%6) \n" | |
| 732 "lea (%2,%6,2),%2 \n" | |
| 733 "movdqa %%xmm1,%%xmm8 \n" | |
| 734 "punpckldq %%xmm5,%%xmm1 \n" | |
| 735 "movlpd %%xmm1,(%1) \n" | |
| 736 "movhpd %%xmm1,(%2) \n" | |
| 737 "punpckhdq %%xmm5,%%xmm8 \n" | |
| 738 "movlpd %%xmm8,(%1,%5) \n" | |
| 739 "lea (%1,%5,2),%1 \n" | |
| 740 "movhpd %%xmm8,(%2,%6) \n" | |
| 741 "lea (%2,%6,2),%2 \n" | |
| 742 "movdqa %%xmm3,%%xmm8 \n" | |
| 743 "punpckldq %%xmm7,%%xmm3 \n" | |
| 744 "movlpd %%xmm3,(%1) \n" | |
| 745 "movhpd %%xmm3,(%2) \n" | |
| 746 "punpckhdq %%xmm7,%%xmm8 \n" | |
| 747 "sub $0x8,%3 \n" | |
| 748 "movlpd %%xmm8,(%1,%5) \n" | |
| 749 "lea (%1,%5,2),%1 \n" | |
| 750 "movhpd %%xmm8,(%2,%6) \n" | |
| 751 "lea (%2,%6,2),%2 \n" | |
| 752 "jg 1b \n" | |
| 753 : "+r"(src), // %0 | |
| 754 "+r"(dst_a), // %1 | |
| 755 "+r"(dst_b), // %2 | |
| 756 "+r"(w) // %3 | |
| 757 : "r"((intptr_t)(src_stride)), // %4 | |
| 758 "r"((intptr_t)(dst_stride_a)), // %5 | |
| 759 "r"((intptr_t)(dst_stride_b)) // %6 | |
| 760 : "memory", "cc", | |
| 761 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", | |
| 762 "xmm8", "xmm9" | |
| 763 ); | |
| 764 } | |
| 765 #endif | |
| 766 #endif | |
| 767 | |
| 768 static void TransposeWx8_C(const uint8* src, int src_stride, | |
| 769 uint8* dst, int dst_stride, | |
| 770 int width) { | |
| 771 int i; | |
| 772 for (i = 0; i < width; ++i) { | |
| 773 dst[0] = src[0 * src_stride]; | |
| 774 dst[1] = src[1 * src_stride]; | |
| 775 dst[2] = src[2 * src_stride]; | |
| 776 dst[3] = src[3 * src_stride]; | |
| 777 dst[4] = src[4 * src_stride]; | |
| 778 dst[5] = src[5 * src_stride]; | |
| 779 dst[6] = src[6 * src_stride]; | |
| 780 dst[7] = src[7 * src_stride]; | |
| 781 ++src; | |
| 782 dst += dst_stride; | |
| 783 } | |
| 784 } | |
| 785 | |
| 786 static void TransposeWxH_C(const uint8* src, int src_stride, | |
| 787 uint8* dst, int dst_stride, | |
| 788 int width, int height) { | |
| 789 int i; | |
| 790 for (i = 0; i < width; ++i) { | |
| 791 int j; | |
| 792 for (j = 0; j < height; ++j) { | |
| 793 dst[i * dst_stride + j] = src[j * src_stride + i]; | |
| 794 } | |
| 795 } | |
| 796 } | |
| 797 | |
| 798 LIBYUV_API | 24 LIBYUV_API |
| 799 void TransposePlane(const uint8* src, int src_stride, | 25 void TransposePlane(const uint8* src, int src_stride, |
| 800 uint8* dst, int dst_stride, | 26 uint8* dst, int dst_stride, |
| 801 int width, int height) { | 27 int width, int height) { |
| 802 int i = height; | 28 int i = height; |
| 803 void (*TransposeWx8)(const uint8* src, int src_stride, | 29 void (*TransposeWx8)(const uint8* src, int src_stride, |
| 804 uint8* dst, int dst_stride, | 30 uint8* dst, int dst_stride, int width) = TransposeWx8_C; |
| 805 int width) = TransposeWx8_C; | 31 #if defined(HAS_TRANSPOSEWX8_NEON) |
| 806 #if defined(HAS_TRANSPOSE_WX8_NEON) | |
| 807 if (TestCpuFlag(kCpuHasNEON)) { | 32 if (TestCpuFlag(kCpuHasNEON)) { |
| 808 TransposeWx8 = TransposeWx8_NEON; | 33 TransposeWx8 = TransposeWx8_NEON; |
| 809 } | 34 } |
| 810 #endif | 35 #endif |
| 811 #if defined(HAS_TRANSPOSE_WX8_SSSE3) | 36 #if defined(HAS_TRANSPOSEWX8_SSSE3) |
| 812 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { | 37 if (TestCpuFlag(kCpuHasSSSE3)) { |
| 813 TransposeWx8 = TransposeWx8_SSSE3; | 38 TransposeWx8 = TransposeWx8_Any_SSSE3; |
| 39 if (IS_ALIGNED(width, 8)) { |
| 40 TransposeWx8 = TransposeWx8_SSSE3; |
| 41 } |
| 814 } | 42 } |
| 815 #endif | 43 #endif |
| 816 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) | 44 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) |
| 817 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { | 45 if (TestCpuFlag(kCpuHasSSSE3)) { |
| 818 TransposeWx8 = TransposeWx8_FAST_SSSE3; | 46 TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; |
| 47 if (IS_ALIGNED(width, 16)) { |
| 48 TransposeWx8 = TransposeWx8_Fast_SSSE3; |
| 49 } |
| 819 } | 50 } |
| 820 #endif | 51 #endif |
| 821 #if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2) | 52 #if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2) |
| 822 if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { | 53 if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { |
| 823 if (IS_ALIGNED(width, 4) && | 54 if (IS_ALIGNED(width, 4) && |
| 824 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { | 55 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { |
| 825 TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2; | 56 TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2; |
| 826 } else { | 57 } else { |
| 827 TransposeWx8 = TransposeWx8_MIPS_DSPR2; | 58 TransposeWx8 = TransposeWx8_MIPS_DSPR2; |
| 828 } | 59 } |
| 829 } | 60 } |
| 830 #endif | 61 #endif |
| 831 | 62 |
| 832 // Work across the source in 8x8 tiles | 63 // Work across the source in 8x8 tiles |
| 833 while (i >= 8) { | 64 while (i >= 8) { |
| 834 TransposeWx8(src, src_stride, dst, dst_stride, width); | 65 TransposeWx8(src, src_stride, dst, dst_stride, width); |
| 835 src += 8 * src_stride; // Go down 8 rows. | 66 src += 8 * src_stride; // Go down 8 rows. |
| 836 dst += 8; // Move over 8 columns. | 67 dst += 8; // Move over 8 columns. |
| 837 i -= 8; | 68 i -= 8; |
| 838 } | 69 } |
| 839 | 70 |
| 840 TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); | 71 if (i > 0) { |
| 72 TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); |
| 73 } |
| 841 } | 74 } |
| 842 | 75 |
| 843 LIBYUV_API | 76 LIBYUV_API |
| 844 void RotatePlane90(const uint8* src, int src_stride, | 77 void RotatePlane90(const uint8* src, int src_stride, |
| 845 uint8* dst, int dst_stride, | 78 uint8* dst, int dst_stride, |
| 846 int width, int height) { | 79 int width, int height) { |
| 847 // Rotate by 90 is a transpose with the source read | 80 // Rotate by 90 is a transpose with the source read |
| 848 // from bottom to top. So set the source pointer to the end | 81 // from bottom to top. So set the source pointer to the end |
| 849 // of the buffer and flip the sign of the source stride. | 82 // of the buffer and flip the sign of the source stride. |
| 850 src += src_stride * (height - 1); | 83 src += src_stride * (height - 1); |
| (...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 948 src += src_stride; | 181 src += src_stride; |
| 949 MirrorRow(src_bot, dst, width); // Mirror last row into first row | 182 MirrorRow(src_bot, dst, width); // Mirror last row into first row |
| 950 dst += dst_stride; | 183 dst += dst_stride; |
| 951 CopyRow(row, dst_bot, width); // Copy first mirrored row into last | 184 CopyRow(row, dst_bot, width); // Copy first mirrored row into last |
| 952 src_bot -= src_stride; | 185 src_bot -= src_stride; |
| 953 dst_bot -= dst_stride; | 186 dst_bot -= dst_stride; |
| 954 } | 187 } |
| 955 free_aligned_buffer_64(row); | 188 free_aligned_buffer_64(row); |
| 956 } | 189 } |
| 957 | 190 |
| 958 static void TransposeUVWx8_C(const uint8* src, int src_stride, | |
| 959 uint8* dst_a, int dst_stride_a, | |
| 960 uint8* dst_b, int dst_stride_b, | |
| 961 int width) { | |
| 962 int i; | |
| 963 for (i = 0; i < width; ++i) { | |
| 964 dst_a[0] = src[0 * src_stride + 0]; | |
| 965 dst_b[0] = src[0 * src_stride + 1]; | |
| 966 dst_a[1] = src[1 * src_stride + 0]; | |
| 967 dst_b[1] = src[1 * src_stride + 1]; | |
| 968 dst_a[2] = src[2 * src_stride + 0]; | |
| 969 dst_b[2] = src[2 * src_stride + 1]; | |
| 970 dst_a[3] = src[3 * src_stride + 0]; | |
| 971 dst_b[3] = src[3 * src_stride + 1]; | |
| 972 dst_a[4] = src[4 * src_stride + 0]; | |
| 973 dst_b[4] = src[4 * src_stride + 1]; | |
| 974 dst_a[5] = src[5 * src_stride + 0]; | |
| 975 dst_b[5] = src[5 * src_stride + 1]; | |
| 976 dst_a[6] = src[6 * src_stride + 0]; | |
| 977 dst_b[6] = src[6 * src_stride + 1]; | |
| 978 dst_a[7] = src[7 * src_stride + 0]; | |
| 979 dst_b[7] = src[7 * src_stride + 1]; | |
| 980 src += 2; | |
| 981 dst_a += dst_stride_a; | |
| 982 dst_b += dst_stride_b; | |
| 983 } | |
| 984 } | |
| 985 | |
| 986 static void TransposeUVWxH_C(const uint8* src, int src_stride, | |
| 987 uint8* dst_a, int dst_stride_a, | |
| 988 uint8* dst_b, int dst_stride_b, | |
| 989 int width, int height) { | |
| 990 int i; | |
| 991 for (i = 0; i < width * 2; i += 2) { | |
| 992 int j; | |
| 993 for (j = 0; j < height; ++j) { | |
| 994 dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; | |
| 995 dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; | |
| 996 } | |
| 997 } | |
| 998 } | |
| 999 | |
| 1000 LIBYUV_API | 191 LIBYUV_API |
| 1001 void TransposeUV(const uint8* src, int src_stride, | 192 void TransposeUV(const uint8* src, int src_stride, |
| 1002 uint8* dst_a, int dst_stride_a, | 193 uint8* dst_a, int dst_stride_a, |
| 1003 uint8* dst_b, int dst_stride_b, | 194 uint8* dst_b, int dst_stride_b, |
| 1004 int width, int height) { | 195 int width, int height) { |
| 1005 int i = height; | 196 int i = height; |
| 1006 void (*TransposeUVWx8)(const uint8* src, int src_stride, | 197 void (*TransposeUVWx8)(const uint8* src, int src_stride, |
| 1007 uint8* dst_a, int dst_stride_a, | 198 uint8* dst_a, int dst_stride_a, |
| 1008 uint8* dst_b, int dst_stride_b, | 199 uint8* dst_b, int dst_stride_b, |
| 1009 int width) = TransposeUVWx8_C; | 200 int width) = TransposeUVWx8_C; |
| 1010 #if defined(HAS_TRANSPOSE_UVWX8_NEON) | 201 #if defined(HAS_TRANSPOSEUVWX8_NEON) |
| 1011 if (TestCpuFlag(kCpuHasNEON)) { | 202 if (TestCpuFlag(kCpuHasNEON)) { |
| 1012 TransposeUVWx8 = TransposeUVWx8_NEON; | 203 TransposeUVWx8 = TransposeUVWx8_NEON; |
| 1013 } | 204 } |
| 1014 #endif | 205 #endif |
| 1015 #if defined(HAS_TRANSPOSE_UVWX8_SSE2) | 206 #if defined(HAS_TRANSPOSEUVWX8_SSE2) |
| 1016 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { | 207 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { |
| 1017 TransposeUVWx8 = TransposeUVWx8_SSE2; | 208 TransposeUVWx8 = TransposeUVWx8_SSE2; |
| 1018 } | 209 } |
| 1019 #endif | 210 #endif |
| 1020 #if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) | 211 #if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2) |
| 1021 if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && | 212 if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && |
| 1022 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { | 213 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { |
| 1023 TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; | 214 TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; |
| 1024 } | 215 } |
| 1025 #endif | 216 #endif |
| 1026 | 217 |
| 1027 // Work through the source in 8x8 tiles. | 218 // Work through the source in 8x8 tiles. |
| 1028 while (i >= 8) { | 219 while (i >= 8) { |
| 1029 TransposeUVWx8(src, src_stride, | 220 TransposeUVWx8(src, src_stride, |
| 1030 dst_a, dst_stride_a, | 221 dst_a, dst_stride_a, |
| 1031 dst_b, dst_stride_b, | 222 dst_b, dst_stride_b, |
| 1032 width); | 223 width); |
| 1033 src += 8 * src_stride; // Go down 8 rows. | 224 src += 8 * src_stride; // Go down 8 rows. |
| 1034 dst_a += 8; // Move over 8 columns. | 225 dst_a += 8; // Move over 8 columns. |
| 1035 dst_b += 8; // Move over 8 columns. | 226 dst_b += 8; // Move over 8 columns. |
| 1036 i -= 8; | 227 i -= 8; |
| 1037 } | 228 } |
| 1038 | 229 |
| 1039 TransposeUVWxH_C(src, src_stride, | 230 if (i > 0) { |
| 1040 dst_a, dst_stride_a, | 231 TransposeUVWxH_C(src, src_stride, |
| 1041 dst_b, dst_stride_b, | 232 dst_a, dst_stride_a, |
| 1042 width, i); | 233 dst_b, dst_stride_b, |
| 234 width, i); |
| 235 } |
| 1043 } | 236 } |
| 1044 | 237 |
| 1045 LIBYUV_API | 238 LIBYUV_API |
| 1046 void RotateUV90(const uint8* src, int src_stride, | 239 void RotateUV90(const uint8* src, int src_stride, |
| 1047 uint8* dst_a, int dst_stride_a, | 240 uint8* dst_a, int dst_stride_a, |
| 1048 uint8* dst_b, int dst_stride_b, | 241 uint8* dst_b, int dst_stride_b, |
| 1049 int width, int height) { | 242 int width, int height) { |
| 1050 src += src_stride * (height - 1); | 243 src += src_stride * (height - 1); |
| 1051 src_stride = -src_stride; | 244 src_stride = -src_stride; |
| 1052 | 245 |
| (...skipping 241 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1294 default: | 487 default: |
| 1295 break; | 488 break; |
| 1296 } | 489 } |
| 1297 return -1; | 490 return -1; |
| 1298 } | 491 } |
| 1299 | 492 |
| 1300 #ifdef __cplusplus | 493 #ifdef __cplusplus |
| 1301 } // extern "C" | 494 } // extern "C" |
| 1302 } // namespace libyuv | 495 } // namespace libyuv |
| 1303 #endif | 496 #endif |
| OLD | NEW |