OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright 2015 The LibYuv Project Authors. All rights reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include "libyuv/row.h" |
| 12 #include "libyuv/rotate_row.h" |
| 13 |
| 14 #ifdef __cplusplus |
| 15 namespace libyuv { |
| 16 extern "C" { |
| 17 #endif |
| 18 |
| 19 // This module is for GCC x86 and x64. |
| 20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) |
| 21 |
| 22 #if !defined(LIBYUV_DISABLE_X86) && \ |
| 23 (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) |
| 24 void TransposeWx8_SSSE3(const uint8* src, int src_stride, |
| 25 uint8* dst, int dst_stride, int width) { |
| 26 asm volatile ( |
| 27 // Read in the data from the source pointer. |
| 28 // First round of bit swap. |
| 29 ".p2align 2 \n" |
| 30 "1: \n" |
| 31 "movq (%0),%%xmm0 \n" |
| 32 "movq (%0,%3),%%xmm1 \n" |
| 33 "lea (%0,%3,2),%0 \n" |
| 34 "punpcklbw %%xmm1,%%xmm0 \n" |
| 35 "movq (%0),%%xmm2 \n" |
| 36 "movdqa %%xmm0,%%xmm1 \n" |
| 37 "palignr $0x8,%%xmm1,%%xmm1 \n" |
| 38 "movq (%0,%3),%%xmm3 \n" |
| 39 "lea (%0,%3,2),%0 \n" |
| 40 "punpcklbw %%xmm3,%%xmm2 \n" |
| 41 "movdqa %%xmm2,%%xmm3 \n" |
| 42 "movq (%0),%%xmm4 \n" |
| 43 "palignr $0x8,%%xmm3,%%xmm3 \n" |
| 44 "movq (%0,%3),%%xmm5 \n" |
| 45 "lea (%0,%3,2),%0 \n" |
| 46 "punpcklbw %%xmm5,%%xmm4 \n" |
| 47 "movdqa %%xmm4,%%xmm5 \n" |
| 48 "movq (%0),%%xmm6 \n" |
| 49 "palignr $0x8,%%xmm5,%%xmm5 \n" |
| 50 "movq (%0,%3),%%xmm7 \n" |
| 51 "lea (%0,%3,2),%0 \n" |
| 52 "punpcklbw %%xmm7,%%xmm6 \n" |
| 53 "neg %3 \n" |
| 54 "movdqa %%xmm6,%%xmm7 \n" |
| 55 "lea 0x8(%0,%3,8),%0 \n" |
| 56 "palignr $0x8,%%xmm7,%%xmm7 \n" |
| 57 "neg %3 \n" |
| 58 // Second round of bit swap. |
| 59 "punpcklwd %%xmm2,%%xmm0 \n" |
| 60 "punpcklwd %%xmm3,%%xmm1 \n" |
| 61 "movdqa %%xmm0,%%xmm2 \n" |
| 62 "movdqa %%xmm1,%%xmm3 \n" |
| 63 "palignr $0x8,%%xmm2,%%xmm2 \n" |
| 64 "palignr $0x8,%%xmm3,%%xmm3 \n" |
| 65 "punpcklwd %%xmm6,%%xmm4 \n" |
| 66 "punpcklwd %%xmm7,%%xmm5 \n" |
| 67 "movdqa %%xmm4,%%xmm6 \n" |
| 68 "movdqa %%xmm5,%%xmm7 \n" |
| 69 "palignr $0x8,%%xmm6,%%xmm6 \n" |
| 70 "palignr $0x8,%%xmm7,%%xmm7 \n" |
| 71 // Third round of bit swap. |
| 72 // Write to the destination pointer. |
| 73 "punpckldq %%xmm4,%%xmm0 \n" |
| 74 "movq %%xmm0,(%1) \n" |
| 75 "movdqa %%xmm0,%%xmm4 \n" |
| 76 "palignr $0x8,%%xmm4,%%xmm4 \n" |
| 77 "movq %%xmm4,(%1,%4) \n" |
| 78 "lea (%1,%4,2),%1 \n" |
| 79 "punpckldq %%xmm6,%%xmm2 \n" |
| 80 "movdqa %%xmm2,%%xmm6 \n" |
| 81 "movq %%xmm2,(%1) \n" |
| 82 "palignr $0x8,%%xmm6,%%xmm6 \n" |
| 83 "punpckldq %%xmm5,%%xmm1 \n" |
| 84 "movq %%xmm6,(%1,%4) \n" |
| 85 "lea (%1,%4,2),%1 \n" |
| 86 "movdqa %%xmm1,%%xmm5 \n" |
| 87 "movq %%xmm1,(%1) \n" |
| 88 "palignr $0x8,%%xmm5,%%xmm5 \n" |
| 89 "movq %%xmm5,(%1,%4) \n" |
| 90 "lea (%1,%4,2),%1 \n" |
| 91 "punpckldq %%xmm7,%%xmm3 \n" |
| 92 "movq %%xmm3,(%1) \n" |
| 93 "movdqa %%xmm3,%%xmm7 \n" |
| 94 "palignr $0x8,%%xmm7,%%xmm7 \n" |
| 95 "sub $0x8,%2 \n" |
| 96 "movq %%xmm7,(%1,%4) \n" |
| 97 "lea (%1,%4,2),%1 \n" |
| 98 "jg 1b \n" |
| 99 : "+r"(src), // %0 |
| 100 "+r"(dst), // %1 |
| 101 "+r"(width) // %2 |
| 102 : "r"((intptr_t)(src_stride)), // %3 |
| 103 "r"((intptr_t)(dst_stride)) // %4 |
| 104 : "memory", "cc", |
| 105 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 106 ); |
| 107 } |
| 108 |
| 109 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__) |
| 110 void TransposeUVWx8_SSE2(const uint8* src, int src_stride, |
| 111 uint8* dst_a, int dst_stride_a, |
| 112 uint8* dst_b, int dst_stride_b, int width); |
| 113 asm ( |
| 114 DECLARE_FUNCTION(TransposeUVWx8_SSE2) |
| 115 "push %ebx \n" |
| 116 "push %esi \n" |
| 117 "push %edi \n" |
| 118 "push %ebp \n" |
| 119 "mov 0x14(%esp),%eax \n" |
| 120 "mov 0x18(%esp),%edi \n" |
| 121 "mov 0x1c(%esp),%edx \n" |
| 122 "mov 0x20(%esp),%esi \n" |
| 123 "mov 0x24(%esp),%ebx \n" |
| 124 "mov 0x28(%esp),%ebp \n" |
| 125 "mov %esp,%ecx \n" |
| 126 "sub $0x14,%esp \n" |
| 127 "and $0xfffffff0,%esp \n" |
| 128 "mov %ecx,0x10(%esp) \n" |
| 129 "mov 0x2c(%ecx),%ecx \n" |
| 130 |
| 131 "1: \n" |
| 132 "movdqu (%eax),%xmm0 \n" |
| 133 "movdqu (%eax,%edi,1),%xmm1 \n" |
| 134 "lea (%eax,%edi,2),%eax \n" |
| 135 "movdqa %xmm0,%xmm7 \n" |
| 136 "punpcklbw %xmm1,%xmm0 \n" |
| 137 "punpckhbw %xmm1,%xmm7 \n" |
| 138 "movdqa %xmm7,%xmm1 \n" |
| 139 "movdqu (%eax),%xmm2 \n" |
| 140 "movdqu (%eax,%edi,1),%xmm3 \n" |
| 141 "lea (%eax,%edi,2),%eax \n" |
| 142 "movdqa %xmm2,%xmm7 \n" |
| 143 "punpcklbw %xmm3,%xmm2 \n" |
| 144 "punpckhbw %xmm3,%xmm7 \n" |
| 145 "movdqa %xmm7,%xmm3 \n" |
| 146 "movdqu (%eax),%xmm4 \n" |
| 147 "movdqu (%eax,%edi,1),%xmm5 \n" |
| 148 "lea (%eax,%edi,2),%eax \n" |
| 149 "movdqa %xmm4,%xmm7 \n" |
| 150 "punpcklbw %xmm5,%xmm4 \n" |
| 151 "punpckhbw %xmm5,%xmm7 \n" |
| 152 "movdqa %xmm7,%xmm5 \n" |
| 153 "movdqu (%eax),%xmm6 \n" |
| 154 "movdqu (%eax,%edi,1),%xmm7 \n" |
| 155 "lea (%eax,%edi,2),%eax \n" |
| 156 "movdqu %xmm5,(%esp) \n" |
| 157 "neg %edi \n" |
| 158 "movdqa %xmm6,%xmm5 \n" |
| 159 "punpcklbw %xmm7,%xmm6 \n" |
| 160 "punpckhbw %xmm7,%xmm5 \n" |
| 161 "movdqa %xmm5,%xmm7 \n" |
| 162 "lea 0x10(%eax,%edi,8),%eax \n" |
| 163 "neg %edi \n" |
| 164 "movdqa %xmm0,%xmm5 \n" |
| 165 "punpcklwd %xmm2,%xmm0 \n" |
| 166 "punpckhwd %xmm2,%xmm5 \n" |
| 167 "movdqa %xmm5,%xmm2 \n" |
| 168 "movdqa %xmm1,%xmm5 \n" |
| 169 "punpcklwd %xmm3,%xmm1 \n" |
| 170 "punpckhwd %xmm3,%xmm5 \n" |
| 171 "movdqa %xmm5,%xmm3 \n" |
| 172 "movdqa %xmm4,%xmm5 \n" |
| 173 "punpcklwd %xmm6,%xmm4 \n" |
| 174 "punpckhwd %xmm6,%xmm5 \n" |
| 175 "movdqa %xmm5,%xmm6 \n" |
| 176 "movdqu (%esp),%xmm5 \n" |
| 177 "movdqu %xmm6,(%esp) \n" |
| 178 "movdqa %xmm5,%xmm6 \n" |
| 179 "punpcklwd %xmm7,%xmm5 \n" |
| 180 "punpckhwd %xmm7,%xmm6 \n" |
| 181 "movdqa %xmm6,%xmm7 \n" |
| 182 "movdqa %xmm0,%xmm6 \n" |
| 183 "punpckldq %xmm4,%xmm0 \n" |
| 184 "punpckhdq %xmm4,%xmm6 \n" |
| 185 "movdqa %xmm6,%xmm4 \n" |
| 186 "movdqu (%esp),%xmm6 \n" |
| 187 "movlpd %xmm0,(%edx) \n" |
| 188 "movhpd %xmm0,(%ebx) \n" |
| 189 "movlpd %xmm4,(%edx,%esi,1) \n" |
| 190 "lea (%edx,%esi,2),%edx \n" |
| 191 "movhpd %xmm4,(%ebx,%ebp,1) \n" |
| 192 "lea (%ebx,%ebp,2),%ebx \n" |
| 193 "movdqa %xmm2,%xmm0 \n" |
| 194 "punpckldq %xmm6,%xmm2 \n" |
| 195 "movlpd %xmm2,(%edx) \n" |
| 196 "movhpd %xmm2,(%ebx) \n" |
| 197 "punpckhdq %xmm6,%xmm0 \n" |
| 198 "movlpd %xmm0,(%edx,%esi,1) \n" |
| 199 "lea (%edx,%esi,2),%edx \n" |
| 200 "movhpd %xmm0,(%ebx,%ebp,1) \n" |
| 201 "lea (%ebx,%ebp,2),%ebx \n" |
| 202 "movdqa %xmm1,%xmm0 \n" |
| 203 "punpckldq %xmm5,%xmm1 \n" |
| 204 "movlpd %xmm1,(%edx) \n" |
| 205 "movhpd %xmm1,(%ebx) \n" |
| 206 "punpckhdq %xmm5,%xmm0 \n" |
| 207 "movlpd %xmm0,(%edx,%esi,1) \n" |
| 208 "lea (%edx,%esi,2),%edx \n" |
| 209 "movhpd %xmm0,(%ebx,%ebp,1) \n" |
| 210 "lea (%ebx,%ebp,2),%ebx \n" |
| 211 "movdqa %xmm3,%xmm0 \n" |
| 212 "punpckldq %xmm7,%xmm3 \n" |
| 213 "movlpd %xmm3,(%edx) \n" |
| 214 "movhpd %xmm3,(%ebx) \n" |
| 215 "punpckhdq %xmm7,%xmm0 \n" |
| 216 "sub $0x8,%ecx \n" |
| 217 "movlpd %xmm0,(%edx,%esi,1) \n" |
| 218 "lea (%edx,%esi,2),%edx \n" |
| 219 "movhpd %xmm0,(%ebx,%ebp,1) \n" |
| 220 "lea (%ebx,%ebp,2),%ebx \n" |
| 221 "jg 1b \n" |
| 222 "mov 0x10(%esp),%esp \n" |
| 223 "pop %ebp \n" |
| 224 "pop %edi \n" |
| 225 "pop %esi \n" |
| 226 "pop %ebx \n" |
| 227 #if defined(__native_client__) |
| 228 "pop %ecx \n" |
| 229 "and $0xffffffe0,%ecx \n" |
| 230 "jmp *%ecx \n" |
| 231 #else |
| 232 "ret \n" |
| 233 #endif |
| 234 ); |
| 235 #endif |
| 236 #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ |
| 237 defined(__x86_64__) |
| 238 // 64 bit version has enough registers to do 16x8 to 8x16 at a time. |
| 239 void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, |
| 240 uint8* dst, int dst_stride, int width) { |
| 241 asm volatile ( |
| 242 // Read in the data from the source pointer. |
| 243 // First round of bit swap. |
| 244 ".p2align 2 \n" |
| 245 "1: \n" |
| 246 "movdqu (%0),%%xmm0 \n" |
| 247 "movdqu (%0,%3),%%xmm1 \n" |
| 248 "lea (%0,%3,2),%0 \n" |
| 249 "movdqa %%xmm0,%%xmm8 \n" |
| 250 "punpcklbw %%xmm1,%%xmm0 \n" |
| 251 "punpckhbw %%xmm1,%%xmm8 \n" |
| 252 "movdqu (%0),%%xmm2 \n" |
| 253 "movdqa %%xmm0,%%xmm1 \n" |
| 254 "movdqa %%xmm8,%%xmm9 \n" |
| 255 "palignr $0x8,%%xmm1,%%xmm1 \n" |
| 256 "palignr $0x8,%%xmm9,%%xmm9 \n" |
| 257 "movdqu (%0,%3),%%xmm3 \n" |
| 258 "lea (%0,%3,2),%0 \n" |
| 259 "movdqa %%xmm2,%%xmm10 \n" |
| 260 "punpcklbw %%xmm3,%%xmm2 \n" |
| 261 "punpckhbw %%xmm3,%%xmm10 \n" |
| 262 "movdqa %%xmm2,%%xmm3 \n" |
| 263 "movdqa %%xmm10,%%xmm11 \n" |
| 264 "movdqu (%0),%%xmm4 \n" |
| 265 "palignr $0x8,%%xmm3,%%xmm3 \n" |
| 266 "palignr $0x8,%%xmm11,%%xmm11 \n" |
| 267 "movdqu (%0,%3),%%xmm5 \n" |
| 268 "lea (%0,%3,2),%0 \n" |
| 269 "movdqa %%xmm4,%%xmm12 \n" |
| 270 "punpcklbw %%xmm5,%%xmm4 \n" |
| 271 "punpckhbw %%xmm5,%%xmm12 \n" |
| 272 "movdqa %%xmm4,%%xmm5 \n" |
| 273 "movdqa %%xmm12,%%xmm13 \n" |
| 274 "movdqu (%0),%%xmm6 \n" |
| 275 "palignr $0x8,%%xmm5,%%xmm5 \n" |
| 276 "palignr $0x8,%%xmm13,%%xmm13 \n" |
| 277 "movdqu (%0,%3),%%xmm7 \n" |
| 278 "lea (%0,%3,2),%0 \n" |
| 279 "movdqa %%xmm6,%%xmm14 \n" |
| 280 "punpcklbw %%xmm7,%%xmm6 \n" |
| 281 "punpckhbw %%xmm7,%%xmm14 \n" |
| 282 "neg %3 \n" |
| 283 "movdqa %%xmm6,%%xmm7 \n" |
| 284 "movdqa %%xmm14,%%xmm15 \n" |
| 285 "lea 0x10(%0,%3,8),%0 \n" |
| 286 "palignr $0x8,%%xmm7,%%xmm7 \n" |
| 287 "palignr $0x8,%%xmm15,%%xmm15 \n" |
| 288 "neg %3 \n" |
| 289 // Second round of bit swap. |
| 290 "punpcklwd %%xmm2,%%xmm0 \n" |
| 291 "punpcklwd %%xmm3,%%xmm1 \n" |
| 292 "movdqa %%xmm0,%%xmm2 \n" |
| 293 "movdqa %%xmm1,%%xmm3 \n" |
| 294 "palignr $0x8,%%xmm2,%%xmm2 \n" |
| 295 "palignr $0x8,%%xmm3,%%xmm3 \n" |
| 296 "punpcklwd %%xmm6,%%xmm4 \n" |
| 297 "punpcklwd %%xmm7,%%xmm5 \n" |
| 298 "movdqa %%xmm4,%%xmm6 \n" |
| 299 "movdqa %%xmm5,%%xmm7 \n" |
| 300 "palignr $0x8,%%xmm6,%%xmm6 \n" |
| 301 "palignr $0x8,%%xmm7,%%xmm7 \n" |
| 302 "punpcklwd %%xmm10,%%xmm8 \n" |
| 303 "punpcklwd %%xmm11,%%xmm9 \n" |
| 304 "movdqa %%xmm8,%%xmm10 \n" |
| 305 "movdqa %%xmm9,%%xmm11 \n" |
| 306 "palignr $0x8,%%xmm10,%%xmm10 \n" |
| 307 "palignr $0x8,%%xmm11,%%xmm11 \n" |
| 308 "punpcklwd %%xmm14,%%xmm12 \n" |
| 309 "punpcklwd %%xmm15,%%xmm13 \n" |
| 310 "movdqa %%xmm12,%%xmm14 \n" |
| 311 "movdqa %%xmm13,%%xmm15 \n" |
| 312 "palignr $0x8,%%xmm14,%%xmm14 \n" |
| 313 "palignr $0x8,%%xmm15,%%xmm15 \n" |
| 314 // Third round of bit swap. |
| 315 // Write to the destination pointer. |
| 316 "punpckldq %%xmm4,%%xmm0 \n" |
| 317 "movq %%xmm0,(%1) \n" |
| 318 "movdqa %%xmm0,%%xmm4 \n" |
| 319 "palignr $0x8,%%xmm4,%%xmm4 \n" |
| 320 "movq %%xmm4,(%1,%4) \n" |
| 321 "lea (%1,%4,2),%1 \n" |
| 322 "punpckldq %%xmm6,%%xmm2 \n" |
| 323 "movdqa %%xmm2,%%xmm6 \n" |
| 324 "movq %%xmm2,(%1) \n" |
| 325 "palignr $0x8,%%xmm6,%%xmm6 \n" |
| 326 "punpckldq %%xmm5,%%xmm1 \n" |
| 327 "movq %%xmm6,(%1,%4) \n" |
| 328 "lea (%1,%4,2),%1 \n" |
| 329 "movdqa %%xmm1,%%xmm5 \n" |
| 330 "movq %%xmm1,(%1) \n" |
| 331 "palignr $0x8,%%xmm5,%%xmm5 \n" |
| 332 "movq %%xmm5,(%1,%4) \n" |
| 333 "lea (%1,%4,2),%1 \n" |
| 334 "punpckldq %%xmm7,%%xmm3 \n" |
| 335 "movq %%xmm3,(%1) \n" |
| 336 "movdqa %%xmm3,%%xmm7 \n" |
| 337 "palignr $0x8,%%xmm7,%%xmm7 \n" |
| 338 "movq %%xmm7,(%1,%4) \n" |
| 339 "lea (%1,%4,2),%1 \n" |
| 340 "punpckldq %%xmm12,%%xmm8 \n" |
| 341 "movq %%xmm8,(%1) \n" |
| 342 "movdqa %%xmm8,%%xmm12 \n" |
| 343 "palignr $0x8,%%xmm12,%%xmm12 \n" |
| 344 "movq %%xmm12,(%1,%4) \n" |
| 345 "lea (%1,%4,2),%1 \n" |
| 346 "punpckldq %%xmm14,%%xmm10 \n" |
| 347 "movdqa %%xmm10,%%xmm14 \n" |
| 348 "movq %%xmm10,(%1) \n" |
| 349 "palignr $0x8,%%xmm14,%%xmm14 \n" |
| 350 "punpckldq %%xmm13,%%xmm9 \n" |
| 351 "movq %%xmm14,(%1,%4) \n" |
| 352 "lea (%1,%4,2),%1 \n" |
| 353 "movdqa %%xmm9,%%xmm13 \n" |
| 354 "movq %%xmm9,(%1) \n" |
| 355 "palignr $0x8,%%xmm13,%%xmm13 \n" |
| 356 "movq %%xmm13,(%1,%4) \n" |
| 357 "lea (%1,%4,2),%1 \n" |
| 358 "punpckldq %%xmm15,%%xmm11 \n" |
| 359 "movq %%xmm11,(%1) \n" |
| 360 "movdqa %%xmm11,%%xmm15 \n" |
| 361 "palignr $0x8,%%xmm15,%%xmm15 \n" |
| 362 "sub $0x10,%2 \n" |
| 363 "movq %%xmm15,(%1,%4) \n" |
| 364 "lea (%1,%4,2),%1 \n" |
| 365 "jg 1b \n" |
| 366 : "+r"(src), // %0 |
| 367 "+r"(dst), // %1 |
| 368 "+r"(width) // %2 |
| 369 : "r"((intptr_t)(src_stride)), // %3 |
| 370 "r"((intptr_t)(dst_stride)) // %4 |
| 371 : "memory", "cc", |
| 372 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", |
| 373 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" |
| 374 ); |
| 375 } |
| 376 |
| 377 void TransposeUVWx8_SSE2(const uint8* src, int src_stride, |
| 378 uint8* dst_a, int dst_stride_a, |
| 379 uint8* dst_b, int dst_stride_b, int width) { |
| 380 asm volatile ( |
| 381 // Read in the data from the source pointer. |
| 382 // First round of bit swap. |
| 383 ".p2align 2 \n" |
| 384 "1: \n" |
| 385 "movdqu (%0),%%xmm0 \n" |
| 386 "movdqu (%0,%4),%%xmm1 \n" |
| 387 "lea (%0,%4,2),%0 \n" |
| 388 "movdqa %%xmm0,%%xmm8 \n" |
| 389 "punpcklbw %%xmm1,%%xmm0 \n" |
| 390 "punpckhbw %%xmm1,%%xmm8 \n" |
| 391 "movdqa %%xmm8,%%xmm1 \n" |
| 392 "movdqu (%0),%%xmm2 \n" |
| 393 "movdqu (%0,%4),%%xmm3 \n" |
| 394 "lea (%0,%4,2),%0 \n" |
| 395 "movdqa %%xmm2,%%xmm8 \n" |
| 396 "punpcklbw %%xmm3,%%xmm2 \n" |
| 397 "punpckhbw %%xmm3,%%xmm8 \n" |
| 398 "movdqa %%xmm8,%%xmm3 \n" |
| 399 "movdqu (%0),%%xmm4 \n" |
| 400 "movdqu (%0,%4),%%xmm5 \n" |
| 401 "lea (%0,%4,2),%0 \n" |
| 402 "movdqa %%xmm4,%%xmm8 \n" |
| 403 "punpcklbw %%xmm5,%%xmm4 \n" |
| 404 "punpckhbw %%xmm5,%%xmm8 \n" |
| 405 "movdqa %%xmm8,%%xmm5 \n" |
| 406 "movdqu (%0),%%xmm6 \n" |
| 407 "movdqu (%0,%4),%%xmm7 \n" |
| 408 "lea (%0,%4,2),%0 \n" |
| 409 "movdqa %%xmm6,%%xmm8 \n" |
| 410 "punpcklbw %%xmm7,%%xmm6 \n" |
| 411 "neg %4 \n" |
| 412 "lea 0x10(%0,%4,8),%0 \n" |
| 413 "punpckhbw %%xmm7,%%xmm8 \n" |
| 414 "movdqa %%xmm8,%%xmm7 \n" |
| 415 "neg %4 \n" |
| 416 // Second round of bit swap. |
| 417 "movdqa %%xmm0,%%xmm8 \n" |
| 418 "movdqa %%xmm1,%%xmm9 \n" |
| 419 "punpckhwd %%xmm2,%%xmm8 \n" |
| 420 "punpckhwd %%xmm3,%%xmm9 \n" |
| 421 "punpcklwd %%xmm2,%%xmm0 \n" |
| 422 "punpcklwd %%xmm3,%%xmm1 \n" |
| 423 "movdqa %%xmm8,%%xmm2 \n" |
| 424 "movdqa %%xmm9,%%xmm3 \n" |
| 425 "movdqa %%xmm4,%%xmm8 \n" |
| 426 "movdqa %%xmm5,%%xmm9 \n" |
| 427 "punpckhwd %%xmm6,%%xmm8 \n" |
| 428 "punpckhwd %%xmm7,%%xmm9 \n" |
| 429 "punpcklwd %%xmm6,%%xmm4 \n" |
| 430 "punpcklwd %%xmm7,%%xmm5 \n" |
| 431 "movdqa %%xmm8,%%xmm6 \n" |
| 432 "movdqa %%xmm9,%%xmm7 \n" |
| 433 // Third round of bit swap. |
| 434 // Write to the destination pointer. |
| 435 "movdqa %%xmm0,%%xmm8 \n" |
| 436 "punpckldq %%xmm4,%%xmm0 \n" |
| 437 "movlpd %%xmm0,(%1) \n" // Write back U channel |
| 438 "movhpd %%xmm0,(%2) \n" // Write back V channel |
| 439 "punpckhdq %%xmm4,%%xmm8 \n" |
| 440 "movlpd %%xmm8,(%1,%5) \n" |
| 441 "lea (%1,%5,2),%1 \n" |
| 442 "movhpd %%xmm8,(%2,%6) \n" |
| 443 "lea (%2,%6,2),%2 \n" |
| 444 "movdqa %%xmm2,%%xmm8 \n" |
| 445 "punpckldq %%xmm6,%%xmm2 \n" |
| 446 "movlpd %%xmm2,(%1) \n" |
| 447 "movhpd %%xmm2,(%2) \n" |
| 448 "punpckhdq %%xmm6,%%xmm8 \n" |
| 449 "movlpd %%xmm8,(%1,%5) \n" |
| 450 "lea (%1,%5,2),%1 \n" |
| 451 "movhpd %%xmm8,(%2,%6) \n" |
| 452 "lea (%2,%6,2),%2 \n" |
| 453 "movdqa %%xmm1,%%xmm8 \n" |
| 454 "punpckldq %%xmm5,%%xmm1 \n" |
| 455 "movlpd %%xmm1,(%1) \n" |
| 456 "movhpd %%xmm1,(%2) \n" |
| 457 "punpckhdq %%xmm5,%%xmm8 \n" |
| 458 "movlpd %%xmm8,(%1,%5) \n" |
| 459 "lea (%1,%5,2),%1 \n" |
| 460 "movhpd %%xmm8,(%2,%6) \n" |
| 461 "lea (%2,%6,2),%2 \n" |
| 462 "movdqa %%xmm3,%%xmm8 \n" |
| 463 "punpckldq %%xmm7,%%xmm3 \n" |
| 464 "movlpd %%xmm3,(%1) \n" |
| 465 "movhpd %%xmm3,(%2) \n" |
| 466 "punpckhdq %%xmm7,%%xmm8 \n" |
| 467 "sub $0x8,%3 \n" |
| 468 "movlpd %%xmm8,(%1,%5) \n" |
| 469 "lea (%1,%5,2),%1 \n" |
| 470 "movhpd %%xmm8,(%2,%6) \n" |
| 471 "lea (%2,%6,2),%2 \n" |
| 472 "jg 1b \n" |
| 473 : "+r"(src), // %0 |
| 474 "+r"(dst_a), // %1 |
| 475 "+r"(dst_b), // %2 |
| 476 "+r"(width) // %3 |
| 477 : "r"((intptr_t)(src_stride)), // %4 |
| 478 "r"((intptr_t)(dst_stride_a)), // %5 |
| 479 "r"((intptr_t)(dst_stride_b)) // %6 |
| 480 : "memory", "cc", |
| 481 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", |
| 482 "xmm8", "xmm9" |
| 483 ); |
| 484 } |
| 485 #endif |
| 486 #endif |
| 487 |
| 488 #endif // defined(__x86_64__) || defined(__i386__) |
| 489 |
| 490 #ifdef __cplusplus |
| 491 } // extern "C" |
| 492 } // namespace libyuv |
| 493 #endif |
OLD | NEW |