OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include "third_party/libyuv/include/libyuv/row.h" |
| 12 |
| 13 #ifdef __cplusplus |
| 14 namespace libyuv { |
| 15 extern "C" { |
| 16 #endif |
| 17 |
| 18 // This module is for GCC x86 and x64. |
| 19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) |
| 20 |
| 21 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) |
| 22 |
| 23 // Constants for ARGB |
| 24 static vec8 kARGBToY = { |
| 25 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 |
| 26 }; |
| 27 |
| 28 // JPeg full range. |
| 29 static vec8 kARGBToYJ = { |
| 30 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 |
| 31 }; |
| 32 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) |
| 33 |
| 34 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) |
| 35 |
| 36 static vec8 kARGBToU = { |
| 37 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 |
| 38 }; |
| 39 |
| 40 static vec8 kARGBToUJ = { |
| 41 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 |
| 42 }; |
| 43 |
| 44 static vec8 kARGBToV = { |
| 45 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, |
| 46 }; |
| 47 |
| 48 static vec8 kARGBToVJ = { |
| 49 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 |
| 50 }; |
| 51 |
| 52 // Constants for BGRA |
| 53 static vec8 kBGRAToY = { |
| 54 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 |
| 55 }; |
| 56 |
| 57 static vec8 kBGRAToU = { |
| 58 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 |
| 59 }; |
| 60 |
| 61 static vec8 kBGRAToV = { |
| 62 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 |
| 63 }; |
| 64 |
| 65 // Constants for ABGR |
| 66 static vec8 kABGRToY = { |
| 67 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 |
| 68 }; |
| 69 |
| 70 static vec8 kABGRToU = { |
| 71 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 |
| 72 }; |
| 73 |
| 74 static vec8 kABGRToV = { |
| 75 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 |
| 76 }; |
| 77 |
| 78 // Constants for RGBA. |
| 79 static vec8 kRGBAToY = { |
| 80 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 |
| 81 }; |
| 82 |
| 83 static vec8 kRGBAToU = { |
| 84 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 |
| 85 }; |
| 86 |
| 87 static vec8 kRGBAToV = { |
| 88 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 |
| 89 }; |
| 90 |
| 91 static uvec8 kAddY16 = { |
| 92 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u |
| 93 }; |
| 94 |
| 95 static vec16 kAddYJ64 = { |
| 96 64, 64, 64, 64, 64, 64, 64, 64 |
| 97 }; |
| 98 |
| 99 static uvec8 kAddUV128 = { |
| 100 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, |
| 101 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u |
| 102 }; |
| 103 |
| 104 static uvec16 kAddUVJ128 = { |
| 105 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u |
| 106 }; |
| 107 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) |
| 108 |
| 109 #ifdef HAS_RGB24TOARGBROW_SSSE3 |
| 110 |
| 111 // Shuffle table for converting RGB24 to ARGB. |
| 112 static uvec8 kShuffleMaskRGB24ToARGB = { |
| 113 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u |
| 114 }; |
| 115 |
| 116 // Shuffle table for converting RAW to ARGB. |
| 117 static uvec8 kShuffleMaskRAWToARGB = { |
| 118 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u |
| 119 }; |
| 120 |
| 121 // Shuffle table for converting ARGB to RGB24. |
| 122 static uvec8 kShuffleMaskARGBToRGB24 = { |
| 123 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u |
| 124 }; |
| 125 |
| 126 // Shuffle table for converting ARGB to RAW. |
| 127 static uvec8 kShuffleMaskARGBToRAW = { |
| 128 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u |
| 129 }; |
| 130 |
| 131 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 |
| 132 static uvec8 kShuffleMaskARGBToRGB24_0 = { |
| 133 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u |
| 134 }; |
| 135 |
| 136 // Shuffle table for converting ARGB to RAW. |
| 137 static uvec8 kShuffleMaskARGBToRAW_0 = { |
| 138 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u |
| 139 }; |
| 140 #endif // HAS_RGB24TOARGBROW_SSSE3 |
| 141 |
| 142 #if defined(TESTING) && defined(__x86_64__) |
| 143 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
| 144 asm volatile ( |
| 145 ".p2align 5 \n" |
| 146 "mov %%eax,%%eax \n" |
| 147 "mov %%ebx,%%ebx \n" |
| 148 "mov %%ecx,%%ecx \n" |
| 149 "mov %%edx,%%edx \n" |
| 150 "mov %%esi,%%esi \n" |
| 151 "mov %%edi,%%edi \n" |
| 152 "mov %%ebp,%%ebp \n" |
| 153 "mov %%esp,%%esp \n" |
| 154 ".p2align 5 \n" |
| 155 "mov %%r8d,%%r8d \n" |
| 156 "mov %%r9d,%%r9d \n" |
| 157 "mov %%r10d,%%r10d \n" |
| 158 "mov %%r11d,%%r11d \n" |
| 159 "mov %%r12d,%%r12d \n" |
| 160 "mov %%r13d,%%r13d \n" |
| 161 "mov %%r14d,%%r14d \n" |
| 162 "mov %%r15d,%%r15d \n" |
| 163 ".p2align 5 \n" |
| 164 "lea (%%rax),%%eax \n" |
| 165 "lea (%%rbx),%%ebx \n" |
| 166 "lea (%%rcx),%%ecx \n" |
| 167 "lea (%%rdx),%%edx \n" |
| 168 "lea (%%rsi),%%esi \n" |
| 169 "lea (%%rdi),%%edi \n" |
| 170 "lea (%%rbp),%%ebp \n" |
| 171 "lea (%%rsp),%%esp \n" |
| 172 ".p2align 5 \n" |
| 173 "lea (%%r8),%%r8d \n" |
| 174 "lea (%%r9),%%r9d \n" |
| 175 "lea (%%r10),%%r10d \n" |
| 176 "lea (%%r11),%%r11d \n" |
| 177 "lea (%%r12),%%r12d \n" |
| 178 "lea (%%r13),%%r13d \n" |
| 179 "lea (%%r14),%%r14d \n" |
| 180 "lea (%%r15),%%r15d \n" |
| 181 |
| 182 ".p2align 5 \n" |
| 183 "lea 0x10(%%rax),%%eax \n" |
| 184 "lea 0x10(%%rbx),%%ebx \n" |
| 185 "lea 0x10(%%rcx),%%ecx \n" |
| 186 "lea 0x10(%%rdx),%%edx \n" |
| 187 "lea 0x10(%%rsi),%%esi \n" |
| 188 "lea 0x10(%%rdi),%%edi \n" |
| 189 "lea 0x10(%%rbp),%%ebp \n" |
| 190 "lea 0x10(%%rsp),%%esp \n" |
| 191 ".p2align 5 \n" |
| 192 "lea 0x10(%%r8),%%r8d \n" |
| 193 "lea 0x10(%%r9),%%r9d \n" |
| 194 "lea 0x10(%%r10),%%r10d \n" |
| 195 "lea 0x10(%%r11),%%r11d \n" |
| 196 "lea 0x10(%%r12),%%r12d \n" |
| 197 "lea 0x10(%%r13),%%r13d \n" |
| 198 "lea 0x10(%%r14),%%r14d \n" |
| 199 "lea 0x10(%%r15),%%r15d \n" |
| 200 |
| 201 ".p2align 5 \n" |
| 202 "add 0x10,%%eax \n" |
| 203 "add 0x10,%%ebx \n" |
| 204 "add 0x10,%%ecx \n" |
| 205 "add 0x10,%%edx \n" |
| 206 "add 0x10,%%esi \n" |
| 207 "add 0x10,%%edi \n" |
| 208 "add 0x10,%%ebp \n" |
| 209 "add 0x10,%%esp \n" |
| 210 ".p2align 5 \n" |
| 211 "add 0x10,%%r8d \n" |
| 212 "add 0x10,%%r9d \n" |
| 213 "add 0x10,%%r10d \n" |
| 214 "add 0x10,%%r11d \n" |
| 215 "add 0x10,%%r12d \n" |
| 216 "add 0x10,%%r13d \n" |
| 217 "add 0x10,%%r14d \n" |
| 218 "add 0x10,%%r15d \n" |
| 219 |
| 220 ".p2align 2 \n" |
| 221 "1: \n" |
| 222 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| 223 "lea " MEMLEA(0x8,0) ",%0 \n" |
| 224 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 225 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 226 "sub $0x8,%2 \n" |
| 227 "jg 1b \n" |
| 228 : "+r"(src_y), // %0 |
| 229 "+r"(dst_argb), // %1 |
| 230 "+r"(pix) // %2 |
| 231 : |
| 232 : "memory", "cc" |
| 233 #if defined(__SSE2__) |
| 234 , "xmm0", "xmm1", "xmm5" |
| 235 #endif |
| 236 ); |
| 237 } |
| 238 #endif // TESTING |
| 239 |
| 240 #ifdef HAS_I400TOARGBROW_SSE2 |
| 241 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
| 242 asm volatile ( |
| 243 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 244 "pslld $0x18,%%xmm5 \n" |
| 245 LABELALIGN |
| 246 "1: \n" |
| 247 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| 248 "lea " MEMLEA(0x8,0) ",%0 \n" |
| 249 "punpcklbw %%xmm0,%%xmm0 \n" |
| 250 "movdqa %%xmm0,%%xmm1 \n" |
| 251 "punpcklwd %%xmm0,%%xmm0 \n" |
| 252 "punpckhwd %%xmm1,%%xmm1 \n" |
| 253 "por %%xmm5,%%xmm0 \n" |
| 254 "por %%xmm5,%%xmm1 \n" |
| 255 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 256 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 257 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 258 "sub $0x8,%2 \n" |
| 259 "jg 1b \n" |
| 260 : "+r"(src_y), // %0 |
| 261 "+r"(dst_argb), // %1 |
| 262 "+r"(pix) // %2 |
| 263 : |
| 264 : "memory", "cc" |
| 265 #if defined(__SSE2__) |
| 266 , "xmm0", "xmm1", "xmm5" |
| 267 #endif |
| 268 ); |
| 269 } |
| 270 |
| 271 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, |
| 272 int pix) { |
| 273 asm volatile ( |
| 274 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 275 "pslld $0x18,%%xmm5 \n" |
| 276 LABELALIGN |
| 277 "1: \n" |
| 278 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| 279 "lea " MEMLEA(0x8,0) ",%0 \n" |
| 280 "punpcklbw %%xmm0,%%xmm0 \n" |
| 281 "movdqa %%xmm0,%%xmm1 \n" |
| 282 "punpcklwd %%xmm0,%%xmm0 \n" |
| 283 "punpckhwd %%xmm1,%%xmm1 \n" |
| 284 "por %%xmm5,%%xmm0 \n" |
| 285 "por %%xmm5,%%xmm1 \n" |
| 286 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 287 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 288 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 289 "sub $0x8,%2 \n" |
| 290 "jg 1b \n" |
| 291 : "+r"(src_y), // %0 |
| 292 "+r"(dst_argb), // %1 |
| 293 "+r"(pix) // %2 |
| 294 : |
| 295 : "memory", "cc" |
| 296 #if defined(__SSE2__) |
| 297 , "xmm0", "xmm1", "xmm5" |
| 298 #endif |
| 299 ); |
| 300 } |
| 301 #endif // HAS_I400TOARGBROW_SSE2 |
| 302 |
| 303 #ifdef HAS_RGB24TOARGBROW_SSSE3 |
| 304 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
| 305 asm volatile ( |
| 306 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 |
| 307 "pslld $0x18,%%xmm5 \n" |
| 308 "movdqa %3,%%xmm4 \n" |
| 309 LABELALIGN |
| 310 "1: \n" |
| 311 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 312 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 313 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" |
| 314 "lea " MEMLEA(0x30,0) ",%0 \n" |
| 315 "movdqa %%xmm3,%%xmm2 \n" |
| 316 "palignr $0x8,%%xmm1,%%xmm2 \n" |
| 317 "pshufb %%xmm4,%%xmm2 \n" |
| 318 "por %%xmm5,%%xmm2 \n" |
| 319 "palignr $0xc,%%xmm0,%%xmm1 \n" |
| 320 "pshufb %%xmm4,%%xmm0 \n" |
| 321 "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" |
| 322 "por %%xmm5,%%xmm0 \n" |
| 323 "pshufb %%xmm4,%%xmm1 \n" |
| 324 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 325 "por %%xmm5,%%xmm1 \n" |
| 326 "palignr $0x4,%%xmm3,%%xmm3 \n" |
| 327 "pshufb %%xmm4,%%xmm3 \n" |
| 328 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 329 "por %%xmm5,%%xmm3 \n" |
| 330 "sub $0x10,%2 \n" |
| 331 "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" |
| 332 "lea " MEMLEA(0x40,1) ",%1 \n" |
| 333 "jg 1b \n" |
| 334 : "+r"(src_rgb24), // %0 |
| 335 "+r"(dst_argb), // %1 |
| 336 "+r"(pix) // %2 |
| 337 : "m"(kShuffleMaskRGB24ToARGB) // %3 |
| 338 : "memory", "cc" |
| 339 #if defined(__SSE2__) |
| 340 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 341 #endif |
| 342 ); |
| 343 } |
| 344 |
| 345 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { |
| 346 asm volatile ( |
| 347 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 |
| 348 "pslld $0x18,%%xmm5 \n" |
| 349 "movdqa %3,%%xmm4 \n" |
| 350 LABELALIGN |
| 351 "1: \n" |
| 352 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 353 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 354 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" |
| 355 "lea " MEMLEA(0x30,0) ",%0 \n" |
| 356 "movdqa %%xmm3,%%xmm2 \n" |
| 357 "palignr $0x8,%%xmm1,%%xmm2 \n" |
| 358 "pshufb %%xmm4,%%xmm2 \n" |
| 359 "por %%xmm5,%%xmm2 \n" |
| 360 "palignr $0xc,%%xmm0,%%xmm1 \n" |
| 361 "pshufb %%xmm4,%%xmm0 \n" |
| 362 "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" |
| 363 "por %%xmm5,%%xmm0 \n" |
| 364 "pshufb %%xmm4,%%xmm1 \n" |
| 365 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 366 "por %%xmm5,%%xmm1 \n" |
| 367 "palignr $0x4,%%xmm3,%%xmm3 \n" |
| 368 "pshufb %%xmm4,%%xmm3 \n" |
| 369 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 370 "por %%xmm5,%%xmm3 \n" |
| 371 "sub $0x10,%2 \n" |
| 372 "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" |
| 373 "lea " MEMLEA(0x40,1) ",%1 \n" |
| 374 "jg 1b \n" |
| 375 : "+r"(src_raw), // %0 |
| 376 "+r"(dst_argb), // %1 |
| 377 "+r"(pix) // %2 |
| 378 : "m"(kShuffleMaskRAWToARGB) // %3 |
| 379 : "memory", "cc" |
| 380 #if defined(__SSE2__) |
| 381 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 382 #endif |
| 383 ); |
| 384 } |
| 385 |
| 386 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { |
| 387 asm volatile ( |
| 388 "mov $0x1080108,%%eax \n" |
| 389 "movd %%eax,%%xmm5 \n" |
| 390 "pshufd $0x0,%%xmm5,%%xmm5 \n" |
| 391 "mov $0x20802080,%%eax \n" |
| 392 "movd %%eax,%%xmm6 \n" |
| 393 "pshufd $0x0,%%xmm6,%%xmm6 \n" |
| 394 "pcmpeqb %%xmm3,%%xmm3 \n" |
| 395 "psllw $0xb,%%xmm3 \n" |
| 396 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 397 "psllw $0xa,%%xmm4 \n" |
| 398 "psrlw $0x5,%%xmm4 \n" |
| 399 "pcmpeqb %%xmm7,%%xmm7 \n" |
| 400 "psllw $0x8,%%xmm7 \n" |
| 401 "sub %0,%1 \n" |
| 402 "sub %0,%1 \n" |
| 403 LABELALIGN |
| 404 "1: \n" |
| 405 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 406 "movdqa %%xmm0,%%xmm1 \n" |
| 407 "movdqa %%xmm0,%%xmm2 \n" |
| 408 "pand %%xmm3,%%xmm1 \n" |
| 409 "psllw $0xb,%%xmm2 \n" |
| 410 "pmulhuw %%xmm5,%%xmm1 \n" |
| 411 "pmulhuw %%xmm5,%%xmm2 \n" |
| 412 "psllw $0x8,%%xmm1 \n" |
| 413 "por %%xmm2,%%xmm1 \n" |
| 414 "pand %%xmm4,%%xmm0 \n" |
| 415 "pmulhuw %%xmm6,%%xmm0 \n" |
| 416 "por %%xmm7,%%xmm0 \n" |
| 417 "movdqa %%xmm1,%%xmm2 \n" |
| 418 "punpcklbw %%xmm0,%%xmm1 \n" |
| 419 "punpckhbw %%xmm0,%%xmm2 \n" |
| 420 BUNDLEALIGN |
| 421 MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) |
| 422 MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) |
| 423 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 424 "sub $0x8,%2 \n" |
| 425 "jg 1b \n" |
| 426 : "+r"(src), // %0 |
| 427 "+r"(dst), // %1 |
| 428 "+r"(pix) // %2 |
| 429 : |
| 430 : "memory", "cc", "eax" |
| 431 #if defined(__native_client__) && defined(__x86_64__) |
| 432 , "r14" |
| 433 #endif |
| 434 #if defined(__SSE2__) |
| 435 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 436 #endif |
| 437 ); |
| 438 } |
| 439 |
| 440 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { |
| 441 asm volatile ( |
| 442 "mov $0x1080108,%%eax \n" |
| 443 "movd %%eax,%%xmm5 \n" |
| 444 "pshufd $0x0,%%xmm5,%%xmm5 \n" |
| 445 "mov $0x42004200,%%eax \n" |
| 446 "movd %%eax,%%xmm6 \n" |
| 447 "pshufd $0x0,%%xmm6,%%xmm6 \n" |
| 448 "pcmpeqb %%xmm3,%%xmm3 \n" |
| 449 "psllw $0xb,%%xmm3 \n" |
| 450 "movdqa %%xmm3,%%xmm4 \n" |
| 451 "psrlw $0x6,%%xmm4 \n" |
| 452 "pcmpeqb %%xmm7,%%xmm7 \n" |
| 453 "psllw $0x8,%%xmm7 \n" |
| 454 "sub %0,%1 \n" |
| 455 "sub %0,%1 \n" |
| 456 LABELALIGN |
| 457 "1: \n" |
| 458 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 459 "movdqa %%xmm0,%%xmm1 \n" |
| 460 "movdqa %%xmm0,%%xmm2 \n" |
| 461 "psllw $0x1,%%xmm1 \n" |
| 462 "psllw $0xb,%%xmm2 \n" |
| 463 "pand %%xmm3,%%xmm1 \n" |
| 464 "pmulhuw %%xmm5,%%xmm2 \n" |
| 465 "pmulhuw %%xmm5,%%xmm1 \n" |
| 466 "psllw $0x8,%%xmm1 \n" |
| 467 "por %%xmm2,%%xmm1 \n" |
| 468 "movdqa %%xmm0,%%xmm2 \n" |
| 469 "pand %%xmm4,%%xmm0 \n" |
| 470 "psraw $0x8,%%xmm2 \n" |
| 471 "pmulhuw %%xmm6,%%xmm0 \n" |
| 472 "pand %%xmm7,%%xmm2 \n" |
| 473 "por %%xmm2,%%xmm0 \n" |
| 474 "movdqa %%xmm1,%%xmm2 \n" |
| 475 "punpcklbw %%xmm0,%%xmm1 \n" |
| 476 "punpckhbw %%xmm0,%%xmm2 \n" |
| 477 BUNDLEALIGN |
| 478 MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) |
| 479 MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) |
| 480 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 481 "sub $0x8,%2 \n" |
| 482 "jg 1b \n" |
| 483 : "+r"(src), // %0 |
| 484 "+r"(dst), // %1 |
| 485 "+r"(pix) // %2 |
| 486 : |
| 487 : "memory", "cc", "eax" |
| 488 #if defined(__native_client__) && defined(__x86_64__) |
| 489 , "r14" |
| 490 #endif |
| 491 #if defined(__SSE2__) |
| 492 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 493 #endif |
| 494 ); |
| 495 } |
| 496 |
| 497 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { |
| 498 asm volatile ( |
| 499 "mov $0xf0f0f0f,%%eax \n" |
| 500 "movd %%eax,%%xmm4 \n" |
| 501 "pshufd $0x0,%%xmm4,%%xmm4 \n" |
| 502 "movdqa %%xmm4,%%xmm5 \n" |
| 503 "pslld $0x4,%%xmm5 \n" |
| 504 "sub %0,%1 \n" |
| 505 "sub %0,%1 \n" |
| 506 LABELALIGN |
| 507 "1: \n" |
| 508 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 509 "movdqa %%xmm0,%%xmm2 \n" |
| 510 "pand %%xmm4,%%xmm0 \n" |
| 511 "pand %%xmm5,%%xmm2 \n" |
| 512 "movdqa %%xmm0,%%xmm1 \n" |
| 513 "movdqa %%xmm2,%%xmm3 \n" |
| 514 "psllw $0x4,%%xmm1 \n" |
| 515 "psrlw $0x4,%%xmm3 \n" |
| 516 "por %%xmm1,%%xmm0 \n" |
| 517 "por %%xmm3,%%xmm2 \n" |
| 518 "movdqa %%xmm0,%%xmm1 \n" |
| 519 "punpcklbw %%xmm2,%%xmm0 \n" |
| 520 "punpckhbw %%xmm2,%%xmm1 \n" |
| 521 BUNDLEALIGN |
| 522 MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2) |
| 523 MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2) |
| 524 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 525 "sub $0x8,%2 \n" |
| 526 "jg 1b \n" |
| 527 : "+r"(src), // %0 |
| 528 "+r"(dst), // %1 |
| 529 "+r"(pix) // %2 |
| 530 : |
| 531 : "memory", "cc", "eax" |
| 532 #if defined(__native_client__) && defined(__x86_64__) |
| 533 , "r14" |
| 534 #endif |
| 535 #if defined(__SSE2__) |
| 536 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 537 #endif |
| 538 ); |
| 539 } |
| 540 |
| 541 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { |
| 542 asm volatile ( |
| 543 "movdqa %3,%%xmm6 \n" |
| 544 LABELALIGN |
| 545 "1: \n" |
| 546 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 547 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 548 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 549 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 550 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 551 "pshufb %%xmm6,%%xmm0 \n" |
| 552 "pshufb %%xmm6,%%xmm1 \n" |
| 553 "pshufb %%xmm6,%%xmm2 \n" |
| 554 "pshufb %%xmm6,%%xmm3 \n" |
| 555 "movdqa %%xmm1,%%xmm4 \n" |
| 556 "psrldq $0x4,%%xmm1 \n" |
| 557 "pslldq $0xc,%%xmm4 \n" |
| 558 "movdqa %%xmm2,%%xmm5 \n" |
| 559 "por %%xmm4,%%xmm0 \n" |
| 560 "pslldq $0x8,%%xmm5 \n" |
| 561 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 562 "por %%xmm5,%%xmm1 \n" |
| 563 "psrldq $0x8,%%xmm2 \n" |
| 564 "pslldq $0x4,%%xmm3 \n" |
| 565 "por %%xmm3,%%xmm2 \n" |
| 566 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 567 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" |
| 568 "lea " MEMLEA(0x30,1) ",%1 \n" |
| 569 "sub $0x10,%2 \n" |
| 570 "jg 1b \n" |
| 571 : "+r"(src), // %0 |
| 572 "+r"(dst), // %1 |
| 573 "+r"(pix) // %2 |
| 574 : "m"(kShuffleMaskARGBToRGB24) // %3 |
| 575 : "memory", "cc" |
| 576 #if defined(__SSE2__) |
| 577 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 578 #endif |
| 579 ); |
| 580 } |
| 581 |
| 582 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { |
| 583 asm volatile ( |
| 584 "movdqa %3,%%xmm6 \n" |
| 585 LABELALIGN |
| 586 "1: \n" |
| 587 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 588 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 589 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 590 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 591 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 592 "pshufb %%xmm6,%%xmm0 \n" |
| 593 "pshufb %%xmm6,%%xmm1 \n" |
| 594 "pshufb %%xmm6,%%xmm2 \n" |
| 595 "pshufb %%xmm6,%%xmm3 \n" |
| 596 "movdqa %%xmm1,%%xmm4 \n" |
| 597 "psrldq $0x4,%%xmm1 \n" |
| 598 "pslldq $0xc,%%xmm4 \n" |
| 599 "movdqa %%xmm2,%%xmm5 \n" |
| 600 "por %%xmm4,%%xmm0 \n" |
| 601 "pslldq $0x8,%%xmm5 \n" |
| 602 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 603 "por %%xmm5,%%xmm1 \n" |
| 604 "psrldq $0x8,%%xmm2 \n" |
| 605 "pslldq $0x4,%%xmm3 \n" |
| 606 "por %%xmm3,%%xmm2 \n" |
| 607 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 608 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" |
| 609 "lea " MEMLEA(0x30,1) ",%1 \n" |
| 610 "sub $0x10,%2 \n" |
| 611 "jg 1b \n" |
| 612 : "+r"(src), // %0 |
| 613 "+r"(dst), // %1 |
| 614 "+r"(pix) // %2 |
| 615 : "m"(kShuffleMaskARGBToRAW) // %3 |
| 616 : "memory", "cc" |
| 617 #if defined(__SSE2__) |
| 618 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 619 #endif |
| 620 ); |
| 621 } |
| 622 |
| 623 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { |
| 624 asm volatile ( |
| 625 "pcmpeqb %%xmm3,%%xmm3 \n" |
| 626 "psrld $0x1b,%%xmm3 \n" |
| 627 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 628 "psrld $0x1a,%%xmm4 \n" |
| 629 "pslld $0x5,%%xmm4 \n" |
| 630 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 631 "pslld $0xb,%%xmm5 \n" |
| 632 LABELALIGN |
| 633 "1: \n" |
| 634 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 635 "movdqa %%xmm0,%%xmm1 \n" |
| 636 "movdqa %%xmm0,%%xmm2 \n" |
| 637 "pslld $0x8,%%xmm0 \n" |
| 638 "psrld $0x3,%%xmm1 \n" |
| 639 "psrld $0x5,%%xmm2 \n" |
| 640 "psrad $0x10,%%xmm0 \n" |
| 641 "pand %%xmm3,%%xmm1 \n" |
| 642 "pand %%xmm4,%%xmm2 \n" |
| 643 "pand %%xmm5,%%xmm0 \n" |
| 644 "por %%xmm2,%%xmm1 \n" |
| 645 "por %%xmm1,%%xmm0 \n" |
| 646 "packssdw %%xmm0,%%xmm0 \n" |
| 647 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 648 "movq %%xmm0," MEMACCESS(1) " \n" |
| 649 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 650 "sub $0x4,%2 \n" |
| 651 "jg 1b \n" |
| 652 : "+r"(src), // %0 |
| 653 "+r"(dst), // %1 |
| 654 "+r"(pix) // %2 |
| 655 : |
| 656 : "memory", "cc" |
| 657 #if defined(__SSE2__) |
| 658 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 659 #endif |
| 660 ); |
| 661 } |
| 662 |
| 663 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { |
| 664 asm volatile ( |
| 665 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 666 "psrld $0x1b,%%xmm4 \n" |
| 667 "movdqa %%xmm4,%%xmm5 \n" |
| 668 "pslld $0x5,%%xmm5 \n" |
| 669 "movdqa %%xmm4,%%xmm6 \n" |
| 670 "pslld $0xa,%%xmm6 \n" |
| 671 "pcmpeqb %%xmm7,%%xmm7 \n" |
| 672 "pslld $0xf,%%xmm7 \n" |
| 673 LABELALIGN |
| 674 "1: \n" |
| 675 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 676 "movdqa %%xmm0,%%xmm1 \n" |
| 677 "movdqa %%xmm0,%%xmm2 \n" |
| 678 "movdqa %%xmm0,%%xmm3 \n" |
| 679 "psrad $0x10,%%xmm0 \n" |
| 680 "psrld $0x3,%%xmm1 \n" |
| 681 "psrld $0x6,%%xmm2 \n" |
| 682 "psrld $0x9,%%xmm3 \n" |
| 683 "pand %%xmm7,%%xmm0 \n" |
| 684 "pand %%xmm4,%%xmm1 \n" |
| 685 "pand %%xmm5,%%xmm2 \n" |
| 686 "pand %%xmm6,%%xmm3 \n" |
| 687 "por %%xmm1,%%xmm0 \n" |
| 688 "por %%xmm3,%%xmm2 \n" |
| 689 "por %%xmm2,%%xmm0 \n" |
| 690 "packssdw %%xmm0,%%xmm0 \n" |
| 691 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 692 "movq %%xmm0," MEMACCESS(1) " \n" |
| 693 "lea " MEMACCESS2(0x8,1) ",%1 \n" |
| 694 "sub $0x4,%2 \n" |
| 695 "jg 1b \n" |
| 696 : "+r"(src), // %0 |
| 697 "+r"(dst), // %1 |
| 698 "+r"(pix) // %2 |
| 699 : |
| 700 : "memory", "cc" |
| 701 #if defined(__SSE2__) |
| 702 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 703 #endif |
| 704 ); |
| 705 } |
| 706 |
| 707 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { |
| 708 asm volatile ( |
| 709 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 710 "psllw $0xc,%%xmm4 \n" |
| 711 "movdqa %%xmm4,%%xmm3 \n" |
| 712 "psrlw $0x8,%%xmm3 \n" |
| 713 LABELALIGN |
| 714 "1: \n" |
| 715 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 716 "movdqa %%xmm0,%%xmm1 \n" |
| 717 "pand %%xmm3,%%xmm0 \n" |
| 718 "pand %%xmm4,%%xmm1 \n" |
| 719 "psrlq $0x4,%%xmm0 \n" |
| 720 "psrlq $0x8,%%xmm1 \n" |
| 721 "por %%xmm1,%%xmm0 \n" |
| 722 "packuswb %%xmm0,%%xmm0 \n" |
| 723 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 724 "movq %%xmm0," MEMACCESS(1) " \n" |
| 725 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 726 "sub $0x4,%2 \n" |
| 727 "jg 1b \n" |
| 728 : "+r"(src), // %0 |
| 729 "+r"(dst), // %1 |
| 730 "+r"(pix) // %2 |
| 731 : |
| 732 : "memory", "cc" |
| 733 #if defined(__SSE2__) |
| 734 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
| 735 #endif |
| 736 ); |
| 737 } |
| 738 #endif // HAS_RGB24TOARGBROW_SSSE3 |
| 739 |
| 740 #ifdef HAS_ARGBTOYROW_SSSE3 |
| 741 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 742 asm volatile ( |
| 743 "movdqa %4,%%xmm5 \n" |
| 744 "movdqa %3,%%xmm4 \n" |
| 745 LABELALIGN |
| 746 "1: \n" |
| 747 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 748 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 749 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 750 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 751 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 752 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 753 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 754 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 755 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 756 "phaddw %%xmm1,%%xmm0 \n" |
| 757 "phaddw %%xmm3,%%xmm2 \n" |
| 758 "psrlw $0x7,%%xmm0 \n" |
| 759 "psrlw $0x7,%%xmm2 \n" |
| 760 "packuswb %%xmm2,%%xmm0 \n" |
| 761 "paddb %%xmm5,%%xmm0 \n" |
| 762 "sub $0x10,%2 \n" |
| 763 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 764 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 765 "jg 1b \n" |
| 766 : "+r"(src_argb), // %0 |
| 767 "+r"(dst_y), // %1 |
| 768 "+r"(pix) // %2 |
| 769 : "m"(kARGBToY), // %3 |
| 770 "m"(kAddY16) // %4 |
| 771 : "memory", "cc" |
| 772 #if defined(__SSE2__) |
| 773 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 774 #endif |
| 775 ); |
| 776 } |
| 777 |
| 778 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 779 asm volatile ( |
| 780 "movdqa %4,%%xmm5 \n" |
| 781 "movdqa %3,%%xmm4 \n" |
| 782 LABELALIGN |
| 783 "1: \n" |
| 784 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 785 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 786 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 787 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 788 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 789 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 790 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 791 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 792 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 793 "phaddw %%xmm1,%%xmm0 \n" |
| 794 "phaddw %%xmm3,%%xmm2 \n" |
| 795 "psrlw $0x7,%%xmm0 \n" |
| 796 "psrlw $0x7,%%xmm2 \n" |
| 797 "packuswb %%xmm2,%%xmm0 \n" |
| 798 "paddb %%xmm5,%%xmm0 \n" |
| 799 "sub $0x10,%2 \n" |
| 800 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 801 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 802 "jg 1b \n" |
| 803 : "+r"(src_argb), // %0 |
| 804 "+r"(dst_y), // %1 |
| 805 "+r"(pix) // %2 |
| 806 : "m"(kARGBToY), // %3 |
| 807 "m"(kAddY16) // %4 |
| 808 : "memory", "cc" |
| 809 #if defined(__SSE2__) |
| 810 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 811 #endif |
| 812 ); |
| 813 } |
| 814 #endif // HAS_ARGBTOYROW_SSSE3 |
| 815 |
| 816 #ifdef HAS_ARGBTOYJROW_SSSE3 |
| 817 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 818 asm volatile ( |
| 819 "movdqa %3,%%xmm4 \n" |
| 820 "movdqa %4,%%xmm5 \n" |
| 821 LABELALIGN |
| 822 "1: \n" |
| 823 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 824 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 825 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 826 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 827 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 828 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 829 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 830 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 831 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 832 "phaddw %%xmm1,%%xmm0 \n" |
| 833 "phaddw %%xmm3,%%xmm2 \n" |
| 834 "paddw %%xmm5,%%xmm0 \n" |
| 835 "paddw %%xmm5,%%xmm2 \n" |
| 836 "psrlw $0x7,%%xmm0 \n" |
| 837 "psrlw $0x7,%%xmm2 \n" |
| 838 "packuswb %%xmm2,%%xmm0 \n" |
| 839 "sub $0x10,%2 \n" |
| 840 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 841 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 842 "jg 1b \n" |
| 843 : "+r"(src_argb), // %0 |
| 844 "+r"(dst_y), // %1 |
| 845 "+r"(pix) // %2 |
| 846 : "m"(kARGBToYJ), // %3 |
| 847 "m"(kAddYJ64) // %4 |
| 848 : "memory", "cc" |
| 849 #if defined(__SSE2__) |
| 850 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 851 #endif |
| 852 ); |
| 853 } |
| 854 |
| 855 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 856 asm volatile ( |
| 857 "movdqa %3,%%xmm4 \n" |
| 858 "movdqa %4,%%xmm5 \n" |
| 859 LABELALIGN |
| 860 "1: \n" |
| 861 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 862 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 863 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 864 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 865 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 866 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 867 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 868 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 869 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 870 "phaddw %%xmm1,%%xmm0 \n" |
| 871 "phaddw %%xmm3,%%xmm2 \n" |
| 872 "paddw %%xmm5,%%xmm0 \n" |
| 873 "paddw %%xmm5,%%xmm2 \n" |
| 874 "psrlw $0x7,%%xmm0 \n" |
| 875 "psrlw $0x7,%%xmm2 \n" |
| 876 "packuswb %%xmm2,%%xmm0 \n" |
| 877 "sub $0x10,%2 \n" |
| 878 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 879 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 880 "jg 1b \n" |
| 881 : "+r"(src_argb), // %0 |
| 882 "+r"(dst_y), // %1 |
| 883 "+r"(pix) // %2 |
| 884 : "m"(kARGBToYJ), // %3 |
| 885 "m"(kAddYJ64) // %4 |
| 886 : "memory", "cc" |
| 887 #if defined(__SSE2__) |
| 888 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 889 #endif |
| 890 ); |
| 891 } |
| 892 #endif // HAS_ARGBTOYJROW_SSSE3 |
| 893 |
| 894 #ifdef HAS_ARGBTOUVROW_SSSE3 |
| 895 // TODO(fbarchard): pass xmm constants to single block of assembly. |
| 896 // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes |
| 897 // 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, |
| 898 // or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around |
| 899 // and considered unsafe. |
| 900 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 901 uint8* dst_u, uint8* dst_v, int width) { |
| 902 asm volatile ( |
| 903 "movdqa %0,%%xmm4 \n" |
| 904 "movdqa %1,%%xmm3 \n" |
| 905 "movdqa %2,%%xmm5 \n" |
| 906 : |
| 907 : "m"(kARGBToU), // %0 |
| 908 "m"(kARGBToV), // %1 |
| 909 "m"(kAddUV128) // %2 |
| 910 ); |
| 911 asm volatile ( |
| 912 "sub %1,%2 \n" |
| 913 LABELALIGN |
| 914 "1: \n" |
| 915 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 916 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 917 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 918 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 919 BUNDLEALIGN |
| 920 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 |
| 921 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 |
| 922 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 |
| 923 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 |
| 924 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 925 "movdqa %%xmm0,%%xmm7 \n" |
| 926 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 927 "shufps $0xdd,%%xmm1,%%xmm7 \n" |
| 928 "pavgb %%xmm7,%%xmm0 \n" |
| 929 "movdqa %%xmm2,%%xmm7 \n" |
| 930 "shufps $0x88,%%xmm6,%%xmm2 \n" |
| 931 "shufps $0xdd,%%xmm6,%%xmm7 \n" |
| 932 "pavgb %%xmm7,%%xmm2 \n" |
| 933 "movdqa %%xmm0,%%xmm1 \n" |
| 934 "movdqa %%xmm2,%%xmm6 \n" |
| 935 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 936 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 937 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 938 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 939 "phaddw %%xmm2,%%xmm0 \n" |
| 940 "phaddw %%xmm6,%%xmm1 \n" |
| 941 "psraw $0x8,%%xmm0 \n" |
| 942 "psraw $0x8,%%xmm1 \n" |
| 943 "packsswb %%xmm1,%%xmm0 \n" |
| 944 "paddb %%xmm5,%%xmm0 \n" |
| 945 "sub $0x10,%3 \n" |
| 946 "movlps %%xmm0," MEMACCESS(1) " \n" |
| 947 BUNDLEALIGN |
| 948 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
| 949 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 950 "jg 1b \n" |
| 951 : "+r"(src_argb0), // %0 |
| 952 "+r"(dst_u), // %1 |
| 953 "+r"(dst_v), // %2 |
| 954 "+rm"(width) // %3 |
| 955 : "r"((intptr_t)(src_stride_argb)) // %4 |
| 956 : "memory", "cc" |
| 957 #if defined(__native_client__) && defined(__x86_64__) |
| 958 , "r14" |
| 959 #endif |
| 960 #if defined(__SSE2__) |
| 961 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 962 #endif |
| 963 ); |
| 964 } |
| 965 |
| 966 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. |
| 967 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 968 uint8* dst_u, uint8* dst_v, int width) { |
| 969 asm volatile ( |
| 970 "movdqa %0,%%xmm4 \n" |
| 971 "movdqa %1,%%xmm3 \n" |
| 972 "movdqa %2,%%xmm5 \n" |
| 973 : |
| 974 : "m"(kARGBToUJ), // %0 |
| 975 "m"(kARGBToVJ), // %1 |
| 976 "m"(kAddUVJ128) // %2 |
| 977 ); |
| 978 asm volatile ( |
| 979 "sub %1,%2 \n" |
| 980 LABELALIGN |
| 981 "1: \n" |
| 982 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 983 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 984 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 985 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 986 BUNDLEALIGN |
| 987 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 |
| 988 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 |
| 989 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 |
| 990 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 |
| 991 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 992 "movdqa %%xmm0,%%xmm7 \n" |
| 993 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 994 "shufps $0xdd,%%xmm1,%%xmm7 \n" |
| 995 "pavgb %%xmm7,%%xmm0 \n" |
| 996 "movdqa %%xmm2,%%xmm7 \n" |
| 997 "shufps $0x88,%%xmm6,%%xmm2 \n" |
| 998 "shufps $0xdd,%%xmm6,%%xmm7 \n" |
| 999 "pavgb %%xmm7,%%xmm2 \n" |
| 1000 "movdqa %%xmm0,%%xmm1 \n" |
| 1001 "movdqa %%xmm2,%%xmm6 \n" |
| 1002 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1003 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1004 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 1005 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 1006 "phaddw %%xmm2,%%xmm0 \n" |
| 1007 "phaddw %%xmm6,%%xmm1 \n" |
| 1008 "paddw %%xmm5,%%xmm0 \n" |
| 1009 "paddw %%xmm5,%%xmm1 \n" |
| 1010 "psraw $0x8,%%xmm0 \n" |
| 1011 "psraw $0x8,%%xmm1 \n" |
| 1012 "packsswb %%xmm1,%%xmm0 \n" |
| 1013 "sub $0x10,%3 \n" |
| 1014 "movlps %%xmm0," MEMACCESS(1) " \n" |
| 1015 BUNDLEALIGN |
| 1016 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
| 1017 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 1018 "jg 1b \n" |
| 1019 : "+r"(src_argb0), // %0 |
| 1020 "+r"(dst_u), // %1 |
| 1021 "+r"(dst_v), // %2 |
| 1022 "+rm"(width) // %3 |
| 1023 : "r"((intptr_t)(src_stride_argb)) // %4 |
| 1024 : "memory", "cc" |
| 1025 #if defined(__native_client__) && defined(__x86_64__) |
| 1026 , "r14" |
| 1027 #endif |
| 1028 #if defined(__SSE2__) |
| 1029 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1030 #endif |
| 1031 ); |
| 1032 } |
| 1033 |
| 1034 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1035 uint8* dst_u, uint8* dst_v, int width) { |
| 1036 asm volatile ( |
| 1037 "movdqa %0,%%xmm4 \n" |
| 1038 "movdqa %1,%%xmm3 \n" |
| 1039 "movdqa %2,%%xmm5 \n" |
| 1040 : |
| 1041 : "m"(kARGBToU), // %0 |
| 1042 "m"(kARGBToV), // %1 |
| 1043 "m"(kAddUV128) // %2 |
| 1044 ); |
| 1045 asm volatile ( |
| 1046 "sub %1,%2 \n" |
| 1047 LABELALIGN |
| 1048 "1: \n" |
| 1049 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1050 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1051 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1052 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1053 BUNDLEALIGN |
| 1054 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
| 1055 "pavgb %%xmm7,%%xmm0 \n" |
| 1056 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 |
| 1057 "pavgb %%xmm7,%%xmm1 \n" |
| 1058 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 |
| 1059 "pavgb %%xmm7,%%xmm2 \n" |
| 1060 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 |
| 1061 "pavgb %%xmm7,%%xmm6 \n" |
| 1062 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1063 "movdqa %%xmm0,%%xmm7 \n" |
| 1064 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 1065 "shufps $0xdd,%%xmm1,%%xmm7 \n" |
| 1066 "pavgb %%xmm7,%%xmm0 \n" |
| 1067 "movdqa %%xmm2,%%xmm7 \n" |
| 1068 "shufps $0x88,%%xmm6,%%xmm2 \n" |
| 1069 "shufps $0xdd,%%xmm6,%%xmm7 \n" |
| 1070 "pavgb %%xmm7,%%xmm2 \n" |
| 1071 "movdqa %%xmm0,%%xmm1 \n" |
| 1072 "movdqa %%xmm2,%%xmm6 \n" |
| 1073 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1074 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1075 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 1076 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 1077 "phaddw %%xmm2,%%xmm0 \n" |
| 1078 "phaddw %%xmm6,%%xmm1 \n" |
| 1079 "psraw $0x8,%%xmm0 \n" |
| 1080 "psraw $0x8,%%xmm1 \n" |
| 1081 "packsswb %%xmm1,%%xmm0 \n" |
| 1082 "paddb %%xmm5,%%xmm0 \n" |
| 1083 "sub $0x10,%3 \n" |
| 1084 "movlps %%xmm0," MEMACCESS(1) " \n" |
| 1085 BUNDLEALIGN |
| 1086 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
| 1087 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 1088 "jg 1b \n" |
| 1089 : "+r"(src_argb0), // %0 |
| 1090 "+r"(dst_u), // %1 |
| 1091 "+r"(dst_v), // %2 |
| 1092 "+rm"(width) // %3 |
| 1093 : "r"((intptr_t)(src_stride_argb)) // %4 |
| 1094 : "memory", "cc" |
| 1095 #if defined(__native_client__) && defined(__x86_64__) |
| 1096 , "r14" |
| 1097 #endif |
| 1098 #if defined(__SSE2__) |
| 1099 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1100 #endif |
| 1101 ); |
| 1102 } |
| 1103 |
| 1104 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1105 uint8* dst_u, uint8* dst_v, int width) { |
| 1106 asm volatile ( |
| 1107 "movdqa %0,%%xmm4 \n" |
| 1108 "movdqa %1,%%xmm3 \n" |
| 1109 "movdqa %2,%%xmm5 \n" |
| 1110 : |
| 1111 : "m"(kARGBToUJ), // %0 |
| 1112 "m"(kARGBToVJ), // %1 |
| 1113 "m"(kAddUVJ128) // %2 |
| 1114 ); |
| 1115 asm volatile ( |
| 1116 "sub %1,%2 \n" |
| 1117 LABELALIGN |
| 1118 "1: \n" |
| 1119 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1120 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1121 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1122 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1123 BUNDLEALIGN |
| 1124 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
| 1125 "pavgb %%xmm7,%%xmm0 \n" |
| 1126 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 |
| 1127 "pavgb %%xmm7,%%xmm1 \n" |
| 1128 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 |
| 1129 "pavgb %%xmm7,%%xmm2 \n" |
| 1130 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 |
| 1131 "pavgb %%xmm7,%%xmm6 \n" |
| 1132 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1133 "movdqa %%xmm0,%%xmm7 \n" |
| 1134 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 1135 "shufps $0xdd,%%xmm1,%%xmm7 \n" |
| 1136 "pavgb %%xmm7,%%xmm0 \n" |
| 1137 "movdqa %%xmm2,%%xmm7 \n" |
| 1138 "shufps $0x88,%%xmm6,%%xmm2 \n" |
| 1139 "shufps $0xdd,%%xmm6,%%xmm7 \n" |
| 1140 "pavgb %%xmm7,%%xmm2 \n" |
| 1141 "movdqa %%xmm0,%%xmm1 \n" |
| 1142 "movdqa %%xmm2,%%xmm6 \n" |
| 1143 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1144 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1145 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 1146 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 1147 "phaddw %%xmm2,%%xmm0 \n" |
| 1148 "phaddw %%xmm6,%%xmm1 \n" |
| 1149 "paddw %%xmm5,%%xmm0 \n" |
| 1150 "paddw %%xmm5,%%xmm1 \n" |
| 1151 "psraw $0x8,%%xmm0 \n" |
| 1152 "psraw $0x8,%%xmm1 \n" |
| 1153 "packsswb %%xmm1,%%xmm0 \n" |
| 1154 "sub $0x10,%3 \n" |
| 1155 "movlps %%xmm0," MEMACCESS(1) " \n" |
| 1156 BUNDLEALIGN |
| 1157 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
| 1158 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 1159 "jg 1b \n" |
| 1160 : "+r"(src_argb0), // %0 |
| 1161 "+r"(dst_u), // %1 |
| 1162 "+r"(dst_v), // %2 |
| 1163 "+rm"(width) // %3 |
| 1164 : "r"((intptr_t)(src_stride_argb)) |
| 1165 : "memory", "cc" |
| 1166 #if defined(__native_client__) && defined(__x86_64__) |
| 1167 , "r14" |
| 1168 #endif |
| 1169 #if defined(__SSE2__) |
| 1170 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1171 #endif |
| 1172 ); |
| 1173 } |
| 1174 |
| 1175 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
| 1176 int width) { |
| 1177 asm volatile ( |
| 1178 "movdqa %0,%%xmm4 \n" |
| 1179 "movdqa %1,%%xmm3 \n" |
| 1180 "movdqa %2,%%xmm5 \n" |
| 1181 : |
| 1182 : "m"(kARGBToU), // %0 |
| 1183 "m"(kARGBToV), // %1 |
| 1184 "m"(kAddUV128) // %2 |
| 1185 ); |
| 1186 asm volatile ( |
| 1187 "sub %1,%2 \n" |
| 1188 LABELALIGN |
| 1189 "1: \n" |
| 1190 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 1191 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1192 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1193 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1194 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1195 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 1196 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1197 "pmaddubsw %%xmm4,%%xmm6 \n" |
| 1198 "phaddw %%xmm1,%%xmm0 \n" |
| 1199 "phaddw %%xmm6,%%xmm2 \n" |
| 1200 "psraw $0x8,%%xmm0 \n" |
| 1201 "psraw $0x8,%%xmm2 \n" |
| 1202 "packsswb %%xmm2,%%xmm0 \n" |
| 1203 "paddb %%xmm5,%%xmm0 \n" |
| 1204 "sub $0x10,%3 \n" |
| 1205 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 1206 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 1207 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1208 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1209 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1210 "pmaddubsw %%xmm3,%%xmm0 \n" |
| 1211 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 1212 "pmaddubsw %%xmm3,%%xmm2 \n" |
| 1213 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 1214 "phaddw %%xmm1,%%xmm0 \n" |
| 1215 "phaddw %%xmm6,%%xmm2 \n" |
| 1216 "psraw $0x8,%%xmm0 \n" |
| 1217 "psraw $0x8,%%xmm2 \n" |
| 1218 "packsswb %%xmm2,%%xmm0 \n" |
| 1219 "paddb %%xmm5,%%xmm0 \n" |
| 1220 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1221 BUNDLEALIGN |
| 1222 MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1) |
| 1223 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 1224 "jg 1b \n" |
| 1225 : "+r"(src_argb), // %0 |
| 1226 "+r"(dst_u), // %1 |
| 1227 "+r"(dst_v), // %2 |
| 1228 "+rm"(width) // %3 |
| 1229 : |
| 1230 : "memory", "cc" |
| 1231 #if defined(__native_client__) && defined(__x86_64__) |
| 1232 , "r14" |
| 1233 #endif |
| 1234 #if defined(__SSE2__) |
| 1235 , "xmm0", "xmm1", "xmm2", "xmm6" |
| 1236 #endif |
| 1237 ); |
| 1238 } |
| 1239 |
| 1240 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, |
| 1241 uint8* dst_v, int width) { |
| 1242 asm volatile ( |
| 1243 "movdqa %0,%%xmm4 \n" |
| 1244 "movdqa %1,%%xmm3 \n" |
| 1245 "movdqa %2,%%xmm5 \n" |
| 1246 : |
| 1247 : "m"(kARGBToU), // %0 |
| 1248 "m"(kARGBToV), // %1 |
| 1249 "m"(kAddUV128) // %2 |
| 1250 ); |
| 1251 asm volatile ( |
| 1252 "sub %1,%2 \n" |
| 1253 LABELALIGN |
| 1254 "1: \n" |
| 1255 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1256 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1257 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1258 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1259 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1260 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 1261 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1262 "pmaddubsw %%xmm4,%%xmm6 \n" |
| 1263 "phaddw %%xmm1,%%xmm0 \n" |
| 1264 "phaddw %%xmm6,%%xmm2 \n" |
| 1265 "psraw $0x8,%%xmm0 \n" |
| 1266 "psraw $0x8,%%xmm2 \n" |
| 1267 "packsswb %%xmm2,%%xmm0 \n" |
| 1268 "paddb %%xmm5,%%xmm0 \n" |
| 1269 "sub $0x10,%3 \n" |
| 1270 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 1271 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1272 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1273 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1274 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1275 "pmaddubsw %%xmm3,%%xmm0 \n" |
| 1276 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 1277 "pmaddubsw %%xmm3,%%xmm2 \n" |
| 1278 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 1279 "phaddw %%xmm1,%%xmm0 \n" |
| 1280 "phaddw %%xmm6,%%xmm2 \n" |
| 1281 "psraw $0x8,%%xmm0 \n" |
| 1282 "psraw $0x8,%%xmm2 \n" |
| 1283 "packsswb %%xmm2,%%xmm0 \n" |
| 1284 "paddb %%xmm5,%%xmm0 \n" |
| 1285 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1286 BUNDLEALIGN |
| 1287 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) |
| 1288 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 1289 "jg 1b \n" |
| 1290 : "+r"(src_argb), // %0 |
| 1291 "+r"(dst_u), // %1 |
| 1292 "+r"(dst_v), // %2 |
| 1293 "+rm"(width) // %3 |
| 1294 : |
| 1295 : "memory", "cc" |
| 1296 #if defined(__native_client__) && defined(__x86_64__) |
| 1297 , "r14" |
| 1298 #endif |
| 1299 #if defined(__SSE2__) |
| 1300 , "xmm0", "xmm1", "xmm2", "xmm6" |
| 1301 #endif |
| 1302 ); |
| 1303 } |
| 1304 |
| 1305 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, |
| 1306 uint8* dst_u, uint8* dst_v, int width) { |
| 1307 asm volatile ( |
| 1308 "movdqa %0,%%xmm4 \n" |
| 1309 "movdqa %1,%%xmm3 \n" |
| 1310 "movdqa %2,%%xmm5 \n" |
| 1311 : |
| 1312 : "m"(kARGBToU), // %0 |
| 1313 "m"(kARGBToV), // %1 |
| 1314 "m"(kAddUV128) // %2 |
| 1315 ); |
| 1316 asm volatile ( |
| 1317 "sub %1,%2 \n" |
| 1318 LABELALIGN |
| 1319 "1: \n" |
| 1320 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 1321 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1322 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1323 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1324 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1325 "movdqa %%xmm0,%%xmm7 \n" |
| 1326 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 1327 "shufps $0xdd,%%xmm1,%%xmm7 \n" |
| 1328 "pavgb %%xmm7,%%xmm0 \n" |
| 1329 "movdqa %%xmm2,%%xmm7 \n" |
| 1330 "shufps $0x88,%%xmm6,%%xmm2 \n" |
| 1331 "shufps $0xdd,%%xmm6,%%xmm7 \n" |
| 1332 "pavgb %%xmm7,%%xmm2 \n" |
| 1333 "movdqa %%xmm0,%%xmm1 \n" |
| 1334 "movdqa %%xmm2,%%xmm6 \n" |
| 1335 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1336 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1337 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 1338 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 1339 "phaddw %%xmm2,%%xmm0 \n" |
| 1340 "phaddw %%xmm6,%%xmm1 \n" |
| 1341 "psraw $0x8,%%xmm0 \n" |
| 1342 "psraw $0x8,%%xmm1 \n" |
| 1343 "packsswb %%xmm1,%%xmm0 \n" |
| 1344 "paddb %%xmm5,%%xmm0 \n" |
| 1345 "sub $0x10,%3 \n" |
| 1346 "movlps %%xmm0," MEMACCESS(1) " \n" |
| 1347 BUNDLEALIGN |
| 1348 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
| 1349 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 1350 "jg 1b \n" |
| 1351 : "+r"(src_argb0), // %0 |
| 1352 "+r"(dst_u), // %1 |
| 1353 "+r"(dst_v), // %2 |
| 1354 "+rm"(width) // %3 |
| 1355 : |
| 1356 : "memory", "cc" |
| 1357 #if defined(__native_client__) && defined(__x86_64__) |
| 1358 , "r14" |
| 1359 #endif |
| 1360 #if defined(__SSE2__) |
| 1361 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1362 #endif |
| 1363 ); |
| 1364 } |
| 1365 |
| 1366 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, |
| 1367 uint8* dst_u, uint8* dst_v, int width) { |
| 1368 asm volatile ( |
| 1369 "movdqa %0,%%xmm4 \n" |
| 1370 "movdqa %1,%%xmm3 \n" |
| 1371 "movdqa %2,%%xmm5 \n" |
| 1372 : |
| 1373 : "m"(kARGBToU), // %0 |
| 1374 "m"(kARGBToV), // %1 |
| 1375 "m"(kAddUV128) // %2 |
| 1376 ); |
| 1377 asm volatile ( |
| 1378 "sub %1,%2 \n" |
| 1379 LABELALIGN |
| 1380 "1: \n" |
| 1381 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1382 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1383 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1384 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1385 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1386 "movdqa %%xmm0,%%xmm7 \n" |
| 1387 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 1388 "shufps $0xdd,%%xmm1,%%xmm7 \n" |
| 1389 "pavgb %%xmm7,%%xmm0 \n" |
| 1390 "movdqa %%xmm2,%%xmm7 \n" |
| 1391 "shufps $0x88,%%xmm6,%%xmm2 \n" |
| 1392 "shufps $0xdd,%%xmm6,%%xmm7 \n" |
| 1393 "pavgb %%xmm7,%%xmm2 \n" |
| 1394 "movdqa %%xmm0,%%xmm1 \n" |
| 1395 "movdqa %%xmm2,%%xmm6 \n" |
| 1396 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1397 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1398 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 1399 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 1400 "phaddw %%xmm2,%%xmm0 \n" |
| 1401 "phaddw %%xmm6,%%xmm1 \n" |
| 1402 "psraw $0x8,%%xmm0 \n" |
| 1403 "psraw $0x8,%%xmm1 \n" |
| 1404 "packsswb %%xmm1,%%xmm0 \n" |
| 1405 "paddb %%xmm5,%%xmm0 \n" |
| 1406 "sub $0x10,%3 \n" |
| 1407 "movlps %%xmm0," MEMACCESS(1) " \n" |
| 1408 BUNDLEALIGN |
| 1409 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
| 1410 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 1411 "jg 1b \n" |
| 1412 : "+r"(src_argb0), // %0 |
| 1413 "+r"(dst_u), // %1 |
| 1414 "+r"(dst_v), // %2 |
| 1415 "+rm"(width) // %3 |
| 1416 : |
| 1417 : "memory", "cc" |
| 1418 #if defined(__native_client__) && defined(__x86_64__) |
| 1419 , "r14" |
| 1420 #endif |
| 1421 #if defined(__SSE2__) |
| 1422 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1423 #endif |
| 1424 ); |
| 1425 } |
| 1426 |
| 1427 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { |
| 1428 asm volatile ( |
| 1429 "movdqa %4,%%xmm5 \n" |
| 1430 "movdqa %3,%%xmm4 \n" |
| 1431 LABELALIGN |
| 1432 "1: \n" |
| 1433 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 1434 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1435 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1436 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 1437 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1438 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 1439 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1440 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 1441 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1442 "phaddw %%xmm1,%%xmm0 \n" |
| 1443 "phaddw %%xmm3,%%xmm2 \n" |
| 1444 "psrlw $0x7,%%xmm0 \n" |
| 1445 "psrlw $0x7,%%xmm2 \n" |
| 1446 "packuswb %%xmm2,%%xmm0 \n" |
| 1447 "paddb %%xmm5,%%xmm0 \n" |
| 1448 "sub $0x10,%2 \n" |
| 1449 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 1450 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 1451 "jg 1b \n" |
| 1452 : "+r"(src_bgra), // %0 |
| 1453 "+r"(dst_y), // %1 |
| 1454 "+r"(pix) // %2 |
| 1455 : "m"(kBGRAToY), // %3 |
| 1456 "m"(kAddY16) // %4 |
| 1457 : "memory", "cc" |
| 1458 #if defined(__SSE2__) |
| 1459 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1460 #endif |
| 1461 ); |
| 1462 } |
| 1463 |
| 1464 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { |
| 1465 asm volatile ( |
| 1466 "movdqa %4,%%xmm5 \n" |
| 1467 "movdqa %3,%%xmm4 \n" |
| 1468 LABELALIGN |
| 1469 "1: \n" |
| 1470 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1471 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1472 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1473 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 1474 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1475 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 1476 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1477 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 1478 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1479 "phaddw %%xmm1,%%xmm0 \n" |
| 1480 "phaddw %%xmm3,%%xmm2 \n" |
| 1481 "psrlw $0x7,%%xmm0 \n" |
| 1482 "psrlw $0x7,%%xmm2 \n" |
| 1483 "packuswb %%xmm2,%%xmm0 \n" |
| 1484 "paddb %%xmm5,%%xmm0 \n" |
| 1485 "sub $0x10,%2 \n" |
| 1486 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 1487 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 1488 "jg 1b \n" |
| 1489 : "+r"(src_bgra), // %0 |
| 1490 "+r"(dst_y), // %1 |
| 1491 "+r"(pix) // %2 |
| 1492 : "m"(kBGRAToY), // %3 |
| 1493 "m"(kAddY16) // %4 |
| 1494 : "memory", "cc" |
| 1495 #if defined(__SSE2__) |
| 1496 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1497 #endif |
| 1498 ); |
| 1499 } |
| 1500 |
| 1501 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, |
| 1502 uint8* dst_u, uint8* dst_v, int width) { |
| 1503 asm volatile ( |
| 1504 "movdqa %0,%%xmm4 \n" |
| 1505 "movdqa %1,%%xmm3 \n" |
| 1506 "movdqa %2,%%xmm5 \n" |
| 1507 : |
| 1508 : "m"(kBGRAToU), // %0 |
| 1509 "m"(kBGRAToV), // %1 |
| 1510 "m"(kAddUV128) // %2 |
| 1511 ); |
| 1512 asm volatile ( |
| 1513 "sub %1,%2 \n" |
| 1514 LABELALIGN |
| 1515 "1: \n" |
| 1516 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 1517 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1518 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1519 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1520 BUNDLEALIGN |
| 1521 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 |
| 1522 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 |
| 1523 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 |
| 1524 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 |
| 1525 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1526 "movdqa %%xmm0,%%xmm7 \n" |
| 1527 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 1528 "shufps $0xdd,%%xmm1,%%xmm7 \n" |
| 1529 "pavgb %%xmm7,%%xmm0 \n" |
| 1530 "movdqa %%xmm2,%%xmm7 \n" |
| 1531 "shufps $0x88,%%xmm6,%%xmm2 \n" |
| 1532 "shufps $0xdd,%%xmm6,%%xmm7 \n" |
| 1533 "pavgb %%xmm7,%%xmm2 \n" |
| 1534 "movdqa %%xmm0,%%xmm1 \n" |
| 1535 "movdqa %%xmm2,%%xmm6 \n" |
| 1536 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1537 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1538 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 1539 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 1540 "phaddw %%xmm2,%%xmm0 \n" |
| 1541 "phaddw %%xmm6,%%xmm1 \n" |
| 1542 "psraw $0x8,%%xmm0 \n" |
| 1543 "psraw $0x8,%%xmm1 \n" |
| 1544 "packsswb %%xmm1,%%xmm0 \n" |
| 1545 "paddb %%xmm5,%%xmm0 \n" |
| 1546 "sub $0x10,%3 \n" |
| 1547 "movlps %%xmm0," MEMACCESS(1) " \n" |
| 1548 BUNDLEALIGN |
| 1549 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
| 1550 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 1551 "jg 1b \n" |
| 1552 : "+r"(src_bgra0), // %0 |
| 1553 "+r"(dst_u), // %1 |
| 1554 "+r"(dst_v), // %2 |
| 1555 "+rm"(width) // %3 |
| 1556 : "r"((intptr_t)(src_stride_bgra)) // %4 |
| 1557 : "memory", "cc" |
| 1558 #if defined(__native_client__) && defined(__x86_64__) |
| 1559 , "r14" |
| 1560 #endif |
| 1561 #if defined(__SSE2__) |
| 1562 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1563 #endif |
| 1564 ); |
| 1565 } |
| 1566 |
| 1567 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, |
| 1568 uint8* dst_u, uint8* dst_v, int width) { |
| 1569 asm volatile ( |
| 1570 "movdqa %0,%%xmm4 \n" |
| 1571 "movdqa %1,%%xmm3 \n" |
| 1572 "movdqa %2,%%xmm5 \n" |
| 1573 : |
| 1574 : "m"(kBGRAToU), // %0 |
| 1575 "m"(kBGRAToV), // %1 |
| 1576 "m"(kAddUV128) // %2 |
| 1577 ); |
| 1578 asm volatile ( |
| 1579 "sub %1,%2 \n" |
| 1580 LABELALIGN |
| 1581 "1: \n" |
| 1582 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1583 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1584 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1585 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1586 BUNDLEALIGN |
| 1587 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
| 1588 "pavgb %%xmm7,%%xmm0 \n" |
| 1589 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 |
| 1590 "pavgb %%xmm7,%%xmm1 \n" |
| 1591 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 |
| 1592 "pavgb %%xmm7,%%xmm2 \n" |
| 1593 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 |
| 1594 "pavgb %%xmm7,%%xmm6 \n" |
| 1595 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1596 "movdqa %%xmm0,%%xmm7 \n" |
| 1597 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 1598 "shufps $0xdd,%%xmm1,%%xmm7 \n" |
| 1599 "pavgb %%xmm7,%%xmm0 \n" |
| 1600 "movdqa %%xmm2,%%xmm7 \n" |
| 1601 "shufps $0x88,%%xmm6,%%xmm2 \n" |
| 1602 "shufps $0xdd,%%xmm6,%%xmm7 \n" |
| 1603 "pavgb %%xmm7,%%xmm2 \n" |
| 1604 "movdqa %%xmm0,%%xmm1 \n" |
| 1605 "movdqa %%xmm2,%%xmm6 \n" |
| 1606 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1607 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1608 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 1609 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 1610 "phaddw %%xmm2,%%xmm0 \n" |
| 1611 "phaddw %%xmm6,%%xmm1 \n" |
| 1612 "psraw $0x8,%%xmm0 \n" |
| 1613 "psraw $0x8,%%xmm1 \n" |
| 1614 "packsswb %%xmm1,%%xmm0 \n" |
| 1615 "paddb %%xmm5,%%xmm0 \n" |
| 1616 "sub $0x10,%3 \n" |
| 1617 "movlps %%xmm0," MEMACCESS(1) " \n" |
| 1618 BUNDLEALIGN |
| 1619 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
| 1620 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 1621 "jg 1b \n" |
| 1622 : "+r"(src_bgra0), // %0 |
| 1623 "+r"(dst_u), // %1 |
| 1624 "+r"(dst_v), // %2 |
| 1625 "+rm"(width) // %3 |
| 1626 : "r"((intptr_t)(src_stride_bgra)) // %4 |
| 1627 : "memory", "cc" |
| 1628 #if defined(__native_client__) && defined(__x86_64__) |
| 1629 , "r14" |
| 1630 #endif |
| 1631 #if defined(__SSE2__) |
| 1632 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1633 #endif |
| 1634 ); |
| 1635 } |
| 1636 |
| 1637 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { |
| 1638 asm volatile ( |
| 1639 "movdqa %4,%%xmm5 \n" |
| 1640 "movdqa %3,%%xmm4 \n" |
| 1641 LABELALIGN |
| 1642 "1: \n" |
| 1643 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 1644 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1645 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1646 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 1647 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1648 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 1649 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1650 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 1651 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1652 "phaddw %%xmm1,%%xmm0 \n" |
| 1653 "phaddw %%xmm3,%%xmm2 \n" |
| 1654 "psrlw $0x7,%%xmm0 \n" |
| 1655 "psrlw $0x7,%%xmm2 \n" |
| 1656 "packuswb %%xmm2,%%xmm0 \n" |
| 1657 "paddb %%xmm5,%%xmm0 \n" |
| 1658 "sub $0x10,%2 \n" |
| 1659 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 1660 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 1661 "jg 1b \n" |
| 1662 : "+r"(src_abgr), // %0 |
| 1663 "+r"(dst_y), // %1 |
| 1664 "+r"(pix) // %2 |
| 1665 : "m"(kABGRToY), // %3 |
| 1666 "m"(kAddY16) // %4 |
| 1667 : "memory", "cc" |
| 1668 #if defined(__SSE2__) |
| 1669 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1670 #endif |
| 1671 ); |
| 1672 } |
| 1673 |
| 1674 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { |
| 1675 asm volatile ( |
| 1676 "movdqa %4,%%xmm5 \n" |
| 1677 "movdqa %3,%%xmm4 \n" |
| 1678 LABELALIGN |
| 1679 "1: \n" |
| 1680 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1681 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1682 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1683 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 1684 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1685 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 1686 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1687 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 1688 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1689 "phaddw %%xmm1,%%xmm0 \n" |
| 1690 "phaddw %%xmm3,%%xmm2 \n" |
| 1691 "psrlw $0x7,%%xmm0 \n" |
| 1692 "psrlw $0x7,%%xmm2 \n" |
| 1693 "packuswb %%xmm2,%%xmm0 \n" |
| 1694 "paddb %%xmm5,%%xmm0 \n" |
| 1695 "sub $0x10,%2 \n" |
| 1696 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 1697 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 1698 "jg 1b \n" |
| 1699 : "+r"(src_abgr), // %0 |
| 1700 "+r"(dst_y), // %1 |
| 1701 "+r"(pix) // %2 |
| 1702 : "m"(kABGRToY), // %3 |
| 1703 "m"(kAddY16) // %4 |
| 1704 : "memory", "cc" |
| 1705 #if defined(__SSE2__) |
| 1706 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1707 #endif |
| 1708 ); |
| 1709 } |
| 1710 |
| 1711 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { |
| 1712 asm volatile ( |
| 1713 "movdqa %4,%%xmm5 \n" |
| 1714 "movdqa %3,%%xmm4 \n" |
| 1715 LABELALIGN |
| 1716 "1: \n" |
| 1717 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 1718 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1719 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1720 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 1721 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1722 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 1723 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1724 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 1725 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1726 "phaddw %%xmm1,%%xmm0 \n" |
| 1727 "phaddw %%xmm3,%%xmm2 \n" |
| 1728 "psrlw $0x7,%%xmm0 \n" |
| 1729 "psrlw $0x7,%%xmm2 \n" |
| 1730 "packuswb %%xmm2,%%xmm0 \n" |
| 1731 "paddb %%xmm5,%%xmm0 \n" |
| 1732 "sub $0x10,%2 \n" |
| 1733 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 1734 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 1735 "jg 1b \n" |
| 1736 : "+r"(src_rgba), // %0 |
| 1737 "+r"(dst_y), // %1 |
| 1738 "+r"(pix) // %2 |
| 1739 : "m"(kRGBAToY), // %3 |
| 1740 "m"(kAddY16) // %4 |
| 1741 : "memory", "cc" |
| 1742 #if defined(__SSE2__) |
| 1743 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1744 #endif |
| 1745 ); |
| 1746 } |
| 1747 |
| 1748 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { |
| 1749 asm volatile ( |
| 1750 "movdqa %4,%%xmm5 \n" |
| 1751 "movdqa %3,%%xmm4 \n" |
| 1752 LABELALIGN |
| 1753 "1: \n" |
| 1754 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1755 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1756 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1757 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 1758 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1759 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 1760 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1761 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 1762 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1763 "phaddw %%xmm1,%%xmm0 \n" |
| 1764 "phaddw %%xmm3,%%xmm2 \n" |
| 1765 "psrlw $0x7,%%xmm0 \n" |
| 1766 "psrlw $0x7,%%xmm2 \n" |
| 1767 "packuswb %%xmm2,%%xmm0 \n" |
| 1768 "paddb %%xmm5,%%xmm0 \n" |
| 1769 "sub $0x10,%2 \n" |
| 1770 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 1771 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 1772 "jg 1b \n" |
| 1773 : "+r"(src_rgba), // %0 |
| 1774 "+r"(dst_y), // %1 |
| 1775 "+r"(pix) // %2 |
| 1776 : "m"(kRGBAToY), // %3 |
| 1777 "m"(kAddY16) // %4 |
| 1778 : "memory", "cc" |
| 1779 #if defined(__SSE2__) |
| 1780 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1781 #endif |
| 1782 ); |
| 1783 } |
| 1784 |
| 1785 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, |
| 1786 uint8* dst_u, uint8* dst_v, int width) { |
| 1787 asm volatile ( |
| 1788 "movdqa %0,%%xmm4 \n" |
| 1789 "movdqa %1,%%xmm3 \n" |
| 1790 "movdqa %2,%%xmm5 \n" |
| 1791 : |
| 1792 : "m"(kABGRToU), // %0 |
| 1793 "m"(kABGRToV), // %1 |
| 1794 "m"(kAddUV128) // %2 |
| 1795 ); |
| 1796 asm volatile ( |
| 1797 "sub %1,%2 \n" |
| 1798 LABELALIGN |
| 1799 "1: \n" |
| 1800 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 1801 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1802 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1803 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1804 BUNDLEALIGN |
| 1805 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 |
| 1806 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 |
| 1807 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 |
| 1808 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 |
| 1809 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1810 "movdqa %%xmm0,%%xmm7 \n" |
| 1811 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 1812 "shufps $0xdd,%%xmm1,%%xmm7 \n" |
| 1813 "pavgb %%xmm7,%%xmm0 \n" |
| 1814 "movdqa %%xmm2,%%xmm7 \n" |
| 1815 "shufps $0x88,%%xmm6,%%xmm2 \n" |
| 1816 "shufps $0xdd,%%xmm6,%%xmm7 \n" |
| 1817 "pavgb %%xmm7,%%xmm2 \n" |
| 1818 "movdqa %%xmm0,%%xmm1 \n" |
| 1819 "movdqa %%xmm2,%%xmm6 \n" |
| 1820 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1821 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1822 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 1823 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 1824 "phaddw %%xmm2,%%xmm0 \n" |
| 1825 "phaddw %%xmm6,%%xmm1 \n" |
| 1826 "psraw $0x8,%%xmm0 \n" |
| 1827 "psraw $0x8,%%xmm1 \n" |
| 1828 "packsswb %%xmm1,%%xmm0 \n" |
| 1829 "paddb %%xmm5,%%xmm0 \n" |
| 1830 "sub $0x10,%3 \n" |
| 1831 "movlps %%xmm0," MEMACCESS(1) " \n" |
| 1832 BUNDLEALIGN |
| 1833 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
| 1834 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 1835 "jg 1b \n" |
| 1836 : "+r"(src_abgr0), // %0 |
| 1837 "+r"(dst_u), // %1 |
| 1838 "+r"(dst_v), // %2 |
| 1839 "+rm"(width) // %3 |
| 1840 : "r"((intptr_t)(src_stride_abgr)) // %4 |
| 1841 : "memory", "cc" |
| 1842 #if defined(__native_client__) && defined(__x86_64__) |
| 1843 , "r14" |
| 1844 #endif |
| 1845 #if defined(__SSE2__) |
| 1846 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1847 #endif |
| 1848 ); |
| 1849 } |
| 1850 |
| 1851 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, |
| 1852 uint8* dst_u, uint8* dst_v, int width) { |
| 1853 asm volatile ( |
| 1854 "movdqa %0,%%xmm4 \n" |
| 1855 "movdqa %1,%%xmm3 \n" |
| 1856 "movdqa %2,%%xmm5 \n" |
| 1857 : |
| 1858 : "m"(kABGRToU), // %0 |
| 1859 "m"(kABGRToV), // %1 |
| 1860 "m"(kAddUV128) // %2 |
| 1861 ); |
| 1862 asm volatile ( |
| 1863 "sub %1,%2 \n" |
| 1864 LABELALIGN |
| 1865 "1: \n" |
| 1866 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1867 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1868 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1869 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1870 BUNDLEALIGN |
| 1871 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
| 1872 "pavgb %%xmm7,%%xmm0 \n" |
| 1873 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 |
| 1874 "pavgb %%xmm7,%%xmm1 \n" |
| 1875 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 |
| 1876 "pavgb %%xmm7,%%xmm2 \n" |
| 1877 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 |
| 1878 "pavgb %%xmm7,%%xmm6 \n" |
| 1879 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1880 "movdqa %%xmm0,%%xmm7 \n" |
| 1881 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 1882 "shufps $0xdd,%%xmm1,%%xmm7 \n" |
| 1883 "pavgb %%xmm7,%%xmm0 \n" |
| 1884 "movdqa %%xmm2,%%xmm7 \n" |
| 1885 "shufps $0x88,%%xmm6,%%xmm2 \n" |
| 1886 "shufps $0xdd,%%xmm6,%%xmm7 \n" |
| 1887 "pavgb %%xmm7,%%xmm2 \n" |
| 1888 "movdqa %%xmm0,%%xmm1 \n" |
| 1889 "movdqa %%xmm2,%%xmm6 \n" |
| 1890 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1891 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1892 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 1893 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 1894 "phaddw %%xmm2,%%xmm0 \n" |
| 1895 "phaddw %%xmm6,%%xmm1 \n" |
| 1896 "psraw $0x8,%%xmm0 \n" |
| 1897 "psraw $0x8,%%xmm1 \n" |
| 1898 "packsswb %%xmm1,%%xmm0 \n" |
| 1899 "paddb %%xmm5,%%xmm0 \n" |
| 1900 "sub $0x10,%3 \n" |
| 1901 "movlps %%xmm0," MEMACCESS(1) " \n" |
| 1902 BUNDLEALIGN |
| 1903 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
| 1904 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 1905 "jg 1b \n" |
| 1906 : "+r"(src_abgr0), // %0 |
| 1907 "+r"(dst_u), // %1 |
| 1908 "+r"(dst_v), // %2 |
| 1909 "+rm"(width) // %3 |
| 1910 : "r"((intptr_t)(src_stride_abgr)) // %4 |
| 1911 : "memory", "cc" |
| 1912 #if defined(__native_client__) && defined(__x86_64__) |
| 1913 , "r14" |
| 1914 #endif |
| 1915 #if defined(__SSE2__) |
| 1916 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1917 #endif |
| 1918 ); |
| 1919 } |
| 1920 |
| 1921 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, |
| 1922 uint8* dst_u, uint8* dst_v, int width) { |
| 1923 asm volatile ( |
| 1924 "movdqa %0,%%xmm4 \n" |
| 1925 "movdqa %1,%%xmm3 \n" |
| 1926 "movdqa %2,%%xmm5 \n" |
| 1927 : |
| 1928 : "m"(kRGBAToU), // %0 |
| 1929 "m"(kRGBAToV), // %1 |
| 1930 "m"(kAddUV128) // %2 |
| 1931 ); |
| 1932 asm volatile ( |
| 1933 "sub %1,%2 \n" |
| 1934 LABELALIGN |
| 1935 "1: \n" |
| 1936 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 1937 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 1938 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 1939 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 1940 BUNDLEALIGN |
| 1941 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 |
| 1942 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 |
| 1943 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 |
| 1944 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 |
| 1945 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 1946 "movdqa %%xmm0,%%xmm7 \n" |
| 1947 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 1948 "shufps $0xdd,%%xmm1,%%xmm7 \n" |
| 1949 "pavgb %%xmm7,%%xmm0 \n" |
| 1950 "movdqa %%xmm2,%%xmm7 \n" |
| 1951 "shufps $0x88,%%xmm6,%%xmm2 \n" |
| 1952 "shufps $0xdd,%%xmm6,%%xmm7 \n" |
| 1953 "pavgb %%xmm7,%%xmm2 \n" |
| 1954 "movdqa %%xmm0,%%xmm1 \n" |
| 1955 "movdqa %%xmm2,%%xmm6 \n" |
| 1956 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 1957 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 1958 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 1959 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 1960 "phaddw %%xmm2,%%xmm0 \n" |
| 1961 "phaddw %%xmm6,%%xmm1 \n" |
| 1962 "psraw $0x8,%%xmm0 \n" |
| 1963 "psraw $0x8,%%xmm1 \n" |
| 1964 "packsswb %%xmm1,%%xmm0 \n" |
| 1965 "paddb %%xmm5,%%xmm0 \n" |
| 1966 "sub $0x10,%3 \n" |
| 1967 "movlps %%xmm0," MEMACCESS(1) " \n" |
| 1968 BUNDLEALIGN |
| 1969 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
| 1970 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 1971 "jg 1b \n" |
| 1972 : "+r"(src_rgba0), // %0 |
| 1973 "+r"(dst_u), // %1 |
| 1974 "+r"(dst_v), // %2 |
| 1975 "+rm"(width) // %3 |
| 1976 : "r"((intptr_t)(src_stride_rgba)) |
| 1977 : "memory", "cc" |
| 1978 #if defined(__native_client__) && defined(__x86_64__) |
| 1979 , "r14" |
| 1980 #endif |
| 1981 #if defined(__SSE2__) |
| 1982 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1983 #endif |
| 1984 ); |
| 1985 } |
| 1986 |
| 1987 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, |
| 1988 uint8* dst_u, uint8* dst_v, int width) { |
| 1989 asm volatile ( |
| 1990 "movdqa %0,%%xmm4 \n" |
| 1991 "movdqa %1,%%xmm3 \n" |
| 1992 "movdqa %2,%%xmm5 \n" |
| 1993 : |
| 1994 : "m"(kRGBAToU), // %0 |
| 1995 "m"(kRGBAToV), // %1 |
| 1996 "m"(kAddUV128) // %2 |
| 1997 ); |
| 1998 asm volatile ( |
| 1999 "sub %1,%2 \n" |
| 2000 LABELALIGN |
| 2001 "1: \n" |
| 2002 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 2003 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 2004 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 2005 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
| 2006 BUNDLEALIGN |
| 2007 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
| 2008 "pavgb %%xmm7,%%xmm0 \n" |
| 2009 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 |
| 2010 "pavgb %%xmm7,%%xmm1 \n" |
| 2011 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 |
| 2012 "pavgb %%xmm7,%%xmm2 \n" |
| 2013 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 |
| 2014 "pavgb %%xmm7,%%xmm6 \n" |
| 2015 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 2016 "movdqa %%xmm0,%%xmm7 \n" |
| 2017 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 2018 "shufps $0xdd,%%xmm1,%%xmm7 \n" |
| 2019 "pavgb %%xmm7,%%xmm0 \n" |
| 2020 "movdqa %%xmm2,%%xmm7 \n" |
| 2021 "shufps $0x88,%%xmm6,%%xmm2 \n" |
| 2022 "shufps $0xdd,%%xmm6,%%xmm7 \n" |
| 2023 "pavgb %%xmm7,%%xmm2 \n" |
| 2024 "movdqa %%xmm0,%%xmm1 \n" |
| 2025 "movdqa %%xmm2,%%xmm6 \n" |
| 2026 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 2027 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 2028 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 2029 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 2030 "phaddw %%xmm2,%%xmm0 \n" |
| 2031 "phaddw %%xmm6,%%xmm1 \n" |
| 2032 "psraw $0x8,%%xmm0 \n" |
| 2033 "psraw $0x8,%%xmm1 \n" |
| 2034 "packsswb %%xmm1,%%xmm0 \n" |
| 2035 "paddb %%xmm5,%%xmm0 \n" |
| 2036 "sub $0x10,%3 \n" |
| 2037 "movlps %%xmm0," MEMACCESS(1) " \n" |
| 2038 BUNDLEALIGN |
| 2039 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
| 2040 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 2041 "jg 1b \n" |
| 2042 : "+r"(src_rgba0), // %0 |
| 2043 "+r"(dst_u), // %1 |
| 2044 "+r"(dst_v), // %2 |
| 2045 "+rm"(width) // %3 |
| 2046 : "r"((intptr_t)(src_stride_rgba)) // %4 |
| 2047 : "memory", "cc" |
| 2048 #if defined(__native_client__) && defined(__x86_64__) |
| 2049 , "r14" |
| 2050 #endif |
| 2051 #if defined(__SSE2__) |
| 2052 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 2053 #endif |
| 2054 ); |
| 2055 } |
| 2056 #endif // HAS_ARGBTOUVROW_SSSE3 |
| 2057 |
| 2058 #ifdef HAS_I422TOARGBROW_SSSE3 |
| 2059 #define UB 127 /* min(63,(int8)(2.018 * 64)) */ |
| 2060 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ |
| 2061 #define UR 0 |
| 2062 |
| 2063 #define VB 0 |
| 2064 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ |
| 2065 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */ |
| 2066 |
| 2067 // Bias |
| 2068 #define BB UB * 128 + VB * 128 |
| 2069 #define BG UG * 128 + VG * 128 |
| 2070 #define BR UR * 128 + VR * 128 |
| 2071 |
| 2072 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */ |
| 2073 |
| 2074 struct { |
| 2075 vec8 kUVToB; // 0 |
| 2076 vec8 kUVToG; // 16 |
| 2077 vec8 kUVToR; // 32 |
| 2078 vec16 kUVBiasB; // 48 |
| 2079 vec16 kUVBiasG; // 64 |
| 2080 vec16 kUVBiasR; // 80 |
| 2081 vec16 kYSub16; // 96 |
| 2082 vec16 kYToRgb; // 112 |
| 2083 vec8 kVUToB; // 128 |
| 2084 vec8 kVUToG; // 144 |
| 2085 vec8 kVUToR; // 160 |
| 2086 } static SIMD_ALIGNED(kYuvConstants) = { |
| 2087 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, |
| 2088 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, |
| 2089 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }, |
| 2090 { BB, BB, BB, BB, BB, BB, BB, BB }, |
| 2091 { BG, BG, BG, BG, BG, BG, BG, BG }, |
| 2092 { BR, BR, BR, BR, BR, BR, BR, BR }, |
| 2093 { 16, 16, 16, 16, 16, 16, 16, 16 }, |
| 2094 { YG, YG, YG, YG, YG, YG, YG, YG }, |
| 2095 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB }, |
| 2096 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, |
| 2097 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR } |
| 2098 }; |
| 2099 |
| 2100 |
| 2101 // Read 8 UV from 411 |
| 2102 #define READYUV444 \ |
| 2103 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 2104 BUNDLEALIGN \ |
| 2105 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 2106 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
| 2107 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2108 |
| 2109 // Read 4 UV from 422, upsample to 8 UV |
| 2110 #define READYUV422 \ |
| 2111 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 2112 BUNDLEALIGN \ |
| 2113 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 2114 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
| 2115 "punpcklbw %%xmm1,%%xmm0 \n" \ |
| 2116 "punpcklwd %%xmm0,%%xmm0 \n" |
| 2117 |
| 2118 // Read 2 UV from 411, upsample to 8 UV |
| 2119 #define READYUV411 \ |
| 2120 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 2121 BUNDLEALIGN \ |
| 2122 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 2123 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ |
| 2124 "punpcklbw %%xmm1,%%xmm0 \n" \ |
| 2125 "punpcklwd %%xmm0,%%xmm0 \n" \ |
| 2126 "punpckldq %%xmm0,%%xmm0 \n" |
| 2127 |
| 2128 // Read 4 UV from NV12, upsample to 8 UV |
| 2129 #define READNV12 \ |
| 2130 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
| 2131 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ |
| 2132 "punpcklwd %%xmm0,%%xmm0 \n" |
| 2133 |
| 2134 // Convert 8 pixels: 8 UV and 8 Y |
| 2135 #define YUVTORGB \ |
| 2136 "movdqa %%xmm0,%%xmm1 \n" \ |
| 2137 "movdqa %%xmm0,%%xmm2 \n" \ |
| 2138 "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \ |
| 2139 "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \ |
| 2140 "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \ |
| 2141 "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ |
| 2142 "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ |
| 2143 "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ |
| 2144 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ |
| 2145 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ |
| 2146 "punpcklbw %%xmm4,%%xmm3 \n" \ |
| 2147 "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ |
| 2148 "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ |
| 2149 "paddsw %%xmm3,%%xmm0 \n" \ |
| 2150 "paddsw %%xmm3,%%xmm1 \n" \ |
| 2151 "paddsw %%xmm3,%%xmm2 \n" \ |
| 2152 "psraw $0x6,%%xmm0 \n" \ |
| 2153 "psraw $0x6,%%xmm1 \n" \ |
| 2154 "psraw $0x6,%%xmm2 \n" \ |
| 2155 "packuswb %%xmm0,%%xmm0 \n" \ |
| 2156 "packuswb %%xmm1,%%xmm1 \n" \ |
| 2157 "packuswb %%xmm2,%%xmm2 \n" |
| 2158 |
| 2159 // Convert 8 pixels: 8 VU and 8 Y |
| 2160 #define YVUTORGB \ |
| 2161 "movdqa %%xmm0,%%xmm1 \n" \ |
| 2162 "movdqa %%xmm0,%%xmm2 \n" \ |
| 2163 "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \ |
| 2164 "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \ |
| 2165 "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \ |
| 2166 "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ |
| 2167 "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ |
| 2168 "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ |
| 2169 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ |
| 2170 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ |
| 2171 "punpcklbw %%xmm4,%%xmm3 \n" \ |
| 2172 "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ |
| 2173 "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ |
| 2174 "paddsw %%xmm3,%%xmm0 \n" \ |
| 2175 "paddsw %%xmm3,%%xmm1 \n" \ |
| 2176 "paddsw %%xmm3,%%xmm2 \n" \ |
| 2177 "psraw $0x6,%%xmm0 \n" \ |
| 2178 "psraw $0x6,%%xmm1 \n" \ |
| 2179 "psraw $0x6,%%xmm2 \n" \ |
| 2180 "packuswb %%xmm0,%%xmm0 \n" \ |
| 2181 "packuswb %%xmm1,%%xmm1 \n" \ |
| 2182 "packuswb %%xmm2,%%xmm2 \n" |
| 2183 |
| 2184 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, |
| 2185 const uint8* u_buf, |
| 2186 const uint8* v_buf, |
| 2187 uint8* dst_argb, |
| 2188 int width) { |
| 2189 asm volatile ( |
| 2190 "sub %[u_buf],%[v_buf] \n" |
| 2191 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2192 "pxor %%xmm4,%%xmm4 \n" |
| 2193 LABELALIGN |
| 2194 "1: \n" |
| 2195 READYUV444 |
| 2196 YUVTORGB |
| 2197 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2198 "punpcklbw %%xmm5,%%xmm2 \n" |
| 2199 "movdqa %%xmm0,%%xmm1 \n" |
| 2200 "punpcklwd %%xmm2,%%xmm0 \n" |
| 2201 "punpckhwd %%xmm2,%%xmm1 \n" |
| 2202 "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n" |
| 2203 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" |
| 2204 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
| 2205 "sub $0x8,%[width] \n" |
| 2206 "jg 1b \n" |
| 2207 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2208 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2209 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2210 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2211 [width]"+rm"(width) // %[width] |
| 2212 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2213 : "memory", "cc" |
| 2214 #if defined(__native_client__) && defined(__x86_64__) |
| 2215 , "r14" |
| 2216 #endif |
| 2217 #if defined(__SSE2__) |
| 2218 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2219 #endif |
| 2220 ); |
| 2221 } |
| 2222 |
| 2223 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, |
| 2224 const uint8* u_buf, |
| 2225 const uint8* v_buf, |
| 2226 uint8* dst_rgb24, |
| 2227 int width) { |
| 2228 // fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. |
| 2229 #if defined(__i386__) |
| 2230 asm volatile ( |
| 2231 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" |
| 2232 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" |
| 2233 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), |
| 2234 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)); |
| 2235 #endif |
| 2236 |
| 2237 asm volatile ( |
| 2238 #if !defined(__i386__) |
| 2239 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" |
| 2240 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" |
| 2241 #endif |
| 2242 "sub %[u_buf],%[v_buf] \n" |
| 2243 "pxor %%xmm4,%%xmm4 \n" |
| 2244 LABELALIGN |
| 2245 "1: \n" |
| 2246 READYUV422 |
| 2247 YUVTORGB |
| 2248 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2249 "punpcklbw %%xmm2,%%xmm2 \n" |
| 2250 "movdqa %%xmm0,%%xmm1 \n" |
| 2251 "punpcklwd %%xmm2,%%xmm0 \n" |
| 2252 "punpckhwd %%xmm2,%%xmm1 \n" |
| 2253 "pshufb %%xmm5,%%xmm0 \n" |
| 2254 "pshufb %%xmm6,%%xmm1 \n" |
| 2255 "palignr $0xc,%%xmm0,%%xmm1 \n" |
| 2256 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" |
| 2257 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" |
| 2258 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" |
| 2259 "sub $0x8,%[width] \n" |
| 2260 "jg 1b \n" |
| 2261 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2262 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2263 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2264 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] |
| 2265 [width]"+rm"(width) // %[width] |
| 2266 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) |
| 2267 #if !defined(__i386__) |
| 2268 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), |
| 2269 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) |
| 2270 #endif |
| 2271 : "memory", "cc" |
| 2272 #if defined(__native_client__) && defined(__x86_64__) |
| 2273 , "r14" |
| 2274 #endif |
| 2275 #if defined(__SSE2__) |
| 2276 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 2277 #endif |
| 2278 ); |
| 2279 } |
| 2280 |
| 2281 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, |
| 2282 const uint8* u_buf, |
| 2283 const uint8* v_buf, |
| 2284 uint8* dst_raw, |
| 2285 int width) { |
| 2286 // fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. |
| 2287 #if defined(__i386__) |
| 2288 asm volatile ( |
| 2289 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" |
| 2290 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" |
| 2291 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), |
| 2292 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)); |
| 2293 #endif |
| 2294 |
| 2295 asm volatile ( |
| 2296 #if !defined(__i386__) |
| 2297 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" |
| 2298 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" |
| 2299 #endif |
| 2300 "sub %[u_buf],%[v_buf] \n" |
| 2301 "pxor %%xmm4,%%xmm4 \n" |
| 2302 LABELALIGN |
| 2303 "1: \n" |
| 2304 READYUV422 |
| 2305 YUVTORGB |
| 2306 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2307 "punpcklbw %%xmm2,%%xmm2 \n" |
| 2308 "movdqa %%xmm0,%%xmm1 \n" |
| 2309 "punpcklwd %%xmm2,%%xmm0 \n" |
| 2310 "punpckhwd %%xmm2,%%xmm1 \n" |
| 2311 "pshufb %%xmm5,%%xmm0 \n" |
| 2312 "pshufb %%xmm6,%%xmm1 \n" |
| 2313 "palignr $0xc,%%xmm0,%%xmm1 \n" |
| 2314 "movq %%xmm0," MEMACCESS([dst_raw]) " \n" |
| 2315 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" |
| 2316 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" |
| 2317 "sub $0x8,%[width] \n" |
| 2318 "jg 1b \n" |
| 2319 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2320 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2321 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2322 [dst_raw]"+r"(dst_raw), // %[dst_raw] |
| 2323 [width]"+rm"(width) // %[width] |
| 2324 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) |
| 2325 #if !defined(__i386__) |
| 2326 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), |
| 2327 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) |
| 2328 #endif |
| 2329 : "memory", "cc" |
| 2330 #if defined(__native_client__) && defined(__x86_64__) |
| 2331 , "r14" |
| 2332 #endif |
| 2333 #if defined(__SSE2__) |
| 2334 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 2335 #endif |
| 2336 ); |
| 2337 } |
| 2338 |
| 2339 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, |
| 2340 const uint8* u_buf, |
| 2341 const uint8* v_buf, |
| 2342 uint8* dst_argb, |
| 2343 int width) { |
| 2344 asm volatile ( |
| 2345 "sub %[u_buf],%[v_buf] \n" |
| 2346 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2347 "pxor %%xmm4,%%xmm4 \n" |
| 2348 LABELALIGN |
| 2349 "1: \n" |
| 2350 READYUV422 |
| 2351 YUVTORGB |
| 2352 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2353 "punpcklbw %%xmm5,%%xmm2 \n" |
| 2354 "movdqa %%xmm0,%%xmm1 \n" |
| 2355 "punpcklwd %%xmm2,%%xmm0 \n" |
| 2356 "punpckhwd %%xmm2,%%xmm1 \n" |
| 2357 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" |
| 2358 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
| 2359 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
| 2360 "sub $0x8,%[width] \n" |
| 2361 "jg 1b \n" |
| 2362 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2363 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2364 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2365 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2366 [width]"+rm"(width) // %[width] |
| 2367 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2368 : "memory", "cc" |
| 2369 #if defined(__native_client__) && defined(__x86_64__) |
| 2370 , "r14" |
| 2371 #endif |
| 2372 #if defined(__SSE2__) |
| 2373 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2374 #endif |
| 2375 ); |
| 2376 } |
| 2377 |
| 2378 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, |
| 2379 const uint8* u_buf, |
| 2380 const uint8* v_buf, |
| 2381 uint8* dst_argb, |
| 2382 int width) { |
| 2383 asm volatile ( |
| 2384 "sub %[u_buf],%[v_buf] \n" |
| 2385 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2386 "pxor %%xmm4,%%xmm4 \n" |
| 2387 LABELALIGN |
| 2388 "1: \n" |
| 2389 READYUV411 |
| 2390 YUVTORGB |
| 2391 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2392 "punpcklbw %%xmm5,%%xmm2 \n" |
| 2393 "movdqa %%xmm0,%%xmm1 \n" |
| 2394 "punpcklwd %%xmm2,%%xmm0 \n" |
| 2395 "punpckhwd %%xmm2,%%xmm1 \n" |
| 2396 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" |
| 2397 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
| 2398 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
| 2399 "sub $0x8,%[width] \n" |
| 2400 "jg 1b \n" |
| 2401 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2402 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2403 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2404 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2405 [width]"+rm"(width) // %[width] |
| 2406 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2407 : "memory", "cc" |
| 2408 #if defined(__native_client__) && defined(__x86_64__) |
| 2409 , "r14" |
| 2410 #endif |
| 2411 #if defined(__SSE2__) |
| 2412 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2413 #endif |
| 2414 ); |
| 2415 } |
| 2416 |
| 2417 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, |
| 2418 const uint8* uv_buf, |
| 2419 uint8* dst_argb, |
| 2420 int width) { |
| 2421 asm volatile ( |
| 2422 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2423 "pxor %%xmm4,%%xmm4 \n" |
| 2424 LABELALIGN |
| 2425 "1: \n" |
| 2426 READNV12 |
| 2427 YUVTORGB |
| 2428 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2429 "punpcklbw %%xmm5,%%xmm2 \n" |
| 2430 "movdqa %%xmm0,%%xmm1 \n" |
| 2431 "punpcklwd %%xmm2,%%xmm0 \n" |
| 2432 "punpckhwd %%xmm2,%%xmm1 \n" |
| 2433 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" |
| 2434 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
| 2435 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
| 2436 "sub $0x8,%[width] \n" |
| 2437 "jg 1b \n" |
| 2438 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2439 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
| 2440 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2441 [width]"+rm"(width) // %[width] |
| 2442 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2443 : "memory", "cc" |
| 2444 // Does not use r14. |
| 2445 #if defined(__SSE2__) |
| 2446 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2447 #endif |
| 2448 ); |
| 2449 } |
| 2450 |
| 2451 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, |
| 2452 const uint8* uv_buf, |
| 2453 uint8* dst_argb, |
| 2454 int width) { |
| 2455 asm volatile ( |
| 2456 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2457 "pxor %%xmm4,%%xmm4 \n" |
| 2458 LABELALIGN |
| 2459 "1: \n" |
| 2460 READNV12 |
| 2461 YVUTORGB |
| 2462 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2463 "punpcklbw %%xmm5,%%xmm2 \n" |
| 2464 "movdqa %%xmm0,%%xmm1 \n" |
| 2465 "punpcklwd %%xmm2,%%xmm0 \n" |
| 2466 "punpckhwd %%xmm2,%%xmm1 \n" |
| 2467 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" |
| 2468 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
| 2469 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
| 2470 "sub $0x8,%[width] \n" |
| 2471 "jg 1b \n" |
| 2472 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2473 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
| 2474 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2475 [width]"+rm"(width) // %[width] |
| 2476 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2477 : "memory", "cc" |
| 2478 // Does not use r14. |
| 2479 #if defined(__SSE2__) |
| 2480 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2481 #endif |
| 2482 ); |
| 2483 } |
| 2484 |
| 2485 void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
| 2486 const uint8* u_buf, |
| 2487 const uint8* v_buf, |
| 2488 uint8* dst_argb, |
| 2489 int width) { |
| 2490 asm volatile ( |
| 2491 "sub %[u_buf],%[v_buf] \n" |
| 2492 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2493 "pxor %%xmm4,%%xmm4 \n" |
| 2494 LABELALIGN |
| 2495 "1: \n" |
| 2496 READYUV444 |
| 2497 YUVTORGB |
| 2498 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2499 "punpcklbw %%xmm5,%%xmm2 \n" |
| 2500 "movdqa %%xmm0,%%xmm1 \n" |
| 2501 "punpcklwd %%xmm2,%%xmm0 \n" |
| 2502 "punpckhwd %%xmm2,%%xmm1 \n" |
| 2503 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" |
| 2504 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
| 2505 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
| 2506 "sub $0x8,%[width] \n" |
| 2507 "jg 1b \n" |
| 2508 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2509 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2510 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2511 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2512 [width]"+rm"(width) // %[width] |
| 2513 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2514 : "memory", "cc" |
| 2515 #if defined(__native_client__) && defined(__x86_64__) |
| 2516 , "r14" |
| 2517 #endif |
| 2518 #if defined(__SSE2__) |
| 2519 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2520 #endif |
| 2521 ); |
| 2522 } |
| 2523 |
| 2524 void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
| 2525 const uint8* u_buf, |
| 2526 const uint8* v_buf, |
| 2527 uint8* dst_argb, |
| 2528 int width) { |
| 2529 asm volatile ( |
| 2530 "sub %[u_buf],%[v_buf] \n" |
| 2531 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2532 "pxor %%xmm4,%%xmm4 \n" |
| 2533 LABELALIGN |
| 2534 "1: \n" |
| 2535 READYUV422 |
| 2536 YUVTORGB |
| 2537 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2538 "punpcklbw %%xmm5,%%xmm2 \n" |
| 2539 "movdqa %%xmm0,%%xmm1 \n" |
| 2540 "punpcklwd %%xmm2,%%xmm0 \n" |
| 2541 "punpckhwd %%xmm2,%%xmm1 \n" |
| 2542 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" |
| 2543 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
| 2544 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
| 2545 "sub $0x8,%[width] \n" |
| 2546 "jg 1b \n" |
| 2547 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2548 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2549 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2550 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2551 [width]"+rm"(width) // %[width] |
| 2552 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2553 : "memory", "cc" |
| 2554 #if defined(__native_client__) && defined(__x86_64__) |
| 2555 , "r14" |
| 2556 #endif |
| 2557 #if defined(__SSE2__) |
| 2558 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2559 #endif |
| 2560 ); |
| 2561 } |
| 2562 |
| 2563 void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
| 2564 const uint8* u_buf, |
| 2565 const uint8* v_buf, |
| 2566 uint8* dst_argb, |
| 2567 int width) { |
| 2568 asm volatile ( |
| 2569 "sub %[u_buf],%[v_buf] \n" |
| 2570 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2571 "pxor %%xmm4,%%xmm4 \n" |
| 2572 LABELALIGN |
| 2573 "1: \n" |
| 2574 READYUV411 |
| 2575 YUVTORGB |
| 2576 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2577 "punpcklbw %%xmm5,%%xmm2 \n" |
| 2578 "movdqa %%xmm0,%%xmm1 \n" |
| 2579 "punpcklwd %%xmm2,%%xmm0 \n" |
| 2580 "punpckhwd %%xmm2,%%xmm1 \n" |
| 2581 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" |
| 2582 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
| 2583 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
| 2584 "sub $0x8,%[width] \n" |
| 2585 "jg 1b \n" |
| 2586 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2587 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2588 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2589 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2590 [width]"+rm"(width) // %[width] |
| 2591 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2592 : "memory", "cc" |
| 2593 #if defined(__native_client__) && defined(__x86_64__) |
| 2594 , "r14" |
| 2595 #endif |
| 2596 #if defined(__SSE2__) |
| 2597 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2598 #endif |
| 2599 ); |
| 2600 } |
| 2601 |
| 2602 void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
| 2603 const uint8* uv_buf, |
| 2604 uint8* dst_argb, |
| 2605 int width) { |
| 2606 asm volatile ( |
| 2607 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2608 "pxor %%xmm4,%%xmm4 \n" |
| 2609 LABELALIGN |
| 2610 "1: \n" |
| 2611 READNV12 |
| 2612 YUVTORGB |
| 2613 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2614 "punpcklbw %%xmm5,%%xmm2 \n" |
| 2615 "movdqa %%xmm0,%%xmm1 \n" |
| 2616 "punpcklwd %%xmm2,%%xmm0 \n" |
| 2617 "punpckhwd %%xmm2,%%xmm1 \n" |
| 2618 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" |
| 2619 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
| 2620 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
| 2621 "sub $0x8,%[width] \n" |
| 2622 "jg 1b \n" |
| 2623 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2624 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
| 2625 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2626 [width]"+rm"(width) // %[width] |
| 2627 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2628 : "memory", "cc" |
| 2629 // Does not use r14. |
| 2630 #if defined(__SSE2__) |
| 2631 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2632 #endif |
| 2633 ); |
| 2634 } |
| 2635 |
| 2636 void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
| 2637 const uint8* uv_buf, |
| 2638 uint8* dst_argb, |
| 2639 int width) { |
| 2640 asm volatile ( |
| 2641 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2642 "pxor %%xmm4,%%xmm4 \n" |
| 2643 LABELALIGN |
| 2644 "1: \n" |
| 2645 READNV12 |
| 2646 YVUTORGB |
| 2647 "punpcklbw %%xmm1,%%xmm0 \n" |
| 2648 "punpcklbw %%xmm5,%%xmm2 \n" |
| 2649 "movdqa %%xmm0,%%xmm1 \n" |
| 2650 "punpcklwd %%xmm2,%%xmm0 \n" |
| 2651 "punpckhwd %%xmm2,%%xmm1 \n" |
| 2652 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" |
| 2653 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
| 2654 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
| 2655 "sub $0x8,%[width] \n" |
| 2656 "jg 1b \n" |
| 2657 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2658 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
| 2659 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2660 [width]"+rm"(width) // %[width] |
| 2661 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2662 : "memory", "cc" |
| 2663 // Does not use r14. |
| 2664 #if defined(__SSE2__) |
| 2665 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2666 #endif |
| 2667 ); |
| 2668 } |
| 2669 |
| 2670 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, |
| 2671 const uint8* u_buf, |
| 2672 const uint8* v_buf, |
| 2673 uint8* dst_bgra, |
| 2674 int width) { |
| 2675 asm volatile ( |
| 2676 "sub %[u_buf],%[v_buf] \n" |
| 2677 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2678 "pxor %%xmm4,%%xmm4 \n" |
| 2679 LABELALIGN |
| 2680 "1: \n" |
| 2681 READYUV422 |
| 2682 YUVTORGB |
| 2683 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2684 "punpcklbw %%xmm0,%%xmm1 \n" |
| 2685 "punpcklbw %%xmm2,%%xmm5 \n" |
| 2686 "movdqa %%xmm5,%%xmm0 \n" |
| 2687 "punpcklwd %%xmm1,%%xmm5 \n" |
| 2688 "punpckhwd %%xmm1,%%xmm0 \n" |
| 2689 "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n" |
| 2690 "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" |
| 2691 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" |
| 2692 "sub $0x8,%[width] \n" |
| 2693 "jg 1b \n" |
| 2694 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2695 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2696 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2697 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] |
| 2698 [width]"+rm"(width) // %[width] |
| 2699 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2700 : "memory", "cc" |
| 2701 #if defined(__native_client__) && defined(__x86_64__) |
| 2702 , "r14" |
| 2703 #endif |
| 2704 #if defined(__SSE2__) |
| 2705 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2706 #endif |
| 2707 ); |
| 2708 } |
| 2709 |
| 2710 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, |
| 2711 const uint8* u_buf, |
| 2712 const uint8* v_buf, |
| 2713 uint8* dst_abgr, |
| 2714 int width) { |
| 2715 asm volatile ( |
| 2716 "sub %[u_buf],%[v_buf] \n" |
| 2717 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2718 "pxor %%xmm4,%%xmm4 \n" |
| 2719 LABELALIGN |
| 2720 "1: \n" |
| 2721 READYUV422 |
| 2722 YUVTORGB |
| 2723 "punpcklbw %%xmm1,%%xmm2 \n" |
| 2724 "punpcklbw %%xmm5,%%xmm0 \n" |
| 2725 "movdqa %%xmm2,%%xmm1 \n" |
| 2726 "punpcklwd %%xmm0,%%xmm2 \n" |
| 2727 "punpckhwd %%xmm0,%%xmm1 \n" |
| 2728 "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n" |
| 2729 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" |
| 2730 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" |
| 2731 "sub $0x8,%[width] \n" |
| 2732 "jg 1b \n" |
| 2733 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2734 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2735 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2736 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
| 2737 [width]"+rm"(width) // %[width] |
| 2738 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2739 : "memory", "cc" |
| 2740 #if defined(__native_client__) && defined(__x86_64__) |
| 2741 , "r14" |
| 2742 #endif |
| 2743 #if defined(__SSE2__) |
| 2744 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2745 #endif |
| 2746 ); |
| 2747 } |
| 2748 |
| 2749 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
| 2750 const uint8* u_buf, |
| 2751 const uint8* v_buf, |
| 2752 uint8* dst_rgba, |
| 2753 int width) { |
| 2754 asm volatile ( |
| 2755 "sub %[u_buf],%[v_buf] \n" |
| 2756 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2757 "pxor %%xmm4,%%xmm4 \n" |
| 2758 LABELALIGN |
| 2759 "1: \n" |
| 2760 READYUV422 |
| 2761 YUVTORGB |
| 2762 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2763 "punpcklbw %%xmm2,%%xmm1 \n" |
| 2764 "punpcklbw %%xmm0,%%xmm5 \n" |
| 2765 "movdqa %%xmm5,%%xmm0 \n" |
| 2766 "punpcklwd %%xmm1,%%xmm5 \n" |
| 2767 "punpckhwd %%xmm1,%%xmm0 \n" |
| 2768 "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n" |
| 2769 "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" |
| 2770 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" |
| 2771 "sub $0x8,%[width] \n" |
| 2772 "jg 1b \n" |
| 2773 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2774 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2775 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2776 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] |
| 2777 [width]"+rm"(width) // %[width] |
| 2778 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2779 : "memory", "cc" |
| 2780 #if defined(__native_client__) && defined(__x86_64__) |
| 2781 , "r14" |
| 2782 #endif |
| 2783 #if defined(__SSE2__) |
| 2784 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2785 #endif |
| 2786 ); |
| 2787 } |
| 2788 |
| 2789 void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, |
| 2790 const uint8* u_buf, |
| 2791 const uint8* v_buf, |
| 2792 uint8* dst_bgra, |
| 2793 int width) { |
| 2794 asm volatile ( |
| 2795 "sub %[u_buf],%[v_buf] \n" |
| 2796 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2797 "pxor %%xmm4,%%xmm4 \n" |
| 2798 LABELALIGN |
| 2799 "1: \n" |
| 2800 READYUV422 |
| 2801 YUVTORGB |
| 2802 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2803 "punpcklbw %%xmm0,%%xmm1 \n" |
| 2804 "punpcklbw %%xmm2,%%xmm5 \n" |
| 2805 "movdqa %%xmm5,%%xmm0 \n" |
| 2806 "punpcklwd %%xmm1,%%xmm5 \n" |
| 2807 "punpckhwd %%xmm1,%%xmm0 \n" |
| 2808 "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" |
| 2809 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" |
| 2810 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" |
| 2811 "sub $0x8,%[width] \n" |
| 2812 "jg 1b \n" |
| 2813 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2814 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2815 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2816 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] |
| 2817 [width]"+rm"(width) // %[width] |
| 2818 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2819 : "memory", "cc" |
| 2820 #if defined(__native_client__) && defined(__x86_64__) |
| 2821 , "r14" |
| 2822 #endif |
| 2823 #if defined(__SSE2__) |
| 2824 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2825 #endif |
| 2826 ); |
| 2827 } |
| 2828 |
| 2829 void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, |
| 2830 const uint8* u_buf, |
| 2831 const uint8* v_buf, |
| 2832 uint8* dst_abgr, |
| 2833 int width) { |
| 2834 asm volatile ( |
| 2835 "sub %[u_buf],%[v_buf] \n" |
| 2836 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2837 "pxor %%xmm4,%%xmm4 \n" |
| 2838 LABELALIGN |
| 2839 "1: \n" |
| 2840 READYUV422 |
| 2841 YUVTORGB |
| 2842 "punpcklbw %%xmm1,%%xmm2 \n" |
| 2843 "punpcklbw %%xmm5,%%xmm0 \n" |
| 2844 "movdqa %%xmm2,%%xmm1 \n" |
| 2845 "punpcklwd %%xmm0,%%xmm2 \n" |
| 2846 "punpckhwd %%xmm0,%%xmm1 \n" |
| 2847 "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" |
| 2848 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" |
| 2849 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" |
| 2850 "sub $0x8,%[width] \n" |
| 2851 "jg 1b \n" |
| 2852 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2853 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2854 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2855 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
| 2856 [width]"+rm"(width) // %[width] |
| 2857 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2858 : "memory", "cc" |
| 2859 #if defined(__native_client__) && defined(__x86_64__) |
| 2860 , "r14" |
| 2861 #endif |
| 2862 #if defined(__SSE2__) |
| 2863 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2864 #endif |
| 2865 ); |
| 2866 } |
| 2867 |
| 2868 void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, |
| 2869 const uint8* u_buf, |
| 2870 const uint8* v_buf, |
| 2871 uint8* dst_rgba, |
| 2872 int width) { |
| 2873 asm volatile ( |
| 2874 "sub %[u_buf],%[v_buf] \n" |
| 2875 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2876 "pxor %%xmm4,%%xmm4 \n" |
| 2877 LABELALIGN |
| 2878 "1: \n" |
| 2879 READYUV422 |
| 2880 YUVTORGB |
| 2881 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2882 "punpcklbw %%xmm2,%%xmm1 \n" |
| 2883 "punpcklbw %%xmm0,%%xmm5 \n" |
| 2884 "movdqa %%xmm5,%%xmm0 \n" |
| 2885 "punpcklwd %%xmm1,%%xmm5 \n" |
| 2886 "punpckhwd %%xmm1,%%xmm0 \n" |
| 2887 "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n" |
| 2888 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" |
| 2889 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" |
| 2890 "sub $0x8,%[width] \n" |
| 2891 "jg 1b \n" |
| 2892 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2893 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2894 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2895 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] |
| 2896 [width]"+rm"(width) // %[width] |
| 2897 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
| 2898 : "memory", "cc" |
| 2899 #if defined(__native_client__) && defined(__x86_64__) |
| 2900 , "r14" |
| 2901 #endif |
| 2902 #if defined(__SSE2__) |
| 2903 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2904 #endif |
| 2905 ); |
| 2906 } |
| 2907 |
| 2908 #endif // HAS_I422TOARGBROW_SSSE3 |
| 2909 |
| 2910 #ifdef HAS_YTOARGBROW_SSE2 |
| 2911 void YToARGBRow_SSE2(const uint8* y_buf, |
| 2912 uint8* dst_argb, |
| 2913 int width) { |
| 2914 asm volatile ( |
| 2915 "pxor %%xmm5,%%xmm5 \n" |
| 2916 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 2917 "pslld $0x18,%%xmm4 \n" |
| 2918 "mov $0x00100010,%%eax \n" |
| 2919 "movd %%eax,%%xmm3 \n" |
| 2920 "pshufd $0x0,%%xmm3,%%xmm3 \n" |
| 2921 "mov $0x004a004a,%%eax \n" |
| 2922 "movd %%eax,%%xmm2 \n" |
| 2923 "pshufd $0x0,%%xmm2,%%xmm2 \n" |
| 2924 LABELALIGN |
| 2925 "1: \n" |
| 2926 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 |
| 2927 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| 2928 "lea " MEMLEA(0x8,0) ",%0 \n" |
| 2929 "punpcklbw %%xmm5,%%xmm0 \n" |
| 2930 "psubusw %%xmm3,%%xmm0 \n" |
| 2931 "pmullw %%xmm2,%%xmm0 \n" |
| 2932 "psrlw $6, %%xmm0 \n" |
| 2933 "packuswb %%xmm0,%%xmm0 \n" |
| 2934 |
| 2935 // Step 2: Weave into ARGB |
| 2936 "punpcklbw %%xmm0,%%xmm0 \n" |
| 2937 "movdqa %%xmm0,%%xmm1 \n" |
| 2938 "punpcklwd %%xmm0,%%xmm0 \n" |
| 2939 "punpckhwd %%xmm1,%%xmm1 \n" |
| 2940 "por %%xmm4,%%xmm0 \n" |
| 2941 "por %%xmm4,%%xmm1 \n" |
| 2942 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 2943 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 2944 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 2945 |
| 2946 "sub $0x8,%2 \n" |
| 2947 "jg 1b \n" |
| 2948 : "+r"(y_buf), // %0 |
| 2949 "+r"(dst_argb), // %1 |
| 2950 "+rm"(width) // %2 |
| 2951 : |
| 2952 : "memory", "cc", "eax" |
| 2953 #if defined(__SSE2__) |
| 2954 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
| 2955 #endif |
| 2956 ); |
| 2957 } |
| 2958 #endif // HAS_YTOARGBROW_SSE2 |
| 2959 |
| 2960 #ifdef HAS_MIRRORROW_SSSE3 |
| 2961 // Shuffle table for reversing the bytes. |
| 2962 static uvec8 kShuffleMirror = { |
| 2963 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
| 2964 }; |
| 2965 |
| 2966 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
| 2967 intptr_t temp_width = (intptr_t)(width); |
| 2968 asm volatile ( |
| 2969 "movdqa %3,%%xmm5 \n" |
| 2970 "lea " MEMLEA(-0x10,0) ",%0 \n" |
| 2971 LABELALIGN |
| 2972 "1: \n" |
| 2973 MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0 |
| 2974 "pshufb %%xmm5,%%xmm0 \n" |
| 2975 "sub $0x10,%2 \n" |
| 2976 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 2977 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 2978 "jg 1b \n" |
| 2979 : "+r"(src), // %0 |
| 2980 "+r"(dst), // %1 |
| 2981 "+r"(temp_width) // %2 |
| 2982 : "m"(kShuffleMirror) // %3 |
| 2983 : "memory", "cc" |
| 2984 #if defined(__native_client__) && defined(__x86_64__) |
| 2985 , "r14" |
| 2986 #endif |
| 2987 #if defined(__SSE2__) |
| 2988 , "xmm0", "xmm5" |
| 2989 #endif |
| 2990 ); |
| 2991 } |
| 2992 #endif // HAS_MIRRORROW_SSSE3 |
| 2993 |
| 2994 #ifdef HAS_MIRRORROW_SSE2 |
| 2995 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { |
| 2996 intptr_t temp_width = (intptr_t)(width); |
| 2997 asm volatile ( |
| 2998 "lea " MEMLEA(-0x10,0) ",%0 \n" |
| 2999 LABELALIGN |
| 3000 "1: \n" |
| 3001 MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 |
| 3002 "movdqa %%xmm0,%%xmm1 \n" |
| 3003 "psllw $0x8,%%xmm0 \n" |
| 3004 "psrlw $0x8,%%xmm1 \n" |
| 3005 "por %%xmm1,%%xmm0 \n" |
| 3006 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" |
| 3007 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" |
| 3008 "pshufd $0x4e,%%xmm0,%%xmm0 \n" |
| 3009 "sub $0x10,%2 \n" |
| 3010 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 3011 "lea " MEMLEA(0x10,1)",%1 \n" |
| 3012 "jg 1b \n" |
| 3013 : "+r"(src), // %0 |
| 3014 "+r"(dst), // %1 |
| 3015 "+r"(temp_width) // %2 |
| 3016 : |
| 3017 : "memory", "cc" |
| 3018 #if defined(__native_client__) && defined(__x86_64__) |
| 3019 , "r14" |
| 3020 #endif |
| 3021 #if defined(__SSE2__) |
| 3022 , "xmm0", "xmm1" |
| 3023 #endif |
| 3024 ); |
| 3025 } |
| 3026 #endif // HAS_MIRRORROW_SSE2 |
| 3027 |
| 3028 #ifdef HAS_MIRRORROW_UV_SSSE3 |
| 3029 // Shuffle table for reversing the bytes of UV channels. |
| 3030 static uvec8 kShuffleMirrorUV = { |
| 3031 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u |
| 3032 }; |
| 3033 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, |
| 3034 int width) { |
| 3035 intptr_t temp_width = (intptr_t)(width); |
| 3036 asm volatile ( |
| 3037 "movdqa %4,%%xmm1 \n" |
| 3038 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" |
| 3039 "sub %1,%2 \n" |
| 3040 LABELALIGN |
| 3041 "1: \n" |
| 3042 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 3043 "lea " MEMLEA(-0x10,0) ",%0 \n" |
| 3044 "pshufb %%xmm1,%%xmm0 \n" |
| 3045 "sub $8,%3 \n" |
| 3046 "movlpd %%xmm0," MEMACCESS(1) " \n" |
| 3047 BUNDLEALIGN |
| 3048 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) |
| 3049 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 3050 "jg 1b \n" |
| 3051 : "+r"(src), // %0 |
| 3052 "+r"(dst_u), // %1 |
| 3053 "+r"(dst_v), // %2 |
| 3054 "+r"(temp_width) // %3 |
| 3055 : "m"(kShuffleMirrorUV) // %4 |
| 3056 : "memory", "cc" |
| 3057 #if defined(__native_client__) && defined(__x86_64__) |
| 3058 , "r14" |
| 3059 #endif |
| 3060 #if defined(__SSE2__) |
| 3061 , "xmm0", "xmm1" |
| 3062 #endif |
| 3063 ); |
| 3064 } |
| 3065 #endif // HAS_MIRRORROW_UV_SSSE3 |
| 3066 |
| 3067 #ifdef HAS_ARGBMIRRORROW_SSSE3 |
| 3068 // Shuffle table for reversing the bytes. |
| 3069 static uvec8 kARGBShuffleMirror = { |
| 3070 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u |
| 3071 }; |
| 3072 |
| 3073 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
| 3074 intptr_t temp_width = (intptr_t)(width); |
| 3075 asm volatile ( |
| 3076 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" |
| 3077 "movdqa %3,%%xmm5 \n" |
| 3078 LABELALIGN |
| 3079 "1: \n" |
| 3080 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 3081 "pshufb %%xmm5,%%xmm0 \n" |
| 3082 "lea " MEMLEA(-0x10,0) ",%0 \n" |
| 3083 "sub $0x4,%2 \n" |
| 3084 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 3085 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 3086 "jg 1b \n" |
| 3087 : "+r"(src), // %0 |
| 3088 "+r"(dst), // %1 |
| 3089 "+r"(temp_width) // %2 |
| 3090 : "m"(kARGBShuffleMirror) // %3 |
| 3091 : "memory", "cc" |
| 3092 #if defined(__SSE2__) |
| 3093 , "xmm0", "xmm5" |
| 3094 #endif |
| 3095 ); |
| 3096 } |
| 3097 #endif // HAS_ARGBMIRRORROW_SSSE3 |
| 3098 |
| 3099 #ifdef HAS_SPLITUVROW_SSE2 |
| 3100 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { |
| 3101 asm volatile ( |
| 3102 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3103 "psrlw $0x8,%%xmm5 \n" |
| 3104 "sub %1,%2 \n" |
| 3105 LABELALIGN |
| 3106 "1: \n" |
| 3107 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 3108 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3109 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3110 "movdqa %%xmm0,%%xmm2 \n" |
| 3111 "movdqa %%xmm1,%%xmm3 \n" |
| 3112 "pand %%xmm5,%%xmm0 \n" |
| 3113 "pand %%xmm5,%%xmm1 \n" |
| 3114 "packuswb %%xmm1,%%xmm0 \n" |
| 3115 "psrlw $0x8,%%xmm2 \n" |
| 3116 "psrlw $0x8,%%xmm3 \n" |
| 3117 "packuswb %%xmm3,%%xmm2 \n" |
| 3118 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 3119 MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2) |
| 3120 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 3121 "sub $0x10,%3 \n" |
| 3122 "jg 1b \n" |
| 3123 : "+r"(src_uv), // %0 |
| 3124 "+r"(dst_u), // %1 |
| 3125 "+r"(dst_v), // %2 |
| 3126 "+r"(pix) // %3 |
| 3127 : |
| 3128 : "memory", "cc" |
| 3129 #if defined(__native_client__) && defined(__x86_64__) |
| 3130 , "r14" |
| 3131 #endif |
| 3132 #if defined(__SSE2__) |
| 3133 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 3134 #endif |
| 3135 ); |
| 3136 } |
| 3137 |
| 3138 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
| 3139 int pix) { |
| 3140 asm volatile ( |
| 3141 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3142 "psrlw $0x8,%%xmm5 \n" |
| 3143 "sub %1,%2 \n" |
| 3144 LABELALIGN |
| 3145 "1: \n" |
| 3146 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3147 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3148 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3149 "movdqa %%xmm0,%%xmm2 \n" |
| 3150 "movdqa %%xmm1,%%xmm3 \n" |
| 3151 "pand %%xmm5,%%xmm0 \n" |
| 3152 "pand %%xmm5,%%xmm1 \n" |
| 3153 "packuswb %%xmm1,%%xmm0 \n" |
| 3154 "psrlw $0x8,%%xmm2 \n" |
| 3155 "psrlw $0x8,%%xmm3 \n" |
| 3156 "packuswb %%xmm3,%%xmm2 \n" |
| 3157 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 3158 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) |
| 3159 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 3160 "sub $0x10,%3 \n" |
| 3161 "jg 1b \n" |
| 3162 : "+r"(src_uv), // %0 |
| 3163 "+r"(dst_u), // %1 |
| 3164 "+r"(dst_v), // %2 |
| 3165 "+r"(pix) // %3 |
| 3166 : |
| 3167 : "memory", "cc" |
| 3168 #if defined(__native_client__) && defined(__x86_64__) |
| 3169 , "r14" |
| 3170 #endif |
| 3171 #if defined(__SSE2__) |
| 3172 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 3173 #endif |
| 3174 ); |
| 3175 } |
| 3176 #endif // HAS_SPLITUVROW_SSE2 |
| 3177 |
| 3178 #ifdef HAS_MERGEUVROW_SSE2 |
| 3179 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
| 3180 int width) { |
| 3181 asm volatile ( |
| 3182 "sub %0,%1 \n" |
| 3183 LABELALIGN |
| 3184 "1: \n" |
| 3185 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 3186 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 |
| 3187 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 3188 "movdqa %%xmm0,%%xmm2 \n" |
| 3189 "punpcklbw %%xmm1,%%xmm0 \n" |
| 3190 "punpckhbw %%xmm1,%%xmm2 \n" |
| 3191 "movdqa %%xmm0," MEMACCESS(2) " \n" |
| 3192 "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" |
| 3193 "lea " MEMLEA(0x20,2) ",%2 \n" |
| 3194 "sub $0x10,%3 \n" |
| 3195 "jg 1b \n" |
| 3196 : "+r"(src_u), // %0 |
| 3197 "+r"(src_v), // %1 |
| 3198 "+r"(dst_uv), // %2 |
| 3199 "+r"(width) // %3 |
| 3200 : |
| 3201 : "memory", "cc" |
| 3202 #if defined(__native_client__) && defined(__x86_64__) |
| 3203 , "r14" |
| 3204 #endif |
| 3205 #if defined(__SSE2__) |
| 3206 , "xmm0", "xmm1", "xmm2" |
| 3207 #endif |
| 3208 ); |
| 3209 } |
| 3210 |
| 3211 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, |
| 3212 uint8* dst_uv, int width) { |
| 3213 asm volatile ( |
| 3214 "sub %0,%1 \n" |
| 3215 LABELALIGN |
| 3216 "1: \n" |
| 3217 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3218 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 |
| 3219 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 3220 "movdqa %%xmm0,%%xmm2 \n" |
| 3221 "punpcklbw %%xmm1,%%xmm0 \n" |
| 3222 "punpckhbw %%xmm1,%%xmm2 \n" |
| 3223 "movdqu %%xmm0," MEMACCESS(2) " \n" |
| 3224 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" |
| 3225 "lea " MEMLEA(0x20,2) ",%2 \n" |
| 3226 "sub $0x10,%3 \n" |
| 3227 "jg 1b \n" |
| 3228 : "+r"(src_u), // %0 |
| 3229 "+r"(src_v), // %1 |
| 3230 "+r"(dst_uv), // %2 |
| 3231 "+r"(width) // %3 |
| 3232 : |
| 3233 : "memory", "cc" |
| 3234 #if defined(__native_client__) && defined(__x86_64__) |
| 3235 , "r14" |
| 3236 #endif |
| 3237 #if defined(__SSE2__) |
| 3238 , "xmm0", "xmm1", "xmm2" |
| 3239 #endif |
| 3240 ); |
| 3241 } |
| 3242 #endif // HAS_MERGEUVROW_SSE2 |
| 3243 |
| 3244 #ifdef HAS_COPYROW_SSE2 |
| 3245 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { |
| 3246 asm volatile ( |
| 3247 LABELALIGN |
| 3248 "1: \n" |
| 3249 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 3250 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3251 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3252 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 3253 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 3254 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 3255 "sub $0x20,%2 \n" |
| 3256 "jg 1b \n" |
| 3257 : "+r"(src), // %0 |
| 3258 "+r"(dst), // %1 |
| 3259 "+r"(count) // %2 |
| 3260 : |
| 3261 : "memory", "cc" |
| 3262 #if defined(__SSE2__) |
| 3263 , "xmm0", "xmm1" |
| 3264 #endif |
| 3265 ); |
| 3266 } |
| 3267 #endif // HAS_COPYROW_SSE2 |
| 3268 |
| 3269 #ifdef HAS_COPYROW_X86 |
| 3270 void CopyRow_X86(const uint8* src, uint8* dst, int width) { |
| 3271 size_t width_tmp = (size_t)(width); |
| 3272 asm volatile ( |
| 3273 "shr $0x2,%2 \n" |
| 3274 "rep movsl " MEMMOVESTRING(0,1) " \n" |
| 3275 : "+S"(src), // %0 |
| 3276 "+D"(dst), // %1 |
| 3277 "+c"(width_tmp) // %2 |
| 3278 : |
| 3279 : "memory", "cc" |
| 3280 ); |
| 3281 } |
| 3282 #endif // HAS_COPYROW_X86 |
| 3283 |
| 3284 #ifdef HAS_COPYROW_ERMS |
| 3285 // Unaligned Multiple of 1. |
| 3286 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { |
| 3287 size_t width_tmp = (size_t)(width); |
| 3288 asm volatile ( |
| 3289 "rep movsb " MEMMOVESTRING(0,1) " \n" |
| 3290 : "+S"(src), // %0 |
| 3291 "+D"(dst), // %1 |
| 3292 "+c"(width_tmp) // %2 |
| 3293 : |
| 3294 : "memory", "cc" |
| 3295 ); |
| 3296 } |
| 3297 #endif // HAS_COPYROW_ERMS |
| 3298 |
| 3299 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 |
| 3300 // width in pixels |
| 3301 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
| 3302 asm volatile ( |
| 3303 "pcmpeqb %%xmm0,%%xmm0 \n" |
| 3304 "pslld $0x18,%%xmm0 \n" |
| 3305 "pcmpeqb %%xmm1,%%xmm1 \n" |
| 3306 "psrld $0x8,%%xmm1 \n" |
| 3307 LABELALIGN |
| 3308 "1: \n" |
| 3309 "movdqa " MEMACCESS(0) ",%%xmm2 \n" |
| 3310 "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" |
| 3311 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3312 "movdqa " MEMACCESS(1) ",%%xmm4 \n" |
| 3313 "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" |
| 3314 "pand %%xmm0,%%xmm2 \n" |
| 3315 "pand %%xmm0,%%xmm3 \n" |
| 3316 "pand %%xmm1,%%xmm4 \n" |
| 3317 "pand %%xmm1,%%xmm5 \n" |
| 3318 "por %%xmm4,%%xmm2 \n" |
| 3319 "por %%xmm5,%%xmm3 \n" |
| 3320 "movdqa %%xmm2," MEMACCESS(1) " \n" |
| 3321 "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" |
| 3322 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 3323 "sub $0x8,%2 \n" |
| 3324 "jg 1b \n" |
| 3325 : "+r"(src), // %0 |
| 3326 "+r"(dst), // %1 |
| 3327 "+r"(width) // %2 |
| 3328 : |
| 3329 : "memory", "cc" |
| 3330 #if defined(__SSE2__) |
| 3331 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 3332 #endif |
| 3333 ); |
| 3334 } |
| 3335 #endif // HAS_ARGBCOPYALPHAROW_SSE2 |
| 3336 |
| 3337 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 |
| 3338 // width in pixels |
| 3339 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
| 3340 asm volatile ( |
| 3341 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" |
| 3342 "vpsrld $0x8,%%ymm0,%%ymm0 \n" |
| 3343 LABELALIGN |
| 3344 "1: \n" |
| 3345 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" |
| 3346 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" |
| 3347 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 3348 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" |
| 3349 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" |
| 3350 "vmovdqu %%ymm1," MEMACCESS(1) " \n" |
| 3351 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" |
| 3352 "lea " MEMLEA(0x40,1) ",%1 \n" |
| 3353 "sub $0x10,%2 \n" |
| 3354 "jg 1b \n" |
| 3355 "vzeroupper \n" |
| 3356 : "+r"(src), // %0 |
| 3357 "+r"(dst), // %1 |
| 3358 "+r"(width) // %2 |
| 3359 : |
| 3360 : "memory", "cc" |
| 3361 #if defined(__SSE2__) |
| 3362 , "xmm0", "xmm1", "xmm2" |
| 3363 #endif |
| 3364 ); |
| 3365 } |
| 3366 #endif // HAS_ARGBCOPYALPHAROW_AVX2 |
| 3367 |
| 3368 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 |
| 3369 // width in pixels |
| 3370 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
| 3371 asm volatile ( |
| 3372 "pcmpeqb %%xmm0,%%xmm0 \n" |
| 3373 "pslld $0x18,%%xmm0 \n" |
| 3374 "pcmpeqb %%xmm1,%%xmm1 \n" |
| 3375 "psrld $0x8,%%xmm1 \n" |
| 3376 LABELALIGN |
| 3377 "1: \n" |
| 3378 "movq " MEMACCESS(0) ",%%xmm2 \n" |
| 3379 "lea " MEMLEA(0x8,0) ",%0 \n" |
| 3380 "punpcklbw %%xmm2,%%xmm2 \n" |
| 3381 "punpckhwd %%xmm2,%%xmm3 \n" |
| 3382 "punpcklwd %%xmm2,%%xmm2 \n" |
| 3383 "movdqa " MEMACCESS(1) ",%%xmm4 \n" |
| 3384 "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" |
| 3385 "pand %%xmm0,%%xmm2 \n" |
| 3386 "pand %%xmm0,%%xmm3 \n" |
| 3387 "pand %%xmm1,%%xmm4 \n" |
| 3388 "pand %%xmm1,%%xmm5 \n" |
| 3389 "por %%xmm4,%%xmm2 \n" |
| 3390 "por %%xmm5,%%xmm3 \n" |
| 3391 "movdqa %%xmm2," MEMACCESS(1) " \n" |
| 3392 "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" |
| 3393 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 3394 "sub $0x8,%2 \n" |
| 3395 "jg 1b \n" |
| 3396 : "+r"(src), // %0 |
| 3397 "+r"(dst), // %1 |
| 3398 "+r"(width) // %2 |
| 3399 : |
| 3400 : "memory", "cc" |
| 3401 #if defined(__SSE2__) |
| 3402 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 3403 #endif |
| 3404 ); |
| 3405 } |
| 3406 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 |
| 3407 |
| 3408 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 |
| 3409 // width in pixels |
| 3410 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
| 3411 asm volatile ( |
| 3412 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" |
| 3413 "vpsrld $0x8,%%ymm0,%%ymm0 \n" |
| 3414 LABELALIGN |
| 3415 "1: \n" |
| 3416 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" |
| 3417 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" |
| 3418 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 3419 "vpslld $0x18,%%ymm1,%%ymm1 \n" |
| 3420 "vpslld $0x18,%%ymm2,%%ymm2 \n" |
| 3421 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" |
| 3422 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" |
| 3423 "vmovdqu %%ymm1," MEMACCESS(1) " \n" |
| 3424 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" |
| 3425 "lea " MEMLEA(0x40,1) ",%1 \n" |
| 3426 "sub $0x10,%2 \n" |
| 3427 "jg 1b \n" |
| 3428 "vzeroupper \n" |
| 3429 : "+r"(src), // %0 |
| 3430 "+r"(dst), // %1 |
| 3431 "+r"(width) // %2 |
| 3432 : |
| 3433 : "memory", "cc" |
| 3434 #if defined(__SSE2__) |
| 3435 , "xmm0", "xmm1", "xmm2" |
| 3436 #endif |
| 3437 ); |
| 3438 } |
| 3439 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 |
| 3440 |
| 3441 #ifdef HAS_SETROW_X86 |
| 3442 void SetRow_X86(uint8* dst, uint32 v32, int width) { |
| 3443 size_t width_tmp = (size_t)(width); |
| 3444 asm volatile ( |
| 3445 "shr $0x2,%1 \n" |
| 3446 "rep stosl " MEMSTORESTRING(eax,0) " \n" |
| 3447 : "+D"(dst), // %0 |
| 3448 "+c"(width_tmp) // %1 |
| 3449 : "a"(v32) // %2 |
| 3450 : "memory", "cc"); |
| 3451 } |
| 3452 |
| 3453 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, |
| 3454 int dst_stride, int height) { |
| 3455 for (int y = 0; y < height; ++y) { |
| 3456 size_t width_tmp = (size_t)(width); |
| 3457 uint32* d = (uint32*)(dst); |
| 3458 asm volatile ( |
| 3459 "rep stosl " MEMSTORESTRING(eax,0) " \n" |
| 3460 : "+D"(d), // %0 |
| 3461 "+c"(width_tmp) // %1 |
| 3462 : "a"(v32) // %2 |
| 3463 : "memory", "cc"); |
| 3464 dst += dst_stride; |
| 3465 } |
| 3466 } |
| 3467 #endif // HAS_SETROW_X86 |
| 3468 |
| 3469 #ifdef HAS_YUY2TOYROW_SSE2 |
| 3470 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { |
| 3471 asm volatile ( |
| 3472 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3473 "psrlw $0x8,%%xmm5 \n" |
| 3474 LABELALIGN |
| 3475 "1: \n" |
| 3476 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 3477 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3478 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3479 "pand %%xmm5,%%xmm0 \n" |
| 3480 "pand %%xmm5,%%xmm1 \n" |
| 3481 "packuswb %%xmm1,%%xmm0 \n" |
| 3482 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 3483 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 3484 "sub $0x10,%2 \n" |
| 3485 "jg 1b \n" |
| 3486 : "+r"(src_yuy2), // %0 |
| 3487 "+r"(dst_y), // %1 |
| 3488 "+r"(pix) // %2 |
| 3489 : |
| 3490 : "memory", "cc" |
| 3491 #if defined(__SSE2__) |
| 3492 , "xmm0", "xmm1", "xmm5" |
| 3493 #endif |
| 3494 ); |
| 3495 } |
| 3496 |
| 3497 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, |
| 3498 uint8* dst_u, uint8* dst_v, int pix) { |
| 3499 asm volatile ( |
| 3500 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3501 "psrlw $0x8,%%xmm5 \n" |
| 3502 "sub %1,%2 \n" |
| 3503 LABELALIGN |
| 3504 "1: \n" |
| 3505 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 3506 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3507 BUNDLEALIGN |
| 3508 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 |
| 3509 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 |
| 3510 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3511 "pavgb %%xmm2,%%xmm0 \n" |
| 3512 "pavgb %%xmm3,%%xmm1 \n" |
| 3513 "psrlw $0x8,%%xmm0 \n" |
| 3514 "psrlw $0x8,%%xmm1 \n" |
| 3515 "packuswb %%xmm1,%%xmm0 \n" |
| 3516 "movdqa %%xmm0,%%xmm1 \n" |
| 3517 "pand %%xmm5,%%xmm0 \n" |
| 3518 "packuswb %%xmm0,%%xmm0 \n" |
| 3519 "psrlw $0x8,%%xmm1 \n" |
| 3520 "packuswb %%xmm1,%%xmm1 \n" |
| 3521 "movq %%xmm0," MEMACCESS(1) " \n" |
| 3522 BUNDLEALIGN |
| 3523 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
| 3524 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 3525 "sub $0x10,%3 \n" |
| 3526 "jg 1b \n" |
| 3527 : "+r"(src_yuy2), // %0 |
| 3528 "+r"(dst_u), // %1 |
| 3529 "+r"(dst_v), // %2 |
| 3530 "+r"(pix) // %3 |
| 3531 : "r"((intptr_t)(stride_yuy2)) // %4 |
| 3532 : "memory", "cc" |
| 3533 #if defined(__native_client__) && defined(__x86_64__) |
| 3534 , "r14" |
| 3535 #endif |
| 3536 #if defined(__SSE2__) |
| 3537 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 3538 #endif |
| 3539 ); |
| 3540 } |
| 3541 |
| 3542 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, |
| 3543 uint8* dst_u, uint8* dst_v, int pix) { |
| 3544 asm volatile ( |
| 3545 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3546 "psrlw $0x8,%%xmm5 \n" |
| 3547 "sub %1,%2 \n" |
| 3548 LABELALIGN |
| 3549 "1: \n" |
| 3550 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 3551 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3552 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3553 "psrlw $0x8,%%xmm0 \n" |
| 3554 "psrlw $0x8,%%xmm1 \n" |
| 3555 "packuswb %%xmm1,%%xmm0 \n" |
| 3556 "movdqa %%xmm0,%%xmm1 \n" |
| 3557 "pand %%xmm5,%%xmm0 \n" |
| 3558 "packuswb %%xmm0,%%xmm0 \n" |
| 3559 "psrlw $0x8,%%xmm1 \n" |
| 3560 "packuswb %%xmm1,%%xmm1 \n" |
| 3561 "movq %%xmm0," MEMACCESS(1) " \n" |
| 3562 BUNDLEALIGN |
| 3563 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
| 3564 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 3565 "sub $0x10,%3 \n" |
| 3566 "jg 1b \n" |
| 3567 : "+r"(src_yuy2), // %0 |
| 3568 "+r"(dst_u), // %1 |
| 3569 "+r"(dst_v), // %2 |
| 3570 "+r"(pix) // %3 |
| 3571 : |
| 3572 : "memory", "cc" |
| 3573 #if defined(__native_client__) && defined(__x86_64__) |
| 3574 , "r14" |
| 3575 #endif |
| 3576 #if defined(__SSE2__) |
| 3577 , "xmm0", "xmm1", "xmm5" |
| 3578 #endif |
| 3579 ); |
| 3580 } |
| 3581 |
| 3582 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, |
| 3583 uint8* dst_y, int pix) { |
| 3584 asm volatile ( |
| 3585 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3586 "psrlw $0x8,%%xmm5 \n" |
| 3587 LABELALIGN |
| 3588 "1: \n" |
| 3589 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3590 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3591 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3592 "pand %%xmm5,%%xmm0 \n" |
| 3593 "pand %%xmm5,%%xmm1 \n" |
| 3594 "packuswb %%xmm1,%%xmm0 \n" |
| 3595 "sub $0x10,%2 \n" |
| 3596 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 3597 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 3598 "jg 1b \n" |
| 3599 : "+r"(src_yuy2), // %0 |
| 3600 "+r"(dst_y), // %1 |
| 3601 "+r"(pix) // %2 |
| 3602 : |
| 3603 : "memory", "cc" |
| 3604 #if defined(__SSE2__) |
| 3605 , "xmm0", "xmm1", "xmm5" |
| 3606 #endif |
| 3607 ); |
| 3608 } |
| 3609 |
| 3610 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, |
| 3611 int stride_yuy2, |
| 3612 uint8* dst_u, uint8* dst_v, int pix) { |
| 3613 asm volatile ( |
| 3614 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3615 "psrlw $0x8,%%xmm5 \n" |
| 3616 "sub %1,%2 \n" |
| 3617 LABELALIGN |
| 3618 "1: \n" |
| 3619 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3620 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3621 BUNDLEALIGN |
| 3622 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 |
| 3623 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 |
| 3624 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3625 "pavgb %%xmm2,%%xmm0 \n" |
| 3626 "pavgb %%xmm3,%%xmm1 \n" |
| 3627 "psrlw $0x8,%%xmm0 \n" |
| 3628 "psrlw $0x8,%%xmm1 \n" |
| 3629 "packuswb %%xmm1,%%xmm0 \n" |
| 3630 "movdqa %%xmm0,%%xmm1 \n" |
| 3631 "pand %%xmm5,%%xmm0 \n" |
| 3632 "packuswb %%xmm0,%%xmm0 \n" |
| 3633 "psrlw $0x8,%%xmm1 \n" |
| 3634 "packuswb %%xmm1,%%xmm1 \n" |
| 3635 "movq %%xmm0," MEMACCESS(1) " \n" |
| 3636 BUNDLEALIGN |
| 3637 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
| 3638 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 3639 "sub $0x10,%3 \n" |
| 3640 "jg 1b \n" |
| 3641 : "+r"(src_yuy2), // %0 |
| 3642 "+r"(dst_u), // %1 |
| 3643 "+r"(dst_v), // %2 |
| 3644 "+r"(pix) // %3 |
| 3645 : "r"((intptr_t)(stride_yuy2)) // %4 |
| 3646 : "memory", "cc" |
| 3647 #if defined(__native_client__) && defined(__x86_64__) |
| 3648 , "r14" |
| 3649 #endif |
| 3650 #if defined(__SSE2__) |
| 3651 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 3652 #endif |
| 3653 ); |
| 3654 } |
| 3655 |
| 3656 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, |
| 3657 uint8* dst_u, uint8* dst_v, int pix) { |
| 3658 asm volatile ( |
| 3659 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3660 "psrlw $0x8,%%xmm5 \n" |
| 3661 "sub %1,%2 \n" |
| 3662 LABELALIGN |
| 3663 "1: \n" |
| 3664 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3665 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3666 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3667 "psrlw $0x8,%%xmm0 \n" |
| 3668 "psrlw $0x8,%%xmm1 \n" |
| 3669 "packuswb %%xmm1,%%xmm0 \n" |
| 3670 "movdqa %%xmm0,%%xmm1 \n" |
| 3671 "pand %%xmm5,%%xmm0 \n" |
| 3672 "packuswb %%xmm0,%%xmm0 \n" |
| 3673 "psrlw $0x8,%%xmm1 \n" |
| 3674 "packuswb %%xmm1,%%xmm1 \n" |
| 3675 "movq %%xmm0," MEMACCESS(1) " \n" |
| 3676 BUNDLEALIGN |
| 3677 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
| 3678 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 3679 "sub $0x10,%3 \n" |
| 3680 "jg 1b \n" |
| 3681 : "+r"(src_yuy2), // %0 |
| 3682 "+r"(dst_u), // %1 |
| 3683 "+r"(dst_v), // %2 |
| 3684 "+r"(pix) // %3 |
| 3685 : |
| 3686 : "memory", "cc" |
| 3687 #if defined(__native_client__) && defined(__x86_64__) |
| 3688 , "r14" |
| 3689 #endif |
| 3690 #if defined(__SSE2__) |
| 3691 , "xmm0", "xmm1", "xmm5" |
| 3692 #endif |
| 3693 ); |
| 3694 } |
| 3695 |
| 3696 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { |
| 3697 asm volatile ( |
| 3698 LABELALIGN |
| 3699 "1: \n" |
| 3700 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 3701 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3702 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3703 "psrlw $0x8,%%xmm0 \n" |
| 3704 "psrlw $0x8,%%xmm1 \n" |
| 3705 "packuswb %%xmm1,%%xmm0 \n" |
| 3706 "sub $0x10,%2 \n" |
| 3707 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 3708 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 3709 "jg 1b \n" |
| 3710 : "+r"(src_uyvy), // %0 |
| 3711 "+r"(dst_y), // %1 |
| 3712 "+r"(pix) // %2 |
| 3713 : |
| 3714 : "memory", "cc" |
| 3715 #if defined(__SSE2__) |
| 3716 , "xmm0", "xmm1" |
| 3717 #endif |
| 3718 ); |
| 3719 } |
| 3720 |
| 3721 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, |
| 3722 uint8* dst_u, uint8* dst_v, int pix) { |
| 3723 asm volatile ( |
| 3724 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3725 "psrlw $0x8,%%xmm5 \n" |
| 3726 "sub %1,%2 \n" |
| 3727 LABELALIGN |
| 3728 "1: \n" |
| 3729 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 3730 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3731 BUNDLEALIGN |
| 3732 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 |
| 3733 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 |
| 3734 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3735 "pavgb %%xmm2,%%xmm0 \n" |
| 3736 "pavgb %%xmm3,%%xmm1 \n" |
| 3737 "pand %%xmm5,%%xmm0 \n" |
| 3738 "pand %%xmm5,%%xmm1 \n" |
| 3739 "packuswb %%xmm1,%%xmm0 \n" |
| 3740 "movdqa %%xmm0,%%xmm1 \n" |
| 3741 "pand %%xmm5,%%xmm0 \n" |
| 3742 "packuswb %%xmm0,%%xmm0 \n" |
| 3743 "psrlw $0x8,%%xmm1 \n" |
| 3744 "packuswb %%xmm1,%%xmm1 \n" |
| 3745 "movq %%xmm0," MEMACCESS(1) " \n" |
| 3746 BUNDLEALIGN |
| 3747 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
| 3748 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 3749 "sub $0x10,%3 \n" |
| 3750 "jg 1b \n" |
| 3751 : "+r"(src_uyvy), // %0 |
| 3752 "+r"(dst_u), // %1 |
| 3753 "+r"(dst_v), // %2 |
| 3754 "+r"(pix) // %3 |
| 3755 : "r"((intptr_t)(stride_uyvy)) // %4 |
| 3756 : "memory", "cc" |
| 3757 #if defined(__native_client__) && defined(__x86_64__) |
| 3758 , "r14" |
| 3759 #endif |
| 3760 #if defined(__SSE2__) |
| 3761 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 3762 #endif |
| 3763 ); |
| 3764 } |
| 3765 |
| 3766 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
| 3767 uint8* dst_u, uint8* dst_v, int pix) { |
| 3768 asm volatile ( |
| 3769 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3770 "psrlw $0x8,%%xmm5 \n" |
| 3771 "sub %1,%2 \n" |
| 3772 LABELALIGN |
| 3773 "1: \n" |
| 3774 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 3775 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3776 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3777 "pand %%xmm5,%%xmm0 \n" |
| 3778 "pand %%xmm5,%%xmm1 \n" |
| 3779 "packuswb %%xmm1,%%xmm0 \n" |
| 3780 "movdqa %%xmm0,%%xmm1 \n" |
| 3781 "pand %%xmm5,%%xmm0 \n" |
| 3782 "packuswb %%xmm0,%%xmm0 \n" |
| 3783 "psrlw $0x8,%%xmm1 \n" |
| 3784 "packuswb %%xmm1,%%xmm1 \n" |
| 3785 "movq %%xmm0," MEMACCESS(1) " \n" |
| 3786 BUNDLEALIGN |
| 3787 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
| 3788 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 3789 "sub $0x10,%3 \n" |
| 3790 "jg 1b \n" |
| 3791 : "+r"(src_uyvy), // %0 |
| 3792 "+r"(dst_u), // %1 |
| 3793 "+r"(dst_v), // %2 |
| 3794 "+r"(pix) // %3 |
| 3795 : |
| 3796 : "memory", "cc" |
| 3797 #if defined(__native_client__) && defined(__x86_64__) |
| 3798 , "r14" |
| 3799 #endif |
| 3800 #if defined(__SSE2__) |
| 3801 , "xmm0", "xmm1", "xmm5" |
| 3802 #endif |
| 3803 ); |
| 3804 } |
| 3805 |
| 3806 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, |
| 3807 uint8* dst_y, int pix) { |
| 3808 asm volatile ( |
| 3809 LABELALIGN |
| 3810 "1: \n" |
| 3811 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3812 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3813 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3814 "psrlw $0x8,%%xmm0 \n" |
| 3815 "psrlw $0x8,%%xmm1 \n" |
| 3816 "packuswb %%xmm1,%%xmm0 \n" |
| 3817 "sub $0x10,%2 \n" |
| 3818 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 3819 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 3820 "jg 1b \n" |
| 3821 : "+r"(src_uyvy), // %0 |
| 3822 "+r"(dst_y), // %1 |
| 3823 "+r"(pix) // %2 |
| 3824 : |
| 3825 : "memory", "cc" |
| 3826 #if defined(__SSE2__) |
| 3827 , "xmm0", "xmm1" |
| 3828 #endif |
| 3829 ); |
| 3830 } |
| 3831 |
| 3832 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, |
| 3833 uint8* dst_u, uint8* dst_v, int pix) { |
| 3834 asm volatile ( |
| 3835 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3836 "psrlw $0x8,%%xmm5 \n" |
| 3837 "sub %1,%2 \n" |
| 3838 LABELALIGN |
| 3839 "1: \n" |
| 3840 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3841 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3842 BUNDLEALIGN |
| 3843 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 |
| 3844 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 |
| 3845 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3846 "pavgb %%xmm2,%%xmm0 \n" |
| 3847 "pavgb %%xmm3,%%xmm1 \n" |
| 3848 "pand %%xmm5,%%xmm0 \n" |
| 3849 "pand %%xmm5,%%xmm1 \n" |
| 3850 "packuswb %%xmm1,%%xmm0 \n" |
| 3851 "movdqa %%xmm0,%%xmm1 \n" |
| 3852 "pand %%xmm5,%%xmm0 \n" |
| 3853 "packuswb %%xmm0,%%xmm0 \n" |
| 3854 "psrlw $0x8,%%xmm1 \n" |
| 3855 "packuswb %%xmm1,%%xmm1 \n" |
| 3856 "movq %%xmm0," MEMACCESS(1) " \n" |
| 3857 BUNDLEALIGN |
| 3858 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
| 3859 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 3860 "sub $0x10,%3 \n" |
| 3861 "jg 1b \n" |
| 3862 : "+r"(src_uyvy), // %0 |
| 3863 "+r"(dst_u), // %1 |
| 3864 "+r"(dst_v), // %2 |
| 3865 "+r"(pix) // %3 |
| 3866 : "r"((intptr_t)(stride_uyvy)) // %4 |
| 3867 : "memory", "cc" |
| 3868 #if defined(__native_client__) && defined(__x86_64__) |
| 3869 , "r14" |
| 3870 #endif |
| 3871 #if defined(__SSE2__) |
| 3872 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 3873 #endif |
| 3874 ); |
| 3875 } |
| 3876 |
| 3877 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, |
| 3878 uint8* dst_u, uint8* dst_v, int pix) { |
| 3879 asm volatile ( |
| 3880 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3881 "psrlw $0x8,%%xmm5 \n" |
| 3882 "sub %1,%2 \n" |
| 3883 LABELALIGN |
| 3884 "1: \n" |
| 3885 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3886 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3887 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3888 "pand %%xmm5,%%xmm0 \n" |
| 3889 "pand %%xmm5,%%xmm1 \n" |
| 3890 "packuswb %%xmm1,%%xmm0 \n" |
| 3891 "movdqa %%xmm0,%%xmm1 \n" |
| 3892 "pand %%xmm5,%%xmm0 \n" |
| 3893 "packuswb %%xmm0,%%xmm0 \n" |
| 3894 "psrlw $0x8,%%xmm1 \n" |
| 3895 "packuswb %%xmm1,%%xmm1 \n" |
| 3896 "movq %%xmm0," MEMACCESS(1) " \n" |
| 3897 BUNDLEALIGN |
| 3898 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
| 3899 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 3900 "sub $0x10,%3 \n" |
| 3901 "jg 1b \n" |
| 3902 : "+r"(src_uyvy), // %0 |
| 3903 "+r"(dst_u), // %1 |
| 3904 "+r"(dst_v), // %2 |
| 3905 "+r"(pix) // %3 |
| 3906 : |
| 3907 : "memory", "cc" |
| 3908 #if defined(__native_client__) && defined(__x86_64__) |
| 3909 , "r14" |
| 3910 #endif |
| 3911 #if defined(__SSE2__) |
| 3912 , "xmm0", "xmm1", "xmm5" |
| 3913 #endif |
| 3914 ); |
| 3915 } |
| 3916 #endif // HAS_YUY2TOYROW_SSE2 |
| 3917 |
| 3918 #ifdef HAS_ARGBBLENDROW_SSE2 |
| 3919 // Blend 8 pixels at a time. |
| 3920 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
| 3921 uint8* dst_argb, int width) { |
| 3922 asm volatile ( |
| 3923 "pcmpeqb %%xmm7,%%xmm7 \n" |
| 3924 "psrlw $0xf,%%xmm7 \n" |
| 3925 "pcmpeqb %%xmm6,%%xmm6 \n" |
| 3926 "psrlw $0x8,%%xmm6 \n" |
| 3927 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3928 "psllw $0x8,%%xmm5 \n" |
| 3929 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 3930 "pslld $0x18,%%xmm4 \n" |
| 3931 "sub $0x1,%3 \n" |
| 3932 "je 91f \n" |
| 3933 "jl 99f \n" |
| 3934 |
| 3935 // 1 pixel loop until destination pointer is aligned. |
| 3936 "10: \n" |
| 3937 "test $0xf,%2 \n" |
| 3938 "je 19f \n" |
| 3939 "movd " MEMACCESS(0) ",%%xmm3 \n" |
| 3940 "lea " MEMLEA(0x4,0) ",%0 \n" |
| 3941 "movdqa %%xmm3,%%xmm0 \n" |
| 3942 "pxor %%xmm4,%%xmm3 \n" |
| 3943 "movd " MEMACCESS(1) ",%%xmm2 \n" |
| 3944 "psrlw $0x8,%%xmm3 \n" |
| 3945 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" |
| 3946 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" |
| 3947 "pand %%xmm6,%%xmm2 \n" |
| 3948 "paddw %%xmm7,%%xmm3 \n" |
| 3949 "pmullw %%xmm3,%%xmm2 \n" |
| 3950 "movd " MEMACCESS(1) ",%%xmm1 \n" |
| 3951 "lea " MEMLEA(0x4,1) ",%1 \n" |
| 3952 "psrlw $0x8,%%xmm1 \n" |
| 3953 "por %%xmm4,%%xmm0 \n" |
| 3954 "pmullw %%xmm3,%%xmm1 \n" |
| 3955 "psrlw $0x8,%%xmm2 \n" |
| 3956 "paddusb %%xmm2,%%xmm0 \n" |
| 3957 "pand %%xmm5,%%xmm1 \n" |
| 3958 "paddusb %%xmm1,%%xmm0 \n" |
| 3959 "sub $0x1,%3 \n" |
| 3960 "movd %%xmm0," MEMACCESS(2) " \n" |
| 3961 "lea " MEMLEA(0x4,2) ",%2 \n" |
| 3962 "jge 10b \n" |
| 3963 |
| 3964 "19: \n" |
| 3965 "add $1-4,%3 \n" |
| 3966 "jl 49f \n" |
| 3967 |
| 3968 // 4 pixel loop. |
| 3969 LABELALIGN |
| 3970 "41: \n" |
| 3971 "movdqu " MEMACCESS(0) ",%%xmm3 \n" |
| 3972 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 3973 "movdqa %%xmm3,%%xmm0 \n" |
| 3974 "pxor %%xmm4,%%xmm3 \n" |
| 3975 "movdqu " MEMACCESS(1) ",%%xmm2 \n" |
| 3976 "psrlw $0x8,%%xmm3 \n" |
| 3977 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" |
| 3978 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" |
| 3979 "pand %%xmm6,%%xmm2 \n" |
| 3980 "paddw %%xmm7,%%xmm3 \n" |
| 3981 "pmullw %%xmm3,%%xmm2 \n" |
| 3982 "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
| 3983 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 3984 "psrlw $0x8,%%xmm1 \n" |
| 3985 "por %%xmm4,%%xmm0 \n" |
| 3986 "pmullw %%xmm3,%%xmm1 \n" |
| 3987 "psrlw $0x8,%%xmm2 \n" |
| 3988 "paddusb %%xmm2,%%xmm0 \n" |
| 3989 "pand %%xmm5,%%xmm1 \n" |
| 3990 "paddusb %%xmm1,%%xmm0 \n" |
| 3991 "sub $0x4,%3 \n" |
| 3992 "movdqa %%xmm0," MEMACCESS(2) " \n" |
| 3993 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 3994 "jge 41b \n" |
| 3995 |
| 3996 "49: \n" |
| 3997 "add $0x3,%3 \n" |
| 3998 "jl 99f \n" |
| 3999 |
| 4000 // 1 pixel loop. |
| 4001 "91: \n" |
| 4002 "movd " MEMACCESS(0) ",%%xmm3 \n" |
| 4003 "lea " MEMLEA(0x4,0) ",%0 \n" |
| 4004 "movdqa %%xmm3,%%xmm0 \n" |
| 4005 "pxor %%xmm4,%%xmm3 \n" |
| 4006 "movd " MEMACCESS(1) ",%%xmm2 \n" |
| 4007 "psrlw $0x8,%%xmm3 \n" |
| 4008 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" |
| 4009 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" |
| 4010 "pand %%xmm6,%%xmm2 \n" |
| 4011 "paddw %%xmm7,%%xmm3 \n" |
| 4012 "pmullw %%xmm3,%%xmm2 \n" |
| 4013 "movd " MEMACCESS(1) ",%%xmm1 \n" |
| 4014 "lea " MEMLEA(0x4,1) ",%1 \n" |
| 4015 "psrlw $0x8,%%xmm1 \n" |
| 4016 "por %%xmm4,%%xmm0 \n" |
| 4017 "pmullw %%xmm3,%%xmm1 \n" |
| 4018 "psrlw $0x8,%%xmm2 \n" |
| 4019 "paddusb %%xmm2,%%xmm0 \n" |
| 4020 "pand %%xmm5,%%xmm1 \n" |
| 4021 "paddusb %%xmm1,%%xmm0 \n" |
| 4022 "sub $0x1,%3 \n" |
| 4023 "movd %%xmm0," MEMACCESS(2) " \n" |
| 4024 "lea " MEMLEA(0x4,2) ",%2 \n" |
| 4025 "jge 91b \n" |
| 4026 "99: \n" |
| 4027 : "+r"(src_argb0), // %0 |
| 4028 "+r"(src_argb1), // %1 |
| 4029 "+r"(dst_argb), // %2 |
| 4030 "+r"(width) // %3 |
| 4031 : |
| 4032 : "memory", "cc" |
| 4033 #if defined(__SSE2__) |
| 4034 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 4035 #endif |
| 4036 ); |
| 4037 } |
| 4038 #endif // HAS_ARGBBLENDROW_SSE2 |
| 4039 |
| 4040 #ifdef HAS_ARGBBLENDROW_SSSE3 |
| 4041 // Shuffle table for isolating alpha. |
| 4042 static uvec8 kShuffleAlpha = { |
| 4043 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
| 4044 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 |
| 4045 }; |
| 4046 |
| 4047 // Blend 8 pixels at a time |
| 4048 // Shuffle table for reversing the bytes. |
| 4049 |
| 4050 // Same as SSE2, but replaces |
| 4051 // psrlw xmm3, 8 // alpha |
| 4052 // pshufhw xmm3, xmm3,0F5h // 8 alpha words |
| 4053 // pshuflw xmm3, xmm3,0F5h |
| 4054 // with.. |
| 4055 // pshufb xmm3, kShuffleAlpha // alpha |
| 4056 |
| 4057 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
| 4058 uint8* dst_argb, int width) { |
| 4059 asm volatile ( |
| 4060 "pcmpeqb %%xmm7,%%xmm7 \n" |
| 4061 "psrlw $0xf,%%xmm7 \n" |
| 4062 "pcmpeqb %%xmm6,%%xmm6 \n" |
| 4063 "psrlw $0x8,%%xmm6 \n" |
| 4064 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 4065 "psllw $0x8,%%xmm5 \n" |
| 4066 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 4067 "pslld $0x18,%%xmm4 \n" |
| 4068 "sub $0x1,%3 \n" |
| 4069 "je 91f \n" |
| 4070 "jl 99f \n" |
| 4071 |
| 4072 // 1 pixel loop until destination pointer is aligned. |
| 4073 "10: \n" |
| 4074 "test $0xf,%2 \n" |
| 4075 "je 19f \n" |
| 4076 "movd " MEMACCESS(0) ",%%xmm3 \n" |
| 4077 "lea " MEMLEA(0x4,0) ",%0 \n" |
| 4078 "movdqa %%xmm3,%%xmm0 \n" |
| 4079 "pxor %%xmm4,%%xmm3 \n" |
| 4080 "movd " MEMACCESS(1) ",%%xmm2 \n" |
| 4081 "pshufb %4,%%xmm3 \n" |
| 4082 "pand %%xmm6,%%xmm2 \n" |
| 4083 "paddw %%xmm7,%%xmm3 \n" |
| 4084 "pmullw %%xmm3,%%xmm2 \n" |
| 4085 "movd " MEMACCESS(1) ",%%xmm1 \n" |
| 4086 "lea " MEMLEA(0x4,1) ",%1 \n" |
| 4087 "psrlw $0x8,%%xmm1 \n" |
| 4088 "por %%xmm4,%%xmm0 \n" |
| 4089 "pmullw %%xmm3,%%xmm1 \n" |
| 4090 "psrlw $0x8,%%xmm2 \n" |
| 4091 "paddusb %%xmm2,%%xmm0 \n" |
| 4092 "pand %%xmm5,%%xmm1 \n" |
| 4093 "paddusb %%xmm1,%%xmm0 \n" |
| 4094 "sub $0x1,%3 \n" |
| 4095 "movd %%xmm0," MEMACCESS(2) " \n" |
| 4096 "lea " MEMLEA(0x4,2) ",%2 \n" |
| 4097 "jge 10b \n" |
| 4098 |
| 4099 "19: \n" |
| 4100 "add $1-4,%3 \n" |
| 4101 "jl 49f \n" |
| 4102 "test $0xf,%0 \n" |
| 4103 "jne 41f \n" |
| 4104 "test $0xf,%1 \n" |
| 4105 "jne 41f \n" |
| 4106 |
| 4107 // 4 pixel loop. |
| 4108 LABELALIGN |
| 4109 "40: \n" |
| 4110 "movdqa " MEMACCESS(0) ",%%xmm3 \n" |
| 4111 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4112 "movdqa %%xmm3,%%xmm0 \n" |
| 4113 "pxor %%xmm4,%%xmm3 \n" |
| 4114 "movdqa " MEMACCESS(1) ",%%xmm2 \n" |
| 4115 "pshufb %4,%%xmm3 \n" |
| 4116 "pand %%xmm6,%%xmm2 \n" |
| 4117 "paddw %%xmm7,%%xmm3 \n" |
| 4118 "pmullw %%xmm3,%%xmm2 \n" |
| 4119 "movdqa " MEMACCESS(1) ",%%xmm1 \n" |
| 4120 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 4121 "psrlw $0x8,%%xmm1 \n" |
| 4122 "por %%xmm4,%%xmm0 \n" |
| 4123 "pmullw %%xmm3,%%xmm1 \n" |
| 4124 "psrlw $0x8,%%xmm2 \n" |
| 4125 "paddusb %%xmm2,%%xmm0 \n" |
| 4126 "pand %%xmm5,%%xmm1 \n" |
| 4127 "paddusb %%xmm1,%%xmm0 \n" |
| 4128 "sub $0x4,%3 \n" |
| 4129 "movdqa %%xmm0," MEMACCESS(2) " \n" |
| 4130 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 4131 "jge 40b \n" |
| 4132 "jmp 49f \n" |
| 4133 |
| 4134 // 4 pixel unaligned loop. |
| 4135 LABELALIGN |
| 4136 "41: \n" |
| 4137 "movdqu " MEMACCESS(0) ",%%xmm3 \n" |
| 4138 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4139 "movdqa %%xmm3,%%xmm0 \n" |
| 4140 "pxor %%xmm4,%%xmm3 \n" |
| 4141 "movdqu " MEMACCESS(1) ",%%xmm2 \n" |
| 4142 "pshufb %4,%%xmm3 \n" |
| 4143 "pand %%xmm6,%%xmm2 \n" |
| 4144 "paddw %%xmm7,%%xmm3 \n" |
| 4145 "pmullw %%xmm3,%%xmm2 \n" |
| 4146 "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
| 4147 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 4148 "psrlw $0x8,%%xmm1 \n" |
| 4149 "por %%xmm4,%%xmm0 \n" |
| 4150 "pmullw %%xmm3,%%xmm1 \n" |
| 4151 "psrlw $0x8,%%xmm2 \n" |
| 4152 "paddusb %%xmm2,%%xmm0 \n" |
| 4153 "pand %%xmm5,%%xmm1 \n" |
| 4154 "paddusb %%xmm1,%%xmm0 \n" |
| 4155 "sub $0x4,%3 \n" |
| 4156 "movdqa %%xmm0," MEMACCESS(2) " \n" |
| 4157 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 4158 "jge 41b \n" |
| 4159 |
| 4160 "49: \n" |
| 4161 "add $0x3,%3 \n" |
| 4162 "jl 99f \n" |
| 4163 |
| 4164 // 1 pixel loop. |
| 4165 "91: \n" |
| 4166 "movd " MEMACCESS(0) ",%%xmm3 \n" |
| 4167 "lea " MEMLEA(0x4,0) ",%0 \n" |
| 4168 "movdqa %%xmm3,%%xmm0 \n" |
| 4169 "pxor %%xmm4,%%xmm3 \n" |
| 4170 "movd " MEMACCESS(1) ",%%xmm2 \n" |
| 4171 "pshufb %4,%%xmm3 \n" |
| 4172 "pand %%xmm6,%%xmm2 \n" |
| 4173 "paddw %%xmm7,%%xmm3 \n" |
| 4174 "pmullw %%xmm3,%%xmm2 \n" |
| 4175 "movd " MEMACCESS(1) ",%%xmm1 \n" |
| 4176 "lea " MEMLEA(0x4,1) ",%1 \n" |
| 4177 "psrlw $0x8,%%xmm1 \n" |
| 4178 "por %%xmm4,%%xmm0 \n" |
| 4179 "pmullw %%xmm3,%%xmm1 \n" |
| 4180 "psrlw $0x8,%%xmm2 \n" |
| 4181 "paddusb %%xmm2,%%xmm0 \n" |
| 4182 "pand %%xmm5,%%xmm1 \n" |
| 4183 "paddusb %%xmm1,%%xmm0 \n" |
| 4184 "sub $0x1,%3 \n" |
| 4185 "movd %%xmm0," MEMACCESS(2) " \n" |
| 4186 "lea " MEMLEA(0x4,2) ",%2 \n" |
| 4187 "jge 91b \n" |
| 4188 "99: \n" |
| 4189 : "+r"(src_argb0), // %0 |
| 4190 "+r"(src_argb1), // %1 |
| 4191 "+r"(dst_argb), // %2 |
| 4192 "+r"(width) // %3 |
| 4193 : "m"(kShuffleAlpha) // %4 |
| 4194 : "memory", "cc" |
| 4195 #if defined(__SSE2__) |
| 4196 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 4197 #endif |
| 4198 ); |
| 4199 } |
| 4200 #endif // HAS_ARGBBLENDROW_SSSE3 |
| 4201 |
| 4202 #ifdef HAS_ARGBATTENUATEROW_SSE2 |
| 4203 // Attenuate 4 pixels at a time. |
| 4204 // aligned to 16 bytes |
| 4205 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { |
| 4206 asm volatile ( |
| 4207 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 4208 "pslld $0x18,%%xmm4 \n" |
| 4209 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 4210 "psrld $0x8,%%xmm5 \n" |
| 4211 |
| 4212 // 4 pixel loop. |
| 4213 LABELALIGN |
| 4214 "1: \n" |
| 4215 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 4216 "punpcklbw %%xmm0,%%xmm0 \n" |
| 4217 "pshufhw $0xff,%%xmm0,%%xmm2 \n" |
| 4218 "pshuflw $0xff,%%xmm2,%%xmm2 \n" |
| 4219 "pmulhuw %%xmm2,%%xmm0 \n" |
| 4220 "movdqa " MEMACCESS(0) ",%%xmm1 \n" |
| 4221 "punpckhbw %%xmm1,%%xmm1 \n" |
| 4222 "pshufhw $0xff,%%xmm1,%%xmm2 \n" |
| 4223 "pshuflw $0xff,%%xmm2,%%xmm2 \n" |
| 4224 "pmulhuw %%xmm2,%%xmm1 \n" |
| 4225 "movdqa " MEMACCESS(0) ",%%xmm2 \n" |
| 4226 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4227 "psrlw $0x8,%%xmm0 \n" |
| 4228 "pand %%xmm4,%%xmm2 \n" |
| 4229 "psrlw $0x8,%%xmm1 \n" |
| 4230 "packuswb %%xmm1,%%xmm0 \n" |
| 4231 "pand %%xmm5,%%xmm0 \n" |
| 4232 "por %%xmm2,%%xmm0 \n" |
| 4233 "sub $0x4,%2 \n" |
| 4234 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 4235 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 4236 "jg 1b \n" |
| 4237 : "+r"(src_argb), // %0 |
| 4238 "+r"(dst_argb), // %1 |
| 4239 "+r"(width) // %2 |
| 4240 : |
| 4241 : "memory", "cc" |
| 4242 #if defined(__SSE2__) |
| 4243 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 4244 #endif |
| 4245 ); |
| 4246 } |
| 4247 #endif // HAS_ARGBATTENUATEROW_SSE2 |
| 4248 |
| 4249 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
| 4250 // Shuffle table duplicating alpha |
| 4251 static uvec8 kShuffleAlpha0 = { |
| 4252 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, |
| 4253 }; |
| 4254 static uvec8 kShuffleAlpha1 = { |
| 4255 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
| 4256 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, |
| 4257 }; |
| 4258 // Attenuate 4 pixels at a time. |
| 4259 // aligned to 16 bytes |
| 4260 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
| 4261 asm volatile ( |
| 4262 "pcmpeqb %%xmm3,%%xmm3 \n" |
| 4263 "pslld $0x18,%%xmm3 \n" |
| 4264 "movdqa %3,%%xmm4 \n" |
| 4265 "movdqa %4,%%xmm5 \n" |
| 4266 |
| 4267 // 4 pixel loop. |
| 4268 LABELALIGN |
| 4269 "1: \n" |
| 4270 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 4271 "pshufb %%xmm4,%%xmm0 \n" |
| 4272 "movdqu " MEMACCESS(0) ",%%xmm1 \n" |
| 4273 "punpcklbw %%xmm1,%%xmm1 \n" |
| 4274 "pmulhuw %%xmm1,%%xmm0 \n" |
| 4275 "movdqu " MEMACCESS(0) ",%%xmm1 \n" |
| 4276 "pshufb %%xmm5,%%xmm1 \n" |
| 4277 "movdqu " MEMACCESS(0) ",%%xmm2 \n" |
| 4278 "punpckhbw %%xmm2,%%xmm2 \n" |
| 4279 "pmulhuw %%xmm2,%%xmm1 \n" |
| 4280 "movdqu " MEMACCESS(0) ",%%xmm2 \n" |
| 4281 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4282 "pand %%xmm3,%%xmm2 \n" |
| 4283 "psrlw $0x8,%%xmm0 \n" |
| 4284 "psrlw $0x8,%%xmm1 \n" |
| 4285 "packuswb %%xmm1,%%xmm0 \n" |
| 4286 "por %%xmm2,%%xmm0 \n" |
| 4287 "sub $0x4,%2 \n" |
| 4288 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 4289 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 4290 "jg 1b \n" |
| 4291 : "+r"(src_argb), // %0 |
| 4292 "+r"(dst_argb), // %1 |
| 4293 "+r"(width) // %2 |
| 4294 : "m"(kShuffleAlpha0), // %3 |
| 4295 "m"(kShuffleAlpha1) // %4 |
| 4296 : "memory", "cc" |
| 4297 #if defined(__SSE2__) |
| 4298 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 4299 #endif |
| 4300 ); |
| 4301 } |
| 4302 #endif // HAS_ARGBATTENUATEROW_SSSE3 |
| 4303 |
| 4304 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 |
| 4305 // Unattenuate 4 pixels at a time. |
| 4306 // aligned to 16 bytes |
| 4307 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
| 4308 int width) { |
| 4309 uintptr_t alpha = 0; |
| 4310 asm volatile ( |
| 4311 // 4 pixel loop. |
| 4312 LABELALIGN |
| 4313 "1: \n" |
| 4314 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 4315 "movzb " MEMACCESS2(0x03,0) ",%3 \n" |
| 4316 "punpcklbw %%xmm0,%%xmm0 \n" |
| 4317 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 |
| 4318 "movzb " MEMACCESS2(0x07,0) ",%3 \n" |
| 4319 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 |
| 4320 "pshuflw $0x40,%%xmm2,%%xmm2 \n" |
| 4321 "pshuflw $0x40,%%xmm3,%%xmm3 \n" |
| 4322 "movlhps %%xmm3,%%xmm2 \n" |
| 4323 "pmulhuw %%xmm2,%%xmm0 \n" |
| 4324 "movdqu " MEMACCESS(0) ",%%xmm1 \n" |
| 4325 "movzb " MEMACCESS2(0x0b,0) ",%3 \n" |
| 4326 "punpckhbw %%xmm1,%%xmm1 \n" |
| 4327 BUNDLEALIGN |
| 4328 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 |
| 4329 "movzb " MEMACCESS2(0x0f,0) ",%3 \n" |
| 4330 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 |
| 4331 "pshuflw $0x40,%%xmm2,%%xmm2 \n" |
| 4332 "pshuflw $0x40,%%xmm3,%%xmm3 \n" |
| 4333 "movlhps %%xmm3,%%xmm2 \n" |
| 4334 "pmulhuw %%xmm2,%%xmm1 \n" |
| 4335 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4336 "packuswb %%xmm1,%%xmm0 \n" |
| 4337 "sub $0x4,%2 \n" |
| 4338 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 4339 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 4340 "jg 1b \n" |
| 4341 : "+r"(src_argb), // %0 |
| 4342 "+r"(dst_argb), // %1 |
| 4343 "+r"(width), // %2 |
| 4344 "+r"(alpha) // %3 |
| 4345 : "r"(fixed_invtbl8) // %4 |
| 4346 : "memory", "cc" |
| 4347 #if defined(__native_client__) && defined(__x86_64__) |
| 4348 , "r14" |
| 4349 #endif |
| 4350 #if defined(__SSE2__) |
| 4351 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 4352 #endif |
| 4353 ); |
| 4354 } |
| 4355 #endif // HAS_ARGBUNATTENUATEROW_SSE2 |
| 4356 |
| 4357 #ifdef HAS_ARGBGRAYROW_SSSE3 |
| 4358 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels |
| 4359 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
| 4360 asm volatile ( |
| 4361 "movdqa %3,%%xmm4 \n" |
| 4362 "movdqa %4,%%xmm5 \n" |
| 4363 |
| 4364 // 8 pixel loop. |
| 4365 LABELALIGN |
| 4366 "1: \n" |
| 4367 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 4368 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 4369 "pmaddubsw %%xmm4,%%xmm0 \n" |
| 4370 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 4371 "phaddw %%xmm1,%%xmm0 \n" |
| 4372 "paddw %%xmm5,%%xmm0 \n" |
| 4373 "psrlw $0x7,%%xmm0 \n" |
| 4374 "packuswb %%xmm0,%%xmm0 \n" |
| 4375 "movdqa " MEMACCESS(0) ",%%xmm2 \n" |
| 4376 "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" |
| 4377 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 4378 "psrld $0x18,%%xmm2 \n" |
| 4379 "psrld $0x18,%%xmm3 \n" |
| 4380 "packuswb %%xmm3,%%xmm2 \n" |
| 4381 "packuswb %%xmm2,%%xmm2 \n" |
| 4382 "movdqa %%xmm0,%%xmm3 \n" |
| 4383 "punpcklbw %%xmm0,%%xmm0 \n" |
| 4384 "punpcklbw %%xmm2,%%xmm3 \n" |
| 4385 "movdqa %%xmm0,%%xmm1 \n" |
| 4386 "punpcklwd %%xmm3,%%xmm0 \n" |
| 4387 "punpckhwd %%xmm3,%%xmm1 \n" |
| 4388 "sub $0x8,%2 \n" |
| 4389 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 4390 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 4391 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 4392 "jg 1b \n" |
| 4393 : "+r"(src_argb), // %0 |
| 4394 "+r"(dst_argb), // %1 |
| 4395 "+r"(width) // %2 |
| 4396 : "m"(kARGBToYJ), // %3 |
| 4397 "m"(kAddYJ64) // %4 |
| 4398 : "memory", "cc" |
| 4399 #if defined(__SSE2__) |
| 4400 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 4401 #endif |
| 4402 ); |
| 4403 } |
| 4404 #endif // HAS_ARGBGRAYROW_SSSE3 |
| 4405 |
| 4406 #ifdef HAS_ARGBSEPIAROW_SSSE3 |
| 4407 // b = (r * 35 + g * 68 + b * 17) >> 7 |
| 4408 // g = (r * 45 + g * 88 + b * 22) >> 7 |
| 4409 // r = (r * 50 + g * 98 + b * 24) >> 7 |
| 4410 // Constant for ARGB color to sepia tone |
| 4411 static vec8 kARGBToSepiaB = { |
| 4412 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 |
| 4413 }; |
| 4414 |
| 4415 static vec8 kARGBToSepiaG = { |
| 4416 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 |
| 4417 }; |
| 4418 |
| 4419 static vec8 kARGBToSepiaR = { |
| 4420 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 |
| 4421 }; |
| 4422 |
| 4423 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
| 4424 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { |
| 4425 asm volatile ( |
| 4426 "movdqa %2,%%xmm2 \n" |
| 4427 "movdqa %3,%%xmm3 \n" |
| 4428 "movdqa %4,%%xmm4 \n" |
| 4429 |
| 4430 // 8 pixel loop. |
| 4431 LABELALIGN |
| 4432 "1: \n" |
| 4433 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 4434 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" |
| 4435 "pmaddubsw %%xmm2,%%xmm0 \n" |
| 4436 "pmaddubsw %%xmm2,%%xmm6 \n" |
| 4437 "phaddw %%xmm6,%%xmm0 \n" |
| 4438 "psrlw $0x7,%%xmm0 \n" |
| 4439 "packuswb %%xmm0,%%xmm0 \n" |
| 4440 "movdqa " MEMACCESS(0) ",%%xmm5 \n" |
| 4441 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 4442 "pmaddubsw %%xmm3,%%xmm5 \n" |
| 4443 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 4444 "phaddw %%xmm1,%%xmm5 \n" |
| 4445 "psrlw $0x7,%%xmm5 \n" |
| 4446 "packuswb %%xmm5,%%xmm5 \n" |
| 4447 "punpcklbw %%xmm5,%%xmm0 \n" |
| 4448 "movdqa " MEMACCESS(0) ",%%xmm5 \n" |
| 4449 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 4450 "pmaddubsw %%xmm4,%%xmm5 \n" |
| 4451 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 4452 "phaddw %%xmm1,%%xmm5 \n" |
| 4453 "psrlw $0x7,%%xmm5 \n" |
| 4454 "packuswb %%xmm5,%%xmm5 \n" |
| 4455 "movdqa " MEMACCESS(0) ",%%xmm6 \n" |
| 4456 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 4457 "psrld $0x18,%%xmm6 \n" |
| 4458 "psrld $0x18,%%xmm1 \n" |
| 4459 "packuswb %%xmm1,%%xmm6 \n" |
| 4460 "packuswb %%xmm6,%%xmm6 \n" |
| 4461 "punpcklbw %%xmm6,%%xmm5 \n" |
| 4462 "movdqa %%xmm0,%%xmm1 \n" |
| 4463 "punpcklwd %%xmm5,%%xmm0 \n" |
| 4464 "punpckhwd %%xmm5,%%xmm1 \n" |
| 4465 "sub $0x8,%1 \n" |
| 4466 "movdqa %%xmm0," MEMACCESS(0) " \n" |
| 4467 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" |
| 4468 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 4469 "jg 1b \n" |
| 4470 : "+r"(dst_argb), // %0 |
| 4471 "+r"(width) // %1 |
| 4472 : "m"(kARGBToSepiaB), // %2 |
| 4473 "m"(kARGBToSepiaG), // %3 |
| 4474 "m"(kARGBToSepiaR) // %4 |
| 4475 : "memory", "cc" |
| 4476 #if defined(__SSE2__) |
| 4477 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 4478 #endif |
| 4479 ); |
| 4480 } |
| 4481 #endif // HAS_ARGBSEPIAROW_SSSE3 |
| 4482 |
| 4483 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 |
| 4484 // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
| 4485 // Same as Sepia except matrix is provided. |
| 4486 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
| 4487 const int8* matrix_argb, int width) { |
| 4488 asm volatile ( |
| 4489 "movdqu " MEMACCESS(3) ",%%xmm5 \n" |
| 4490 "pshufd $0x00,%%xmm5,%%xmm2 \n" |
| 4491 "pshufd $0x55,%%xmm5,%%xmm3 \n" |
| 4492 "pshufd $0xaa,%%xmm5,%%xmm4 \n" |
| 4493 "pshufd $0xff,%%xmm5,%%xmm5 \n" |
| 4494 |
| 4495 // 8 pixel loop. |
| 4496 LABELALIGN |
| 4497 "1: \n" |
| 4498 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 4499 "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" |
| 4500 "pmaddubsw %%xmm2,%%xmm0 \n" |
| 4501 "pmaddubsw %%xmm2,%%xmm7 \n" |
| 4502 "movdqa " MEMACCESS(0) ",%%xmm6 \n" |
| 4503 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 4504 "pmaddubsw %%xmm3,%%xmm6 \n" |
| 4505 "pmaddubsw %%xmm3,%%xmm1 \n" |
| 4506 "phaddsw %%xmm7,%%xmm0 \n" |
| 4507 "phaddsw %%xmm1,%%xmm6 \n" |
| 4508 "psraw $0x6,%%xmm0 \n" |
| 4509 "psraw $0x6,%%xmm6 \n" |
| 4510 "packuswb %%xmm0,%%xmm0 \n" |
| 4511 "packuswb %%xmm6,%%xmm6 \n" |
| 4512 "punpcklbw %%xmm6,%%xmm0 \n" |
| 4513 "movdqa " MEMACCESS(0) ",%%xmm1 \n" |
| 4514 "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" |
| 4515 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 4516 "pmaddubsw %%xmm4,%%xmm7 \n" |
| 4517 "phaddsw %%xmm7,%%xmm1 \n" |
| 4518 "movdqa " MEMACCESS(0) ",%%xmm6 \n" |
| 4519 "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" |
| 4520 "pmaddubsw %%xmm5,%%xmm6 \n" |
| 4521 "pmaddubsw %%xmm5,%%xmm7 \n" |
| 4522 "phaddsw %%xmm7,%%xmm6 \n" |
| 4523 "psraw $0x6,%%xmm1 \n" |
| 4524 "psraw $0x6,%%xmm6 \n" |
| 4525 "packuswb %%xmm1,%%xmm1 \n" |
| 4526 "packuswb %%xmm6,%%xmm6 \n" |
| 4527 "punpcklbw %%xmm6,%%xmm1 \n" |
| 4528 "movdqa %%xmm0,%%xmm6 \n" |
| 4529 "punpcklwd %%xmm1,%%xmm0 \n" |
| 4530 "punpckhwd %%xmm1,%%xmm6 \n" |
| 4531 "sub $0x8,%2 \n" |
| 4532 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 4533 "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n" |
| 4534 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 4535 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 4536 "jg 1b \n" |
| 4537 : "+r"(src_argb), // %0 |
| 4538 "+r"(dst_argb), // %1 |
| 4539 "+r"(width) // %2 |
| 4540 : "r"(matrix_argb) // %3 |
| 4541 : "memory", "cc" |
| 4542 #if defined(__SSE2__) |
| 4543 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 4544 #endif |
| 4545 ); |
| 4546 } |
| 4547 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 |
| 4548 |
| 4549 #ifdef HAS_ARGBQUANTIZEROW_SSE2 |
| 4550 // Quantize 4 ARGB pixels (16 bytes). |
| 4551 // aligned to 16 bytes |
| 4552 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, |
| 4553 int interval_offset, int width) { |
| 4554 asm volatile ( |
| 4555 "movd %2,%%xmm2 \n" |
| 4556 "movd %3,%%xmm3 \n" |
| 4557 "movd %4,%%xmm4 \n" |
| 4558 "pshuflw $0x40,%%xmm2,%%xmm2 \n" |
| 4559 "pshufd $0x44,%%xmm2,%%xmm2 \n" |
| 4560 "pshuflw $0x40,%%xmm3,%%xmm3 \n" |
| 4561 "pshufd $0x44,%%xmm3,%%xmm3 \n" |
| 4562 "pshuflw $0x40,%%xmm4,%%xmm4 \n" |
| 4563 "pshufd $0x44,%%xmm4,%%xmm4 \n" |
| 4564 "pxor %%xmm5,%%xmm5 \n" |
| 4565 "pcmpeqb %%xmm6,%%xmm6 \n" |
| 4566 "pslld $0x18,%%xmm6 \n" |
| 4567 |
| 4568 // 4 pixel loop. |
| 4569 LABELALIGN |
| 4570 "1: \n" |
| 4571 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 4572 "punpcklbw %%xmm5,%%xmm0 \n" |
| 4573 "pmulhuw %%xmm2,%%xmm0 \n" |
| 4574 "movdqa " MEMACCESS(0) ",%%xmm1 \n" |
| 4575 "punpckhbw %%xmm5,%%xmm1 \n" |
| 4576 "pmulhuw %%xmm2,%%xmm1 \n" |
| 4577 "pmullw %%xmm3,%%xmm0 \n" |
| 4578 "movdqa " MEMACCESS(0) ",%%xmm7 \n" |
| 4579 "pmullw %%xmm3,%%xmm1 \n" |
| 4580 "pand %%xmm6,%%xmm7 \n" |
| 4581 "paddw %%xmm4,%%xmm0 \n" |
| 4582 "paddw %%xmm4,%%xmm1 \n" |
| 4583 "packuswb %%xmm1,%%xmm0 \n" |
| 4584 "por %%xmm7,%%xmm0 \n" |
| 4585 "sub $0x4,%1 \n" |
| 4586 "movdqa %%xmm0," MEMACCESS(0) " \n" |
| 4587 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4588 "jg 1b \n" |
| 4589 : "+r"(dst_argb), // %0 |
| 4590 "+r"(width) // %1 |
| 4591 : "r"(scale), // %2 |
| 4592 "r"(interval_size), // %3 |
| 4593 "r"(interval_offset) // %4 |
| 4594 : "memory", "cc" |
| 4595 #if defined(__SSE2__) |
| 4596 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 4597 #endif |
| 4598 ); |
| 4599 } |
| 4600 #endif // HAS_ARGBQUANTIZEROW_SSE2 |
| 4601 |
| 4602 #ifdef HAS_ARGBSHADEROW_SSE2 |
| 4603 // Shade 4 pixels at a time by specified value. |
| 4604 // Aligned to 16 bytes. |
| 4605 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, |
| 4606 uint32 value) { |
| 4607 asm volatile ( |
| 4608 "movd %3,%%xmm2 \n" |
| 4609 "punpcklbw %%xmm2,%%xmm2 \n" |
| 4610 "punpcklqdq %%xmm2,%%xmm2 \n" |
| 4611 |
| 4612 // 4 pixel loop. |
| 4613 LABELALIGN |
| 4614 "1: \n" |
| 4615 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 4616 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4617 "movdqa %%xmm0,%%xmm1 \n" |
| 4618 "punpcklbw %%xmm0,%%xmm0 \n" |
| 4619 "punpckhbw %%xmm1,%%xmm1 \n" |
| 4620 "pmulhuw %%xmm2,%%xmm0 \n" |
| 4621 "pmulhuw %%xmm2,%%xmm1 \n" |
| 4622 "psrlw $0x8,%%xmm0 \n" |
| 4623 "psrlw $0x8,%%xmm1 \n" |
| 4624 "packuswb %%xmm1,%%xmm0 \n" |
| 4625 "sub $0x4,%2 \n" |
| 4626 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 4627 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 4628 "jg 1b \n" |
| 4629 : "+r"(src_argb), // %0 |
| 4630 "+r"(dst_argb), // %1 |
| 4631 "+r"(width) // %2 |
| 4632 : "r"(value) // %3 |
| 4633 : "memory", "cc" |
| 4634 #if defined(__SSE2__) |
| 4635 , "xmm0", "xmm1", "xmm2" |
| 4636 #endif |
| 4637 ); |
| 4638 } |
| 4639 #endif // HAS_ARGBSHADEROW_SSE2 |
| 4640 |
| 4641 #ifdef HAS_ARGBMULTIPLYROW_SSE2 |
| 4642 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. |
| 4643 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
| 4644 uint8* dst_argb, int width) { |
| 4645 asm volatile ( |
| 4646 "pxor %%xmm5,%%xmm5 \n" |
| 4647 |
| 4648 // 4 pixel loop. |
| 4649 LABELALIGN |
| 4650 "1: \n" |
| 4651 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 4652 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4653 "movdqu " MEMACCESS(1) ",%%xmm2 \n" |
| 4654 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 4655 "movdqu %%xmm0,%%xmm1 \n" |
| 4656 "movdqu %%xmm2,%%xmm3 \n" |
| 4657 "punpcklbw %%xmm0,%%xmm0 \n" |
| 4658 "punpckhbw %%xmm1,%%xmm1 \n" |
| 4659 "punpcklbw %%xmm5,%%xmm2 \n" |
| 4660 "punpckhbw %%xmm5,%%xmm3 \n" |
| 4661 "pmulhuw %%xmm2,%%xmm0 \n" |
| 4662 "pmulhuw %%xmm3,%%xmm1 \n" |
| 4663 "packuswb %%xmm1,%%xmm0 \n" |
| 4664 "sub $0x4,%3 \n" |
| 4665 "movdqu %%xmm0," MEMACCESS(2) " \n" |
| 4666 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 4667 "jg 1b \n" |
| 4668 : "+r"(src_argb0), // %0 |
| 4669 "+r"(src_argb1), // %1 |
| 4670 "+r"(dst_argb), // %2 |
| 4671 "+r"(width) // %3 |
| 4672 : |
| 4673 : "memory", "cc" |
| 4674 #if defined(__SSE2__) |
| 4675 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 4676 #endif |
| 4677 ); |
| 4678 } |
| 4679 #endif // HAS_ARGBMULTIPLYROW_SSE2 |
| 4680 |
| 4681 #ifdef HAS_ARGBADDROW_SSE2 |
| 4682 // Add 2 rows of ARGB pixels together, 4 pixels at a time. |
| 4683 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
| 4684 uint8* dst_argb, int width) { |
| 4685 asm volatile ( |
| 4686 // 4 pixel loop. |
| 4687 LABELALIGN |
| 4688 "1: \n" |
| 4689 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 4690 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4691 "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
| 4692 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 4693 "paddusb %%xmm1,%%xmm0 \n" |
| 4694 "sub $0x4,%3 \n" |
| 4695 "movdqu %%xmm0," MEMACCESS(2) " \n" |
| 4696 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 4697 "jg 1b \n" |
| 4698 : "+r"(src_argb0), // %0 |
| 4699 "+r"(src_argb1), // %1 |
| 4700 "+r"(dst_argb), // %2 |
| 4701 "+r"(width) // %3 |
| 4702 : |
| 4703 : "memory", "cc" |
| 4704 #if defined(__SSE2__) |
| 4705 , "xmm0", "xmm1" |
| 4706 #endif |
| 4707 ); |
| 4708 } |
| 4709 #endif // HAS_ARGBADDROW_SSE2 |
| 4710 |
| 4711 #ifdef HAS_ARGBSUBTRACTROW_SSE2 |
| 4712 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. |
| 4713 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
| 4714 uint8* dst_argb, int width) { |
| 4715 asm volatile ( |
| 4716 // 4 pixel loop. |
| 4717 LABELALIGN |
| 4718 "1: \n" |
| 4719 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 4720 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4721 "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
| 4722 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 4723 "psubusb %%xmm1,%%xmm0 \n" |
| 4724 "sub $0x4,%3 \n" |
| 4725 "movdqu %%xmm0," MEMACCESS(2) " \n" |
| 4726 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 4727 "jg 1b \n" |
| 4728 : "+r"(src_argb0), // %0 |
| 4729 "+r"(src_argb1), // %1 |
| 4730 "+r"(dst_argb), // %2 |
| 4731 "+r"(width) // %3 |
| 4732 : |
| 4733 : "memory", "cc" |
| 4734 #if defined(__SSE2__) |
| 4735 , "xmm0", "xmm1" |
| 4736 #endif |
| 4737 ); |
| 4738 } |
| 4739 #endif // HAS_ARGBSUBTRACTROW_SSE2 |
| 4740 |
| 4741 #ifdef HAS_SOBELXROW_SSE2 |
| 4742 // SobelX as a matrix is |
| 4743 // -1 0 1 |
| 4744 // -2 0 2 |
| 4745 // -1 0 1 |
| 4746 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
| 4747 const uint8* src_y2, uint8* dst_sobelx, int width) { |
| 4748 asm volatile ( |
| 4749 "sub %0,%1 \n" |
| 4750 "sub %0,%2 \n" |
| 4751 "sub %0,%3 \n" |
| 4752 "pxor %%xmm5,%%xmm5 \n" |
| 4753 |
| 4754 // 8 pixel loop. |
| 4755 LABELALIGN |
| 4756 "1: \n" |
| 4757 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| 4758 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" |
| 4759 "punpcklbw %%xmm5,%%xmm0 \n" |
| 4760 "punpcklbw %%xmm5,%%xmm1 \n" |
| 4761 "psubw %%xmm1,%%xmm0 \n" |
| 4762 BUNDLEALIGN |
| 4763 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 |
| 4764 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 |
| 4765 "punpcklbw %%xmm5,%%xmm1 \n" |
| 4766 "punpcklbw %%xmm5,%%xmm2 \n" |
| 4767 "psubw %%xmm2,%%xmm1 \n" |
| 4768 BUNDLEALIGN |
| 4769 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 |
| 4770 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 |
| 4771 "punpcklbw %%xmm5,%%xmm2 \n" |
| 4772 "punpcklbw %%xmm5,%%xmm3 \n" |
| 4773 "psubw %%xmm3,%%xmm2 \n" |
| 4774 "paddw %%xmm2,%%xmm0 \n" |
| 4775 "paddw %%xmm1,%%xmm0 \n" |
| 4776 "paddw %%xmm1,%%xmm0 \n" |
| 4777 "pxor %%xmm1,%%xmm1 \n" |
| 4778 "psubw %%xmm0,%%xmm1 \n" |
| 4779 "pmaxsw %%xmm1,%%xmm0 \n" |
| 4780 "packuswb %%xmm0,%%xmm0 \n" |
| 4781 "sub $0x8,%4 \n" |
| 4782 BUNDLEALIGN |
| 4783 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) |
| 4784 "lea " MEMLEA(0x8,0) ",%0 \n" |
| 4785 "jg 1b \n" |
| 4786 : "+r"(src_y0), // %0 |
| 4787 "+r"(src_y1), // %1 |
| 4788 "+r"(src_y2), // %2 |
| 4789 "+r"(dst_sobelx), // %3 |
| 4790 "+r"(width) // %4 |
| 4791 : |
| 4792 : "memory", "cc" |
| 4793 #if defined(__native_client__) && defined(__x86_64__) |
| 4794 , "r14" |
| 4795 #endif |
| 4796 #if defined(__SSE2__) |
| 4797 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 4798 #endif |
| 4799 ); |
| 4800 } |
| 4801 #endif // HAS_SOBELXROW_SSE2 |
| 4802 |
| 4803 #ifdef HAS_SOBELYROW_SSE2 |
| 4804 // SobelY as a matrix is |
| 4805 // -1 -2 -1 |
| 4806 // 0 0 0 |
| 4807 // 1 2 1 |
| 4808 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
| 4809 uint8* dst_sobely, int width) { |
| 4810 asm volatile ( |
| 4811 "sub %0,%1 \n" |
| 4812 "sub %0,%2 \n" |
| 4813 "pxor %%xmm5,%%xmm5 \n" |
| 4814 |
| 4815 // 8 pixel loop. |
| 4816 LABELALIGN |
| 4817 "1: \n" |
| 4818 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| 4819 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 |
| 4820 "punpcklbw %%xmm5,%%xmm0 \n" |
| 4821 "punpcklbw %%xmm5,%%xmm1 \n" |
| 4822 "psubw %%xmm1,%%xmm0 \n" |
| 4823 BUNDLEALIGN |
| 4824 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" |
| 4825 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 |
| 4826 "punpcklbw %%xmm5,%%xmm1 \n" |
| 4827 "punpcklbw %%xmm5,%%xmm2 \n" |
| 4828 "psubw %%xmm2,%%xmm1 \n" |
| 4829 BUNDLEALIGN |
| 4830 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" |
| 4831 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 |
| 4832 "punpcklbw %%xmm5,%%xmm2 \n" |
| 4833 "punpcklbw %%xmm5,%%xmm3 \n" |
| 4834 "psubw %%xmm3,%%xmm2 \n" |
| 4835 "paddw %%xmm2,%%xmm0 \n" |
| 4836 "paddw %%xmm1,%%xmm0 \n" |
| 4837 "paddw %%xmm1,%%xmm0 \n" |
| 4838 "pxor %%xmm1,%%xmm1 \n" |
| 4839 "psubw %%xmm0,%%xmm1 \n" |
| 4840 "pmaxsw %%xmm1,%%xmm0 \n" |
| 4841 "packuswb %%xmm0,%%xmm0 \n" |
| 4842 "sub $0x8,%3 \n" |
| 4843 BUNDLEALIGN |
| 4844 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) |
| 4845 "lea " MEMLEA(0x8,0) ",%0 \n" |
| 4846 "jg 1b \n" |
| 4847 : "+r"(src_y0), // %0 |
| 4848 "+r"(src_y1), // %1 |
| 4849 "+r"(dst_sobely), // %2 |
| 4850 "+r"(width) // %3 |
| 4851 : |
| 4852 : "memory", "cc" |
| 4853 #if defined(__native_client__) && defined(__x86_64__) |
| 4854 , "r14" |
| 4855 #endif |
| 4856 #if defined(__SSE2__) |
| 4857 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 4858 #endif |
| 4859 ); |
| 4860 } |
| 4861 #endif // HAS_SOBELYROW_SSE2 |
| 4862 |
| 4863 #ifdef HAS_SOBELROW_SSE2 |
| 4864 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
| 4865 // A = 255 |
| 4866 // R = Sobel |
| 4867 // G = Sobel |
| 4868 // B = Sobel |
| 4869 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
| 4870 uint8* dst_argb, int width) { |
| 4871 asm volatile ( |
| 4872 "sub %0,%1 \n" |
| 4873 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 4874 "pslld $0x18,%%xmm5 \n" |
| 4875 |
| 4876 // 8 pixel loop. |
| 4877 LABELALIGN |
| 4878 "1: \n" |
| 4879 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 4880 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 |
| 4881 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4882 "paddusb %%xmm1,%%xmm0 \n" |
| 4883 "movdqa %%xmm0,%%xmm2 \n" |
| 4884 "punpcklbw %%xmm0,%%xmm2 \n" |
| 4885 "punpckhbw %%xmm0,%%xmm0 \n" |
| 4886 "movdqa %%xmm2,%%xmm1 \n" |
| 4887 "punpcklwd %%xmm2,%%xmm1 \n" |
| 4888 "punpckhwd %%xmm2,%%xmm2 \n" |
| 4889 "por %%xmm5,%%xmm1 \n" |
| 4890 "por %%xmm5,%%xmm2 \n" |
| 4891 "movdqa %%xmm0,%%xmm3 \n" |
| 4892 "punpcklwd %%xmm0,%%xmm3 \n" |
| 4893 "punpckhwd %%xmm0,%%xmm0 \n" |
| 4894 "por %%xmm5,%%xmm3 \n" |
| 4895 "por %%xmm5,%%xmm0 \n" |
| 4896 "sub $0x10,%3 \n" |
| 4897 "movdqa %%xmm1," MEMACCESS(2) " \n" |
| 4898 "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" |
| 4899 "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n" |
| 4900 "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n" |
| 4901 "lea " MEMLEA(0x40,2) ",%2 \n" |
| 4902 "jg 1b \n" |
| 4903 : "+r"(src_sobelx), // %0 |
| 4904 "+r"(src_sobely), // %1 |
| 4905 "+r"(dst_argb), // %2 |
| 4906 "+r"(width) // %3 |
| 4907 : |
| 4908 : "memory", "cc" |
| 4909 #if defined(__native_client__) && defined(__x86_64__) |
| 4910 , "r14" |
| 4911 #endif |
| 4912 #if defined(__SSE2__) |
| 4913 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 4914 #endif |
| 4915 ); |
| 4916 } |
| 4917 #endif // HAS_SOBELROW_SSE2 |
| 4918 |
| 4919 #ifdef HAS_SOBELTOPLANEROW_SSE2 |
| 4920 // Adds Sobel X and Sobel Y and stores Sobel into a plane. |
| 4921 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
| 4922 uint8* dst_y, int width) { |
| 4923 asm volatile ( |
| 4924 "sub %0,%1 \n" |
| 4925 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 4926 "pslld $0x18,%%xmm5 \n" |
| 4927 |
| 4928 // 8 pixel loop. |
| 4929 LABELALIGN |
| 4930 "1: \n" |
| 4931 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 4932 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 |
| 4933 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4934 "paddusb %%xmm1,%%xmm0 \n" |
| 4935 "sub $0x10,%3 \n" |
| 4936 "movdqa %%xmm0," MEMACCESS(2) " \n" |
| 4937 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 4938 "jg 1b \n" |
| 4939 : "+r"(src_sobelx), // %0 |
| 4940 "+r"(src_sobely), // %1 |
| 4941 "+r"(dst_y), // %2 |
| 4942 "+r"(width) // %3 |
| 4943 : |
| 4944 : "memory", "cc" |
| 4945 #if defined(__native_client__) && defined(__x86_64__) |
| 4946 , "r14" |
| 4947 #endif |
| 4948 #if defined(__SSE2__) |
| 4949 , "xmm0", "xmm1" |
| 4950 #endif |
| 4951 ); |
| 4952 } |
| 4953 #endif // HAS_SOBELTOPLANEROW_SSE2 |
| 4954 |
| 4955 #ifdef HAS_SOBELXYROW_SSE2 |
| 4956 // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
| 4957 // A = 255 |
| 4958 // R = Sobel X |
| 4959 // G = Sobel |
| 4960 // B = Sobel Y |
| 4961 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
| 4962 uint8* dst_argb, int width) { |
| 4963 asm volatile ( |
| 4964 "sub %0,%1 \n" |
| 4965 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 4966 |
| 4967 // 8 pixel loop. |
| 4968 LABELALIGN |
| 4969 "1: \n" |
| 4970 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 4971 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 |
| 4972 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4973 "movdqa %%xmm0,%%xmm2 \n" |
| 4974 "paddusb %%xmm1,%%xmm2 \n" |
| 4975 "movdqa %%xmm0,%%xmm3 \n" |
| 4976 "punpcklbw %%xmm5,%%xmm3 \n" |
| 4977 "punpckhbw %%xmm5,%%xmm0 \n" |
| 4978 "movdqa %%xmm1,%%xmm4 \n" |
| 4979 "punpcklbw %%xmm2,%%xmm4 \n" |
| 4980 "punpckhbw %%xmm2,%%xmm1 \n" |
| 4981 "movdqa %%xmm4,%%xmm6 \n" |
| 4982 "punpcklwd %%xmm3,%%xmm6 \n" |
| 4983 "punpckhwd %%xmm3,%%xmm4 \n" |
| 4984 "movdqa %%xmm1,%%xmm7 \n" |
| 4985 "punpcklwd %%xmm0,%%xmm7 \n" |
| 4986 "punpckhwd %%xmm0,%%xmm1 \n" |
| 4987 "sub $0x10,%3 \n" |
| 4988 "movdqa %%xmm6," MEMACCESS(2) " \n" |
| 4989 "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n" |
| 4990 "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n" |
| 4991 "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n" |
| 4992 "lea " MEMLEA(0x40,2) ",%2 \n" |
| 4993 "jg 1b \n" |
| 4994 : "+r"(src_sobelx), // %0 |
| 4995 "+r"(src_sobely), // %1 |
| 4996 "+r"(dst_argb), // %2 |
| 4997 "+r"(width) // %3 |
| 4998 : |
| 4999 : "memory", "cc" |
| 5000 #if defined(__native_client__) && defined(__x86_64__) |
| 5001 , "r14" |
| 5002 #endif |
| 5003 #if defined(__SSE2__) |
| 5004 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 5005 #endif |
| 5006 ); |
| 5007 } |
| 5008 #endif // HAS_SOBELXYROW_SSE2 |
| 5009 |
| 5010 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 |
| 5011 // Creates a table of cumulative sums where each value is a sum of all values |
| 5012 // above and to the left of the value, inclusive of the value. |
| 5013 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, |
| 5014 const int32* previous_cumsum, int width) { |
| 5015 asm volatile ( |
| 5016 "pxor %%xmm0,%%xmm0 \n" |
| 5017 "pxor %%xmm1,%%xmm1 \n" |
| 5018 "sub $0x4,%3 \n" |
| 5019 "jl 49f \n" |
| 5020 "test $0xf,%1 \n" |
| 5021 "jne 49f \n" |
| 5022 |
| 5023 // 4 pixel loop \n" |
| 5024 LABELALIGN |
| 5025 "40: \n" |
| 5026 "movdqu " MEMACCESS(0) ",%%xmm2 \n" |
| 5027 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 5028 "movdqa %%xmm2,%%xmm4 \n" |
| 5029 "punpcklbw %%xmm1,%%xmm2 \n" |
| 5030 "movdqa %%xmm2,%%xmm3 \n" |
| 5031 "punpcklwd %%xmm1,%%xmm2 \n" |
| 5032 "punpckhwd %%xmm1,%%xmm3 \n" |
| 5033 "punpckhbw %%xmm1,%%xmm4 \n" |
| 5034 "movdqa %%xmm4,%%xmm5 \n" |
| 5035 "punpcklwd %%xmm1,%%xmm4 \n" |
| 5036 "punpckhwd %%xmm1,%%xmm5 \n" |
| 5037 "paddd %%xmm2,%%xmm0 \n" |
| 5038 "movdqa " MEMACCESS(2) ",%%xmm2 \n" |
| 5039 "paddd %%xmm0,%%xmm2 \n" |
| 5040 "paddd %%xmm3,%%xmm0 \n" |
| 5041 "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n" |
| 5042 "paddd %%xmm0,%%xmm3 \n" |
| 5043 "paddd %%xmm4,%%xmm0 \n" |
| 5044 "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n" |
| 5045 "paddd %%xmm0,%%xmm4 \n" |
| 5046 "paddd %%xmm5,%%xmm0 \n" |
| 5047 "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n" |
| 5048 "lea " MEMLEA(0x40,2) ",%2 \n" |
| 5049 "paddd %%xmm0,%%xmm5 \n" |
| 5050 "movdqa %%xmm2," MEMACCESS(1) " \n" |
| 5051 "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" |
| 5052 "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n" |
| 5053 "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n" |
| 5054 "lea " MEMLEA(0x40,1) ",%1 \n" |
| 5055 "sub $0x4,%3 \n" |
| 5056 "jge 40b \n" |
| 5057 |
| 5058 "49: \n" |
| 5059 "add $0x3,%3 \n" |
| 5060 "jl 19f \n" |
| 5061 |
| 5062 // 1 pixel loop \n" |
| 5063 LABELALIGN |
| 5064 "10: \n" |
| 5065 "movd " MEMACCESS(0) ",%%xmm2 \n" |
| 5066 "lea " MEMLEA(0x4,0) ",%0 \n" |
| 5067 "punpcklbw %%xmm1,%%xmm2 \n" |
| 5068 "punpcklwd %%xmm1,%%xmm2 \n" |
| 5069 "paddd %%xmm2,%%xmm0 \n" |
| 5070 "movdqu " MEMACCESS(2) ",%%xmm2 \n" |
| 5071 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 5072 "paddd %%xmm0,%%xmm2 \n" |
| 5073 "movdqu %%xmm2," MEMACCESS(1) " \n" |
| 5074 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5075 "sub $0x1,%3 \n" |
| 5076 "jge 10b \n" |
| 5077 |
| 5078 "19: \n" |
| 5079 : "+r"(row), // %0 |
| 5080 "+r"(cumsum), // %1 |
| 5081 "+r"(previous_cumsum), // %2 |
| 5082 "+r"(width) // %3 |
| 5083 : |
| 5084 : "memory", "cc" |
| 5085 #if defined(__SSE2__) |
| 5086 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 5087 #endif |
| 5088 ); |
| 5089 } |
| 5090 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 |
| 5091 |
| 5092 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
| 5093 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, |
| 5094 int width, int area, uint8* dst, |
| 5095 int count) { |
| 5096 asm volatile ( |
| 5097 "movd %5,%%xmm5 \n" |
| 5098 "cvtdq2ps %%xmm5,%%xmm5 \n" |
| 5099 "rcpss %%xmm5,%%xmm4 \n" |
| 5100 "pshufd $0x0,%%xmm4,%%xmm4 \n" |
| 5101 "sub $0x4,%3 \n" |
| 5102 "jl 49f \n" |
| 5103 "cmpl $0x80,%5 \n" |
| 5104 "ja 40f \n" |
| 5105 |
| 5106 "pshufd $0x0,%%xmm5,%%xmm5 \n" |
| 5107 "pcmpeqb %%xmm6,%%xmm6 \n" |
| 5108 "psrld $0x10,%%xmm6 \n" |
| 5109 "cvtdq2ps %%xmm6,%%xmm6 \n" |
| 5110 "addps %%xmm6,%%xmm5 \n" |
| 5111 "mulps %%xmm4,%%xmm5 \n" |
| 5112 "cvtps2dq %%xmm5,%%xmm5 \n" |
| 5113 "packssdw %%xmm5,%%xmm5 \n" |
| 5114 |
| 5115 // 4 pixel small loop \n" |
| 5116 LABELALIGN |
| 5117 "4: \n" |
| 5118 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 5119 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 5120 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 5121 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 5122 BUNDLEALIGN |
| 5123 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 |
| 5124 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 |
| 5125 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 |
| 5126 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 |
| 5127 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 5128 "psubd " MEMACCESS(1) ",%%xmm0 \n" |
| 5129 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" |
| 5130 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" |
| 5131 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" |
| 5132 BUNDLEALIGN |
| 5133 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 |
| 5134 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 |
| 5135 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 |
| 5136 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 |
| 5137 "lea " MEMLEA(0x40,1) ",%1 \n" |
| 5138 "packssdw %%xmm1,%%xmm0 \n" |
| 5139 "packssdw %%xmm3,%%xmm2 \n" |
| 5140 "pmulhuw %%xmm5,%%xmm0 \n" |
| 5141 "pmulhuw %%xmm5,%%xmm2 \n" |
| 5142 "packuswb %%xmm2,%%xmm0 \n" |
| 5143 "movdqu %%xmm0," MEMACCESS(2) " \n" |
| 5144 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 5145 "sub $0x4,%3 \n" |
| 5146 "jge 4b \n" |
| 5147 "jmp 49f \n" |
| 5148 |
| 5149 // 4 pixel loop \n" |
| 5150 LABELALIGN |
| 5151 "40: \n" |
| 5152 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 5153 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 5154 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
| 5155 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
| 5156 BUNDLEALIGN |
| 5157 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 |
| 5158 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 |
| 5159 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 |
| 5160 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 |
| 5161 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 5162 "psubd " MEMACCESS(1) ",%%xmm0 \n" |
| 5163 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" |
| 5164 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" |
| 5165 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" |
| 5166 BUNDLEALIGN |
| 5167 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 |
| 5168 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 |
| 5169 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 |
| 5170 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 |
| 5171 "lea " MEMLEA(0x40,1) ",%1 \n" |
| 5172 "cvtdq2ps %%xmm0,%%xmm0 \n" |
| 5173 "cvtdq2ps %%xmm1,%%xmm1 \n" |
| 5174 "mulps %%xmm4,%%xmm0 \n" |
| 5175 "mulps %%xmm4,%%xmm1 \n" |
| 5176 "cvtdq2ps %%xmm2,%%xmm2 \n" |
| 5177 "cvtdq2ps %%xmm3,%%xmm3 \n" |
| 5178 "mulps %%xmm4,%%xmm2 \n" |
| 5179 "mulps %%xmm4,%%xmm3 \n" |
| 5180 "cvtps2dq %%xmm0,%%xmm0 \n" |
| 5181 "cvtps2dq %%xmm1,%%xmm1 \n" |
| 5182 "cvtps2dq %%xmm2,%%xmm2 \n" |
| 5183 "cvtps2dq %%xmm3,%%xmm3 \n" |
| 5184 "packssdw %%xmm1,%%xmm0 \n" |
| 5185 "packssdw %%xmm3,%%xmm2 \n" |
| 5186 "packuswb %%xmm2,%%xmm0 \n" |
| 5187 "movdqu %%xmm0," MEMACCESS(2) " \n" |
| 5188 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 5189 "sub $0x4,%3 \n" |
| 5190 "jge 40b \n" |
| 5191 |
| 5192 "49: \n" |
| 5193 "add $0x3,%3 \n" |
| 5194 "jl 19f \n" |
| 5195 |
| 5196 // 1 pixel loop \n" |
| 5197 LABELALIGN |
| 5198 "10: \n" |
| 5199 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 5200 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 |
| 5201 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 5202 "psubd " MEMACCESS(1) ",%%xmm0 \n" |
| 5203 BUNDLEALIGN |
| 5204 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 |
| 5205 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5206 "cvtdq2ps %%xmm0,%%xmm0 \n" |
| 5207 "mulps %%xmm4,%%xmm0 \n" |
| 5208 "cvtps2dq %%xmm0,%%xmm0 \n" |
| 5209 "packssdw %%xmm0,%%xmm0 \n" |
| 5210 "packuswb %%xmm0,%%xmm0 \n" |
| 5211 "movd %%xmm0," MEMACCESS(2) " \n" |
| 5212 "lea " MEMLEA(0x4,2) ",%2 \n" |
| 5213 "sub $0x1,%3 \n" |
| 5214 "jge 10b \n" |
| 5215 "19: \n" |
| 5216 : "+r"(topleft), // %0 |
| 5217 "+r"(botleft), // %1 |
| 5218 "+r"(dst), // %2 |
| 5219 "+rm"(count) // %3 |
| 5220 : "r"((intptr_t)(width)), // %4 |
| 5221 "rm"(area) // %5 |
| 5222 : "memory", "cc" |
| 5223 #if defined(__native_client__) && defined(__x86_64__) |
| 5224 , "r14" |
| 5225 #endif |
| 5226 #if defined(__SSE2__) |
| 5227 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 5228 #endif |
| 5229 ); |
| 5230 } |
| 5231 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
| 5232 |
| 5233 #ifdef HAS_ARGBAFFINEROW_SSE2 |
| 5234 // Copy ARGB pixels from source image with slope to a row of destination. |
| 5235 LIBYUV_API |
| 5236 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, |
| 5237 uint8* dst_argb, const float* src_dudv, int width) { |
| 5238 intptr_t src_argb_stride_temp = src_argb_stride; |
| 5239 intptr_t temp = 0; |
| 5240 asm volatile ( |
| 5241 "movq " MEMACCESS(3) ",%%xmm2 \n" |
| 5242 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" |
| 5243 "shl $0x10,%1 \n" |
| 5244 "add $0x4,%1 \n" |
| 5245 "movd %1,%%xmm5 \n" |
| 5246 "sub $0x4,%4 \n" |
| 5247 "jl 49f \n" |
| 5248 |
| 5249 "pshufd $0x44,%%xmm7,%%xmm7 \n" |
| 5250 "pshufd $0x0,%%xmm5,%%xmm5 \n" |
| 5251 "movdqa %%xmm2,%%xmm0 \n" |
| 5252 "addps %%xmm7,%%xmm0 \n" |
| 5253 "movlhps %%xmm0,%%xmm2 \n" |
| 5254 "movdqa %%xmm7,%%xmm4 \n" |
| 5255 "addps %%xmm4,%%xmm4 \n" |
| 5256 "movdqa %%xmm2,%%xmm3 \n" |
| 5257 "addps %%xmm4,%%xmm3 \n" |
| 5258 "addps %%xmm4,%%xmm4 \n" |
| 5259 |
| 5260 // 4 pixel loop \n" |
| 5261 LABELALIGN |
| 5262 "40: \n" |
| 5263 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 |
| 5264 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 |
| 5265 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts |
| 5266 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride |
| 5267 "movd %%xmm0,%k1 \n" |
| 5268 "pshufd $0x39,%%xmm0,%%xmm0 \n" |
| 5269 "movd %%xmm0,%k5 \n" |
| 5270 "pshufd $0x39,%%xmm0,%%xmm0 \n" |
| 5271 BUNDLEALIGN |
| 5272 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 |
| 5273 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 |
| 5274 "punpckldq %%xmm6,%%xmm1 \n" |
| 5275 "addps %%xmm4,%%xmm2 \n" |
| 5276 "movq %%xmm1," MEMACCESS(2) " \n" |
| 5277 "movd %%xmm0,%k1 \n" |
| 5278 "pshufd $0x39,%%xmm0,%%xmm0 \n" |
| 5279 "movd %%xmm0,%k5 \n" |
| 5280 BUNDLEALIGN |
| 5281 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 |
| 5282 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 |
| 5283 "punpckldq %%xmm6,%%xmm0 \n" |
| 5284 "addps %%xmm4,%%xmm3 \n" |
| 5285 "sub $0x4,%4 \n" |
| 5286 "movq %%xmm0," MEMACCESS2(0x08,2) " \n" |
| 5287 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 5288 "jge 40b \n" |
| 5289 |
| 5290 "49: \n" |
| 5291 "add $0x3,%4 \n" |
| 5292 "jl 19f \n" |
| 5293 |
| 5294 // 1 pixel loop \n" |
| 5295 LABELALIGN |
| 5296 "10: \n" |
| 5297 "cvttps2dq %%xmm2,%%xmm0 \n" |
| 5298 "packssdw %%xmm0,%%xmm0 \n" |
| 5299 "pmaddwd %%xmm5,%%xmm0 \n" |
| 5300 "addps %%xmm7,%%xmm2 \n" |
| 5301 "movd %%xmm0,%k1 \n" |
| 5302 BUNDLEALIGN |
| 5303 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 |
| 5304 "sub $0x1,%4 \n" |
| 5305 "movd %%xmm0," MEMACCESS(2) " \n" |
| 5306 "lea " MEMLEA(0x04,2) ",%2 \n" |
| 5307 "jge 10b \n" |
| 5308 "19: \n" |
| 5309 : "+r"(src_argb), // %0 |
| 5310 "+r"(src_argb_stride_temp), // %1 |
| 5311 "+r"(dst_argb), // %2 |
| 5312 "+r"(src_dudv), // %3 |
| 5313 "+rm"(width), // %4 |
| 5314 "+r"(temp) // %5 |
| 5315 : |
| 5316 : "memory", "cc" |
| 5317 #if defined(__native_client__) && defined(__x86_64__) |
| 5318 , "r14" |
| 5319 #endif |
| 5320 #if defined(__SSE2__) |
| 5321 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 5322 #endif |
| 5323 ); |
| 5324 } |
| 5325 #endif // HAS_ARGBAFFINEROW_SSE2 |
| 5326 |
| 5327 #ifdef HAS_INTERPOLATEROW_SSSE3 |
| 5328 // Bilinear filter 16x2 -> 16x1 |
| 5329 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
| 5330 ptrdiff_t src_stride, int dst_width, |
| 5331 int source_y_fraction) { |
| 5332 asm volatile ( |
| 5333 "sub %1,%0 \n" |
| 5334 "shr %3 \n" |
| 5335 "cmp $0x0,%3 \n" |
| 5336 "je 100f \n" |
| 5337 "cmp $0x20,%3 \n" |
| 5338 "je 75f \n" |
| 5339 "cmp $0x40,%3 \n" |
| 5340 "je 50f \n" |
| 5341 "cmp $0x60,%3 \n" |
| 5342 "je 25f \n" |
| 5343 |
| 5344 "movd %3,%%xmm0 \n" |
| 5345 "neg %3 \n" |
| 5346 "add $0x80,%3 \n" |
| 5347 "movd %3,%%xmm5 \n" |
| 5348 "punpcklbw %%xmm0,%%xmm5 \n" |
| 5349 "punpcklwd %%xmm5,%%xmm5 \n" |
| 5350 "pshufd $0x0,%%xmm5,%%xmm5 \n" |
| 5351 |
| 5352 // General purpose row blend. |
| 5353 LABELALIGN |
| 5354 "1: \n" |
| 5355 "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
| 5356 MEMOPREG(movdqa,0x00,1,4,1,xmm2) |
| 5357 "movdqa %%xmm0,%%xmm1 \n" |
| 5358 "punpcklbw %%xmm2,%%xmm0 \n" |
| 5359 "punpckhbw %%xmm2,%%xmm1 \n" |
| 5360 "pmaddubsw %%xmm5,%%xmm0 \n" |
| 5361 "pmaddubsw %%xmm5,%%xmm1 \n" |
| 5362 "psrlw $0x7,%%xmm0 \n" |
| 5363 "psrlw $0x7,%%xmm1 \n" |
| 5364 "packuswb %%xmm1,%%xmm0 \n" |
| 5365 "sub $0x10,%2 \n" |
| 5366 BUNDLEALIGN |
| 5367 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) |
| 5368 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5369 "jg 1b \n" |
| 5370 "jmp 99f \n" |
| 5371 |
| 5372 // Blend 25 / 75. |
| 5373 LABELALIGN |
| 5374 "25: \n" |
| 5375 "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
| 5376 MEMOPREG(movdqa,0x00,1,4,1,xmm1) |
| 5377 "pavgb %%xmm1,%%xmm0 \n" |
| 5378 "pavgb %%xmm1,%%xmm0 \n" |
| 5379 "sub $0x10,%2 \n" |
| 5380 BUNDLEALIGN |
| 5381 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) |
| 5382 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5383 "jg 25b \n" |
| 5384 "jmp 99f \n" |
| 5385 |
| 5386 // Blend 50 / 50. |
| 5387 LABELALIGN |
| 5388 "50: \n" |
| 5389 "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
| 5390 MEMOPREG(movdqa,0x00,1,4,1,xmm1) |
| 5391 "pavgb %%xmm1,%%xmm0 \n" |
| 5392 "sub $0x10,%2 \n" |
| 5393 BUNDLEALIGN |
| 5394 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) |
| 5395 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5396 "jg 50b \n" |
| 5397 "jmp 99f \n" |
| 5398 |
| 5399 // Blend 75 / 25. |
| 5400 LABELALIGN |
| 5401 "75: \n" |
| 5402 "movdqa " MEMACCESS(1) ",%%xmm1 \n" |
| 5403 MEMOPREG(movdqa,0x00,1,4,1,xmm0) |
| 5404 "pavgb %%xmm1,%%xmm0 \n" |
| 5405 "pavgb %%xmm1,%%xmm0 \n" |
| 5406 "sub $0x10,%2 \n" |
| 5407 BUNDLEALIGN |
| 5408 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) |
| 5409 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5410 "jg 75b \n" |
| 5411 "jmp 99f \n" |
| 5412 |
| 5413 // Blend 100 / 0 - Copy row unchanged. |
| 5414 LABELALIGN |
| 5415 "100: \n" |
| 5416 "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
| 5417 "sub $0x10,%2 \n" |
| 5418 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) |
| 5419 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5420 "jg 100b \n" |
| 5421 |
| 5422 "99: \n" |
| 5423 : "+r"(dst_ptr), // %0 |
| 5424 "+r"(src_ptr), // %1 |
| 5425 "+r"(dst_width), // %2 |
| 5426 "+r"(source_y_fraction) // %3 |
| 5427 : "r"((intptr_t)(src_stride)) // %4 |
| 5428 : "memory", "cc" |
| 5429 #if defined(__native_client__) && defined(__x86_64__) |
| 5430 , "r14" |
| 5431 #endif |
| 5432 #if defined(__SSE2__) |
| 5433 , "xmm0", "xmm1", "xmm2", "xmm5" |
| 5434 #endif |
| 5435 ); |
| 5436 } |
| 5437 #endif // HAS_INTERPOLATEROW_SSSE3 |
| 5438 |
| 5439 #ifdef HAS_INTERPOLATEROW_SSE2 |
| 5440 // Bilinear filter 16x2 -> 16x1 |
| 5441 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
| 5442 ptrdiff_t src_stride, int dst_width, |
| 5443 int source_y_fraction) { |
| 5444 asm volatile ( |
| 5445 "sub %1,%0 \n" |
| 5446 "shr %3 \n" |
| 5447 "cmp $0x0,%3 \n" |
| 5448 "je 100f \n" |
| 5449 "cmp $0x20,%3 \n" |
| 5450 "je 75f \n" |
| 5451 "cmp $0x40,%3 \n" |
| 5452 "je 50f \n" |
| 5453 "cmp $0x60,%3 \n" |
| 5454 "je 25f \n" |
| 5455 |
| 5456 "movd %3,%%xmm0 \n" |
| 5457 "neg %3 \n" |
| 5458 "add $0x80,%3 \n" |
| 5459 "movd %3,%%xmm5 \n" |
| 5460 "punpcklbw %%xmm0,%%xmm5 \n" |
| 5461 "punpcklwd %%xmm5,%%xmm5 \n" |
| 5462 "pshufd $0x0,%%xmm5,%%xmm5 \n" |
| 5463 "pxor %%xmm4,%%xmm4 \n" |
| 5464 |
| 5465 // General purpose row blend. |
| 5466 LABELALIGN |
| 5467 "1: \n" |
| 5468 "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
| 5469 MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2 |
| 5470 "movdqa %%xmm0,%%xmm1 \n" |
| 5471 "movdqa %%xmm2,%%xmm3 \n" |
| 5472 "punpcklbw %%xmm4,%%xmm2 \n" |
| 5473 "punpckhbw %%xmm4,%%xmm3 \n" |
| 5474 "punpcklbw %%xmm4,%%xmm0 \n" |
| 5475 "punpckhbw %%xmm4,%%xmm1 \n" |
| 5476 "psubw %%xmm0,%%xmm2 \n" |
| 5477 "psubw %%xmm1,%%xmm3 \n" |
| 5478 "paddw %%xmm2,%%xmm2 \n" |
| 5479 "paddw %%xmm3,%%xmm3 \n" |
| 5480 "pmulhw %%xmm5,%%xmm2 \n" |
| 5481 "pmulhw %%xmm5,%%xmm3 \n" |
| 5482 "paddw %%xmm2,%%xmm0 \n" |
| 5483 "paddw %%xmm3,%%xmm1 \n" |
| 5484 "packuswb %%xmm1,%%xmm0 \n" |
| 5485 "sub $0x10,%2 \n" |
| 5486 BUNDLEALIGN |
| 5487 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) |
| 5488 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5489 "jg 1b \n" |
| 5490 "jmp 99f \n" |
| 5491 |
| 5492 // Blend 25 / 75. |
| 5493 LABELALIGN |
| 5494 "25: \n" |
| 5495 "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
| 5496 MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 |
| 5497 "pavgb %%xmm1,%%xmm0 \n" |
| 5498 "pavgb %%xmm1,%%xmm0 \n" |
| 5499 "sub $0x10,%2 \n" |
| 5500 BUNDLEALIGN |
| 5501 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) |
| 5502 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5503 "jg 25b \n" |
| 5504 "jmp 99f \n" |
| 5505 |
| 5506 // Blend 50 / 50. |
| 5507 LABELALIGN |
| 5508 "50: \n" |
| 5509 "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
| 5510 MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 |
| 5511 "pavgb %%xmm1,%%xmm0 \n" |
| 5512 "sub $0x10,%2 \n" |
| 5513 BUNDLEALIGN |
| 5514 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) |
| 5515 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5516 "jg 50b \n" |
| 5517 "jmp 99f \n" |
| 5518 |
| 5519 // Blend 75 / 25. |
| 5520 LABELALIGN |
| 5521 "75: \n" |
| 5522 "movdqa " MEMACCESS(1) ",%%xmm1 \n" |
| 5523 MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0 |
| 5524 "pavgb %%xmm1,%%xmm0 \n" |
| 5525 "pavgb %%xmm1,%%xmm0 \n" |
| 5526 "sub $0x10,%2 \n" |
| 5527 BUNDLEALIGN |
| 5528 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) |
| 5529 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5530 "jg 75b \n" |
| 5531 "jmp 99f \n" |
| 5532 |
| 5533 // Blend 100 / 0 - Copy row unchanged. |
| 5534 LABELALIGN |
| 5535 "100: \n" |
| 5536 "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
| 5537 "sub $0x10,%2 \n" |
| 5538 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) |
| 5539 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5540 "jg 100b \n" |
| 5541 |
| 5542 "99: \n" |
| 5543 : "+r"(dst_ptr), // %0 |
| 5544 "+r"(src_ptr), // %1 |
| 5545 "+r"(dst_width), // %2 |
| 5546 "+r"(source_y_fraction) // %3 |
| 5547 : "r"((intptr_t)(src_stride)) // %4 |
| 5548 : "memory", "cc" |
| 5549 #if defined(__native_client__) && defined(__x86_64__) |
| 5550 , "r14" |
| 5551 #endif |
| 5552 #if defined(__SSE2__) |
| 5553 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 5554 #endif |
| 5555 ); |
| 5556 } |
| 5557 #endif // HAS_INTERPOLATEROW_SSE2 |
| 5558 |
| 5559 #ifdef HAS_INTERPOLATEROW_SSSE3 |
| 5560 // Bilinear filter 16x2 -> 16x1 |
| 5561 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
| 5562 ptrdiff_t src_stride, int dst_width, |
| 5563 int source_y_fraction) { |
| 5564 asm volatile ( |
| 5565 "sub %1,%0 \n" |
| 5566 "shr %3 \n" |
| 5567 "cmp $0x0,%3 \n" |
| 5568 "je 100f \n" |
| 5569 "cmp $0x20,%3 \n" |
| 5570 "je 75f \n" |
| 5571 "cmp $0x40,%3 \n" |
| 5572 "je 50f \n" |
| 5573 "cmp $0x60,%3 \n" |
| 5574 "je 25f \n" |
| 5575 |
| 5576 "movd %3,%%xmm0 \n" |
| 5577 "neg %3 \n" |
| 5578 "add $0x80,%3 \n" |
| 5579 "movd %3,%%xmm5 \n" |
| 5580 "punpcklbw %%xmm0,%%xmm5 \n" |
| 5581 "punpcklwd %%xmm5,%%xmm5 \n" |
| 5582 "pshufd $0x0,%%xmm5,%%xmm5 \n" |
| 5583 |
| 5584 // General purpose row blend. |
| 5585 LABELALIGN |
| 5586 "1: \n" |
| 5587 "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
| 5588 MEMOPREG(movdqu,0x00,1,4,1,xmm2) |
| 5589 "movdqu %%xmm0,%%xmm1 \n" |
| 5590 "punpcklbw %%xmm2,%%xmm0 \n" |
| 5591 "punpckhbw %%xmm2,%%xmm1 \n" |
| 5592 "pmaddubsw %%xmm5,%%xmm0 \n" |
| 5593 "pmaddubsw %%xmm5,%%xmm1 \n" |
| 5594 "psrlw $0x7,%%xmm0 \n" |
| 5595 "psrlw $0x7,%%xmm1 \n" |
| 5596 "packuswb %%xmm1,%%xmm0 \n" |
| 5597 "sub $0x10,%2 \n" |
| 5598 BUNDLEALIGN |
| 5599 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) |
| 5600 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5601 "jg 1b \n" |
| 5602 "jmp 99f \n" |
| 5603 |
| 5604 // Blend 25 / 75. |
| 5605 LABELALIGN |
| 5606 "25: \n" |
| 5607 "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
| 5608 MEMOPREG(movdqu,0x00,1,4,1,xmm1) |
| 5609 "pavgb %%xmm1,%%xmm0 \n" |
| 5610 "pavgb %%xmm1,%%xmm0 \n" |
| 5611 "sub $0x10,%2 \n" |
| 5612 BUNDLEALIGN |
| 5613 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) |
| 5614 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5615 "jg 25b \n" |
| 5616 "jmp 99f \n" |
| 5617 |
| 5618 // Blend 50 / 50. |
| 5619 LABELALIGN |
| 5620 "50: \n" |
| 5621 "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
| 5622 MEMOPREG(movdqu,0x00,1,4,1,xmm1) |
| 5623 "pavgb %%xmm1,%%xmm0 \n" |
| 5624 "sub $0x10,%2 \n" |
| 5625 BUNDLEALIGN |
| 5626 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) |
| 5627 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5628 "jg 50b \n" |
| 5629 "jmp 99f \n" |
| 5630 |
| 5631 // Blend 75 / 25. |
| 5632 LABELALIGN |
| 5633 "75: \n" |
| 5634 "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
| 5635 MEMOPREG(movdqu,0x00,1,4,1,xmm0) |
| 5636 "pavgb %%xmm1,%%xmm0 \n" |
| 5637 "pavgb %%xmm1,%%xmm0 \n" |
| 5638 "sub $0x10,%2 \n" |
| 5639 BUNDLEALIGN |
| 5640 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) |
| 5641 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5642 "jg 75b \n" |
| 5643 "jmp 99f \n" |
| 5644 |
| 5645 // Blend 100 / 0 - Copy row unchanged. |
| 5646 LABELALIGN |
| 5647 "100: \n" |
| 5648 "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
| 5649 "sub $0x10,%2 \n" |
| 5650 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) |
| 5651 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5652 "jg 100b \n" |
| 5653 |
| 5654 "99: \n" |
| 5655 : "+r"(dst_ptr), // %0 |
| 5656 "+r"(src_ptr), // %1 |
| 5657 "+r"(dst_width), // %2 |
| 5658 "+r"(source_y_fraction) // %3 |
| 5659 : "r"((intptr_t)(src_stride)) // %4 |
| 5660 : "memory", "cc" |
| 5661 #if defined(__native_client__) && defined(__x86_64__) |
| 5662 , "r14" |
| 5663 #endif |
| 5664 #if defined(__SSE2__) |
| 5665 , "xmm0", "xmm1", "xmm2", "xmm5" |
| 5666 #endif |
| 5667 ); |
| 5668 } |
| 5669 #endif // HAS_INTERPOLATEROW_SSSE3 |
| 5670 |
| 5671 #ifdef HAS_INTERPOLATEROW_SSE2 |
| 5672 // Bilinear filter 16x2 -> 16x1 |
| 5673 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
| 5674 ptrdiff_t src_stride, int dst_width, |
| 5675 int source_y_fraction) { |
| 5676 asm volatile ( |
| 5677 "sub %1,%0 \n" |
| 5678 "shr %3 \n" |
| 5679 "cmp $0x0,%3 \n" |
| 5680 "je 100f \n" |
| 5681 "cmp $0x20,%3 \n" |
| 5682 "je 75f \n" |
| 5683 "cmp $0x40,%3 \n" |
| 5684 "je 50f \n" |
| 5685 "cmp $0x60,%3 \n" |
| 5686 "je 25f \n" |
| 5687 |
| 5688 "movd %3,%%xmm0 \n" |
| 5689 "neg %3 \n" |
| 5690 "add $0x80,%3 \n" |
| 5691 "movd %3,%%xmm5 \n" |
| 5692 "punpcklbw %%xmm0,%%xmm5 \n" |
| 5693 "punpcklwd %%xmm5,%%xmm5 \n" |
| 5694 "pshufd $0x0,%%xmm5,%%xmm5 \n" |
| 5695 "pxor %%xmm4,%%xmm4 \n" |
| 5696 |
| 5697 // General purpose row blend. |
| 5698 LABELALIGN |
| 5699 "1: \n" |
| 5700 "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
| 5701 MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 |
| 5702 "movdqu %%xmm0,%%xmm1 \n" |
| 5703 "movdqu %%xmm2,%%xmm3 \n" |
| 5704 "punpcklbw %%xmm4,%%xmm2 \n" |
| 5705 "punpckhbw %%xmm4,%%xmm3 \n" |
| 5706 "punpcklbw %%xmm4,%%xmm0 \n" |
| 5707 "punpckhbw %%xmm4,%%xmm1 \n" |
| 5708 "psubw %%xmm0,%%xmm2 \n" |
| 5709 "psubw %%xmm1,%%xmm3 \n" |
| 5710 "paddw %%xmm2,%%xmm2 \n" |
| 5711 "paddw %%xmm3,%%xmm3 \n" |
| 5712 "pmulhw %%xmm5,%%xmm2 \n" |
| 5713 "pmulhw %%xmm5,%%xmm3 \n" |
| 5714 "paddw %%xmm2,%%xmm0 \n" |
| 5715 "paddw %%xmm3,%%xmm1 \n" |
| 5716 "packuswb %%xmm1,%%xmm0 \n" |
| 5717 "sub $0x10,%2 \n" |
| 5718 BUNDLEALIGN |
| 5719 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) |
| 5720 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5721 "jg 1b \n" |
| 5722 "jmp 99f \n" |
| 5723 |
| 5724 // Blend 25 / 75. |
| 5725 LABELALIGN |
| 5726 "25: \n" |
| 5727 "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
| 5728 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 |
| 5729 "pavgb %%xmm1,%%xmm0 \n" |
| 5730 "pavgb %%xmm1,%%xmm0 \n" |
| 5731 "sub $0x10,%2 \n" |
| 5732 BUNDLEALIGN |
| 5733 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) |
| 5734 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5735 "jg 25b \n" |
| 5736 "jmp 99f \n" |
| 5737 |
| 5738 // Blend 50 / 50. |
| 5739 LABELALIGN |
| 5740 "50: \n" |
| 5741 "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
| 5742 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 |
| 5743 "pavgb %%xmm1,%%xmm0 \n" |
| 5744 "sub $0x10,%2 \n" |
| 5745 BUNDLEALIGN |
| 5746 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) |
| 5747 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5748 "jg 50b \n" |
| 5749 "jmp 99f \n" |
| 5750 |
| 5751 // Blend 75 / 25. |
| 5752 LABELALIGN |
| 5753 "75: \n" |
| 5754 "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
| 5755 MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 |
| 5756 "pavgb %%xmm1,%%xmm0 \n" |
| 5757 "pavgb %%xmm1,%%xmm0 \n" |
| 5758 "sub $0x10,%2 \n" |
| 5759 BUNDLEALIGN |
| 5760 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) |
| 5761 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5762 "jg 75b \n" |
| 5763 "jmp 99f \n" |
| 5764 |
| 5765 // Blend 100 / 0 - Copy row unchanged. |
| 5766 LABELALIGN |
| 5767 "100: \n" |
| 5768 "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
| 5769 "sub $0x10,%2 \n" |
| 5770 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) |
| 5771 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 5772 "jg 100b \n" |
| 5773 |
| 5774 "99: \n" |
| 5775 : "+r"(dst_ptr), // %0 |
| 5776 "+r"(src_ptr), // %1 |
| 5777 "+r"(dst_width), // %2 |
| 5778 "+r"(source_y_fraction) // %3 |
| 5779 : "r"((intptr_t)(src_stride)) // %4 |
| 5780 : "memory", "cc" |
| 5781 #if defined(__native_client__) && defined(__x86_64__) |
| 5782 , "r14" |
| 5783 #endif |
| 5784 #if defined(__SSE2__) |
| 5785 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 5786 #endif |
| 5787 ); |
| 5788 } |
| 5789 #endif // HAS_INTERPOLATEROW_SSE2 |
| 5790 |
| 5791 #ifdef HAS_HALFROW_SSE2 |
| 5792 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, |
| 5793 uint8* dst_uv, int pix) { |
| 5794 asm volatile ( |
| 5795 "sub %0,%1 \n" |
| 5796 LABELALIGN |
| 5797 "1: \n" |
| 5798 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 5799 MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0 |
| 5800 "sub $0x10,%2 \n" |
| 5801 MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1) |
| 5802 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 5803 "jg 1b \n" |
| 5804 : "+r"(src_uv), // %0 |
| 5805 "+r"(dst_uv), // %1 |
| 5806 "+r"(pix) // %2 |
| 5807 : "r"((intptr_t)(src_uv_stride)) // %3 |
| 5808 : "memory", "cc" |
| 5809 #if defined(__SSE2__) |
| 5810 , "xmm0" |
| 5811 #endif |
| 5812 ); |
| 5813 } |
| 5814 #endif // HAS_HALFROW_SSE2 |
| 5815 |
| 5816 #ifdef HAS_ARGBTOBAYERROW_SSSE3 |
| 5817 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, |
| 5818 uint32 selector, int pix) { |
| 5819 asm volatile ( |
| 5820 // NaCL caveat - assumes movd is from GPR |
| 5821 "movd %3,%%xmm5 \n" |
| 5822 "pshufd $0x0,%%xmm5,%%xmm5 \n" |
| 5823 LABELALIGN |
| 5824 "1: \n" |
| 5825 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 5826 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 5827 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 5828 "pshufb %%xmm5,%%xmm0 \n" |
| 5829 "pshufb %%xmm5,%%xmm1 \n" |
| 5830 "punpckldq %%xmm1,%%xmm0 \n" |
| 5831 "sub $0x8,%2 \n" |
| 5832 "movq %%xmm0," MEMACCESS(1) " \n" |
| 5833 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 5834 "jg 1b \n" |
| 5835 : "+r"(src_argb), // %0 |
| 5836 "+r"(dst_bayer), // %1 |
| 5837 "+r"(pix) // %2 |
| 5838 : "g"(selector) // %3 |
| 5839 : "memory", "cc" |
| 5840 #if defined(__SSE2__) |
| 5841 , "xmm0", "xmm1", "xmm5" |
| 5842 #endif |
| 5843 ); |
| 5844 } |
| 5845 #endif // HAS_ARGBTOBAYERROW_SSSE3 |
| 5846 |
| 5847 #ifdef HAS_ARGBTOBAYERGGROW_SSE2 |
| 5848 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, |
| 5849 uint32 selector, int pix) { |
| 5850 asm volatile ( |
| 5851 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 5852 "psrld $0x18,%%xmm5 \n" |
| 5853 LABELALIGN |
| 5854 "1: \n" |
| 5855 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 5856 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 5857 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 5858 "psrld $0x8,%%xmm0 \n" |
| 5859 "psrld $0x8,%%xmm1 \n" |
| 5860 "pand %%xmm5,%%xmm0 \n" |
| 5861 "pand %%xmm5,%%xmm1 \n" |
| 5862 "packssdw %%xmm1,%%xmm0 \n" |
| 5863 "packuswb %%xmm1,%%xmm0 \n" |
| 5864 "sub $0x8,%2 \n" |
| 5865 "movq %%xmm0," MEMACCESS(1) " \n" |
| 5866 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 5867 "jg 1b \n" |
| 5868 : "+r"(src_argb), // %0 |
| 5869 "+r"(dst_bayer), // %1 |
| 5870 "+r"(pix) // %2 |
| 5871 : |
| 5872 : "memory", "cc" |
| 5873 #if defined(__SSE2__) |
| 5874 , "xmm0", "xmm1", "xmm5" |
| 5875 #endif |
| 5876 ); |
| 5877 } |
| 5878 #endif // HAS_ARGBTOBAYERGGROW_SSE2 |
| 5879 |
| 5880 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 |
| 5881 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| 5882 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
| 5883 const uint8* shuffler, int pix) { |
| 5884 asm volatile ( |
| 5885 "movdqa " MEMACCESS(3) ",%%xmm5 \n" |
| 5886 LABELALIGN |
| 5887 "1: \n" |
| 5888 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 5889 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 5890 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 5891 "pshufb %%xmm5,%%xmm0 \n" |
| 5892 "pshufb %%xmm5,%%xmm1 \n" |
| 5893 "sub $0x8,%2 \n" |
| 5894 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 5895 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 5896 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 5897 "jg 1b \n" |
| 5898 : "+r"(src_argb), // %0 |
| 5899 "+r"(dst_argb), // %1 |
| 5900 "+r"(pix) // %2 |
| 5901 : "r"(shuffler) // %3 |
| 5902 : "memory", "cc" |
| 5903 #if defined(__SSE2__) |
| 5904 , "xmm0", "xmm1", "xmm5" |
| 5905 #endif |
| 5906 ); |
| 5907 } |
| 5908 |
| 5909 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, |
| 5910 const uint8* shuffler, int pix) { |
| 5911 asm volatile ( |
| 5912 "movdqa " MEMACCESS(3) ",%%xmm5 \n" |
| 5913 LABELALIGN |
| 5914 "1: \n" |
| 5915 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 5916 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 5917 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 5918 "pshufb %%xmm5,%%xmm0 \n" |
| 5919 "pshufb %%xmm5,%%xmm1 \n" |
| 5920 "sub $0x8,%2 \n" |
| 5921 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 5922 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 5923 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 5924 "jg 1b \n" |
| 5925 : "+r"(src_argb), // %0 |
| 5926 "+r"(dst_argb), // %1 |
| 5927 "+r"(pix) // %2 |
| 5928 : "r"(shuffler) // %3 |
| 5929 : "memory", "cc" |
| 5930 #if defined(__SSE2__) |
| 5931 , "xmm0", "xmm1", "xmm5" |
| 5932 #endif |
| 5933 ); |
| 5934 } |
| 5935 #endif // HAS_ARGBSHUFFLEROW_SSSE3 |
| 5936 |
| 5937 #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
| 5938 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| 5939 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
| 5940 const uint8* shuffler, int pix) { |
| 5941 asm volatile ( |
| 5942 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" |
| 5943 LABELALIGN |
| 5944 "1: \n" |
| 5945 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 5946 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
| 5947 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 5948 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" |
| 5949 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" |
| 5950 "sub $0x10,%2 \n" |
| 5951 "vmovdqu %%ymm0," MEMACCESS(1) " \n" |
| 5952 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" |
| 5953 "lea " MEMLEA(0x40,1) ",%1 \n" |
| 5954 "jg 1b \n" |
| 5955 : "+r"(src_argb), // %0 |
| 5956 "+r"(dst_argb), // %1 |
| 5957 "+r"(pix) // %2 |
| 5958 : "r"(shuffler) // %3 |
| 5959 : "memory", "cc" |
| 5960 #if defined(__SSE2__) |
| 5961 , "xmm0", "xmm1", "xmm5" |
| 5962 #endif |
| 5963 ); |
| 5964 } |
| 5965 #endif // HAS_ARGBSHUFFLEROW_AVX2 |
| 5966 |
| 5967 #ifdef HAS_ARGBSHUFFLEROW_SSE2 |
| 5968 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| 5969 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
| 5970 const uint8* shuffler, int pix) { |
| 5971 uintptr_t pixel_temp = 0u; |
| 5972 asm volatile ( |
| 5973 "pxor %%xmm5,%%xmm5 \n" |
| 5974 "mov " MEMACCESS(4) ",%k2 \n" |
| 5975 "cmp $0x3000102,%k2 \n" |
| 5976 "je 3012f \n" |
| 5977 "cmp $0x10203,%k2 \n" |
| 5978 "je 123f \n" |
| 5979 "cmp $0x30201,%k2 \n" |
| 5980 "je 321f \n" |
| 5981 "cmp $0x2010003,%k2 \n" |
| 5982 "je 2103f \n" |
| 5983 |
| 5984 LABELALIGN |
| 5985 "1: \n" |
| 5986 "movzb " MEMACCESS(4) ",%2 \n" |
| 5987 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 |
| 5988 "mov %b2," MEMACCESS(1) " \n" |
| 5989 "movzb " MEMACCESS2(0x1,4) ",%2 \n" |
| 5990 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 |
| 5991 "mov %b2," MEMACCESS2(0x1,1) " \n" |
| 5992 BUNDLEALIGN |
| 5993 "movzb " MEMACCESS2(0x2,4) ",%2 \n" |
| 5994 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 |
| 5995 "mov %b2," MEMACCESS2(0x2,1) " \n" |
| 5996 "movzb " MEMACCESS2(0x3,4) ",%2 \n" |
| 5997 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 |
| 5998 "mov %b2," MEMACCESS2(0x3,1) " \n" |
| 5999 "lea " MEMLEA(0x4,0) ",%0 \n" |
| 6000 "lea " MEMLEA(0x4,1) ",%1 \n" |
| 6001 "sub $0x1,%3 \n" |
| 6002 "jg 1b \n" |
| 6003 "jmp 99f \n" |
| 6004 |
| 6005 LABELALIGN |
| 6006 "123: \n" |
| 6007 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 6008 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 6009 "movdqa %%xmm0,%%xmm1 \n" |
| 6010 "punpcklbw %%xmm5,%%xmm0 \n" |
| 6011 "punpckhbw %%xmm5,%%xmm1 \n" |
| 6012 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" |
| 6013 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" |
| 6014 "pshufhw $0x1b,%%xmm1,%%xmm1 \n" |
| 6015 "pshuflw $0x1b,%%xmm1,%%xmm1 \n" |
| 6016 "packuswb %%xmm1,%%xmm0 \n" |
| 6017 "sub $0x4,%3 \n" |
| 6018 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 6019 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 6020 "jg 123b \n" |
| 6021 "jmp 99f \n" |
| 6022 |
| 6023 LABELALIGN |
| 6024 "321: \n" |
| 6025 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 6026 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 6027 "movdqa %%xmm0,%%xmm1 \n" |
| 6028 "punpcklbw %%xmm5,%%xmm0 \n" |
| 6029 "punpckhbw %%xmm5,%%xmm1 \n" |
| 6030 "pshufhw $0x39,%%xmm0,%%xmm0 \n" |
| 6031 "pshuflw $0x39,%%xmm0,%%xmm0 \n" |
| 6032 "pshufhw $0x39,%%xmm1,%%xmm1 \n" |
| 6033 "pshuflw $0x39,%%xmm1,%%xmm1 \n" |
| 6034 "packuswb %%xmm1,%%xmm0 \n" |
| 6035 "sub $0x4,%3 \n" |
| 6036 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 6037 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 6038 "jg 321b \n" |
| 6039 "jmp 99f \n" |
| 6040 |
| 6041 LABELALIGN |
| 6042 "2103: \n" |
| 6043 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 6044 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 6045 "movdqa %%xmm0,%%xmm1 \n" |
| 6046 "punpcklbw %%xmm5,%%xmm0 \n" |
| 6047 "punpckhbw %%xmm5,%%xmm1 \n" |
| 6048 "pshufhw $0x93,%%xmm0,%%xmm0 \n" |
| 6049 "pshuflw $0x93,%%xmm0,%%xmm0 \n" |
| 6050 "pshufhw $0x93,%%xmm1,%%xmm1 \n" |
| 6051 "pshuflw $0x93,%%xmm1,%%xmm1 \n" |
| 6052 "packuswb %%xmm1,%%xmm0 \n" |
| 6053 "sub $0x4,%3 \n" |
| 6054 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 6055 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 6056 "jg 2103b \n" |
| 6057 "jmp 99f \n" |
| 6058 |
| 6059 LABELALIGN |
| 6060 "3012: \n" |
| 6061 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 6062 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 6063 "movdqa %%xmm0,%%xmm1 \n" |
| 6064 "punpcklbw %%xmm5,%%xmm0 \n" |
| 6065 "punpckhbw %%xmm5,%%xmm1 \n" |
| 6066 "pshufhw $0xc6,%%xmm0,%%xmm0 \n" |
| 6067 "pshuflw $0xc6,%%xmm0,%%xmm0 \n" |
| 6068 "pshufhw $0xc6,%%xmm1,%%xmm1 \n" |
| 6069 "pshuflw $0xc6,%%xmm1,%%xmm1 \n" |
| 6070 "packuswb %%xmm1,%%xmm0 \n" |
| 6071 "sub $0x4,%3 \n" |
| 6072 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 6073 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 6074 "jg 3012b \n" |
| 6075 |
| 6076 "99: \n" |
| 6077 : "+r"(src_argb), // %0 |
| 6078 "+r"(dst_argb), // %1 |
| 6079 "+d"(pixel_temp), // %2 |
| 6080 "+r"(pix) // %3 |
| 6081 : "r"(shuffler) // %4 |
| 6082 : "memory", "cc" |
| 6083 #if defined(__native_client__) && defined(__x86_64__) |
| 6084 , "r14" |
| 6085 #endif |
| 6086 #if defined(__SSE2__) |
| 6087 , "xmm0", "xmm1", "xmm5" |
| 6088 #endif |
| 6089 ); |
| 6090 } |
| 6091 #endif // HAS_ARGBSHUFFLEROW_SSE2 |
| 6092 |
| 6093 #ifdef HAS_I422TOYUY2ROW_SSE2 |
| 6094 void I422ToYUY2Row_SSE2(const uint8* src_y, |
| 6095 const uint8* src_u, |
| 6096 const uint8* src_v, |
| 6097 uint8* dst_frame, int width) { |
| 6098 asm volatile ( |
| 6099 "sub %1,%2 \n" |
| 6100 LABELALIGN |
| 6101 "1: \n" |
| 6102 "movq " MEMACCESS(1) ",%%xmm2 \n" |
| 6103 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 |
| 6104 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 6105 "punpcklbw %%xmm3,%%xmm2 \n" |
| 6106 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 6107 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 6108 "movdqa %%xmm0,%%xmm1 \n" |
| 6109 "punpcklbw %%xmm2,%%xmm0 \n" |
| 6110 "punpckhbw %%xmm2,%%xmm1 \n" |
| 6111 "movdqu %%xmm0," MEMACCESS(3) " \n" |
| 6112 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" |
| 6113 "lea " MEMLEA(0x20,3) ",%3 \n" |
| 6114 "sub $0x10,%4 \n" |
| 6115 "jg 1b \n" |
| 6116 : "+r"(src_y), // %0 |
| 6117 "+r"(src_u), // %1 |
| 6118 "+r"(src_v), // %2 |
| 6119 "+r"(dst_frame), // %3 |
| 6120 "+rm"(width) // %4 |
| 6121 : |
| 6122 : "memory", "cc" |
| 6123 #if defined(__native_client__) && defined(__x86_64__) |
| 6124 , "r14" |
| 6125 #endif |
| 6126 #if defined(__SSE2__) |
| 6127 , "xmm0", "xmm1", "xmm2", "xmm3" |
| 6128 #endif |
| 6129 ); |
| 6130 } |
| 6131 #endif // HAS_I422TOYUY2ROW_SSE2 |
| 6132 |
| 6133 #ifdef HAS_I422TOUYVYROW_SSE2 |
| 6134 void I422ToUYVYRow_SSE2(const uint8* src_y, |
| 6135 const uint8* src_u, |
| 6136 const uint8* src_v, |
| 6137 uint8* dst_frame, int width) { |
| 6138 asm volatile ( |
| 6139 "sub %1,%2 \n" |
| 6140 LABELALIGN |
| 6141 "1: \n" |
| 6142 "movq " MEMACCESS(1) ",%%xmm2 \n" |
| 6143 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 |
| 6144 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 6145 "punpcklbw %%xmm3,%%xmm2 \n" |
| 6146 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 6147 "movdqa %%xmm2,%%xmm1 \n" |
| 6148 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 6149 "punpcklbw %%xmm0,%%xmm1 \n" |
| 6150 "punpckhbw %%xmm0,%%xmm2 \n" |
| 6151 "movdqu %%xmm1," MEMACCESS(3) " \n" |
| 6152 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" |
| 6153 "lea " MEMLEA(0x20,3) ",%3 \n" |
| 6154 "sub $0x10,%4 \n" |
| 6155 "jg 1b \n" |
| 6156 : "+r"(src_y), // %0 |
| 6157 "+r"(src_u), // %1 |
| 6158 "+r"(src_v), // %2 |
| 6159 "+r"(dst_frame), // %3 |
| 6160 "+rm"(width) // %4 |
| 6161 : |
| 6162 : "memory", "cc" |
| 6163 #if defined(__native_client__) && defined(__x86_64__) |
| 6164 , "r14" |
| 6165 #endif |
| 6166 #if defined(__SSE2__) |
| 6167 , "xmm0", "xmm1", "xmm2", "xmm3" |
| 6168 #endif |
| 6169 ); |
| 6170 } |
| 6171 #endif // HAS_I422TOUYVYROW_SSE2 |
| 6172 |
| 6173 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 |
| 6174 void ARGBPolynomialRow_SSE2(const uint8* src_argb, |
| 6175 uint8* dst_argb, const float* poly, |
| 6176 int width) { |
| 6177 asm volatile ( |
| 6178 "pxor %%xmm3,%%xmm3 \n" |
| 6179 |
| 6180 // 2 pixel loop. |
| 6181 LABELALIGN |
| 6182 "1: \n" |
| 6183 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| 6184 "lea " MEMLEA(0x8,0) ",%0 \n" |
| 6185 "punpcklbw %%xmm3,%%xmm0 \n" |
| 6186 "movdqa %%xmm0,%%xmm4 \n" |
| 6187 "punpcklwd %%xmm3,%%xmm0 \n" |
| 6188 "punpckhwd %%xmm3,%%xmm4 \n" |
| 6189 "cvtdq2ps %%xmm0,%%xmm0 \n" |
| 6190 "cvtdq2ps %%xmm4,%%xmm4 \n" |
| 6191 "movdqa %%xmm0,%%xmm1 \n" |
| 6192 "movdqa %%xmm4,%%xmm5 \n" |
| 6193 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" |
| 6194 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" |
| 6195 "addps " MEMACCESS(3) ",%%xmm0 \n" |
| 6196 "addps " MEMACCESS(3) ",%%xmm4 \n" |
| 6197 "movdqa %%xmm1,%%xmm2 \n" |
| 6198 "movdqa %%xmm5,%%xmm6 \n" |
| 6199 "mulps %%xmm1,%%xmm2 \n" |
| 6200 "mulps %%xmm5,%%xmm6 \n" |
| 6201 "mulps %%xmm2,%%xmm1 \n" |
| 6202 "mulps %%xmm6,%%xmm5 \n" |
| 6203 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" |
| 6204 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" |
| 6205 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" |
| 6206 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" |
| 6207 "addps %%xmm2,%%xmm0 \n" |
| 6208 "addps %%xmm6,%%xmm4 \n" |
| 6209 "addps %%xmm1,%%xmm0 \n" |
| 6210 "addps %%xmm5,%%xmm4 \n" |
| 6211 "cvttps2dq %%xmm0,%%xmm0 \n" |
| 6212 "cvttps2dq %%xmm4,%%xmm4 \n" |
| 6213 "packuswb %%xmm4,%%xmm0 \n" |
| 6214 "packuswb %%xmm0,%%xmm0 \n" |
| 6215 "sub $0x2,%2 \n" |
| 6216 "movq %%xmm0," MEMACCESS(1) " \n" |
| 6217 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 6218 "jg 1b \n" |
| 6219 : "+r"(src_argb), // %0 |
| 6220 "+r"(dst_argb), // %1 |
| 6221 "+r"(width) // %2 |
| 6222 : "r"(poly) // %3 |
| 6223 : "memory", "cc" |
| 6224 #if defined(__SSE2__) |
| 6225 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 6226 #endif |
| 6227 ); |
| 6228 } |
| 6229 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 |
| 6230 |
| 6231 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 |
| 6232 void ARGBPolynomialRow_AVX2(const uint8* src_argb, |
| 6233 uint8* dst_argb, const float* poly, |
| 6234 int width) { |
| 6235 asm volatile ( |
| 6236 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" |
| 6237 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" |
| 6238 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" |
| 6239 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" |
| 6240 |
| 6241 // 2 pixel loop. |
| 6242 LABELALIGN |
| 6243 "1: \n" |
| 6244 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels |
| 6245 "lea " MEMLEA(0x8,0) ",%0 \n" |
| 6246 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats |
| 6247 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X |
| 6248 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X |
| 6249 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X |
| 6250 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X |
| 6251 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X |
| 6252 "vcvttps2dq %%ymm0,%%ymm0 \n" |
| 6253 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" |
| 6254 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
| 6255 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" |
| 6256 "sub $0x2,%2 \n" |
| 6257 "vmovq %%xmm0," MEMACCESS(1) " \n" |
| 6258 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 6259 "jg 1b \n" |
| 6260 "vzeroupper \n" |
| 6261 : "+r"(src_argb), // %0 |
| 6262 "+r"(dst_argb), // %1 |
| 6263 "+r"(width) // %2 |
| 6264 : "r"(poly) // %3 |
| 6265 : "memory", "cc" |
| 6266 #if defined(__SSE2__) |
| 6267 // TODO(fbarchard): declare ymm usage when applicable. |
| 6268 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 6269 #endif |
| 6270 ); |
| 6271 } |
| 6272 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 |
| 6273 |
| 6274 #ifdef HAS_ARGBCOLORTABLEROW_X86 |
| 6275 // Tranform ARGB pixels with color table. |
| 6276 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, |
| 6277 int width) { |
| 6278 uintptr_t pixel_temp = 0u; |
| 6279 asm volatile ( |
| 6280 // 1 pixel loop. |
| 6281 LABELALIGN |
| 6282 "1: \n" |
| 6283 "movzb " MEMACCESS(0) ",%1 \n" |
| 6284 "lea " MEMLEA(0x4,0) ",%0 \n" |
| 6285 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 |
| 6286 "mov %b1," MEMACCESS2(-0x4,0) " \n" |
| 6287 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" |
| 6288 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 |
| 6289 "mov %b1," MEMACCESS2(-0x3,0) " \n" |
| 6290 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" |
| 6291 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 |
| 6292 "mov %b1," MEMACCESS2(-0x2,0) " \n" |
| 6293 "movzb " MEMACCESS2(-0x1,0) ",%1 \n" |
| 6294 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 |
| 6295 "mov %b1," MEMACCESS2(-0x1,0) " \n" |
| 6296 "dec %2 \n" |
| 6297 "jg 1b \n" |
| 6298 : "+r"(dst_argb), // %0 |
| 6299 "+d"(pixel_temp), // %1 |
| 6300 "+r"(width) // %2 |
| 6301 : "r"(table_argb) // %3 |
| 6302 : "memory", "cc"); |
| 6303 } |
| 6304 #endif // HAS_ARGBCOLORTABLEROW_X86 |
| 6305 |
| 6306 #ifdef HAS_RGBCOLORTABLEROW_X86 |
| 6307 // Tranform RGB pixels with color table. |
| 6308 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { |
| 6309 uintptr_t pixel_temp = 0u; |
| 6310 asm volatile ( |
| 6311 // 1 pixel loop. |
| 6312 LABELALIGN |
| 6313 "1: \n" |
| 6314 "movzb " MEMACCESS(0) ",%1 \n" |
| 6315 "lea " MEMLEA(0x4,0) ",%0 \n" |
| 6316 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 |
| 6317 "mov %b1," MEMACCESS2(-0x4,0) " \n" |
| 6318 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" |
| 6319 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 |
| 6320 "mov %b1," MEMACCESS2(-0x3,0) " \n" |
| 6321 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" |
| 6322 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 |
| 6323 "mov %b1," MEMACCESS2(-0x2,0) " \n" |
| 6324 "dec %2 \n" |
| 6325 "jg 1b \n" |
| 6326 : "+r"(dst_argb), // %0 |
| 6327 "+d"(pixel_temp), // %1 |
| 6328 "+r"(width) // %2 |
| 6329 : "r"(table_argb) // %3 |
| 6330 : "memory", "cc"); |
| 6331 } |
| 6332 #endif // HAS_RGBCOLORTABLEROW_X86 |
| 6333 |
| 6334 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 6335 // Tranform RGB pixels with luma table. |
| 6336 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
| 6337 int width, |
| 6338 const uint8* luma, uint32 lumacoeff) { |
| 6339 uintptr_t pixel_temp = 0u; |
| 6340 uintptr_t table_temp = 0u; |
| 6341 asm volatile ( |
| 6342 "movd %6,%%xmm3 \n" |
| 6343 "pshufd $0x0,%%xmm3,%%xmm3 \n" |
| 6344 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 6345 "psllw $0x8,%%xmm4 \n" |
| 6346 "pxor %%xmm5,%%xmm5 \n" |
| 6347 |
| 6348 // 4 pixel loop. |
| 6349 LABELALIGN |
| 6350 "1: \n" |
| 6351 "movdqu " MEMACCESS(2) ",%%xmm0 \n" |
| 6352 "pmaddubsw %%xmm3,%%xmm0 \n" |
| 6353 "phaddw %%xmm0,%%xmm0 \n" |
| 6354 "pand %%xmm4,%%xmm0 \n" |
| 6355 "punpcklwd %%xmm5,%%xmm0 \n" |
| 6356 "movd %%xmm0,%k1 \n" // 32 bit offset |
| 6357 "add %5,%1 \n" |
| 6358 "pshufd $0x39,%%xmm0,%%xmm0 \n" |
| 6359 |
| 6360 "movzb " MEMACCESS(2) ",%0 \n" |
| 6361 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
| 6362 "mov %b0," MEMACCESS(3) " \n" |
| 6363 "movzb " MEMACCESS2(0x1,2) ",%0 \n" |
| 6364 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
| 6365 "mov %b0," MEMACCESS2(0x1,3) " \n" |
| 6366 "movzb " MEMACCESS2(0x2,2) ",%0 \n" |
| 6367 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
| 6368 "mov %b0," MEMACCESS2(0x2,3) " \n" |
| 6369 "movzb " MEMACCESS2(0x3,2) ",%0 \n" |
| 6370 "mov %b0," MEMACCESS2(0x3,3) " \n" |
| 6371 |
| 6372 "movd %%xmm0,%k1 \n" // 32 bit offset |
| 6373 "add %5,%1 \n" |
| 6374 "pshufd $0x39,%%xmm0,%%xmm0 \n" |
| 6375 |
| 6376 "movzb " MEMACCESS2(0x4,2) ",%0 \n" |
| 6377 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
| 6378 "mov %b0," MEMACCESS2(0x4,3) " \n" |
| 6379 BUNDLEALIGN |
| 6380 "movzb " MEMACCESS2(0x5,2) ",%0 \n" |
| 6381 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
| 6382 "mov %b0," MEMACCESS2(0x5,3) " \n" |
| 6383 "movzb " MEMACCESS2(0x6,2) ",%0 \n" |
| 6384 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
| 6385 "mov %b0," MEMACCESS2(0x6,3) " \n" |
| 6386 "movzb " MEMACCESS2(0x7,2) ",%0 \n" |
| 6387 "mov %b0," MEMACCESS2(0x7,3) " \n" |
| 6388 |
| 6389 "movd %%xmm0,%k1 \n" // 32 bit offset |
| 6390 "add %5,%1 \n" |
| 6391 "pshufd $0x39,%%xmm0,%%xmm0 \n" |
| 6392 |
| 6393 "movzb " MEMACCESS2(0x8,2) ",%0 \n" |
| 6394 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
| 6395 "mov %b0," MEMACCESS2(0x8,3) " \n" |
| 6396 "movzb " MEMACCESS2(0x9,2) ",%0 \n" |
| 6397 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
| 6398 "mov %b0," MEMACCESS2(0x9,3) " \n" |
| 6399 "movzb " MEMACCESS2(0xa,2) ",%0 \n" |
| 6400 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
| 6401 "mov %b0," MEMACCESS2(0xa,3) " \n" |
| 6402 "movzb " MEMACCESS2(0xb,2) ",%0 \n" |
| 6403 "mov %b0," MEMACCESS2(0xb,3) " \n" |
| 6404 |
| 6405 "movd %%xmm0,%k1 \n" // 32 bit offset |
| 6406 "add %5,%1 \n" |
| 6407 |
| 6408 "movzb " MEMACCESS2(0xc,2) ",%0 \n" |
| 6409 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
| 6410 "mov %b0," MEMACCESS2(0xc,3) " \n" |
| 6411 "movzb " MEMACCESS2(0xd,2) ",%0 \n" |
| 6412 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
| 6413 "mov %b0," MEMACCESS2(0xd,3) " \n" |
| 6414 "movzb " MEMACCESS2(0xe,2) ",%0 \n" |
| 6415 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
| 6416 "mov %b0," MEMACCESS2(0xe,3) " \n" |
| 6417 "movzb " MEMACCESS2(0xf,2) ",%0 \n" |
| 6418 "mov %b0," MEMACCESS2(0xf,3) " \n" |
| 6419 "sub $0x4,%4 \n" |
| 6420 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 6421 "lea " MEMLEA(0x10,3) ",%3 \n" |
| 6422 "jg 1b \n" |
| 6423 : "+d"(pixel_temp), // %0 |
| 6424 "+a"(table_temp), // %1 |
| 6425 "+r"(src_argb), // %2 |
| 6426 "+r"(dst_argb), // %3 |
| 6427 "+rm"(width) // %4 |
| 6428 : "r"(luma), // %5 |
| 6429 "rm"(lumacoeff) // %6 |
| 6430 : "memory", "cc" |
| 6431 #if defined(__SSE2__) |
| 6432 , "xmm0", "xmm3", "xmm4", "xmm5" |
| 6433 #endif |
| 6434 ); |
| 6435 } |
| 6436 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 6437 |
| 6438 #endif // defined(__x86_64__) || defined(__i386__) |
| 6439 |
| 6440 #ifdef __cplusplus |
| 6441 } // extern "C" |
| 6442 } // namespace libyuv |
| 6443 #endif |
OLD | NEW |