| OLD | NEW |
| (Empty) |
| 1 // VERSION 2 | |
| 2 /* | |
| 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | |
| 4 * | |
| 5 * Use of this source code is governed by a BSD-style license | |
| 6 * that can be found in the LICENSE file in the root of the source | |
| 7 * tree. An additional intellectual property rights grant can be found | |
| 8 * in the file PATENTS. All contributing project authors may | |
| 9 * be found in the AUTHORS file in the root of the source tree. | |
| 10 */ | |
| 11 | |
| 12 #include "libyuv/row.h" | |
| 13 | |
| 14 #ifdef __cplusplus | |
| 15 namespace libyuv { | |
| 16 extern "C" { | |
| 17 #endif | |
| 18 | |
| 19 // This module is for GCC x86 and x64. | |
| 20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) | |
| 21 | |
| 22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) | |
| 23 | |
| 24 // Constants for ARGB | |
| 25 static vec8 kARGBToY = { | |
| 26 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 | |
| 27 }; | |
| 28 | |
| 29 // JPeg full range. | |
| 30 static vec8 kARGBToYJ = { | |
| 31 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 | |
| 32 }; | |
| 33 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) | |
| 34 | |
| 35 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) | |
| 36 | |
| 37 static vec8 kARGBToU = { | |
| 38 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 | |
| 39 }; | |
| 40 | |
| 41 static vec8 kARGBToUJ = { | |
| 42 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 | |
| 43 }; | |
| 44 | |
| 45 static vec8 kARGBToV = { | |
| 46 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, | |
| 47 }; | |
| 48 | |
| 49 static vec8 kARGBToVJ = { | |
| 50 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 | |
| 51 }; | |
| 52 | |
| 53 // Constants for BGRA | |
| 54 static vec8 kBGRAToY = { | |
| 55 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 | |
| 56 }; | |
| 57 | |
| 58 static vec8 kBGRAToU = { | |
| 59 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 | |
| 60 }; | |
| 61 | |
| 62 static vec8 kBGRAToV = { | |
| 63 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 | |
| 64 }; | |
| 65 | |
| 66 // Constants for ABGR | |
| 67 static vec8 kABGRToY = { | |
| 68 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 | |
| 69 }; | |
| 70 | |
| 71 static vec8 kABGRToU = { | |
| 72 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 | |
| 73 }; | |
| 74 | |
| 75 static vec8 kABGRToV = { | |
| 76 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 | |
| 77 }; | |
| 78 | |
| 79 // Constants for RGBA. | |
| 80 static vec8 kRGBAToY = { | |
| 81 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 | |
| 82 }; | |
| 83 | |
| 84 static vec8 kRGBAToU = { | |
| 85 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 | |
| 86 }; | |
| 87 | |
| 88 static vec8 kRGBAToV = { | |
| 89 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 | |
| 90 }; | |
| 91 | |
| 92 static uvec8 kAddY16 = { | |
| 93 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u | |
| 94 }; | |
| 95 | |
| 96 // 7 bit fixed point 0.5. | |
| 97 static vec16 kAddYJ64 = { | |
| 98 64, 64, 64, 64, 64, 64, 64, 64 | |
| 99 }; | |
| 100 | |
| 101 static uvec8 kAddUV128 = { | |
| 102 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, | |
| 103 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u | |
| 104 }; | |
| 105 | |
| 106 static uvec16 kAddUVJ128 = { | |
| 107 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u | |
| 108 }; | |
| 109 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) | |
| 110 | |
| 111 #ifdef HAS_RGB24TOARGBROW_SSSE3 | |
| 112 | |
| 113 // Shuffle table for converting RGB24 to ARGB. | |
| 114 static uvec8 kShuffleMaskRGB24ToARGB = { | |
| 115 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u | |
| 116 }; | |
| 117 | |
| 118 // Shuffle table for converting RAW to ARGB. | |
| 119 static uvec8 kShuffleMaskRAWToARGB = { | |
| 120 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u | |
| 121 }; | |
| 122 | |
| 123 // Shuffle table for converting ARGB to RGB24. | |
| 124 static uvec8 kShuffleMaskARGBToRGB24 = { | |
| 125 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u | |
| 126 }; | |
| 127 | |
| 128 // Shuffle table for converting ARGB to RAW. | |
| 129 static uvec8 kShuffleMaskARGBToRAW = { | |
| 130 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u | |
| 131 }; | |
| 132 | |
| 133 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 | |
| 134 static uvec8 kShuffleMaskARGBToRGB24_0 = { | |
| 135 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u | |
| 136 }; | |
| 137 | |
| 138 // Shuffle table for converting ARGB to RAW. | |
| 139 static uvec8 kShuffleMaskARGBToRAW_0 = { | |
| 140 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u | |
| 141 }; | |
| 142 #endif // HAS_RGB24TOARGBROW_SSSE3 | |
| 143 | |
| 144 #if defined(TESTING) && defined(__x86_64__) | |
| 145 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | |
| 146 asm volatile ( | |
| 147 ".p2align 5 \n" | |
| 148 "mov %%eax,%%eax \n" | |
| 149 "mov %%ebx,%%ebx \n" | |
| 150 "mov %%ecx,%%ecx \n" | |
| 151 "mov %%edx,%%edx \n" | |
| 152 "mov %%esi,%%esi \n" | |
| 153 "mov %%edi,%%edi \n" | |
| 154 "mov %%ebp,%%ebp \n" | |
| 155 "mov %%esp,%%esp \n" | |
| 156 ".p2align 5 \n" | |
| 157 "mov %%r8d,%%r8d \n" | |
| 158 "mov %%r9d,%%r9d \n" | |
| 159 "mov %%r10d,%%r10d \n" | |
| 160 "mov %%r11d,%%r11d \n" | |
| 161 "mov %%r12d,%%r12d \n" | |
| 162 "mov %%r13d,%%r13d \n" | |
| 163 "mov %%r14d,%%r14d \n" | |
| 164 "mov %%r15d,%%r15d \n" | |
| 165 ".p2align 5 \n" | |
| 166 "lea (%%rax),%%eax \n" | |
| 167 "lea (%%rbx),%%ebx \n" | |
| 168 "lea (%%rcx),%%ecx \n" | |
| 169 "lea (%%rdx),%%edx \n" | |
| 170 "lea (%%rsi),%%esi \n" | |
| 171 "lea (%%rdi),%%edi \n" | |
| 172 "lea (%%rbp),%%ebp \n" | |
| 173 "lea (%%rsp),%%esp \n" | |
| 174 ".p2align 5 \n" | |
| 175 "lea (%%r8),%%r8d \n" | |
| 176 "lea (%%r9),%%r9d \n" | |
| 177 "lea (%%r10),%%r10d \n" | |
| 178 "lea (%%r11),%%r11d \n" | |
| 179 "lea (%%r12),%%r12d \n" | |
| 180 "lea (%%r13),%%r13d \n" | |
| 181 "lea (%%r14),%%r14d \n" | |
| 182 "lea (%%r15),%%r15d \n" | |
| 183 | |
| 184 ".p2align 5 \n" | |
| 185 "lea 0x10(%%rax),%%eax \n" | |
| 186 "lea 0x10(%%rbx),%%ebx \n" | |
| 187 "lea 0x10(%%rcx),%%ecx \n" | |
| 188 "lea 0x10(%%rdx),%%edx \n" | |
| 189 "lea 0x10(%%rsi),%%esi \n" | |
| 190 "lea 0x10(%%rdi),%%edi \n" | |
| 191 "lea 0x10(%%rbp),%%ebp \n" | |
| 192 "lea 0x10(%%rsp),%%esp \n" | |
| 193 ".p2align 5 \n" | |
| 194 "lea 0x10(%%r8),%%r8d \n" | |
| 195 "lea 0x10(%%r9),%%r9d \n" | |
| 196 "lea 0x10(%%r10),%%r10d \n" | |
| 197 "lea 0x10(%%r11),%%r11d \n" | |
| 198 "lea 0x10(%%r12),%%r12d \n" | |
| 199 "lea 0x10(%%r13),%%r13d \n" | |
| 200 "lea 0x10(%%r14),%%r14d \n" | |
| 201 "lea 0x10(%%r15),%%r15d \n" | |
| 202 | |
| 203 ".p2align 5 \n" | |
| 204 "add 0x10,%%eax \n" | |
| 205 "add 0x10,%%ebx \n" | |
| 206 "add 0x10,%%ecx \n" | |
| 207 "add 0x10,%%edx \n" | |
| 208 "add 0x10,%%esi \n" | |
| 209 "add 0x10,%%edi \n" | |
| 210 "add 0x10,%%ebp \n" | |
| 211 "add 0x10,%%esp \n" | |
| 212 ".p2align 5 \n" | |
| 213 "add 0x10,%%r8d \n" | |
| 214 "add 0x10,%%r9d \n" | |
| 215 "add 0x10,%%r10d \n" | |
| 216 "add 0x10,%%r11d \n" | |
| 217 "add 0x10,%%r12d \n" | |
| 218 "add 0x10,%%r13d \n" | |
| 219 "add 0x10,%%r14d \n" | |
| 220 "add 0x10,%%r15d \n" | |
| 221 | |
| 222 ".p2align 2 \n" | |
| 223 "1: \n" | |
| 224 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
| 225 "lea " MEMLEA(0x8,0) ",%0 \n" | |
| 226 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 227 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 228 "sub $0x8,%2 \n" | |
| 229 "jg 1b \n" | |
| 230 : "+r"(src_y), // %0 | |
| 231 "+r"(dst_argb), // %1 | |
| 232 "+r"(pix) // %2 | |
| 233 : | |
| 234 : "memory", "cc", "xmm0", "xmm1", "xmm5" | |
| 235 ); | |
| 236 } | |
| 237 #endif // TESTING | |
| 238 | |
| 239 #ifdef HAS_I400TOARGBROW_SSE2 | |
| 240 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | |
| 241 asm volatile ( | |
| 242 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 243 "pslld $0x18,%%xmm5 \n" | |
| 244 LABELALIGN | |
| 245 "1: \n" | |
| 246 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
| 247 "lea " MEMLEA(0x8,0) ",%0 \n" | |
| 248 "punpcklbw %%xmm0,%%xmm0 \n" | |
| 249 "movdqa %%xmm0,%%xmm1 \n" | |
| 250 "punpcklwd %%xmm0,%%xmm0 \n" | |
| 251 "punpckhwd %%xmm1,%%xmm1 \n" | |
| 252 "por %%xmm5,%%xmm0 \n" | |
| 253 "por %%xmm5,%%xmm1 \n" | |
| 254 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 255 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
| 256 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 257 "sub $0x8,%2 \n" | |
| 258 "jg 1b \n" | |
| 259 : "+r"(src_y), // %0 | |
| 260 "+r"(dst_argb), // %1 | |
| 261 "+r"(pix) // %2 | |
| 262 :: "memory", "cc", "xmm0", "xmm1", "xmm5" | |
| 263 ); | |
| 264 } | |
| 265 #endif // HAS_I400TOARGBROW_SSE2 | |
| 266 | |
| 267 #ifdef HAS_RGB24TOARGBROW_SSSE3 | |
| 268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { | |
| 269 asm volatile ( | |
| 270 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 | |
| 271 "pslld $0x18,%%xmm5 \n" | |
| 272 "movdqa %3,%%xmm4 \n" | |
| 273 LABELALIGN | |
| 274 "1: \n" | |
| 275 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 276 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 277 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" | |
| 278 "lea " MEMLEA(0x30,0) ",%0 \n" | |
| 279 "movdqa %%xmm3,%%xmm2 \n" | |
| 280 "palignr $0x8,%%xmm1,%%xmm2 \n" | |
| 281 "pshufb %%xmm4,%%xmm2 \n" | |
| 282 "por %%xmm5,%%xmm2 \n" | |
| 283 "palignr $0xc,%%xmm0,%%xmm1 \n" | |
| 284 "pshufb %%xmm4,%%xmm0 \n" | |
| 285 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" | |
| 286 "por %%xmm5,%%xmm0 \n" | |
| 287 "pshufb %%xmm4,%%xmm1 \n" | |
| 288 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 289 "por %%xmm5,%%xmm1 \n" | |
| 290 "palignr $0x4,%%xmm3,%%xmm3 \n" | |
| 291 "pshufb %%xmm4,%%xmm3 \n" | |
| 292 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
| 293 "por %%xmm5,%%xmm3 \n" | |
| 294 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" | |
| 295 "lea " MEMLEA(0x40,1) ",%1 \n" | |
| 296 "sub $0x10,%2 \n" | |
| 297 "jg 1b \n" | |
| 298 : "+r"(src_rgb24), // %0 | |
| 299 "+r"(dst_argb), // %1 | |
| 300 "+r"(pix) // %2 | |
| 301 : "m"(kShuffleMaskRGB24ToARGB) // %3 | |
| 302 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 303 ); | |
| 304 } | |
| 305 | |
| 306 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { | |
| 307 asm volatile ( | |
| 308 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 | |
| 309 "pslld $0x18,%%xmm5 \n" | |
| 310 "movdqa %3,%%xmm4 \n" | |
| 311 LABELALIGN | |
| 312 "1: \n" | |
| 313 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 314 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 315 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" | |
| 316 "lea " MEMLEA(0x30,0) ",%0 \n" | |
| 317 "movdqa %%xmm3,%%xmm2 \n" | |
| 318 "palignr $0x8,%%xmm1,%%xmm2 \n" | |
| 319 "pshufb %%xmm4,%%xmm2 \n" | |
| 320 "por %%xmm5,%%xmm2 \n" | |
| 321 "palignr $0xc,%%xmm0,%%xmm1 \n" | |
| 322 "pshufb %%xmm4,%%xmm0 \n" | |
| 323 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" | |
| 324 "por %%xmm5,%%xmm0 \n" | |
| 325 "pshufb %%xmm4,%%xmm1 \n" | |
| 326 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 327 "por %%xmm5,%%xmm1 \n" | |
| 328 "palignr $0x4,%%xmm3,%%xmm3 \n" | |
| 329 "pshufb %%xmm4,%%xmm3 \n" | |
| 330 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
| 331 "por %%xmm5,%%xmm3 \n" | |
| 332 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" | |
| 333 "lea " MEMLEA(0x40,1) ",%1 \n" | |
| 334 "sub $0x10,%2 \n" | |
| 335 "jg 1b \n" | |
| 336 : "+r"(src_raw), // %0 | |
| 337 "+r"(dst_argb), // %1 | |
| 338 "+r"(pix) // %2 | |
| 339 : "m"(kShuffleMaskRAWToARGB) // %3 | |
| 340 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 341 ); | |
| 342 } | |
| 343 | |
| 344 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { | |
| 345 asm volatile ( | |
| 346 "mov $0x1080108,%%eax \n" | |
| 347 "movd %%eax,%%xmm5 \n" | |
| 348 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
| 349 "mov $0x20802080,%%eax \n" | |
| 350 "movd %%eax,%%xmm6 \n" | |
| 351 "pshufd $0x0,%%xmm6,%%xmm6 \n" | |
| 352 "pcmpeqb %%xmm3,%%xmm3 \n" | |
| 353 "psllw $0xb,%%xmm3 \n" | |
| 354 "pcmpeqb %%xmm4,%%xmm4 \n" | |
| 355 "psllw $0xa,%%xmm4 \n" | |
| 356 "psrlw $0x5,%%xmm4 \n" | |
| 357 "pcmpeqb %%xmm7,%%xmm7 \n" | |
| 358 "psllw $0x8,%%xmm7 \n" | |
| 359 "sub %0,%1 \n" | |
| 360 "sub %0,%1 \n" | |
| 361 LABELALIGN | |
| 362 "1: \n" | |
| 363 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 364 "movdqa %%xmm0,%%xmm1 \n" | |
| 365 "movdqa %%xmm0,%%xmm2 \n" | |
| 366 "pand %%xmm3,%%xmm1 \n" | |
| 367 "psllw $0xb,%%xmm2 \n" | |
| 368 "pmulhuw %%xmm5,%%xmm1 \n" | |
| 369 "pmulhuw %%xmm5,%%xmm2 \n" | |
| 370 "psllw $0x8,%%xmm1 \n" | |
| 371 "por %%xmm2,%%xmm1 \n" | |
| 372 "pand %%xmm4,%%xmm0 \n" | |
| 373 "pmulhuw %%xmm6,%%xmm0 \n" | |
| 374 "por %%xmm7,%%xmm0 \n" | |
| 375 "movdqa %%xmm1,%%xmm2 \n" | |
| 376 "punpcklbw %%xmm0,%%xmm1 \n" | |
| 377 "punpckhbw %%xmm0,%%xmm2 \n" | |
| 378 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) | |
| 379 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) | |
| 380 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 381 "sub $0x8,%2 \n" | |
| 382 "jg 1b \n" | |
| 383 : "+r"(src), // %0 | |
| 384 "+r"(dst), // %1 | |
| 385 "+r"(pix) // %2 | |
| 386 : | |
| 387 : "memory", "cc", "eax", NACL_R14 | |
| 388 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 389 ); | |
| 390 } | |
| 391 | |
| 392 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { | |
| 393 asm volatile ( | |
| 394 "mov $0x1080108,%%eax \n" | |
| 395 "movd %%eax,%%xmm5 \n" | |
| 396 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
| 397 "mov $0x42004200,%%eax \n" | |
| 398 "movd %%eax,%%xmm6 \n" | |
| 399 "pshufd $0x0,%%xmm6,%%xmm6 \n" | |
| 400 "pcmpeqb %%xmm3,%%xmm3 \n" | |
| 401 "psllw $0xb,%%xmm3 \n" | |
| 402 "movdqa %%xmm3,%%xmm4 \n" | |
| 403 "psrlw $0x6,%%xmm4 \n" | |
| 404 "pcmpeqb %%xmm7,%%xmm7 \n" | |
| 405 "psllw $0x8,%%xmm7 \n" | |
| 406 "sub %0,%1 \n" | |
| 407 "sub %0,%1 \n" | |
| 408 LABELALIGN | |
| 409 "1: \n" | |
| 410 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 411 "movdqa %%xmm0,%%xmm1 \n" | |
| 412 "movdqa %%xmm0,%%xmm2 \n" | |
| 413 "psllw $0x1,%%xmm1 \n" | |
| 414 "psllw $0xb,%%xmm2 \n" | |
| 415 "pand %%xmm3,%%xmm1 \n" | |
| 416 "pmulhuw %%xmm5,%%xmm2 \n" | |
| 417 "pmulhuw %%xmm5,%%xmm1 \n" | |
| 418 "psllw $0x8,%%xmm1 \n" | |
| 419 "por %%xmm2,%%xmm1 \n" | |
| 420 "movdqa %%xmm0,%%xmm2 \n" | |
| 421 "pand %%xmm4,%%xmm0 \n" | |
| 422 "psraw $0x8,%%xmm2 \n" | |
| 423 "pmulhuw %%xmm6,%%xmm0 \n" | |
| 424 "pand %%xmm7,%%xmm2 \n" | |
| 425 "por %%xmm2,%%xmm0 \n" | |
| 426 "movdqa %%xmm1,%%xmm2 \n" | |
| 427 "punpcklbw %%xmm0,%%xmm1 \n" | |
| 428 "punpckhbw %%xmm0,%%xmm2 \n" | |
| 429 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) | |
| 430 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) | |
| 431 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 432 "sub $0x8,%2 \n" | |
| 433 "jg 1b \n" | |
| 434 : "+r"(src), // %0 | |
| 435 "+r"(dst), // %1 | |
| 436 "+r"(pix) // %2 | |
| 437 : | |
| 438 : "memory", "cc", "eax", NACL_R14 | |
| 439 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 440 ); | |
| 441 } | |
| 442 | |
| 443 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { | |
| 444 asm volatile ( | |
| 445 "mov $0xf0f0f0f,%%eax \n" | |
| 446 "movd %%eax,%%xmm4 \n" | |
| 447 "pshufd $0x0,%%xmm4,%%xmm4 \n" | |
| 448 "movdqa %%xmm4,%%xmm5 \n" | |
| 449 "pslld $0x4,%%xmm5 \n" | |
| 450 "sub %0,%1 \n" | |
| 451 "sub %0,%1 \n" | |
| 452 LABELALIGN | |
| 453 "1: \n" | |
| 454 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 455 "movdqa %%xmm0,%%xmm2 \n" | |
| 456 "pand %%xmm4,%%xmm0 \n" | |
| 457 "pand %%xmm5,%%xmm2 \n" | |
| 458 "movdqa %%xmm0,%%xmm1 \n" | |
| 459 "movdqa %%xmm2,%%xmm3 \n" | |
| 460 "psllw $0x4,%%xmm1 \n" | |
| 461 "psrlw $0x4,%%xmm3 \n" | |
| 462 "por %%xmm1,%%xmm0 \n" | |
| 463 "por %%xmm3,%%xmm2 \n" | |
| 464 "movdqa %%xmm0,%%xmm1 \n" | |
| 465 "punpcklbw %%xmm2,%%xmm0 \n" | |
| 466 "punpckhbw %%xmm2,%%xmm1 \n" | |
| 467 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) | |
| 468 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) | |
| 469 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 470 "sub $0x8,%2 \n" | |
| 471 "jg 1b \n" | |
| 472 : "+r"(src), // %0 | |
| 473 "+r"(dst), // %1 | |
| 474 "+r"(pix) // %2 | |
| 475 : | |
| 476 : "memory", "cc", "eax", NACL_R14 | |
| 477 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 478 ); | |
| 479 } | |
| 480 | |
| 481 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { | |
| 482 asm volatile ( | |
| 483 "movdqa %3,%%xmm6 \n" | |
| 484 LABELALIGN | |
| 485 "1: \n" | |
| 486 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 487 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 488 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 489 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
| 490 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 491 "pshufb %%xmm6,%%xmm0 \n" | |
| 492 "pshufb %%xmm6,%%xmm1 \n" | |
| 493 "pshufb %%xmm6,%%xmm2 \n" | |
| 494 "pshufb %%xmm6,%%xmm3 \n" | |
| 495 "movdqa %%xmm1,%%xmm4 \n" | |
| 496 "psrldq $0x4,%%xmm1 \n" | |
| 497 "pslldq $0xc,%%xmm4 \n" | |
| 498 "movdqa %%xmm2,%%xmm5 \n" | |
| 499 "por %%xmm4,%%xmm0 \n" | |
| 500 "pslldq $0x8,%%xmm5 \n" | |
| 501 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 502 "por %%xmm5,%%xmm1 \n" | |
| 503 "psrldq $0x8,%%xmm2 \n" | |
| 504 "pslldq $0x4,%%xmm3 \n" | |
| 505 "por %%xmm3,%%xmm2 \n" | |
| 506 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
| 507 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" | |
| 508 "lea " MEMLEA(0x30,1) ",%1 \n" | |
| 509 "sub $0x10,%2 \n" | |
| 510 "jg 1b \n" | |
| 511 : "+r"(src), // %0 | |
| 512 "+r"(dst), // %1 | |
| 513 "+r"(pix) // %2 | |
| 514 : "m"(kShuffleMaskARGBToRGB24) // %3 | |
| 515 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
| 516 ); | |
| 517 } | |
| 518 | |
| 519 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { | |
| 520 asm volatile ( | |
| 521 "movdqa %3,%%xmm6 \n" | |
| 522 LABELALIGN | |
| 523 "1: \n" | |
| 524 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 525 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 526 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 527 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
| 528 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 529 "pshufb %%xmm6,%%xmm0 \n" | |
| 530 "pshufb %%xmm6,%%xmm1 \n" | |
| 531 "pshufb %%xmm6,%%xmm2 \n" | |
| 532 "pshufb %%xmm6,%%xmm3 \n" | |
| 533 "movdqa %%xmm1,%%xmm4 \n" | |
| 534 "psrldq $0x4,%%xmm1 \n" | |
| 535 "pslldq $0xc,%%xmm4 \n" | |
| 536 "movdqa %%xmm2,%%xmm5 \n" | |
| 537 "por %%xmm4,%%xmm0 \n" | |
| 538 "pslldq $0x8,%%xmm5 \n" | |
| 539 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 540 "por %%xmm5,%%xmm1 \n" | |
| 541 "psrldq $0x8,%%xmm2 \n" | |
| 542 "pslldq $0x4,%%xmm3 \n" | |
| 543 "por %%xmm3,%%xmm2 \n" | |
| 544 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
| 545 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" | |
| 546 "lea " MEMLEA(0x30,1) ",%1 \n" | |
| 547 "sub $0x10,%2 \n" | |
| 548 "jg 1b \n" | |
| 549 : "+r"(src), // %0 | |
| 550 "+r"(dst), // %1 | |
| 551 "+r"(pix) // %2 | |
| 552 : "m"(kShuffleMaskARGBToRAW) // %3 | |
| 553 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
| 554 ); | |
| 555 } | |
| 556 | |
| 557 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { | |
| 558 asm volatile ( | |
| 559 "pcmpeqb %%xmm3,%%xmm3 \n" | |
| 560 "psrld $0x1b,%%xmm3 \n" | |
| 561 "pcmpeqb %%xmm4,%%xmm4 \n" | |
| 562 "psrld $0x1a,%%xmm4 \n" | |
| 563 "pslld $0x5,%%xmm4 \n" | |
| 564 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 565 "pslld $0xb,%%xmm5 \n" | |
| 566 LABELALIGN | |
| 567 "1: \n" | |
| 568 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 569 "movdqa %%xmm0,%%xmm1 \n" | |
| 570 "movdqa %%xmm0,%%xmm2 \n" | |
| 571 "pslld $0x8,%%xmm0 \n" | |
| 572 "psrld $0x3,%%xmm1 \n" | |
| 573 "psrld $0x5,%%xmm2 \n" | |
| 574 "psrad $0x10,%%xmm0 \n" | |
| 575 "pand %%xmm3,%%xmm1 \n" | |
| 576 "pand %%xmm4,%%xmm2 \n" | |
| 577 "pand %%xmm5,%%xmm0 \n" | |
| 578 "por %%xmm2,%%xmm1 \n" | |
| 579 "por %%xmm1,%%xmm0 \n" | |
| 580 "packssdw %%xmm0,%%xmm0 \n" | |
| 581 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 582 "movq %%xmm0," MEMACCESS(1) " \n" | |
| 583 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 584 "sub $0x4,%2 \n" | |
| 585 "jg 1b \n" | |
| 586 : "+r"(src), // %0 | |
| 587 "+r"(dst), // %1 | |
| 588 "+r"(pix) // %2 | |
| 589 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 590 ); | |
| 591 } | |
| 592 | |
| 593 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { | |
| 594 asm volatile ( | |
| 595 "pcmpeqb %%xmm4,%%xmm4 \n" | |
| 596 "psrld $0x1b,%%xmm4 \n" | |
| 597 "movdqa %%xmm4,%%xmm5 \n" | |
| 598 "pslld $0x5,%%xmm5 \n" | |
| 599 "movdqa %%xmm4,%%xmm6 \n" | |
| 600 "pslld $0xa,%%xmm6 \n" | |
| 601 "pcmpeqb %%xmm7,%%xmm7 \n" | |
| 602 "pslld $0xf,%%xmm7 \n" | |
| 603 LABELALIGN | |
| 604 "1: \n" | |
| 605 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 606 "movdqa %%xmm0,%%xmm1 \n" | |
| 607 "movdqa %%xmm0,%%xmm2 \n" | |
| 608 "movdqa %%xmm0,%%xmm3 \n" | |
| 609 "psrad $0x10,%%xmm0 \n" | |
| 610 "psrld $0x3,%%xmm1 \n" | |
| 611 "psrld $0x6,%%xmm2 \n" | |
| 612 "psrld $0x9,%%xmm3 \n" | |
| 613 "pand %%xmm7,%%xmm0 \n" | |
| 614 "pand %%xmm4,%%xmm1 \n" | |
| 615 "pand %%xmm5,%%xmm2 \n" | |
| 616 "pand %%xmm6,%%xmm3 \n" | |
| 617 "por %%xmm1,%%xmm0 \n" | |
| 618 "por %%xmm3,%%xmm2 \n" | |
| 619 "por %%xmm2,%%xmm0 \n" | |
| 620 "packssdw %%xmm0,%%xmm0 \n" | |
| 621 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 622 "movq %%xmm0," MEMACCESS(1) " \n" | |
| 623 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 624 "sub $0x4,%2 \n" | |
| 625 "jg 1b \n" | |
| 626 : "+r"(src), // %0 | |
| 627 "+r"(dst), // %1 | |
| 628 "+r"(pix) // %2 | |
| 629 :: "memory", "cc", | |
| 630 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 631 ); | |
| 632 } | |
| 633 | |
| 634 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { | |
| 635 asm volatile ( | |
| 636 "pcmpeqb %%xmm4,%%xmm4 \n" | |
| 637 "psllw $0xc,%%xmm4 \n" | |
| 638 "movdqa %%xmm4,%%xmm3 \n" | |
| 639 "psrlw $0x8,%%xmm3 \n" | |
| 640 LABELALIGN | |
| 641 "1: \n" | |
| 642 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 643 "movdqa %%xmm0,%%xmm1 \n" | |
| 644 "pand %%xmm3,%%xmm0 \n" | |
| 645 "pand %%xmm4,%%xmm1 \n" | |
| 646 "psrlq $0x4,%%xmm0 \n" | |
| 647 "psrlq $0x8,%%xmm1 \n" | |
| 648 "por %%xmm1,%%xmm0 \n" | |
| 649 "packuswb %%xmm0,%%xmm0 \n" | |
| 650 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 651 "movq %%xmm0," MEMACCESS(1) " \n" | |
| 652 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 653 "sub $0x4,%2 \n" | |
| 654 "jg 1b \n" | |
| 655 : "+r"(src), // %0 | |
| 656 "+r"(dst), // %1 | |
| 657 "+r"(pix) // %2 | |
| 658 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | |
| 659 ); | |
| 660 } | |
| 661 #endif // HAS_RGB24TOARGBROW_SSSE3 | |
| 662 | |
| 663 #ifdef HAS_ARGBTOYROW_SSSE3 | |
| 664 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. | |
| 665 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | |
| 666 asm volatile ( | |
| 667 "movdqa %3,%%xmm4 \n" | |
| 668 "movdqa %4,%%xmm5 \n" | |
| 669 LABELALIGN | |
| 670 "1: \n" | |
| 671 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 672 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 673 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 674 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
| 675 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 676 "pmaddubsw %%xmm4,%%xmm1 \n" | |
| 677 "pmaddubsw %%xmm4,%%xmm2 \n" | |
| 678 "pmaddubsw %%xmm4,%%xmm3 \n" | |
| 679 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 680 "phaddw %%xmm1,%%xmm0 \n" | |
| 681 "phaddw %%xmm3,%%xmm2 \n" | |
| 682 "psrlw $0x7,%%xmm0 \n" | |
| 683 "psrlw $0x7,%%xmm2 \n" | |
| 684 "packuswb %%xmm2,%%xmm0 \n" | |
| 685 "paddb %%xmm5,%%xmm0 \n" | |
| 686 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 687 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 688 "sub $0x10,%2 \n" | |
| 689 "jg 1b \n" | |
| 690 : "+r"(src_argb), // %0 | |
| 691 "+r"(dst_y), // %1 | |
| 692 "+r"(pix) // %2 | |
| 693 : "m"(kARGBToY), // %3 | |
| 694 "m"(kAddY16) // %4 | |
| 695 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 696 ); | |
| 697 } | |
| 698 #endif // HAS_ARGBTOYROW_SSSE3 | |
| 699 | |
| 700 #ifdef HAS_ARGBTOYJROW_SSSE3 | |
| 701 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. | |
| 702 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. | |
| 703 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | |
| 704 asm volatile ( | |
| 705 "movdqa %3,%%xmm4 \n" | |
| 706 "movdqa %4,%%xmm5 \n" | |
| 707 LABELALIGN | |
| 708 "1: \n" | |
| 709 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 710 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 711 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 712 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
| 713 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 714 "pmaddubsw %%xmm4,%%xmm1 \n" | |
| 715 "pmaddubsw %%xmm4,%%xmm2 \n" | |
| 716 "pmaddubsw %%xmm4,%%xmm3 \n" | |
| 717 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 718 "phaddw %%xmm1,%%xmm0 \n" | |
| 719 "phaddw %%xmm3,%%xmm2 \n" | |
| 720 "paddw %%xmm5,%%xmm0 \n" | |
| 721 "paddw %%xmm5,%%xmm2 \n" | |
| 722 "psrlw $0x7,%%xmm0 \n" | |
| 723 "psrlw $0x7,%%xmm2 \n" | |
| 724 "packuswb %%xmm2,%%xmm0 \n" | |
| 725 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 726 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 727 "sub $0x10,%2 \n" | |
| 728 "jg 1b \n" | |
| 729 : "+r"(src_argb), // %0 | |
| 730 "+r"(dst_y), // %1 | |
| 731 "+r"(pix) // %2 | |
| 732 : "m"(kARGBToYJ), // %3 | |
| 733 "m"(kAddYJ64) // %4 | |
| 734 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 735 ); | |
| 736 } | |
| 737 #endif // HAS_ARGBTOYJROW_SSSE3 | |
| 738 | |
| 739 #ifdef HAS_ARGBTOYROW_AVX2 | |
| 740 // vpermd for vphaddw + vpackuswb vpermd. | |
| 741 static const lvec32 kPermdARGBToY_AVX = { | |
| 742 0, 4, 1, 5, 2, 6, 3, 7 | |
| 743 }; | |
| 744 | |
| 745 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | |
| 746 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | |
| 747 asm volatile ( | |
| 748 "vbroadcastf128 %3,%%ymm4 \n" | |
| 749 "vbroadcastf128 %4,%%ymm5 \n" | |
| 750 "vmovdqu %5,%%ymm6 \n" | |
| 751 LABELALIGN | |
| 752 "1: \n" | |
| 753 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 754 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
| 755 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" | |
| 756 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" | |
| 757 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" | |
| 758 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" | |
| 759 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" | |
| 760 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" | |
| 761 "lea " MEMLEA(0x80,0) ",%0 \n" | |
| 762 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. | |
| 763 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" | |
| 764 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" | |
| 765 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" | |
| 766 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. | |
| 767 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. | |
| 768 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y | |
| 769 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
| 770 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 771 "sub $0x20,%2 \n" | |
| 772 "jg 1b \n" | |
| 773 "vzeroupper \n" | |
| 774 : "+r"(src_argb), // %0 | |
| 775 "+r"(dst_y), // %1 | |
| 776 "+r"(pix) // %2 | |
| 777 : "m"(kARGBToY), // %3 | |
| 778 "m"(kAddY16), // %4 | |
| 779 "m"(kPermdARGBToY_AVX) // %5 | |
| 780 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
| 781 ); | |
| 782 } | |
| 783 #endif // HAS_ARGBTOYROW_AVX2 | |
| 784 | |
| 785 #ifdef HAS_ARGBTOYJROW_AVX2 | |
| 786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | |
| 787 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | |
| 788 asm volatile ( | |
| 789 "vbroadcastf128 %3,%%ymm4 \n" | |
| 790 "vbroadcastf128 %4,%%ymm5 \n" | |
| 791 "vmovdqu %5,%%ymm6 \n" | |
| 792 LABELALIGN | |
| 793 "1: \n" | |
| 794 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 795 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
| 796 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" | |
| 797 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" | |
| 798 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" | |
| 799 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" | |
| 800 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" | |
| 801 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" | |
| 802 "lea " MEMLEA(0x80,0) ",%0 \n" | |
| 803 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. | |
| 804 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" | |
| 805 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. | |
| 806 "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" | |
| 807 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" | |
| 808 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" | |
| 809 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. | |
| 810 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. | |
| 811 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
| 812 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 813 "sub $0x20,%2 \n" | |
| 814 "jg 1b \n" | |
| 815 "vzeroupper \n" | |
| 816 : "+r"(src_argb), // %0 | |
| 817 "+r"(dst_y), // %1 | |
| 818 "+r"(pix) // %2 | |
| 819 : "m"(kARGBToYJ), // %3 | |
| 820 "m"(kAddYJ64), // %4 | |
| 821 "m"(kPermdARGBToY_AVX) // %5 | |
| 822 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
| 823 ); | |
| 824 } | |
| 825 #endif // HAS_ARGBTOYJROW_AVX2 | |
| 826 | |
| 827 #ifdef HAS_ARGBTOUVROW_SSSE3 | |
| 828 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | |
| 829 uint8* dst_u, uint8* dst_v, int width) { | |
| 830 asm volatile ( | |
| 831 "movdqa %5,%%xmm3 \n" | |
| 832 "movdqa %6,%%xmm4 \n" | |
| 833 "movdqa %7,%%xmm5 \n" | |
| 834 "sub %1,%2 \n" | |
| 835 LABELALIGN | |
| 836 "1: \n" | |
| 837 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 838 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | |
| 839 "pavgb %%xmm7,%%xmm0 \n" | |
| 840 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 841 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 | |
| 842 "pavgb %%xmm7,%%xmm1 \n" | |
| 843 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 844 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 | |
| 845 "pavgb %%xmm7,%%xmm2 \n" | |
| 846 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
| 847 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 | |
| 848 "pavgb %%xmm7,%%xmm6 \n" | |
| 849 | |
| 850 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 851 "movdqa %%xmm0,%%xmm7 \n" | |
| 852 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
| 853 "shufps $0xdd,%%xmm1,%%xmm7 \n" | |
| 854 "pavgb %%xmm7,%%xmm0 \n" | |
| 855 "movdqa %%xmm2,%%xmm7 \n" | |
| 856 "shufps $0x88,%%xmm6,%%xmm2 \n" | |
| 857 "shufps $0xdd,%%xmm6,%%xmm7 \n" | |
| 858 "pavgb %%xmm7,%%xmm2 \n" | |
| 859 "movdqa %%xmm0,%%xmm1 \n" | |
| 860 "movdqa %%xmm2,%%xmm6 \n" | |
| 861 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 862 "pmaddubsw %%xmm4,%%xmm2 \n" | |
| 863 "pmaddubsw %%xmm3,%%xmm1 \n" | |
| 864 "pmaddubsw %%xmm3,%%xmm6 \n" | |
| 865 "phaddw %%xmm2,%%xmm0 \n" | |
| 866 "phaddw %%xmm6,%%xmm1 \n" | |
| 867 "psraw $0x8,%%xmm0 \n" | |
| 868 "psraw $0x8,%%xmm1 \n" | |
| 869 "packsswb %%xmm1,%%xmm0 \n" | |
| 870 "paddb %%xmm5,%%xmm0 \n" | |
| 871 "movlps %%xmm0," MEMACCESS(1) " \n" | |
| 872 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) | |
| 873 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 874 "sub $0x10,%3 \n" | |
| 875 "jg 1b \n" | |
| 876 : "+r"(src_argb0), // %0 | |
| 877 "+r"(dst_u), // %1 | |
| 878 "+r"(dst_v), // %2 | |
| 879 "+rm"(width) // %3 | |
| 880 : "r"((intptr_t)(src_stride_argb)), // %4 | |
| 881 "m"(kARGBToV), // %5 | |
| 882 "m"(kARGBToU), // %6 | |
| 883 "m"(kAddUV128) // %7 | |
| 884 : "memory", "cc", NACL_R14 | |
| 885 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | |
| 886 ); | |
| 887 } | |
| 888 #endif // HAS_ARGBTOUVROW_SSSE3 | |
| 889 | |
| 890 #ifdef HAS_ARGBTOUVROW_AVX2 | |
| 891 // vpshufb for vphaddw + vpackuswb packed to shorts. | |
| 892 static const lvec8 kShufARGBToUV_AVX = { | |
| 893 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, | |
| 894 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 | |
| 895 }; | |
| 896 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, | |
| 897 uint8* dst_u, uint8* dst_v, int width) { | |
| 898 asm volatile ( | |
| 899 "vbroadcastf128 %5,%%ymm5 \n" | |
| 900 "vbroadcastf128 %6,%%ymm6 \n" | |
| 901 "vbroadcastf128 %7,%%ymm7 \n" | |
| 902 "sub %1,%2 \n" | |
| 903 LABELALIGN | |
| 904 "1: \n" | |
| 905 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 906 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
| 907 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" | |
| 908 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" | |
| 909 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 | |
| 910 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) | |
| 911 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) | |
| 912 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) | |
| 913 "lea " MEMLEA(0x80,0) ",%0 \n" | |
| 914 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" | |
| 915 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" | |
| 916 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" | |
| 917 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" | |
| 918 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" | |
| 919 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" | |
| 920 | |
| 921 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" | |
| 922 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" | |
| 923 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" | |
| 924 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" | |
| 925 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" | |
| 926 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" | |
| 927 "vpsraw $0x8,%%ymm1,%%ymm1 \n" | |
| 928 "vpsraw $0x8,%%ymm0,%%ymm0 \n" | |
| 929 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" | |
| 930 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 931 "vpshufb %8,%%ymm0,%%ymm0 \n" | |
| 932 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" | |
| 933 | |
| 934 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" | |
| 935 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) | |
| 936 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 937 "sub $0x20,%3 \n" | |
| 938 "jg 1b \n" | |
| 939 "vzeroupper \n" | |
| 940 : "+r"(src_argb0), // %0 | |
| 941 "+r"(dst_u), // %1 | |
| 942 "+r"(dst_v), // %2 | |
| 943 "+rm"(width) // %3 | |
| 944 : "r"((intptr_t)(src_stride_argb)), // %4 | |
| 945 "m"(kAddUV128), // %5 | |
| 946 "m"(kARGBToV), // %6 | |
| 947 "m"(kARGBToU), // %7 | |
| 948 "m"(kShufARGBToUV_AVX) // %8 | |
| 949 : "memory", "cc", NACL_R14 | |
| 950 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 951 ); | |
| 952 } | |
| 953 #endif // HAS_ARGBTOUVROW_AVX2 | |
| 954 | |
| 955 #ifdef HAS_ARGBTOUVJROW_SSSE3 | |
| 956 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. | |
| 957 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | |
| 958 uint8* dst_u, uint8* dst_v, int width) { | |
| 959 asm volatile ( | |
| 960 "movdqa %5,%%xmm3 \n" | |
| 961 "movdqa %6,%%xmm4 \n" | |
| 962 "movdqa %7,%%xmm5 \n" | |
| 963 "sub %1,%2 \n" | |
| 964 LABELALIGN | |
| 965 "1: \n" | |
| 966 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 967 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | |
| 968 "pavgb %%xmm7,%%xmm0 \n" | |
| 969 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 970 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 | |
| 971 "pavgb %%xmm7,%%xmm1 \n" | |
| 972 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 973 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 | |
| 974 "pavgb %%xmm7,%%xmm2 \n" | |
| 975 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
| 976 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 | |
| 977 "pavgb %%xmm7,%%xmm6 \n" | |
| 978 | |
| 979 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 980 "movdqa %%xmm0,%%xmm7 \n" | |
| 981 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
| 982 "shufps $0xdd,%%xmm1,%%xmm7 \n" | |
| 983 "pavgb %%xmm7,%%xmm0 \n" | |
| 984 "movdqa %%xmm2,%%xmm7 \n" | |
| 985 "shufps $0x88,%%xmm6,%%xmm2 \n" | |
| 986 "shufps $0xdd,%%xmm6,%%xmm7 \n" | |
| 987 "pavgb %%xmm7,%%xmm2 \n" | |
| 988 "movdqa %%xmm0,%%xmm1 \n" | |
| 989 "movdqa %%xmm2,%%xmm6 \n" | |
| 990 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 991 "pmaddubsw %%xmm4,%%xmm2 \n" | |
| 992 "pmaddubsw %%xmm3,%%xmm1 \n" | |
| 993 "pmaddubsw %%xmm3,%%xmm6 \n" | |
| 994 "phaddw %%xmm2,%%xmm0 \n" | |
| 995 "phaddw %%xmm6,%%xmm1 \n" | |
| 996 "paddw %%xmm5,%%xmm0 \n" | |
| 997 "paddw %%xmm5,%%xmm1 \n" | |
| 998 "psraw $0x8,%%xmm0 \n" | |
| 999 "psraw $0x8,%%xmm1 \n" | |
| 1000 "packsswb %%xmm1,%%xmm0 \n" | |
| 1001 "movlps %%xmm0," MEMACCESS(1) " \n" | |
| 1002 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) | |
| 1003 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 1004 "sub $0x10,%3 \n" | |
| 1005 "jg 1b \n" | |
| 1006 : "+r"(src_argb0), // %0 | |
| 1007 "+r"(dst_u), // %1 | |
| 1008 "+r"(dst_v), // %2 | |
| 1009 "+rm"(width) // %3 | |
| 1010 : "r"((intptr_t)(src_stride_argb)), // %4 | |
| 1011 "m"(kARGBToVJ), // %5 | |
| 1012 "m"(kARGBToUJ), // %6 | |
| 1013 "m"(kAddUVJ128) // %7 | |
| 1014 : "memory", "cc", NACL_R14 | |
| 1015 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | |
| 1016 ); | |
| 1017 } | |
| 1018 #endif // HAS_ARGBTOUVJROW_SSSE3 | |
| 1019 | |
| 1020 #ifdef HAS_ARGBTOUV444ROW_SSSE3 | |
| 1021 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | |
| 1022 int width) { | |
| 1023 asm volatile ( | |
| 1024 "movdqa %4,%%xmm3 \n" | |
| 1025 "movdqa %5,%%xmm4 \n" | |
| 1026 "movdqa %6,%%xmm5 \n" | |
| 1027 "sub %1,%2 \n" | |
| 1028 LABELALIGN | |
| 1029 "1: \n" | |
| 1030 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 1031 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 1032 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 1033 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
| 1034 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 1035 "pmaddubsw %%xmm4,%%xmm1 \n" | |
| 1036 "pmaddubsw %%xmm4,%%xmm2 \n" | |
| 1037 "pmaddubsw %%xmm4,%%xmm6 \n" | |
| 1038 "phaddw %%xmm1,%%xmm0 \n" | |
| 1039 "phaddw %%xmm6,%%xmm2 \n" | |
| 1040 "psraw $0x8,%%xmm0 \n" | |
| 1041 "psraw $0x8,%%xmm2 \n" | |
| 1042 "packsswb %%xmm2,%%xmm0 \n" | |
| 1043 "paddb %%xmm5,%%xmm0 \n" | |
| 1044 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 1045 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 1046 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 1047 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 1048 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
| 1049 "pmaddubsw %%xmm3,%%xmm0 \n" | |
| 1050 "pmaddubsw %%xmm3,%%xmm1 \n" | |
| 1051 "pmaddubsw %%xmm3,%%xmm2 \n" | |
| 1052 "pmaddubsw %%xmm3,%%xmm6 \n" | |
| 1053 "phaddw %%xmm1,%%xmm0 \n" | |
| 1054 "phaddw %%xmm6,%%xmm2 \n" | |
| 1055 "psraw $0x8,%%xmm0 \n" | |
| 1056 "psraw $0x8,%%xmm2 \n" | |
| 1057 "packsswb %%xmm2,%%xmm0 \n" | |
| 1058 "paddb %%xmm5,%%xmm0 \n" | |
| 1059 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 1060 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) | |
| 1061 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 1062 "sub $0x10,%3 \n" | |
| 1063 "jg 1b \n" | |
| 1064 : "+r"(src_argb), // %0 | |
| 1065 "+r"(dst_u), // %1 | |
| 1066 "+r"(dst_v), // %2 | |
| 1067 "+rm"(width) // %3 | |
| 1068 : "m"(kARGBToV), // %4 | |
| 1069 "m"(kARGBToU), // %5 | |
| 1070 "m"(kAddUV128) // %6 | |
| 1071 : "memory", "cc", NACL_R14 | |
| 1072 "xmm0", "xmm1", "xmm2", "xmm6" | |
| 1073 ); | |
| 1074 } | |
| 1075 #endif // HAS_ARGBTOUV444ROW_SSSE3 | |
| 1076 | |
| 1077 #ifdef HAS_ARGBTOUV422ROW_SSSE3 | |
| 1078 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, | |
| 1079 uint8* dst_u, uint8* dst_v, int width) { | |
| 1080 asm volatile ( | |
| 1081 "movdqa %4,%%xmm3 \n" | |
| 1082 "movdqa %5,%%xmm4 \n" | |
| 1083 "movdqa %6,%%xmm5 \n" | |
| 1084 "sub %1,%2 \n" | |
| 1085 LABELALIGN | |
| 1086 "1: \n" | |
| 1087 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 1088 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 1089 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 1090 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
| 1091 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 1092 "movdqa %%xmm0,%%xmm7 \n" | |
| 1093 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
| 1094 "shufps $0xdd,%%xmm1,%%xmm7 \n" | |
| 1095 "pavgb %%xmm7,%%xmm0 \n" | |
| 1096 "movdqa %%xmm2,%%xmm7 \n" | |
| 1097 "shufps $0x88,%%xmm6,%%xmm2 \n" | |
| 1098 "shufps $0xdd,%%xmm6,%%xmm7 \n" | |
| 1099 "pavgb %%xmm7,%%xmm2 \n" | |
| 1100 "movdqa %%xmm0,%%xmm1 \n" | |
| 1101 "movdqa %%xmm2,%%xmm6 \n" | |
| 1102 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 1103 "pmaddubsw %%xmm4,%%xmm2 \n" | |
| 1104 "pmaddubsw %%xmm3,%%xmm1 \n" | |
| 1105 "pmaddubsw %%xmm3,%%xmm6 \n" | |
| 1106 "phaddw %%xmm2,%%xmm0 \n" | |
| 1107 "phaddw %%xmm6,%%xmm1 \n" | |
| 1108 "psraw $0x8,%%xmm0 \n" | |
| 1109 "psraw $0x8,%%xmm1 \n" | |
| 1110 "packsswb %%xmm1,%%xmm0 \n" | |
| 1111 "paddb %%xmm5,%%xmm0 \n" | |
| 1112 "movlps %%xmm0," MEMACCESS(1) " \n" | |
| 1113 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) | |
| 1114 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 1115 "sub $0x10,%3 \n" | |
| 1116 "jg 1b \n" | |
| 1117 : "+r"(src_argb0), // %0 | |
| 1118 "+r"(dst_u), // %1 | |
| 1119 "+r"(dst_v), // %2 | |
| 1120 "+rm"(width) // %3 | |
| 1121 : "m"(kARGBToV), // %4 | |
| 1122 "m"(kARGBToU), // %5 | |
| 1123 "m"(kAddUV128) // %6 | |
| 1124 : "memory", "cc", NACL_R14 | |
| 1125 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | |
| 1126 ); | |
| 1127 } | |
| 1128 #endif // HAS_ARGBTOUV422ROW_SSSE3 | |
| 1129 | |
| 1130 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { | |
| 1131 asm volatile ( | |
| 1132 "movdqa %4,%%xmm5 \n" | |
| 1133 "movdqa %3,%%xmm4 \n" | |
| 1134 LABELALIGN | |
| 1135 "1: \n" | |
| 1136 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 1137 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 1138 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 1139 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
| 1140 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 1141 "pmaddubsw %%xmm4,%%xmm1 \n" | |
| 1142 "pmaddubsw %%xmm4,%%xmm2 \n" | |
| 1143 "pmaddubsw %%xmm4,%%xmm3 \n" | |
| 1144 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 1145 "phaddw %%xmm1,%%xmm0 \n" | |
| 1146 "phaddw %%xmm3,%%xmm2 \n" | |
| 1147 "psrlw $0x7,%%xmm0 \n" | |
| 1148 "psrlw $0x7,%%xmm2 \n" | |
| 1149 "packuswb %%xmm2,%%xmm0 \n" | |
| 1150 "paddb %%xmm5,%%xmm0 \n" | |
| 1151 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 1152 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 1153 "sub $0x10,%2 \n" | |
| 1154 "jg 1b \n" | |
| 1155 : "+r"(src_bgra), // %0 | |
| 1156 "+r"(dst_y), // %1 | |
| 1157 "+r"(pix) // %2 | |
| 1158 : "m"(kBGRAToY), // %3 | |
| 1159 "m"(kAddY16) // %4 | |
| 1160 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 1161 ); | |
| 1162 } | |
| 1163 | |
| 1164 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, | |
| 1165 uint8* dst_u, uint8* dst_v, int width) { | |
| 1166 asm volatile ( | |
| 1167 "movdqa %5,%%xmm3 \n" | |
| 1168 "movdqa %6,%%xmm4 \n" | |
| 1169 "movdqa %7,%%xmm5 \n" | |
| 1170 "sub %1,%2 \n" | |
| 1171 LABELALIGN | |
| 1172 "1: \n" | |
| 1173 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 1174 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | |
| 1175 "pavgb %%xmm7,%%xmm0 \n" | |
| 1176 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 1177 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 | |
| 1178 "pavgb %%xmm7,%%xmm1 \n" | |
| 1179 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 1180 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 | |
| 1181 "pavgb %%xmm7,%%xmm2 \n" | |
| 1182 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
| 1183 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 | |
| 1184 "pavgb %%xmm7,%%xmm6 \n" | |
| 1185 | |
| 1186 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 1187 "movdqa %%xmm0,%%xmm7 \n" | |
| 1188 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
| 1189 "shufps $0xdd,%%xmm1,%%xmm7 \n" | |
| 1190 "pavgb %%xmm7,%%xmm0 \n" | |
| 1191 "movdqa %%xmm2,%%xmm7 \n" | |
| 1192 "shufps $0x88,%%xmm6,%%xmm2 \n" | |
| 1193 "shufps $0xdd,%%xmm6,%%xmm7 \n" | |
| 1194 "pavgb %%xmm7,%%xmm2 \n" | |
| 1195 "movdqa %%xmm0,%%xmm1 \n" | |
| 1196 "movdqa %%xmm2,%%xmm6 \n" | |
| 1197 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 1198 "pmaddubsw %%xmm4,%%xmm2 \n" | |
| 1199 "pmaddubsw %%xmm3,%%xmm1 \n" | |
| 1200 "pmaddubsw %%xmm3,%%xmm6 \n" | |
| 1201 "phaddw %%xmm2,%%xmm0 \n" | |
| 1202 "phaddw %%xmm6,%%xmm1 \n" | |
| 1203 "psraw $0x8,%%xmm0 \n" | |
| 1204 "psraw $0x8,%%xmm1 \n" | |
| 1205 "packsswb %%xmm1,%%xmm0 \n" | |
| 1206 "paddb %%xmm5,%%xmm0 \n" | |
| 1207 "movlps %%xmm0," MEMACCESS(1) " \n" | |
| 1208 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) | |
| 1209 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 1210 "sub $0x10,%3 \n" | |
| 1211 "jg 1b \n" | |
| 1212 : "+r"(src_bgra0), // %0 | |
| 1213 "+r"(dst_u), // %1 | |
| 1214 "+r"(dst_v), // %2 | |
| 1215 "+rm"(width) // %3 | |
| 1216 : "r"((intptr_t)(src_stride_bgra)), // %4 | |
| 1217 "m"(kBGRAToV), // %5 | |
| 1218 "m"(kBGRAToU), // %6 | |
| 1219 "m"(kAddUV128) // %7 | |
| 1220 : "memory", "cc", NACL_R14 | |
| 1221 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | |
| 1222 ); | |
| 1223 } | |
| 1224 | |
| 1225 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { | |
| 1226 asm volatile ( | |
| 1227 "movdqa %4,%%xmm5 \n" | |
| 1228 "movdqa %3,%%xmm4 \n" | |
| 1229 LABELALIGN | |
| 1230 "1: \n" | |
| 1231 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 1232 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 1233 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 1234 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
| 1235 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 1236 "pmaddubsw %%xmm4,%%xmm1 \n" | |
| 1237 "pmaddubsw %%xmm4,%%xmm2 \n" | |
| 1238 "pmaddubsw %%xmm4,%%xmm3 \n" | |
| 1239 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 1240 "phaddw %%xmm1,%%xmm0 \n" | |
| 1241 "phaddw %%xmm3,%%xmm2 \n" | |
| 1242 "psrlw $0x7,%%xmm0 \n" | |
| 1243 "psrlw $0x7,%%xmm2 \n" | |
| 1244 "packuswb %%xmm2,%%xmm0 \n" | |
| 1245 "paddb %%xmm5,%%xmm0 \n" | |
| 1246 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 1247 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 1248 "sub $0x10,%2 \n" | |
| 1249 "jg 1b \n" | |
| 1250 : "+r"(src_abgr), // %0 | |
| 1251 "+r"(dst_y), // %1 | |
| 1252 "+r"(pix) // %2 | |
| 1253 : "m"(kABGRToY), // %3 | |
| 1254 "m"(kAddY16) // %4 | |
| 1255 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 1256 ); | |
| 1257 } | |
| 1258 | |
| 1259 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { | |
| 1260 asm volatile ( | |
| 1261 "movdqa %4,%%xmm5 \n" | |
| 1262 "movdqa %3,%%xmm4 \n" | |
| 1263 LABELALIGN | |
| 1264 "1: \n" | |
| 1265 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 1266 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 1267 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 1268 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
| 1269 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 1270 "pmaddubsw %%xmm4,%%xmm1 \n" | |
| 1271 "pmaddubsw %%xmm4,%%xmm2 \n" | |
| 1272 "pmaddubsw %%xmm4,%%xmm3 \n" | |
| 1273 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 1274 "phaddw %%xmm1,%%xmm0 \n" | |
| 1275 "phaddw %%xmm3,%%xmm2 \n" | |
| 1276 "psrlw $0x7,%%xmm0 \n" | |
| 1277 "psrlw $0x7,%%xmm2 \n" | |
| 1278 "packuswb %%xmm2,%%xmm0 \n" | |
| 1279 "paddb %%xmm5,%%xmm0 \n" | |
| 1280 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 1281 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 1282 "sub $0x10,%2 \n" | |
| 1283 "jg 1b \n" | |
| 1284 : "+r"(src_rgba), // %0 | |
| 1285 "+r"(dst_y), // %1 | |
| 1286 "+r"(pix) // %2 | |
| 1287 : "m"(kRGBAToY), // %3 | |
| 1288 "m"(kAddY16) // %4 | |
| 1289 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 1290 ); | |
| 1291 } | |
| 1292 | |
| 1293 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, | |
| 1294 uint8* dst_u, uint8* dst_v, int width) { | |
| 1295 asm volatile ( | |
| 1296 "movdqa %5,%%xmm3 \n" | |
| 1297 "movdqa %6,%%xmm4 \n" | |
| 1298 "movdqa %7,%%xmm5 \n" | |
| 1299 "sub %1,%2 \n" | |
| 1300 LABELALIGN | |
| 1301 "1: \n" | |
| 1302 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 1303 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | |
| 1304 "pavgb %%xmm7,%%xmm0 \n" | |
| 1305 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 1306 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 | |
| 1307 "pavgb %%xmm7,%%xmm1 \n" | |
| 1308 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 1309 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 | |
| 1310 "pavgb %%xmm7,%%xmm2 \n" | |
| 1311 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
| 1312 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 | |
| 1313 "pavgb %%xmm7,%%xmm6 \n" | |
| 1314 | |
| 1315 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 1316 "movdqa %%xmm0,%%xmm7 \n" | |
| 1317 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
| 1318 "shufps $0xdd,%%xmm1,%%xmm7 \n" | |
| 1319 "pavgb %%xmm7,%%xmm0 \n" | |
| 1320 "movdqa %%xmm2,%%xmm7 \n" | |
| 1321 "shufps $0x88,%%xmm6,%%xmm2 \n" | |
| 1322 "shufps $0xdd,%%xmm6,%%xmm7 \n" | |
| 1323 "pavgb %%xmm7,%%xmm2 \n" | |
| 1324 "movdqa %%xmm0,%%xmm1 \n" | |
| 1325 "movdqa %%xmm2,%%xmm6 \n" | |
| 1326 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 1327 "pmaddubsw %%xmm4,%%xmm2 \n" | |
| 1328 "pmaddubsw %%xmm3,%%xmm1 \n" | |
| 1329 "pmaddubsw %%xmm3,%%xmm6 \n" | |
| 1330 "phaddw %%xmm2,%%xmm0 \n" | |
| 1331 "phaddw %%xmm6,%%xmm1 \n" | |
| 1332 "psraw $0x8,%%xmm0 \n" | |
| 1333 "psraw $0x8,%%xmm1 \n" | |
| 1334 "packsswb %%xmm1,%%xmm0 \n" | |
| 1335 "paddb %%xmm5,%%xmm0 \n" | |
| 1336 "movlps %%xmm0," MEMACCESS(1) " \n" | |
| 1337 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) | |
| 1338 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 1339 "sub $0x10,%3 \n" | |
| 1340 "jg 1b \n" | |
| 1341 : "+r"(src_abgr0), // %0 | |
| 1342 "+r"(dst_u), // %1 | |
| 1343 "+r"(dst_v), // %2 | |
| 1344 "+rm"(width) // %3 | |
| 1345 : "r"((intptr_t)(src_stride_abgr)), // %4 | |
| 1346 "m"(kABGRToV), // %5 | |
| 1347 "m"(kABGRToU), // %6 | |
| 1348 "m"(kAddUV128) // %7 | |
| 1349 : "memory", "cc", NACL_R14 | |
| 1350 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | |
| 1351 ); | |
| 1352 } | |
| 1353 | |
| 1354 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, | |
| 1355 uint8* dst_u, uint8* dst_v, int width) { | |
| 1356 asm volatile ( | |
| 1357 "movdqa %5,%%xmm3 \n" | |
| 1358 "movdqa %6,%%xmm4 \n" | |
| 1359 "movdqa %7,%%xmm5 \n" | |
| 1360 "sub %1,%2 \n" | |
| 1361 LABELALIGN | |
| 1362 "1: \n" | |
| 1363 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 1364 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | |
| 1365 "pavgb %%xmm7,%%xmm0 \n" | |
| 1366 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 1367 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 | |
| 1368 "pavgb %%xmm7,%%xmm1 \n" | |
| 1369 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 1370 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 | |
| 1371 "pavgb %%xmm7,%%xmm2 \n" | |
| 1372 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
| 1373 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 | |
| 1374 "pavgb %%xmm7,%%xmm6 \n" | |
| 1375 | |
| 1376 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 1377 "movdqa %%xmm0,%%xmm7 \n" | |
| 1378 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
| 1379 "shufps $0xdd,%%xmm1,%%xmm7 \n" | |
| 1380 "pavgb %%xmm7,%%xmm0 \n" | |
| 1381 "movdqa %%xmm2,%%xmm7 \n" | |
| 1382 "shufps $0x88,%%xmm6,%%xmm2 \n" | |
| 1383 "shufps $0xdd,%%xmm6,%%xmm7 \n" | |
| 1384 "pavgb %%xmm7,%%xmm2 \n" | |
| 1385 "movdqa %%xmm0,%%xmm1 \n" | |
| 1386 "movdqa %%xmm2,%%xmm6 \n" | |
| 1387 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 1388 "pmaddubsw %%xmm4,%%xmm2 \n" | |
| 1389 "pmaddubsw %%xmm3,%%xmm1 \n" | |
| 1390 "pmaddubsw %%xmm3,%%xmm6 \n" | |
| 1391 "phaddw %%xmm2,%%xmm0 \n" | |
| 1392 "phaddw %%xmm6,%%xmm1 \n" | |
| 1393 "psraw $0x8,%%xmm0 \n" | |
| 1394 "psraw $0x8,%%xmm1 \n" | |
| 1395 "packsswb %%xmm1,%%xmm0 \n" | |
| 1396 "paddb %%xmm5,%%xmm0 \n" | |
| 1397 "movlps %%xmm0," MEMACCESS(1) " \n" | |
| 1398 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) | |
| 1399 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 1400 "sub $0x10,%3 \n" | |
| 1401 "jg 1b \n" | |
| 1402 : "+r"(src_rgba0), // %0 | |
| 1403 "+r"(dst_u), // %1 | |
| 1404 "+r"(dst_v), // %2 | |
| 1405 "+rm"(width) // %3 | |
| 1406 : "r"((intptr_t)(src_stride_rgba)), // %4 | |
| 1407 "m"(kRGBAToV), // %5 | |
| 1408 "m"(kRGBAToU), // %6 | |
| 1409 "m"(kAddUV128) // %7 | |
| 1410 : "memory", "cc", NACL_R14 | |
| 1411 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | |
| 1412 ); | |
| 1413 } | |
| 1414 | |
| 1415 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) | |
| 1416 | |
| 1417 // YUV to RGB conversion constants. | |
| 1418 // Y contribution to R,G,B. Scale and bias. | |
| 1419 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ | |
| 1420 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ | |
| 1421 | |
| 1422 // U and V contributions to R,G,B. | |
| 1423 #define UB -128 /* -min(128, round(2.018 * 64)) */ | |
| 1424 #define UG 25 /* -round(-0.391 * 64) */ | |
| 1425 #define VG 52 /* -round(-0.813 * 64) */ | |
| 1426 #define VR -102 /* -round(1.596 * 64) */ | |
| 1427 | |
| 1428 // Bias values to subtract 16 from Y and 128 from U and V. | |
| 1429 #define BB (UB * 128 - YGB) | |
| 1430 #define BG (UG * 128 + VG * 128 - YGB) | |
| 1431 #define BR (VR * 128 - YGB) | |
| 1432 | |
| 1433 struct YuvConstants { | |
| 1434 lvec8 kUVToB; // 0 | |
| 1435 lvec8 kUVToG; // 32 | |
| 1436 lvec8 kUVToR; // 64 | |
| 1437 lvec16 kUVBiasB; // 96 | |
| 1438 lvec16 kUVBiasG; // 128 | |
| 1439 lvec16 kUVBiasR; // 160 | |
| 1440 lvec16 kYToRgb; // 192 | |
| 1441 }; | |
| 1442 | |
| 1443 // BT601 constants for YUV to RGB. | |
| 1444 static YuvConstants SIMD_ALIGNED(kYuvConstants) = { | |
| 1445 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, | |
| 1446 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, | |
| 1447 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, | |
| 1448 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, | |
| 1449 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, | |
| 1450 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, | |
| 1451 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, | |
| 1452 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, | |
| 1453 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, | |
| 1454 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } | |
| 1455 }; | |
| 1456 | |
| 1457 // BT601 constants for NV21 where chroma plane is VU instead of UV. | |
| 1458 static YuvConstants SIMD_ALIGNED(kYvuConstants) = { | |
| 1459 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, | |
| 1460 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, | |
| 1461 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, | |
| 1462 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, | |
| 1463 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, | |
| 1464 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, | |
| 1465 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, | |
| 1466 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, | |
| 1467 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, | |
| 1468 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } | |
| 1469 }; | |
| 1470 | |
| 1471 // Read 8 UV from 411 | |
| 1472 #define READYUV444 \ | |
| 1473 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | |
| 1474 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | |
| 1475 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | |
| 1476 "punpcklbw %%xmm1,%%xmm0 \n" | |
| 1477 | |
| 1478 // Read 4 UV from 422, upsample to 8 UV | |
| 1479 #define READYUV422 \ | |
| 1480 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | |
| 1481 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | |
| 1482 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ | |
| 1483 "punpcklbw %%xmm1,%%xmm0 \n" \ | |
| 1484 "punpcklwd %%xmm0,%%xmm0 \n" | |
| 1485 | |
| 1486 // Read 2 UV from 411, upsample to 8 UV | |
| 1487 #define READYUV411 \ | |
| 1488 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | |
| 1489 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | |
| 1490 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ | |
| 1491 "punpcklbw %%xmm1,%%xmm0 \n" \ | |
| 1492 "punpcklwd %%xmm0,%%xmm0 \n" \ | |
| 1493 "punpckldq %%xmm0,%%xmm0 \n" | |
| 1494 | |
| 1495 // Read 4 UV from NV12, upsample to 8 UV | |
| 1496 #define READNV12 \ | |
| 1497 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | |
| 1498 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ | |
| 1499 "punpcklwd %%xmm0,%%xmm0 \n" | |
| 1500 | |
| 1501 // Convert 8 pixels: 8 UV and 8 Y | |
| 1502 #define YUVTORGB(YuvConstants) \ | |
| 1503 "movdqa %%xmm0,%%xmm1 \n" \ | |
| 1504 "movdqa %%xmm0,%%xmm2 \n" \ | |
| 1505 "movdqa %%xmm0,%%xmm3 \n" \ | |
| 1506 "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \ | |
| 1507 "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \ | |
| 1508 "psubw %%xmm1,%%xmm0 \n" \ | |
| 1509 "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \ | |
| 1510 "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \ | |
| 1511 "psubw %%xmm2,%%xmm1 \n" \ | |
| 1512 "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \ | |
| 1513 "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \ | |
| 1514 "psubw %%xmm3,%%xmm2 \n" \ | |
| 1515 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ | |
| 1516 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ | |
| 1517 "punpcklbw %%xmm3,%%xmm3 \n" \ | |
| 1518 "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \ | |
| 1519 "paddsw %%xmm3,%%xmm0 \n" \ | |
| 1520 "paddsw %%xmm3,%%xmm1 \n" \ | |
| 1521 "paddsw %%xmm3,%%xmm2 \n" \ | |
| 1522 "psraw $0x6,%%xmm0 \n" \ | |
| 1523 "psraw $0x6,%%xmm1 \n" \ | |
| 1524 "psraw $0x6,%%xmm2 \n" \ | |
| 1525 "packuswb %%xmm0,%%xmm0 \n" \ | |
| 1526 "packuswb %%xmm1,%%xmm1 \n" \ | |
| 1527 "packuswb %%xmm2,%%xmm2 \n" | |
| 1528 | |
| 1529 // Store 8 ARGB values. Assumes XMM5 is zero. | |
| 1530 #define STOREARGB \ | |
| 1531 "punpcklbw %%xmm1,%%xmm0 \n" \ | |
| 1532 "punpcklbw %%xmm5,%%xmm2 \n" \ | |
| 1533 "movdqa %%xmm0,%%xmm1 \n" \ | |
| 1534 "punpcklwd %%xmm2,%%xmm0 \n" \ | |
| 1535 "punpckhwd %%xmm2,%%xmm1 \n" \ | |
| 1536 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ | |
| 1537 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \ | |
| 1538 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" | |
| 1539 | |
| 1540 // Store 8 BGRA values. Assumes XMM5 is zero. | |
| 1541 #define STOREBGRA \ | |
| 1542 "pcmpeqb %%xmm5,%%xmm5 \n" \ | |
| 1543 "punpcklbw %%xmm0,%%xmm1 \n" \ | |
| 1544 "punpcklbw %%xmm2,%%xmm5 \n" \ | |
| 1545 "movdqa %%xmm5,%%xmm0 \n" \ | |
| 1546 "punpcklwd %%xmm1,%%xmm5 \n" \ | |
| 1547 "punpckhwd %%xmm1,%%xmm0 \n" \ | |
| 1548 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ | |
| 1549 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \ | |
| 1550 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" | |
| 1551 | |
| 1552 // Store 8 ABGR values. Assumes XMM5 is zero. | |
| 1553 #define STOREABGR \ | |
| 1554 "punpcklbw %%xmm1,%%xmm2 \n" \ | |
| 1555 "punpcklbw %%xmm5,%%xmm0 \n" \ | |
| 1556 "movdqa %%xmm2,%%xmm1 \n" \ | |
| 1557 "punpcklwd %%xmm0,%%xmm2 \n" \ | |
| 1558 "punpckhwd %%xmm0,%%xmm1 \n" \ | |
| 1559 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ | |
| 1560 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \ | |
| 1561 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" | |
| 1562 | |
| 1563 // Store 8 RGBA values. Assumes XMM5 is zero. | |
| 1564 #define STORERGBA \ | |
| 1565 "pcmpeqb %%xmm5,%%xmm5 \n" \ | |
| 1566 "punpcklbw %%xmm2,%%xmm1 \n" \ | |
| 1567 "punpcklbw %%xmm0,%%xmm5 \n" \ | |
| 1568 "movdqa %%xmm5,%%xmm0 \n" \ | |
| 1569 "punpcklwd %%xmm1,%%xmm5 \n" \ | |
| 1570 "punpckhwd %%xmm1,%%xmm0 \n" \ | |
| 1571 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ | |
| 1572 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \ | |
| 1573 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" | |
| 1574 | |
| 1575 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, | |
| 1576 const uint8* u_buf, | |
| 1577 const uint8* v_buf, | |
| 1578 uint8* dst_argb, | |
| 1579 int width) { | |
| 1580 asm volatile ( | |
| 1581 "sub %[u_buf],%[v_buf] \n" | |
| 1582 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 1583 LABELALIGN | |
| 1584 "1: \n" | |
| 1585 READYUV444 | |
| 1586 YUVTORGB(kYuvConstants) | |
| 1587 STOREARGB | |
| 1588 "sub $0x8,%[width] \n" | |
| 1589 "jg 1b \n" | |
| 1590 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1591 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 1592 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 1593 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
| 1594 [width]"+rm"(width) // %[width] | |
| 1595 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
| 1596 : "memory", "cc", NACL_R14 | |
| 1597 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 1598 ); | |
| 1599 } | |
| 1600 | |
| 1601 // TODO(fbarchard): Consider putting masks into constants. | |
| 1602 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, | |
| 1603 const uint8* u_buf, | |
| 1604 const uint8* v_buf, | |
| 1605 uint8* dst_rgb24, | |
| 1606 int width) { | |
| 1607 asm volatile ( | |
| 1608 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" | |
| 1609 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" | |
| 1610 "sub %[u_buf],%[v_buf] \n" | |
| 1611 LABELALIGN | |
| 1612 "1: \n" | |
| 1613 READYUV422 | |
| 1614 YUVTORGB(kYuvConstants) | |
| 1615 "punpcklbw %%xmm1,%%xmm0 \n" | |
| 1616 "punpcklbw %%xmm2,%%xmm2 \n" | |
| 1617 "movdqa %%xmm0,%%xmm1 \n" | |
| 1618 "punpcklwd %%xmm2,%%xmm0 \n" | |
| 1619 "punpckhwd %%xmm2,%%xmm1 \n" | |
| 1620 "pshufb %%xmm5,%%xmm0 \n" | |
| 1621 "pshufb %%xmm6,%%xmm1 \n" | |
| 1622 "palignr $0xc,%%xmm0,%%xmm1 \n" | |
| 1623 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" | |
| 1624 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" | |
| 1625 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" | |
| 1626 "subl $0x8,%[width] \n" | |
| 1627 "jg 1b \n" | |
| 1628 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1629 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 1630 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 1631 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] | |
| 1632 // TODO(fbarchard): Make width a register for 32 bit. | |
| 1633 #if defined(__i386__) && defined(__pic__) | |
| 1634 [width]"+m"(width) // %[width] | |
| 1635 #else | |
| 1636 [width]"+rm"(width) // %[width] | |
| 1637 #endif | |
| 1638 : [kYuvConstants]"r"(&kYuvConstants.kUVToB), | |
| 1639 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), | |
| 1640 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) | |
| 1641 : "memory", "cc", NACL_R14 | |
| 1642 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" | |
| 1643 ); | |
| 1644 } | |
| 1645 | |
| 1646 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, | |
| 1647 const uint8* u_buf, | |
| 1648 const uint8* v_buf, | |
| 1649 uint8* dst_raw, | |
| 1650 int width) { | |
| 1651 asm volatile ( | |
| 1652 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" | |
| 1653 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" | |
| 1654 "sub %[u_buf],%[v_buf] \n" | |
| 1655 LABELALIGN | |
| 1656 "1: \n" | |
| 1657 READYUV422 | |
| 1658 YUVTORGB(kYuvConstants) | |
| 1659 "punpcklbw %%xmm1,%%xmm0 \n" | |
| 1660 "punpcklbw %%xmm2,%%xmm2 \n" | |
| 1661 "movdqa %%xmm0,%%xmm1 \n" | |
| 1662 "punpcklwd %%xmm2,%%xmm0 \n" | |
| 1663 "punpckhwd %%xmm2,%%xmm1 \n" | |
| 1664 "pshufb %%xmm5,%%xmm0 \n" | |
| 1665 "pshufb %%xmm6,%%xmm1 \n" | |
| 1666 "palignr $0xc,%%xmm0,%%xmm1 \n" | |
| 1667 "movq %%xmm0," MEMACCESS([dst_raw]) " \n" | |
| 1668 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" | |
| 1669 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" | |
| 1670 "subl $0x8,%[width] \n" | |
| 1671 "jg 1b \n" | |
| 1672 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1673 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 1674 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 1675 [dst_raw]"+r"(dst_raw), // %[dst_raw] | |
| 1676 // TODO(fbarchard): Make width a register for 32 bit. | |
| 1677 #if defined(__i386__) && defined(__pic__) | |
| 1678 [width]"+m"(width) // %[width] | |
| 1679 #else | |
| 1680 [width]"+rm"(width) // %[width] | |
| 1681 #endif | |
| 1682 : [kYuvConstants]"r"(&kYuvConstants.kUVToB), | |
| 1683 [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), | |
| 1684 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) | |
| 1685 : "memory", "cc", NACL_R14 | |
| 1686 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" | |
| 1687 ); | |
| 1688 } | |
| 1689 | |
| 1690 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, | |
| 1691 const uint8* u_buf, | |
| 1692 const uint8* v_buf, | |
| 1693 uint8* dst_argb, | |
| 1694 int width) { | |
| 1695 asm volatile ( | |
| 1696 "sub %[u_buf],%[v_buf] \n" | |
| 1697 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 1698 LABELALIGN | |
| 1699 "1: \n" | |
| 1700 READYUV422 | |
| 1701 YUVTORGB(kYuvConstants) | |
| 1702 STOREARGB | |
| 1703 "sub $0x8,%[width] \n" | |
| 1704 "jg 1b \n" | |
| 1705 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1706 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 1707 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 1708 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
| 1709 [width]"+rm"(width) // %[width] | |
| 1710 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
| 1711 : "memory", "cc", NACL_R14 | |
| 1712 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 1713 ); | |
| 1714 } | |
| 1715 | |
| 1716 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, | |
| 1717 const uint8* u_buf, | |
| 1718 const uint8* v_buf, | |
| 1719 uint8* dst_argb, | |
| 1720 int width) { | |
| 1721 asm volatile ( | |
| 1722 "sub %[u_buf],%[v_buf] \n" | |
| 1723 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 1724 LABELALIGN | |
| 1725 "1: \n" | |
| 1726 READYUV411 | |
| 1727 YUVTORGB(kYuvConstants) | |
| 1728 STOREARGB | |
| 1729 "sub $0x8,%[width] \n" | |
| 1730 "jg 1b \n" | |
| 1731 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1732 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 1733 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 1734 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
| 1735 [width]"+rm"(width) // %[width] | |
| 1736 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
| 1737 : "memory", "cc", NACL_R14 | |
| 1738 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 1739 ); | |
| 1740 } | |
| 1741 | |
| 1742 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, | |
| 1743 const uint8* uv_buf, | |
| 1744 uint8* dst_argb, | |
| 1745 int width) { | |
| 1746 asm volatile ( | |
| 1747 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 1748 LABELALIGN | |
| 1749 "1: \n" | |
| 1750 READNV12 | |
| 1751 YUVTORGB(kYuvConstants) | |
| 1752 STOREARGB | |
| 1753 "sub $0x8,%[width] \n" | |
| 1754 "jg 1b \n" | |
| 1755 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1756 [uv_buf]"+r"(uv_buf), // %[uv_buf] | |
| 1757 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
| 1758 [width]"+rm"(width) // %[width] | |
| 1759 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
| 1760 // Does not use r14. | |
| 1761 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 1762 ); | |
| 1763 } | |
| 1764 | |
| 1765 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, | |
| 1766 const uint8* uv_buf, | |
| 1767 uint8* dst_argb, | |
| 1768 int width) { | |
| 1769 asm volatile ( | |
| 1770 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 1771 LABELALIGN | |
| 1772 "1: \n" | |
| 1773 READNV12 | |
| 1774 YUVTORGB(kYuvConstants) | |
| 1775 STOREARGB | |
| 1776 "sub $0x8,%[width] \n" | |
| 1777 "jg 1b \n" | |
| 1778 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1779 [uv_buf]"+r"(uv_buf), // %[uv_buf] | |
| 1780 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
| 1781 [width]"+rm"(width) // %[width] | |
| 1782 : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants] | |
| 1783 // Does not use r14. | |
| 1784 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 1785 ); | |
| 1786 } | |
| 1787 | |
| 1788 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, | |
| 1789 const uint8* u_buf, | |
| 1790 const uint8* v_buf, | |
| 1791 uint8* dst_bgra, | |
| 1792 int width) { | |
| 1793 asm volatile ( | |
| 1794 "sub %[u_buf],%[v_buf] \n" | |
| 1795 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 1796 LABELALIGN | |
| 1797 "1: \n" | |
| 1798 READYUV422 | |
| 1799 YUVTORGB(kYuvConstants) | |
| 1800 STOREBGRA | |
| 1801 "sub $0x8,%[width] \n" | |
| 1802 "jg 1b \n" | |
| 1803 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1804 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 1805 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 1806 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] | |
| 1807 [width]"+rm"(width) // %[width] | |
| 1808 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
| 1809 : "memory", "cc", NACL_R14 | |
| 1810 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 1811 ); | |
| 1812 } | |
| 1813 | |
| 1814 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, | |
| 1815 const uint8* u_buf, | |
| 1816 const uint8* v_buf, | |
| 1817 uint8* dst_abgr, | |
| 1818 int width) { | |
| 1819 asm volatile ( | |
| 1820 "sub %[u_buf],%[v_buf] \n" | |
| 1821 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 1822 LABELALIGN | |
| 1823 "1: \n" | |
| 1824 READYUV422 | |
| 1825 YUVTORGB(kYuvConstants) | |
| 1826 STOREABGR | |
| 1827 "sub $0x8,%[width] \n" | |
| 1828 "jg 1b \n" | |
| 1829 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1830 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 1831 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 1832 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] | |
| 1833 [width]"+rm"(width) // %[width] | |
| 1834 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
| 1835 : "memory", "cc", NACL_R14 | |
| 1836 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 1837 ); | |
| 1838 } | |
| 1839 | |
| 1840 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, | |
| 1841 const uint8* u_buf, | |
| 1842 const uint8* v_buf, | |
| 1843 uint8* dst_rgba, | |
| 1844 int width) { | |
| 1845 asm volatile ( | |
| 1846 "sub %[u_buf],%[v_buf] \n" | |
| 1847 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 1848 LABELALIGN | |
| 1849 "1: \n" | |
| 1850 READYUV422 | |
| 1851 YUVTORGB(kYuvConstants) | |
| 1852 STORERGBA | |
| 1853 "sub $0x8,%[width] \n" | |
| 1854 "jg 1b \n" | |
| 1855 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1856 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 1857 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 1858 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] | |
| 1859 [width]"+rm"(width) // %[width] | |
| 1860 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
| 1861 : "memory", "cc", NACL_R14 | |
| 1862 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 1863 ); | |
| 1864 } | |
| 1865 | |
| 1866 #endif // HAS_I422TOARGBROW_SSSE3 | |
| 1867 | |
| 1868 // Read 8 UV from 422, upsample to 16 UV. | |
| 1869 #define READYUV422_AVX2 \ | |
| 1870 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | |
| 1871 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | |
| 1872 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | |
| 1873 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | |
| 1874 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | |
| 1875 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" | |
| 1876 | |
| 1877 // Convert 16 pixels: 16 UV and 16 Y. | |
| 1878 #define YUVTORGB_AVX2(YuvConstants) \ | |
| 1879 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ | |
| 1880 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ | |
| 1881 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ | |
| 1882 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ | |
| 1883 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ | |
| 1884 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \ | |
| 1885 "vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \ | |
| 1886 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \ | |
| 1887 "vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \ | |
| 1888 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ | |
| 1889 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ | |
| 1890 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ | |
| 1891 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \ | |
| 1892 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \ | |
| 1893 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ | |
| 1894 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ | |
| 1895 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ | |
| 1896 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | |
| 1897 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | |
| 1898 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ | |
| 1899 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ | |
| 1900 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | |
| 1901 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | |
| 1902 | |
| 1903 #if defined(HAS_I422TOBGRAROW_AVX2) | |
| 1904 // 16 pixels | |
| 1905 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | |
| 1906 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, | |
| 1907 const uint8* u_buf, | |
| 1908 const uint8* v_buf, | |
| 1909 uint8* dst_bgra, | |
| 1910 int width) { | |
| 1911 asm volatile ( | |
| 1912 "sub %[u_buf],%[v_buf] \n" | |
| 1913 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
| 1914 LABELALIGN | |
| 1915 "1: \n" | |
| 1916 READYUV422_AVX2 | |
| 1917 YUVTORGB_AVX2(kYuvConstants) | |
| 1918 | |
| 1919 // Step 3: Weave into BGRA | |
| 1920 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB | |
| 1921 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
| 1922 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR | |
| 1923 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
| 1924 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels | |
| 1925 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels | |
| 1926 | |
| 1927 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" | |
| 1928 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" | |
| 1929 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" | |
| 1930 "sub $0x10,%[width] \n" | |
| 1931 "jg 1b \n" | |
| 1932 "vzeroupper \n" | |
| 1933 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1934 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 1935 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 1936 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] | |
| 1937 [width]"+rm"(width) // %[width] | |
| 1938 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
| 1939 : "memory", "cc", NACL_R14 | |
| 1940 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 1941 ); | |
| 1942 } | |
| 1943 #endif // HAS_I422TOBGRAROW_AVX2 | |
| 1944 | |
| 1945 #if defined(HAS_I422TOARGBROW_AVX2) | |
| 1946 // 16 pixels | |
| 1947 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | |
| 1948 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, | |
| 1949 const uint8* u_buf, | |
| 1950 const uint8* v_buf, | |
| 1951 uint8* dst_argb, | |
| 1952 int width) { | |
| 1953 asm volatile ( | |
| 1954 "sub %[u_buf],%[v_buf] \n" | |
| 1955 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
| 1956 LABELALIGN | |
| 1957 "1: \n" | |
| 1958 READYUV422_AVX2 | |
| 1959 YUVTORGB_AVX2(kYuvConstants) | |
| 1960 | |
| 1961 // Step 3: Weave into ARGB | |
| 1962 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG | |
| 1963 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 1964 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA | |
| 1965 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
| 1966 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels | |
| 1967 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels | |
| 1968 | |
| 1969 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" | |
| 1970 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" | |
| 1971 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | |
| 1972 "sub $0x10,%[width] \n" | |
| 1973 "jg 1b \n" | |
| 1974 "vzeroupper \n" | |
| 1975 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1976 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 1977 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 1978 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
| 1979 [width]"+rm"(width) // %[width] | |
| 1980 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
| 1981 : "memory", "cc", NACL_R14 | |
| 1982 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 1983 ); | |
| 1984 } | |
| 1985 #endif // HAS_I422TOARGBROW_AVX2 | |
| 1986 | |
| 1987 #if defined(HAS_I422TOABGRROW_AVX2) | |
| 1988 // 16 pixels | |
| 1989 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | |
| 1990 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, | |
| 1991 const uint8* u_buf, | |
| 1992 const uint8* v_buf, | |
| 1993 uint8* dst_argb, | |
| 1994 int width) { | |
| 1995 asm volatile ( | |
| 1996 "sub %[u_buf],%[v_buf] \n" | |
| 1997 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
| 1998 LABELALIGN | |
| 1999 "1: \n" | |
| 2000 READYUV422_AVX2 | |
| 2001 YUVTORGB_AVX2(kYuvConstants) | |
| 2002 | |
| 2003 // Step 3: Weave into ABGR | |
| 2004 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG | |
| 2005 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
| 2006 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA | |
| 2007 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
| 2008 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels | |
| 2009 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels | |
| 2010 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" | |
| 2011 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" | |
| 2012 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | |
| 2013 "sub $0x10,%[width] \n" | |
| 2014 "jg 1b \n" | |
| 2015 "vzeroupper \n" | |
| 2016 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 2017 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 2018 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 2019 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
| 2020 [width]"+rm"(width) // %[width] | |
| 2021 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
| 2022 : "memory", "cc", NACL_R14 | |
| 2023 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 2024 ); | |
| 2025 } | |
| 2026 #endif // HAS_I422TOABGRROW_AVX2 | |
| 2027 | |
| 2028 #if defined(HAS_I422TORGBAROW_AVX2) | |
| 2029 // 16 pixels | |
| 2030 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | |
| 2031 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, | |
| 2032 const uint8* u_buf, | |
| 2033 const uint8* v_buf, | |
| 2034 uint8* dst_argb, | |
| 2035 int width) { | |
| 2036 asm volatile ( | |
| 2037 "sub %[u_buf],%[v_buf] \n" | |
| 2038 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
| 2039 LABELALIGN | |
| 2040 "1: \n" | |
| 2041 READYUV422_AVX2 | |
| 2042 YUVTORGB_AVX2(kYuvConstants) | |
| 2043 | |
| 2044 // Step 3: Weave into RGBA | |
| 2045 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" | |
| 2046 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
| 2047 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" | |
| 2048 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
| 2049 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" | |
| 2050 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" | |
| 2051 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" | |
| 2052 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" | |
| 2053 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | |
| 2054 "sub $0x10,%[width] \n" | |
| 2055 "jg 1b \n" | |
| 2056 "vzeroupper \n" | |
| 2057 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 2058 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 2059 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 2060 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
| 2061 [width]"+rm"(width) // %[width] | |
| 2062 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
| 2063 : "memory", "cc", NACL_R14 | |
| 2064 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 2065 ); | |
| 2066 } | |
| 2067 #endif // HAS_I422TORGBAROW_AVX2 | |
| 2068 | |
| 2069 #ifdef HAS_YTOARGBROW_SSE2 | |
| 2070 void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { | |
| 2071 asm volatile ( | |
| 2072 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 | |
| 2073 "movd %%eax,%%xmm2 \n" | |
| 2074 "pshufd $0x0,%%xmm2,%%xmm2 \n" | |
| 2075 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 | |
| 2076 "movd %%eax,%%xmm3 \n" | |
| 2077 "pshufd $0x0,%%xmm3,%%xmm3 \n" | |
| 2078 "pcmpeqb %%xmm4,%%xmm4 \n" | |
| 2079 "pslld $0x18,%%xmm4 \n" | |
| 2080 LABELALIGN | |
| 2081 "1: \n" | |
| 2082 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 | |
| 2083 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
| 2084 "lea " MEMLEA(0x8,0) ",%0 \n" | |
| 2085 "punpcklbw %%xmm0,%%xmm0 \n" | |
| 2086 "pmulhuw %%xmm2,%%xmm0 \n" | |
| 2087 "psubusw %%xmm3,%%xmm0 \n" | |
| 2088 "psrlw $6, %%xmm0 \n" | |
| 2089 "packuswb %%xmm0,%%xmm0 \n" | |
| 2090 | |
| 2091 // Step 2: Weave into ARGB | |
| 2092 "punpcklbw %%xmm0,%%xmm0 \n" | |
| 2093 "movdqa %%xmm0,%%xmm1 \n" | |
| 2094 "punpcklwd %%xmm0,%%xmm0 \n" | |
| 2095 "punpckhwd %%xmm1,%%xmm1 \n" | |
| 2096 "por %%xmm4,%%xmm0 \n" | |
| 2097 "por %%xmm4,%%xmm1 \n" | |
| 2098 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 2099 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
| 2100 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 2101 | |
| 2102 "sub $0x8,%2 \n" | |
| 2103 "jg 1b \n" | |
| 2104 : "+r"(y_buf), // %0 | |
| 2105 "+r"(dst_argb), // %1 | |
| 2106 "+rm"(width) // %2 | |
| 2107 : | |
| 2108 : "memory", "cc", "eax" | |
| 2109 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | |
| 2110 ); | |
| 2111 } | |
| 2112 #endif // HAS_YTOARGBROW_SSE2 | |
| 2113 | |
| 2114 #ifdef HAS_YTOARGBROW_AVX2 | |
| 2115 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). | |
| 2116 // note: vpunpcklbw mutates and vpackuswb unmutates. | |
| 2117 void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { | |
| 2118 asm volatile ( | |
| 2119 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 | |
| 2120 "vmovd %%eax,%%xmm2 \n" | |
| 2121 "vbroadcastss %%xmm2,%%ymm2 \n" | |
| 2122 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 | |
| 2123 "vmovd %%eax,%%xmm3 \n" | |
| 2124 "vbroadcastss %%xmm3,%%ymm3 \n" | |
| 2125 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" | |
| 2126 "vpslld $0x18,%%ymm4,%%ymm4 \n" | |
| 2127 | |
| 2128 LABELALIGN | |
| 2129 "1: \n" | |
| 2130 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 | |
| 2131 "vmovdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 2132 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 2133 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 2134 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" | |
| 2135 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" | |
| 2136 "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" | |
| 2137 "vpsrlw $0x6,%%ymm0,%%ymm0 \n" | |
| 2138 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | |
| 2139 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" | |
| 2140 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
| 2141 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" | |
| 2142 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" | |
| 2143 "vpor %%ymm4,%%ymm0,%%ymm0 \n" | |
| 2144 "vpor %%ymm4,%%ymm1,%%ymm1 \n" | |
| 2145 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
| 2146 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" | |
| 2147 "lea " MEMLEA(0x40,1) ",%1 \n" | |
| 2148 "sub $0x10,%2 \n" | |
| 2149 "jg 1b \n" | |
| 2150 "vzeroupper \n" | |
| 2151 : "+r"(y_buf), // %0 | |
| 2152 "+r"(dst_argb), // %1 | |
| 2153 "+rm"(width) // %2 | |
| 2154 : | |
| 2155 : "memory", "cc", "eax" | |
| 2156 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | |
| 2157 ); | |
| 2158 } | |
| 2159 #endif // HAS_YTOARGBROW_AVX2 | |
| 2160 | |
| 2161 #ifdef HAS_MIRRORROW_SSSE3 | |
| 2162 // Shuffle table for reversing the bytes. | |
| 2163 static uvec8 kShuffleMirror = { | |
| 2164 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | |
| 2165 }; | |
| 2166 | |
| 2167 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { | |
| 2168 intptr_t temp_width = (intptr_t)(width); | |
| 2169 asm volatile ( | |
| 2170 "movdqa %3,%%xmm5 \n" | |
| 2171 LABELALIGN | |
| 2172 "1: \n" | |
| 2173 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 | |
| 2174 "pshufb %%xmm5,%%xmm0 \n" | |
| 2175 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 2176 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 2177 "sub $0x10,%2 \n" | |
| 2178 "jg 1b \n" | |
| 2179 : "+r"(src), // %0 | |
| 2180 "+r"(dst), // %1 | |
| 2181 "+r"(temp_width) // %2 | |
| 2182 : "m"(kShuffleMirror) // %3 | |
| 2183 : "memory", "cc", NACL_R14 | |
| 2184 "xmm0", "xmm5" | |
| 2185 ); | |
| 2186 } | |
| 2187 #endif // HAS_MIRRORROW_SSSE3 | |
| 2188 | |
| 2189 #ifdef HAS_MIRRORROW_AVX2 | |
| 2190 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { | |
| 2191 intptr_t temp_width = (intptr_t)(width); | |
| 2192 asm volatile ( | |
| 2193 "vbroadcastf128 %3,%%ymm5 \n" | |
| 2194 LABELALIGN | |
| 2195 "1: \n" | |
| 2196 MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 | |
| 2197 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" | |
| 2198 "vpermq $0x4e,%%ymm0,%%ymm0 \n" | |
| 2199 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
| 2200 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 2201 "sub $0x20,%2 \n" | |
| 2202 "jg 1b \n" | |
| 2203 "vzeroupper \n" | |
| 2204 : "+r"(src), // %0 | |
| 2205 "+r"(dst), // %1 | |
| 2206 "+r"(temp_width) // %2 | |
| 2207 : "m"(kShuffleMirror) // %3 | |
| 2208 : "memory", "cc", NACL_R14 | |
| 2209 "xmm0", "xmm5" | |
| 2210 ); | |
| 2211 } | |
| 2212 #endif // HAS_MIRRORROW_AVX2 | |
| 2213 | |
| 2214 #ifdef HAS_MIRRORROW_SSE2 | |
| 2215 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | |
| 2216 intptr_t temp_width = (intptr_t)(width); | |
| 2217 asm volatile ( | |
| 2218 LABELALIGN | |
| 2219 "1: \n" | |
| 2220 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 | |
| 2221 "movdqa %%xmm0,%%xmm1 \n" | |
| 2222 "psllw $0x8,%%xmm0 \n" | |
| 2223 "psrlw $0x8,%%xmm1 \n" | |
| 2224 "por %%xmm1,%%xmm0 \n" | |
| 2225 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" | |
| 2226 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" | |
| 2227 "pshufd $0x4e,%%xmm0,%%xmm0 \n" | |
| 2228 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 2229 "lea " MEMLEA(0x10,1)",%1 \n" | |
| 2230 "sub $0x10,%2 \n" | |
| 2231 "jg 1b \n" | |
| 2232 : "+r"(src), // %0 | |
| 2233 "+r"(dst), // %1 | |
| 2234 "+r"(temp_width) // %2 | |
| 2235 : | |
| 2236 : "memory", "cc", NACL_R14 | |
| 2237 "xmm0", "xmm1" | |
| 2238 ); | |
| 2239 } | |
| 2240 #endif // HAS_MIRRORROW_SSE2 | |
| 2241 | |
| 2242 #ifdef HAS_MIRRORROW_UV_SSSE3 | |
| 2243 // Shuffle table for reversing the bytes of UV channels. | |
| 2244 static uvec8 kShuffleMirrorUV = { | |
| 2245 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u | |
| 2246 }; | |
| 2247 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, | |
| 2248 int width) { | |
| 2249 intptr_t temp_width = (intptr_t)(width); | |
| 2250 asm volatile ( | |
| 2251 "movdqa %4,%%xmm1 \n" | |
| 2252 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" | |
| 2253 "sub %1,%2 \n" | |
| 2254 LABELALIGN | |
| 2255 "1: \n" | |
| 2256 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 2257 "lea " MEMLEA(-0x10,0) ",%0 \n" | |
| 2258 "pshufb %%xmm1,%%xmm0 \n" | |
| 2259 "movlpd %%xmm0," MEMACCESS(1) " \n" | |
| 2260 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) | |
| 2261 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 2262 "sub $8,%3 \n" | |
| 2263 "jg 1b \n" | |
| 2264 : "+r"(src), // %0 | |
| 2265 "+r"(dst_u), // %1 | |
| 2266 "+r"(dst_v), // %2 | |
| 2267 "+r"(temp_width) // %3 | |
| 2268 : "m"(kShuffleMirrorUV) // %4 | |
| 2269 : "memory", "cc", NACL_R14 | |
| 2270 "xmm0", "xmm1" | |
| 2271 ); | |
| 2272 } | |
| 2273 #endif // HAS_MIRRORROW_UV_SSSE3 | |
| 2274 | |
| 2275 #ifdef HAS_ARGBMIRRORROW_SSE2 | |
| 2276 | |
| 2277 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | |
| 2278 intptr_t temp_width = (intptr_t)(width); | |
| 2279 asm volatile ( | |
| 2280 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" | |
| 2281 LABELALIGN | |
| 2282 "1: \n" | |
| 2283 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 2284 "pshufd $0x1b,%%xmm0,%%xmm0 \n" | |
| 2285 "lea " MEMLEA(-0x10,0) ",%0 \n" | |
| 2286 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 2287 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 2288 "sub $0x4,%2 \n" | |
| 2289 "jg 1b \n" | |
| 2290 : "+r"(src), // %0 | |
| 2291 "+r"(dst), // %1 | |
| 2292 "+r"(temp_width) // %2 | |
| 2293 : | |
| 2294 : "memory", "cc" | |
| 2295 , "xmm0" | |
| 2296 ); | |
| 2297 } | |
| 2298 #endif // HAS_ARGBMIRRORROW_SSE2 | |
| 2299 | |
| 2300 #ifdef HAS_ARGBMIRRORROW_AVX2 | |
| 2301 // Shuffle table for reversing the bytes. | |
| 2302 static const ulvec32 kARGBShuffleMirror_AVX2 = { | |
| 2303 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | |
| 2304 }; | |
| 2305 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { | |
| 2306 intptr_t temp_width = (intptr_t)(width); | |
| 2307 asm volatile ( | |
| 2308 "vmovdqu %3,%%ymm5 \n" | |
| 2309 LABELALIGN | |
| 2310 "1: \n" | |
| 2311 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 | |
| 2312 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
| 2313 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 2314 "sub $0x8,%2 \n" | |
| 2315 "jg 1b \n" | |
| 2316 "vzeroupper \n" | |
| 2317 : "+r"(src), // %0 | |
| 2318 "+r"(dst), // %1 | |
| 2319 "+r"(temp_width) // %2 | |
| 2320 : "m"(kARGBShuffleMirror_AVX2) // %3 | |
| 2321 : "memory", "cc", NACL_R14 | |
| 2322 "xmm0", "xmm5" | |
| 2323 ); | |
| 2324 } | |
| 2325 #endif // HAS_ARGBMIRRORROW_AVX2 | |
| 2326 | |
| 2327 #ifdef HAS_SPLITUVROW_AVX2 | |
| 2328 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | |
| 2329 asm volatile ( | |
| 2330 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
| 2331 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | |
| 2332 "sub %1,%2 \n" | |
| 2333 LABELALIGN | |
| 2334 "1: \n" | |
| 2335 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 2336 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
| 2337 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 2338 "vpsrlw $0x8,%%ymm0,%%ymm2 \n" | |
| 2339 "vpsrlw $0x8,%%ymm1,%%ymm3 \n" | |
| 2340 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | |
| 2341 "vpand %%ymm5,%%ymm1,%%ymm1 \n" | |
| 2342 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 2343 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" | |
| 2344 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 2345 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
| 2346 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
| 2347 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) | |
| 2348 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 2349 "sub $0x20,%3 \n" | |
| 2350 "jg 1b \n" | |
| 2351 "vzeroupper \n" | |
| 2352 : "+r"(src_uv), // %0 | |
| 2353 "+r"(dst_u), // %1 | |
| 2354 "+r"(dst_v), // %2 | |
| 2355 "+r"(pix) // %3 | |
| 2356 : | |
| 2357 : "memory", "cc", NACL_R14 | |
| 2358 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 2359 ); | |
| 2360 } | |
| 2361 #endif // HAS_SPLITUVROW_AVX2 | |
| 2362 | |
| 2363 #ifdef HAS_SPLITUVROW_SSE2 | |
| 2364 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | |
| 2365 asm volatile ( | |
| 2366 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 2367 "psrlw $0x8,%%xmm5 \n" | |
| 2368 "sub %1,%2 \n" | |
| 2369 LABELALIGN | |
| 2370 "1: \n" | |
| 2371 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 2372 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 2373 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 2374 "movdqa %%xmm0,%%xmm2 \n" | |
| 2375 "movdqa %%xmm1,%%xmm3 \n" | |
| 2376 "pand %%xmm5,%%xmm0 \n" | |
| 2377 "pand %%xmm5,%%xmm1 \n" | |
| 2378 "packuswb %%xmm1,%%xmm0 \n" | |
| 2379 "psrlw $0x8,%%xmm2 \n" | |
| 2380 "psrlw $0x8,%%xmm3 \n" | |
| 2381 "packuswb %%xmm3,%%xmm2 \n" | |
| 2382 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 2383 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) | |
| 2384 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 2385 "sub $0x10,%3 \n" | |
| 2386 "jg 1b \n" | |
| 2387 : "+r"(src_uv), // %0 | |
| 2388 "+r"(dst_u), // %1 | |
| 2389 "+r"(dst_v), // %2 | |
| 2390 "+r"(pix) // %3 | |
| 2391 : | |
| 2392 : "memory", "cc", NACL_R14 | |
| 2393 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 2394 ); | |
| 2395 } | |
| 2396 #endif // HAS_SPLITUVROW_SSE2 | |
| 2397 | |
| 2398 #ifdef HAS_MERGEUVROW_AVX2 | |
| 2399 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | |
| 2400 int width) { | |
| 2401 asm volatile ( | |
| 2402 "sub %0,%1 \n" | |
| 2403 LABELALIGN | |
| 2404 "1: \n" | |
| 2405 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 2406 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 | |
| 2407 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 2408 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" | |
| 2409 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" | |
| 2410 "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" | |
| 2411 "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n" | |
| 2412 "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n" | |
| 2413 "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n" | |
| 2414 "lea " MEMLEA(0x40,2) ",%2 \n" | |
| 2415 "sub $0x20,%3 \n" | |
| 2416 "jg 1b \n" | |
| 2417 "vzeroupper \n" | |
| 2418 : "+r"(src_u), // %0 | |
| 2419 "+r"(src_v), // %1 | |
| 2420 "+r"(dst_uv), // %2 | |
| 2421 "+r"(width) // %3 | |
| 2422 : | |
| 2423 : "memory", "cc", NACL_R14 | |
| 2424 "xmm0", "xmm1", "xmm2" | |
| 2425 ); | |
| 2426 } | |
| 2427 #endif // HAS_MERGEUVROW_AVX2 | |
| 2428 | |
| 2429 #ifdef HAS_MERGEUVROW_SSE2 | |
| 2430 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | |
| 2431 int width) { | |
| 2432 asm volatile ( | |
| 2433 "sub %0,%1 \n" | |
| 2434 LABELALIGN | |
| 2435 "1: \n" | |
| 2436 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 2437 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | |
| 2438 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 2439 "movdqa %%xmm0,%%xmm2 \n" | |
| 2440 "punpcklbw %%xmm1,%%xmm0 \n" | |
| 2441 "punpckhbw %%xmm1,%%xmm2 \n" | |
| 2442 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
| 2443 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" | |
| 2444 "lea " MEMLEA(0x20,2) ",%2 \n" | |
| 2445 "sub $0x10,%3 \n" | |
| 2446 "jg 1b \n" | |
| 2447 : "+r"(src_u), // %0 | |
| 2448 "+r"(src_v), // %1 | |
| 2449 "+r"(dst_uv), // %2 | |
| 2450 "+r"(width) // %3 | |
| 2451 : | |
| 2452 : "memory", "cc", NACL_R14 | |
| 2453 "xmm0", "xmm1", "xmm2" | |
| 2454 ); | |
| 2455 } | |
| 2456 #endif // HAS_MERGEUVROW_SSE2 | |
| 2457 | |
| 2458 #ifdef HAS_COPYROW_SSE2 | |
| 2459 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { | |
| 2460 asm volatile ( | |
| 2461 LABELALIGN | |
| 2462 "1: \n" | |
| 2463 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 2464 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 2465 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 2466 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 2467 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
| 2468 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 2469 "sub $0x20,%2 \n" | |
| 2470 "jg 1b \n" | |
| 2471 : "+r"(src), // %0 | |
| 2472 "+r"(dst), // %1 | |
| 2473 "+r"(count) // %2 | |
| 2474 : | |
| 2475 : "memory", "cc" | |
| 2476 , "xmm0", "xmm1" | |
| 2477 ); | |
| 2478 } | |
| 2479 #endif // HAS_COPYROW_SSE2 | |
| 2480 | |
| 2481 #ifdef HAS_COPYROW_AVX | |
| 2482 void CopyRow_AVX(const uint8* src, uint8* dst, int count) { | |
| 2483 asm volatile ( | |
| 2484 LABELALIGN | |
| 2485 "1: \n" | |
| 2486 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 2487 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
| 2488 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 2489 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
| 2490 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" | |
| 2491 "lea " MEMLEA(0x40,1) ",%1 \n" | |
| 2492 "sub $0x40,%2 \n" | |
| 2493 "jg 1b \n" | |
| 2494 : "+r"(src), // %0 | |
| 2495 "+r"(dst), // %1 | |
| 2496 "+r"(count) // %2 | |
| 2497 : | |
| 2498 : "memory", "cc" | |
| 2499 , "xmm0", "xmm1" | |
| 2500 ); | |
| 2501 } | |
| 2502 #endif // HAS_COPYROW_AVX | |
| 2503 | |
| 2504 #ifdef HAS_COPYROW_ERMS | |
| 2505 // Multiple of 1. | |
| 2506 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { | |
| 2507 size_t width_tmp = (size_t)(width); | |
| 2508 asm volatile ( | |
| 2509 "rep movsb " MEMMOVESTRING(0,1) " \n" | |
| 2510 : "+S"(src), // %0 | |
| 2511 "+D"(dst), // %1 | |
| 2512 "+c"(width_tmp) // %2 | |
| 2513 : | |
| 2514 : "memory", "cc" | |
| 2515 ); | |
| 2516 } | |
| 2517 #endif // HAS_COPYROW_ERMS | |
| 2518 | |
| 2519 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 | |
| 2520 // width in pixels | |
| 2521 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { | |
| 2522 asm volatile ( | |
| 2523 "pcmpeqb %%xmm0,%%xmm0 \n" | |
| 2524 "pslld $0x18,%%xmm0 \n" | |
| 2525 "pcmpeqb %%xmm1,%%xmm1 \n" | |
| 2526 "psrld $0x8,%%xmm1 \n" | |
| 2527 LABELALIGN | |
| 2528 "1: \n" | |
| 2529 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
| 2530 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" | |
| 2531 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 2532 "movdqu " MEMACCESS(1) ",%%xmm4 \n" | |
| 2533 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" | |
| 2534 "pand %%xmm0,%%xmm2 \n" | |
| 2535 "pand %%xmm0,%%xmm3 \n" | |
| 2536 "pand %%xmm1,%%xmm4 \n" | |
| 2537 "pand %%xmm1,%%xmm5 \n" | |
| 2538 "por %%xmm4,%%xmm2 \n" | |
| 2539 "por %%xmm5,%%xmm3 \n" | |
| 2540 "movdqu %%xmm2," MEMACCESS(1) " \n" | |
| 2541 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" | |
| 2542 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 2543 "sub $0x8,%2 \n" | |
| 2544 "jg 1b \n" | |
| 2545 : "+r"(src), // %0 | |
| 2546 "+r"(dst), // %1 | |
| 2547 "+r"(width) // %2 | |
| 2548 : | |
| 2549 : "memory", "cc" | |
| 2550 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 2551 ); | |
| 2552 } | |
| 2553 #endif // HAS_ARGBCOPYALPHAROW_SSE2 | |
| 2554 | |
| 2555 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 | |
| 2556 // width in pixels | |
| 2557 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { | |
| 2558 asm volatile ( | |
| 2559 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" | |
| 2560 "vpsrld $0x8,%%ymm0,%%ymm0 \n" | |
| 2561 LABELALIGN | |
| 2562 "1: \n" | |
| 2563 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" | |
| 2564 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" | |
| 2565 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 2566 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" | |
| 2567 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" | |
| 2568 "vmovdqu %%ymm1," MEMACCESS(1) " \n" | |
| 2569 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" | |
| 2570 "lea " MEMLEA(0x40,1) ",%1 \n" | |
| 2571 "sub $0x10,%2 \n" | |
| 2572 "jg 1b \n" | |
| 2573 "vzeroupper \n" | |
| 2574 : "+r"(src), // %0 | |
| 2575 "+r"(dst), // %1 | |
| 2576 "+r"(width) // %2 | |
| 2577 : | |
| 2578 : "memory", "cc" | |
| 2579 , "xmm0", "xmm1", "xmm2" | |
| 2580 ); | |
| 2581 } | |
| 2582 #endif // HAS_ARGBCOPYALPHAROW_AVX2 | |
| 2583 | |
| 2584 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 | |
| 2585 // width in pixels | |
| 2586 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { | |
| 2587 asm volatile ( | |
| 2588 "pcmpeqb %%xmm0,%%xmm0 \n" | |
| 2589 "pslld $0x18,%%xmm0 \n" | |
| 2590 "pcmpeqb %%xmm1,%%xmm1 \n" | |
| 2591 "psrld $0x8,%%xmm1 \n" | |
| 2592 LABELALIGN | |
| 2593 "1: \n" | |
| 2594 "movq " MEMACCESS(0) ",%%xmm2 \n" | |
| 2595 "lea " MEMLEA(0x8,0) ",%0 \n" | |
| 2596 "punpcklbw %%xmm2,%%xmm2 \n" | |
| 2597 "punpckhwd %%xmm2,%%xmm3 \n" | |
| 2598 "punpcklwd %%xmm2,%%xmm2 \n" | |
| 2599 "movdqu " MEMACCESS(1) ",%%xmm4 \n" | |
| 2600 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" | |
| 2601 "pand %%xmm0,%%xmm2 \n" | |
| 2602 "pand %%xmm0,%%xmm3 \n" | |
| 2603 "pand %%xmm1,%%xmm4 \n" | |
| 2604 "pand %%xmm1,%%xmm5 \n" | |
| 2605 "por %%xmm4,%%xmm2 \n" | |
| 2606 "por %%xmm5,%%xmm3 \n" | |
| 2607 "movdqu %%xmm2," MEMACCESS(1) " \n" | |
| 2608 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" | |
| 2609 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 2610 "sub $0x8,%2 \n" | |
| 2611 "jg 1b \n" | |
| 2612 : "+r"(src), // %0 | |
| 2613 "+r"(dst), // %1 | |
| 2614 "+r"(width) // %2 | |
| 2615 : | |
| 2616 : "memory", "cc" | |
| 2617 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 2618 ); | |
| 2619 } | |
| 2620 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 | |
| 2621 | |
| 2622 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 | |
| 2623 // width in pixels | |
| 2624 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { | |
| 2625 asm volatile ( | |
| 2626 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" | |
| 2627 "vpsrld $0x8,%%ymm0,%%ymm0 \n" | |
| 2628 LABELALIGN | |
| 2629 "1: \n" | |
| 2630 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" | |
| 2631 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" | |
| 2632 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 2633 "vpslld $0x18,%%ymm1,%%ymm1 \n" | |
| 2634 "vpslld $0x18,%%ymm2,%%ymm2 \n" | |
| 2635 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" | |
| 2636 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" | |
| 2637 "vmovdqu %%ymm1," MEMACCESS(1) " \n" | |
| 2638 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" | |
| 2639 "lea " MEMLEA(0x40,1) ",%1 \n" | |
| 2640 "sub $0x10,%2 \n" | |
| 2641 "jg 1b \n" | |
| 2642 "vzeroupper \n" | |
| 2643 : "+r"(src), // %0 | |
| 2644 "+r"(dst), // %1 | |
| 2645 "+r"(width) // %2 | |
| 2646 : | |
| 2647 : "memory", "cc" | |
| 2648 , "xmm0", "xmm1", "xmm2" | |
| 2649 ); | |
| 2650 } | |
| 2651 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 | |
| 2652 | |
| 2653 #ifdef HAS_SETROW_X86 | |
| 2654 void SetRow_X86(uint8* dst, uint8 v8, int width) { | |
| 2655 size_t width_tmp = (size_t)(width >> 2); | |
| 2656 const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes. | |
| 2657 asm volatile ( | |
| 2658 "rep stosl " MEMSTORESTRING(eax,0) " \n" | |
| 2659 : "+D"(dst), // %0 | |
| 2660 "+c"(width_tmp) // %1 | |
| 2661 : "a"(v32) // %2 | |
| 2662 : "memory", "cc"); | |
| 2663 } | |
| 2664 | |
| 2665 void SetRow_ERMS(uint8* dst, uint8 v8, int width) { | |
| 2666 size_t width_tmp = (size_t)(width); | |
| 2667 asm volatile ( | |
| 2668 "rep stosb " MEMSTORESTRING(al,0) " \n" | |
| 2669 : "+D"(dst), // %0 | |
| 2670 "+c"(width_tmp) // %1 | |
| 2671 : "a"(v8) // %2 | |
| 2672 : "memory", "cc"); | |
| 2673 } | |
| 2674 | |
| 2675 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { | |
| 2676 size_t width_tmp = (size_t)(width); | |
| 2677 asm volatile ( | |
| 2678 "rep stosl " MEMSTORESTRING(eax,0) " \n" | |
| 2679 : "+D"(dst_argb), // %0 | |
| 2680 "+c"(width_tmp) // %1 | |
| 2681 : "a"(v32) // %2 | |
| 2682 : "memory", "cc"); | |
| 2683 } | |
| 2684 #endif // HAS_SETROW_X86 | |
| 2685 | |
| 2686 #ifdef HAS_YUY2TOYROW_SSE2 | |
| 2687 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { | |
| 2688 asm volatile ( | |
| 2689 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 2690 "psrlw $0x8,%%xmm5 \n" | |
| 2691 LABELALIGN | |
| 2692 "1: \n" | |
| 2693 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 2694 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 2695 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 2696 "pand %%xmm5,%%xmm0 \n" | |
| 2697 "pand %%xmm5,%%xmm1 \n" | |
| 2698 "packuswb %%xmm1,%%xmm0 \n" | |
| 2699 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 2700 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 2701 "sub $0x10,%2 \n" | |
| 2702 "jg 1b \n" | |
| 2703 : "+r"(src_yuy2), // %0 | |
| 2704 "+r"(dst_y), // %1 | |
| 2705 "+r"(pix) // %2 | |
| 2706 : | |
| 2707 : "memory", "cc" | |
| 2708 , "xmm0", "xmm1", "xmm5" | |
| 2709 ); | |
| 2710 } | |
| 2711 | |
| 2712 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, | |
| 2713 uint8* dst_u, uint8* dst_v, int pix) { | |
| 2714 asm volatile ( | |
| 2715 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 2716 "psrlw $0x8,%%xmm5 \n" | |
| 2717 "sub %1,%2 \n" | |
| 2718 LABELALIGN | |
| 2719 "1: \n" | |
| 2720 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 2721 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 2722 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 | |
| 2723 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 | |
| 2724 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 2725 "pavgb %%xmm2,%%xmm0 \n" | |
| 2726 "pavgb %%xmm3,%%xmm1 \n" | |
| 2727 "psrlw $0x8,%%xmm0 \n" | |
| 2728 "psrlw $0x8,%%xmm1 \n" | |
| 2729 "packuswb %%xmm1,%%xmm0 \n" | |
| 2730 "movdqa %%xmm0,%%xmm1 \n" | |
| 2731 "pand %%xmm5,%%xmm0 \n" | |
| 2732 "packuswb %%xmm0,%%xmm0 \n" | |
| 2733 "psrlw $0x8,%%xmm1 \n" | |
| 2734 "packuswb %%xmm1,%%xmm1 \n" | |
| 2735 "movq %%xmm0," MEMACCESS(1) " \n" | |
| 2736 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) | |
| 2737 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 2738 "sub $0x10,%3 \n" | |
| 2739 "jg 1b \n" | |
| 2740 : "+r"(src_yuy2), // %0 | |
| 2741 "+r"(dst_u), // %1 | |
| 2742 "+r"(dst_v), // %2 | |
| 2743 "+r"(pix) // %3 | |
| 2744 : "r"((intptr_t)(stride_yuy2)) // %4 | |
| 2745 : "memory", "cc", NACL_R14 | |
| 2746 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 2747 ); | |
| 2748 } | |
| 2749 | |
| 2750 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, | |
| 2751 uint8* dst_u, uint8* dst_v, int pix) { | |
| 2752 asm volatile ( | |
| 2753 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 2754 "psrlw $0x8,%%xmm5 \n" | |
| 2755 "sub %1,%2 \n" | |
| 2756 LABELALIGN | |
| 2757 "1: \n" | |
| 2758 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 2759 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 2760 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 2761 "psrlw $0x8,%%xmm0 \n" | |
| 2762 "psrlw $0x8,%%xmm1 \n" | |
| 2763 "packuswb %%xmm1,%%xmm0 \n" | |
| 2764 "movdqa %%xmm0,%%xmm1 \n" | |
| 2765 "pand %%xmm5,%%xmm0 \n" | |
| 2766 "packuswb %%xmm0,%%xmm0 \n" | |
| 2767 "psrlw $0x8,%%xmm1 \n" | |
| 2768 "packuswb %%xmm1,%%xmm1 \n" | |
| 2769 "movq %%xmm0," MEMACCESS(1) " \n" | |
| 2770 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) | |
| 2771 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 2772 "sub $0x10,%3 \n" | |
| 2773 "jg 1b \n" | |
| 2774 : "+r"(src_yuy2), // %0 | |
| 2775 "+r"(dst_u), // %1 | |
| 2776 "+r"(dst_v), // %2 | |
| 2777 "+r"(pix) // %3 | |
| 2778 : | |
| 2779 : "memory", "cc", NACL_R14 | |
| 2780 "xmm0", "xmm1", "xmm5" | |
| 2781 ); | |
| 2782 } | |
| 2783 | |
| 2784 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { | |
| 2785 asm volatile ( | |
| 2786 LABELALIGN | |
| 2787 "1: \n" | |
| 2788 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 2789 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 2790 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 2791 "psrlw $0x8,%%xmm0 \n" | |
| 2792 "psrlw $0x8,%%xmm1 \n" | |
| 2793 "packuswb %%xmm1,%%xmm0 \n" | |
| 2794 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 2795 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 2796 "sub $0x10,%2 \n" | |
| 2797 "jg 1b \n" | |
| 2798 : "+r"(src_uyvy), // %0 | |
| 2799 "+r"(dst_y), // %1 | |
| 2800 "+r"(pix) // %2 | |
| 2801 : | |
| 2802 : "memory", "cc" | |
| 2803 , "xmm0", "xmm1" | |
| 2804 ); | |
| 2805 } | |
| 2806 | |
| 2807 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, | |
| 2808 uint8* dst_u, uint8* dst_v, int pix) { | |
| 2809 asm volatile ( | |
| 2810 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 2811 "psrlw $0x8,%%xmm5 \n" | |
| 2812 "sub %1,%2 \n" | |
| 2813 LABELALIGN | |
| 2814 "1: \n" | |
| 2815 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 2816 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 2817 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 | |
| 2818 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 | |
| 2819 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 2820 "pavgb %%xmm2,%%xmm0 \n" | |
| 2821 "pavgb %%xmm3,%%xmm1 \n" | |
| 2822 "pand %%xmm5,%%xmm0 \n" | |
| 2823 "pand %%xmm5,%%xmm1 \n" | |
| 2824 "packuswb %%xmm1,%%xmm0 \n" | |
| 2825 "movdqa %%xmm0,%%xmm1 \n" | |
| 2826 "pand %%xmm5,%%xmm0 \n" | |
| 2827 "packuswb %%xmm0,%%xmm0 \n" | |
| 2828 "psrlw $0x8,%%xmm1 \n" | |
| 2829 "packuswb %%xmm1,%%xmm1 \n" | |
| 2830 "movq %%xmm0," MEMACCESS(1) " \n" | |
| 2831 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) | |
| 2832 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 2833 "sub $0x10,%3 \n" | |
| 2834 "jg 1b \n" | |
| 2835 : "+r"(src_uyvy), // %0 | |
| 2836 "+r"(dst_u), // %1 | |
| 2837 "+r"(dst_v), // %2 | |
| 2838 "+r"(pix) // %3 | |
| 2839 : "r"((intptr_t)(stride_uyvy)) // %4 | |
| 2840 : "memory", "cc", NACL_R14 | |
| 2841 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 2842 ); | |
| 2843 } | |
| 2844 | |
| 2845 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, | |
| 2846 uint8* dst_u, uint8* dst_v, int pix) { | |
| 2847 asm volatile ( | |
| 2848 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 2849 "psrlw $0x8,%%xmm5 \n" | |
| 2850 "sub %1,%2 \n" | |
| 2851 LABELALIGN | |
| 2852 "1: \n" | |
| 2853 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 2854 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 2855 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 2856 "pand %%xmm5,%%xmm0 \n" | |
| 2857 "pand %%xmm5,%%xmm1 \n" | |
| 2858 "packuswb %%xmm1,%%xmm0 \n" | |
| 2859 "movdqa %%xmm0,%%xmm1 \n" | |
| 2860 "pand %%xmm5,%%xmm0 \n" | |
| 2861 "packuswb %%xmm0,%%xmm0 \n" | |
| 2862 "psrlw $0x8,%%xmm1 \n" | |
| 2863 "packuswb %%xmm1,%%xmm1 \n" | |
| 2864 "movq %%xmm0," MEMACCESS(1) " \n" | |
| 2865 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) | |
| 2866 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 2867 "sub $0x10,%3 \n" | |
| 2868 "jg 1b \n" | |
| 2869 : "+r"(src_uyvy), // %0 | |
| 2870 "+r"(dst_u), // %1 | |
| 2871 "+r"(dst_v), // %2 | |
| 2872 "+r"(pix) // %3 | |
| 2873 : | |
| 2874 : "memory", "cc", NACL_R14 | |
| 2875 "xmm0", "xmm1", "xmm5" | |
| 2876 ); | |
| 2877 } | |
| 2878 #endif // HAS_YUY2TOYROW_SSE2 | |
| 2879 | |
| 2880 #ifdef HAS_YUY2TOYROW_AVX2 | |
| 2881 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { | |
| 2882 asm volatile ( | |
| 2883 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
| 2884 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | |
| 2885 LABELALIGN | |
| 2886 "1: \n" | |
| 2887 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 2888 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
| 2889 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 2890 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | |
| 2891 "vpand %%ymm5,%%ymm1,%%ymm1 \n" | |
| 2892 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 2893 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 2894 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
| 2895 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 2896 "sub $0x20,%2 \n" | |
| 2897 "jg 1b \n" | |
| 2898 "vzeroupper \n" | |
| 2899 : "+r"(src_yuy2), // %0 | |
| 2900 "+r"(dst_y), // %1 | |
| 2901 "+r"(pix) // %2 | |
| 2902 : | |
| 2903 : "memory", "cc" | |
| 2904 , "xmm0", "xmm1", "xmm5" | |
| 2905 ); | |
| 2906 } | |
| 2907 | |
| 2908 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, | |
| 2909 uint8* dst_u, uint8* dst_v, int pix) { | |
| 2910 asm volatile ( | |
| 2911 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
| 2912 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | |
| 2913 "sub %1,%2 \n" | |
| 2914 LABELALIGN | |
| 2915 "1: \n" | |
| 2916 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 2917 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
| 2918 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 | |
| 2919 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) | |
| 2920 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 2921 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
| 2922 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" | |
| 2923 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 2924 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 2925 "vpand %%ymm5,%%ymm0,%%ymm1 \n" | |
| 2926 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
| 2927 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" | |
| 2928 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | |
| 2929 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
| 2930 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 2931 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" | |
| 2932 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) | |
| 2933 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 2934 "sub $0x20,%3 \n" | |
| 2935 "jg 1b \n" | |
| 2936 "vzeroupper \n" | |
| 2937 : "+r"(src_yuy2), // %0 | |
| 2938 "+r"(dst_u), // %1 | |
| 2939 "+r"(dst_v), // %2 | |
| 2940 "+r"(pix) // %3 | |
| 2941 : "r"((intptr_t)(stride_yuy2)) // %4 | |
| 2942 : "memory", "cc", NACL_R14 | |
| 2943 "xmm0", "xmm1", "xmm5" | |
| 2944 ); | |
| 2945 } | |
| 2946 | |
| 2947 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, | |
| 2948 uint8* dst_u, uint8* dst_v, int pix) { | |
| 2949 asm volatile ( | |
| 2950 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
| 2951 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | |
| 2952 "sub %1,%2 \n" | |
| 2953 LABELALIGN | |
| 2954 "1: \n" | |
| 2955 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 2956 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
| 2957 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 2958 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
| 2959 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" | |
| 2960 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 2961 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 2962 "vpand %%ymm5,%%ymm0,%%ymm1 \n" | |
| 2963 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
| 2964 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" | |
| 2965 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | |
| 2966 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
| 2967 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 2968 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" | |
| 2969 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) | |
| 2970 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 2971 "sub $0x20,%3 \n" | |
| 2972 "jg 1b \n" | |
| 2973 "vzeroupper \n" | |
| 2974 : "+r"(src_yuy2), // %0 | |
| 2975 "+r"(dst_u), // %1 | |
| 2976 "+r"(dst_v), // %2 | |
| 2977 "+r"(pix) // %3 | |
| 2978 : | |
| 2979 : "memory", "cc", NACL_R14 | |
| 2980 "xmm0", "xmm1", "xmm5" | |
| 2981 ); | |
| 2982 } | |
| 2983 | |
| 2984 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) { | |
| 2985 asm volatile ( | |
| 2986 LABELALIGN | |
| 2987 "1: \n" | |
| 2988 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 2989 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
| 2990 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 2991 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
| 2992 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" | |
| 2993 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 2994 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 2995 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
| 2996 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 2997 "sub $0x20,%2 \n" | |
| 2998 "jg 1b \n" | |
| 2999 "vzeroupper \n" | |
| 3000 : "+r"(src_uyvy), // %0 | |
| 3001 "+r"(dst_y), // %1 | |
| 3002 "+r"(pix) // %2 | |
| 3003 : | |
| 3004 : "memory", "cc" | |
| 3005 , "xmm0", "xmm1", "xmm5" | |
| 3006 ); | |
| 3007 } | |
| 3008 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, | |
| 3009 uint8* dst_u, uint8* dst_v, int pix) { | |
| 3010 asm volatile ( | |
| 3011 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
| 3012 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | |
| 3013 "sub %1,%2 \n" | |
| 3014 | |
| 3015 LABELALIGN | |
| 3016 "1: \n" | |
| 3017 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 3018 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
| 3019 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 | |
| 3020 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) | |
| 3021 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 3022 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | |
| 3023 "vpand %%ymm5,%%ymm1,%%ymm1 \n" | |
| 3024 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 3025 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 3026 "vpand %%ymm5,%%ymm0,%%ymm1 \n" | |
| 3027 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
| 3028 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" | |
| 3029 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | |
| 3030 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
| 3031 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 3032 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" | |
| 3033 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) | |
| 3034 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 3035 "sub $0x20,%3 \n" | |
| 3036 "jg 1b \n" | |
| 3037 "vzeroupper \n" | |
| 3038 : "+r"(src_uyvy), // %0 | |
| 3039 "+r"(dst_u), // %1 | |
| 3040 "+r"(dst_v), // %2 | |
| 3041 "+r"(pix) // %3 | |
| 3042 : "r"((intptr_t)(stride_uyvy)) // %4 | |
| 3043 : "memory", "cc", NACL_R14 | |
| 3044 "xmm0", "xmm1", "xmm5" | |
| 3045 ); | |
| 3046 } | |
| 3047 | |
| 3048 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, | |
| 3049 uint8* dst_u, uint8* dst_v, int pix) { | |
| 3050 asm volatile ( | |
| 3051 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
| 3052 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | |
| 3053 "sub %1,%2 \n" | |
| 3054 LABELALIGN | |
| 3055 "1: \n" | |
| 3056 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 3057 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
| 3058 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 3059 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | |
| 3060 "vpand %%ymm5,%%ymm1,%%ymm1 \n" | |
| 3061 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 3062 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 3063 "vpand %%ymm5,%%ymm0,%%ymm1 \n" | |
| 3064 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
| 3065 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" | |
| 3066 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | |
| 3067 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
| 3068 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 3069 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" | |
| 3070 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) | |
| 3071 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 3072 "sub $0x20,%3 \n" | |
| 3073 "jg 1b \n" | |
| 3074 "vzeroupper \n" | |
| 3075 : "+r"(src_uyvy), // %0 | |
| 3076 "+r"(dst_u), // %1 | |
| 3077 "+r"(dst_v), // %2 | |
| 3078 "+r"(pix) // %3 | |
| 3079 : | |
| 3080 : "memory", "cc", NACL_R14 | |
| 3081 "xmm0", "xmm1", "xmm5" | |
| 3082 ); | |
| 3083 } | |
| 3084 #endif // HAS_YUY2TOYROW_AVX2 | |
| 3085 | |
| 3086 #ifdef HAS_ARGBBLENDROW_SSE2 | |
| 3087 // Blend 8 pixels at a time. | |
| 3088 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | |
| 3089 uint8* dst_argb, int width) { | |
| 3090 asm volatile ( | |
| 3091 "pcmpeqb %%xmm7,%%xmm7 \n" | |
| 3092 "psrlw $0xf,%%xmm7 \n" | |
| 3093 "pcmpeqb %%xmm6,%%xmm6 \n" | |
| 3094 "psrlw $0x8,%%xmm6 \n" | |
| 3095 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 3096 "psllw $0x8,%%xmm5 \n" | |
| 3097 "pcmpeqb %%xmm4,%%xmm4 \n" | |
| 3098 "pslld $0x18,%%xmm4 \n" | |
| 3099 "sub $0x1,%3 \n" | |
| 3100 "je 91f \n" | |
| 3101 "jl 99f \n" | |
| 3102 | |
| 3103 // 1 pixel loop until destination pointer is aligned. | |
| 3104 "10: \n" | |
| 3105 "test $0xf,%2 \n" | |
| 3106 "je 19f \n" | |
| 3107 "movd " MEMACCESS(0) ",%%xmm3 \n" | |
| 3108 "lea " MEMLEA(0x4,0) ",%0 \n" | |
| 3109 "movdqa %%xmm3,%%xmm0 \n" | |
| 3110 "pxor %%xmm4,%%xmm3 \n" | |
| 3111 "movd " MEMACCESS(1) ",%%xmm2 \n" | |
| 3112 "psrlw $0x8,%%xmm3 \n" | |
| 3113 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" | |
| 3114 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" | |
| 3115 "pand %%xmm6,%%xmm2 \n" | |
| 3116 "paddw %%xmm7,%%xmm3 \n" | |
| 3117 "pmullw %%xmm3,%%xmm2 \n" | |
| 3118 "movd " MEMACCESS(1) ",%%xmm1 \n" | |
| 3119 "lea " MEMLEA(0x4,1) ",%1 \n" | |
| 3120 "psrlw $0x8,%%xmm1 \n" | |
| 3121 "por %%xmm4,%%xmm0 \n" | |
| 3122 "pmullw %%xmm3,%%xmm1 \n" | |
| 3123 "psrlw $0x8,%%xmm2 \n" | |
| 3124 "paddusb %%xmm2,%%xmm0 \n" | |
| 3125 "pand %%xmm5,%%xmm1 \n" | |
| 3126 "paddusb %%xmm1,%%xmm0 \n" | |
| 3127 "movd %%xmm0," MEMACCESS(2) " \n" | |
| 3128 "lea " MEMLEA(0x4,2) ",%2 \n" | |
| 3129 "sub $0x1,%3 \n" | |
| 3130 "jge 10b \n" | |
| 3131 | |
| 3132 "19: \n" | |
| 3133 "add $1-4,%3 \n" | |
| 3134 "jl 49f \n" | |
| 3135 | |
| 3136 // 4 pixel loop. | |
| 3137 LABELALIGN | |
| 3138 "41: \n" | |
| 3139 "movdqu " MEMACCESS(0) ",%%xmm3 \n" | |
| 3140 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 3141 "movdqa %%xmm3,%%xmm0 \n" | |
| 3142 "pxor %%xmm4,%%xmm3 \n" | |
| 3143 "movdqu " MEMACCESS(1) ",%%xmm2 \n" | |
| 3144 "psrlw $0x8,%%xmm3 \n" | |
| 3145 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" | |
| 3146 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" | |
| 3147 "pand %%xmm6,%%xmm2 \n" | |
| 3148 "paddw %%xmm7,%%xmm3 \n" | |
| 3149 "pmullw %%xmm3,%%xmm2 \n" | |
| 3150 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
| 3151 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 3152 "psrlw $0x8,%%xmm1 \n" | |
| 3153 "por %%xmm4,%%xmm0 \n" | |
| 3154 "pmullw %%xmm3,%%xmm1 \n" | |
| 3155 "psrlw $0x8,%%xmm2 \n" | |
| 3156 "paddusb %%xmm2,%%xmm0 \n" | |
| 3157 "pand %%xmm5,%%xmm1 \n" | |
| 3158 "paddusb %%xmm1,%%xmm0 \n" | |
| 3159 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
| 3160 "lea " MEMLEA(0x10,2) ",%2 \n" | |
| 3161 "sub $0x4,%3 \n" | |
| 3162 "jge 41b \n" | |
| 3163 | |
| 3164 "49: \n" | |
| 3165 "add $0x3,%3 \n" | |
| 3166 "jl 99f \n" | |
| 3167 | |
| 3168 // 1 pixel loop. | |
| 3169 "91: \n" | |
| 3170 "movd " MEMACCESS(0) ",%%xmm3 \n" | |
| 3171 "lea " MEMLEA(0x4,0) ",%0 \n" | |
| 3172 "movdqa %%xmm3,%%xmm0 \n" | |
| 3173 "pxor %%xmm4,%%xmm3 \n" | |
| 3174 "movd " MEMACCESS(1) ",%%xmm2 \n" | |
| 3175 "psrlw $0x8,%%xmm3 \n" | |
| 3176 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" | |
| 3177 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" | |
| 3178 "pand %%xmm6,%%xmm2 \n" | |
| 3179 "paddw %%xmm7,%%xmm3 \n" | |
| 3180 "pmullw %%xmm3,%%xmm2 \n" | |
| 3181 "movd " MEMACCESS(1) ",%%xmm1 \n" | |
| 3182 "lea " MEMLEA(0x4,1) ",%1 \n" | |
| 3183 "psrlw $0x8,%%xmm1 \n" | |
| 3184 "por %%xmm4,%%xmm0 \n" | |
| 3185 "pmullw %%xmm3,%%xmm1 \n" | |
| 3186 "psrlw $0x8,%%xmm2 \n" | |
| 3187 "paddusb %%xmm2,%%xmm0 \n" | |
| 3188 "pand %%xmm5,%%xmm1 \n" | |
| 3189 "paddusb %%xmm1,%%xmm0 \n" | |
| 3190 "movd %%xmm0," MEMACCESS(2) " \n" | |
| 3191 "lea " MEMLEA(0x4,2) ",%2 \n" | |
| 3192 "sub $0x1,%3 \n" | |
| 3193 "jge 91b \n" | |
| 3194 "99: \n" | |
| 3195 : "+r"(src_argb0), // %0 | |
| 3196 "+r"(src_argb1), // %1 | |
| 3197 "+r"(dst_argb), // %2 | |
| 3198 "+r"(width) // %3 | |
| 3199 : | |
| 3200 : "memory", "cc" | |
| 3201 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 3202 ); | |
| 3203 } | |
| 3204 #endif // HAS_ARGBBLENDROW_SSE2 | |
| 3205 | |
| 3206 #ifdef HAS_ARGBBLENDROW_SSSE3 | |
| 3207 // Shuffle table for isolating alpha. | |
| 3208 static uvec8 kShuffleAlpha = { | |
| 3209 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, | |
| 3210 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 | |
| 3211 }; | |
| 3212 | |
| 3213 // Blend 8 pixels at a time | |
| 3214 // Shuffle table for reversing the bytes. | |
| 3215 | |
| 3216 // Same as SSE2, but replaces | |
| 3217 // psrlw xmm3, 8 // alpha | |
| 3218 // pshufhw xmm3, xmm3,0F5h // 8 alpha words | |
| 3219 // pshuflw xmm3, xmm3,0F5h | |
| 3220 // with.. | |
| 3221 // pshufb xmm3, kShuffleAlpha // alpha | |
| 3222 | |
| 3223 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, | |
| 3224 uint8* dst_argb, int width) { | |
| 3225 asm volatile ( | |
| 3226 "pcmpeqb %%xmm7,%%xmm7 \n" | |
| 3227 "psrlw $0xf,%%xmm7 \n" | |
| 3228 "pcmpeqb %%xmm6,%%xmm6 \n" | |
| 3229 "psrlw $0x8,%%xmm6 \n" | |
| 3230 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 3231 "psllw $0x8,%%xmm5 \n" | |
| 3232 "pcmpeqb %%xmm4,%%xmm4 \n" | |
| 3233 "pslld $0x18,%%xmm4 \n" | |
| 3234 "sub $0x1,%3 \n" | |
| 3235 "je 91f \n" | |
| 3236 "jl 99f \n" | |
| 3237 | |
| 3238 // 1 pixel loop until destination pointer is aligned. | |
| 3239 "10: \n" | |
| 3240 "test $0xf,%2 \n" | |
| 3241 "je 19f \n" | |
| 3242 "movd " MEMACCESS(0) ",%%xmm3 \n" | |
| 3243 "lea " MEMLEA(0x4,0) ",%0 \n" | |
| 3244 "movdqa %%xmm3,%%xmm0 \n" | |
| 3245 "pxor %%xmm4,%%xmm3 \n" | |
| 3246 "movd " MEMACCESS(1) ",%%xmm2 \n" | |
| 3247 "pshufb %4,%%xmm3 \n" | |
| 3248 "pand %%xmm6,%%xmm2 \n" | |
| 3249 "paddw %%xmm7,%%xmm3 \n" | |
| 3250 "pmullw %%xmm3,%%xmm2 \n" | |
| 3251 "movd " MEMACCESS(1) ",%%xmm1 \n" | |
| 3252 "lea " MEMLEA(0x4,1) ",%1 \n" | |
| 3253 "psrlw $0x8,%%xmm1 \n" | |
| 3254 "por %%xmm4,%%xmm0 \n" | |
| 3255 "pmullw %%xmm3,%%xmm1 \n" | |
| 3256 "psrlw $0x8,%%xmm2 \n" | |
| 3257 "paddusb %%xmm2,%%xmm0 \n" | |
| 3258 "pand %%xmm5,%%xmm1 \n" | |
| 3259 "paddusb %%xmm1,%%xmm0 \n" | |
| 3260 "movd %%xmm0," MEMACCESS(2) " \n" | |
| 3261 "lea " MEMLEA(0x4,2) ",%2 \n" | |
| 3262 "sub $0x1,%3 \n" | |
| 3263 "jge 10b \n" | |
| 3264 | |
| 3265 "19: \n" | |
| 3266 "add $1-4,%3 \n" | |
| 3267 "jl 49f \n" | |
| 3268 | |
| 3269 // 4 pixel loop. | |
| 3270 LABELALIGN | |
| 3271 "40: \n" | |
| 3272 "movdqu " MEMACCESS(0) ",%%xmm3 \n" | |
| 3273 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 3274 "movdqa %%xmm3,%%xmm0 \n" | |
| 3275 "pxor %%xmm4,%%xmm3 \n" | |
| 3276 "movdqu " MEMACCESS(1) ",%%xmm2 \n" | |
| 3277 "pshufb %4,%%xmm3 \n" | |
| 3278 "pand %%xmm6,%%xmm2 \n" | |
| 3279 "paddw %%xmm7,%%xmm3 \n" | |
| 3280 "pmullw %%xmm3,%%xmm2 \n" | |
| 3281 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
| 3282 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 3283 "psrlw $0x8,%%xmm1 \n" | |
| 3284 "por %%xmm4,%%xmm0 \n" | |
| 3285 "pmullw %%xmm3,%%xmm1 \n" | |
| 3286 "psrlw $0x8,%%xmm2 \n" | |
| 3287 "paddusb %%xmm2,%%xmm0 \n" | |
| 3288 "pand %%xmm5,%%xmm1 \n" | |
| 3289 "paddusb %%xmm1,%%xmm0 \n" | |
| 3290 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
| 3291 "lea " MEMLEA(0x10,2) ",%2 \n" | |
| 3292 "sub $0x4,%3 \n" | |
| 3293 "jge 40b \n" | |
| 3294 | |
| 3295 "49: \n" | |
| 3296 "add $0x3,%3 \n" | |
| 3297 "jl 99f \n" | |
| 3298 | |
| 3299 // 1 pixel loop. | |
| 3300 "91: \n" | |
| 3301 "movd " MEMACCESS(0) ",%%xmm3 \n" | |
| 3302 "lea " MEMLEA(0x4,0) ",%0 \n" | |
| 3303 "movdqa %%xmm3,%%xmm0 \n" | |
| 3304 "pxor %%xmm4,%%xmm3 \n" | |
| 3305 "movd " MEMACCESS(1) ",%%xmm2 \n" | |
| 3306 "pshufb %4,%%xmm3 \n" | |
| 3307 "pand %%xmm6,%%xmm2 \n" | |
| 3308 "paddw %%xmm7,%%xmm3 \n" | |
| 3309 "pmullw %%xmm3,%%xmm2 \n" | |
| 3310 "movd " MEMACCESS(1) ",%%xmm1 \n" | |
| 3311 "lea " MEMLEA(0x4,1) ",%1 \n" | |
| 3312 "psrlw $0x8,%%xmm1 \n" | |
| 3313 "por %%xmm4,%%xmm0 \n" | |
| 3314 "pmullw %%xmm3,%%xmm1 \n" | |
| 3315 "psrlw $0x8,%%xmm2 \n" | |
| 3316 "paddusb %%xmm2,%%xmm0 \n" | |
| 3317 "pand %%xmm5,%%xmm1 \n" | |
| 3318 "paddusb %%xmm1,%%xmm0 \n" | |
| 3319 "movd %%xmm0," MEMACCESS(2) " \n" | |
| 3320 "lea " MEMLEA(0x4,2) ",%2 \n" | |
| 3321 "sub $0x1,%3 \n" | |
| 3322 "jge 91b \n" | |
| 3323 "99: \n" | |
| 3324 : "+r"(src_argb0), // %0 | |
| 3325 "+r"(src_argb1), // %1 | |
| 3326 "+r"(dst_argb), // %2 | |
| 3327 "+r"(width) // %3 | |
| 3328 : "m"(kShuffleAlpha) // %4 | |
| 3329 : "memory", "cc" | |
| 3330 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 3331 ); | |
| 3332 } | |
| 3333 #endif // HAS_ARGBBLENDROW_SSSE3 | |
| 3334 | |
| 3335 #ifdef HAS_ARGBATTENUATEROW_SSE2 | |
| 3336 // Attenuate 4 pixels at a time. | |
| 3337 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { | |
| 3338 asm volatile ( | |
| 3339 "pcmpeqb %%xmm4,%%xmm4 \n" | |
| 3340 "pslld $0x18,%%xmm4 \n" | |
| 3341 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 3342 "psrld $0x8,%%xmm5 \n" | |
| 3343 | |
| 3344 // 4 pixel loop. | |
| 3345 LABELALIGN | |
| 3346 "1: \n" | |
| 3347 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 3348 "punpcklbw %%xmm0,%%xmm0 \n" | |
| 3349 "pshufhw $0xff,%%xmm0,%%xmm2 \n" | |
| 3350 "pshuflw $0xff,%%xmm2,%%xmm2 \n" | |
| 3351 "pmulhuw %%xmm2,%%xmm0 \n" | |
| 3352 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
| 3353 "punpckhbw %%xmm1,%%xmm1 \n" | |
| 3354 "pshufhw $0xff,%%xmm1,%%xmm2 \n" | |
| 3355 "pshuflw $0xff,%%xmm2,%%xmm2 \n" | |
| 3356 "pmulhuw %%xmm2,%%xmm1 \n" | |
| 3357 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
| 3358 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 3359 "psrlw $0x8,%%xmm0 \n" | |
| 3360 "pand %%xmm4,%%xmm2 \n" | |
| 3361 "psrlw $0x8,%%xmm1 \n" | |
| 3362 "packuswb %%xmm1,%%xmm0 \n" | |
| 3363 "pand %%xmm5,%%xmm0 \n" | |
| 3364 "por %%xmm2,%%xmm0 \n" | |
| 3365 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 3366 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 3367 "sub $0x4,%2 \n" | |
| 3368 "jg 1b \n" | |
| 3369 : "+r"(src_argb), // %0 | |
| 3370 "+r"(dst_argb), // %1 | |
| 3371 "+r"(width) // %2 | |
| 3372 : | |
| 3373 : "memory", "cc" | |
| 3374 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 3375 ); | |
| 3376 } | |
| 3377 #endif // HAS_ARGBATTENUATEROW_SSE2 | |
| 3378 | |
| 3379 #ifdef HAS_ARGBATTENUATEROW_SSSE3 | |
| 3380 // Shuffle table duplicating alpha | |
| 3381 static uvec8 kShuffleAlpha0 = { | |
| 3382 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u | |
| 3383 }; | |
| 3384 static uvec8 kShuffleAlpha1 = { | |
| 3385 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, | |
| 3386 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u | |
| 3387 }; | |
| 3388 // Attenuate 4 pixels at a time. | |
| 3389 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { | |
| 3390 asm volatile ( | |
| 3391 "pcmpeqb %%xmm3,%%xmm3 \n" | |
| 3392 "pslld $0x18,%%xmm3 \n" | |
| 3393 "movdqa %3,%%xmm4 \n" | |
| 3394 "movdqa %4,%%xmm5 \n" | |
| 3395 | |
| 3396 // 4 pixel loop. | |
| 3397 LABELALIGN | |
| 3398 "1: \n" | |
| 3399 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 3400 "pshufb %%xmm4,%%xmm0 \n" | |
| 3401 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
| 3402 "punpcklbw %%xmm1,%%xmm1 \n" | |
| 3403 "pmulhuw %%xmm1,%%xmm0 \n" | |
| 3404 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
| 3405 "pshufb %%xmm5,%%xmm1 \n" | |
| 3406 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
| 3407 "punpckhbw %%xmm2,%%xmm2 \n" | |
| 3408 "pmulhuw %%xmm2,%%xmm1 \n" | |
| 3409 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
| 3410 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 3411 "pand %%xmm3,%%xmm2 \n" | |
| 3412 "psrlw $0x8,%%xmm0 \n" | |
| 3413 "psrlw $0x8,%%xmm1 \n" | |
| 3414 "packuswb %%xmm1,%%xmm0 \n" | |
| 3415 "por %%xmm2,%%xmm0 \n" | |
| 3416 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 3417 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 3418 "sub $0x4,%2 \n" | |
| 3419 "jg 1b \n" | |
| 3420 : "+r"(src_argb), // %0 | |
| 3421 "+r"(dst_argb), // %1 | |
| 3422 "+r"(width) // %2 | |
| 3423 : "m"(kShuffleAlpha0), // %3 | |
| 3424 "m"(kShuffleAlpha1) // %4 | |
| 3425 : "memory", "cc" | |
| 3426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 3427 ); | |
| 3428 } | |
| 3429 #endif // HAS_ARGBATTENUATEROW_SSSE3 | |
| 3430 | |
| 3431 #ifdef HAS_ARGBATTENUATEROW_AVX2 | |
| 3432 // Shuffle table duplicating alpha. | |
| 3433 static const uvec8 kShuffleAlpha_AVX2 = { | |
| 3434 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u | |
| 3435 }; | |
| 3436 // Attenuate 8 pixels at a time. | |
| 3437 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { | |
| 3438 asm volatile ( | |
| 3439 "vbroadcastf128 %3,%%ymm4 \n" | |
| 3440 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
| 3441 "vpslld $0x18,%%ymm5,%%ymm5 \n" | |
| 3442 "sub %0,%1 \n" | |
| 3443 | |
| 3444 // 8 pixel loop. | |
| 3445 LABELALIGN | |
| 3446 "1: \n" | |
| 3447 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" | |
| 3448 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" | |
| 3449 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" | |
| 3450 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" | |
| 3451 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" | |
| 3452 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" | |
| 3453 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" | |
| 3454 "vpand %%ymm5,%%ymm6,%%ymm6 \n" | |
| 3455 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
| 3456 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" | |
| 3457 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 3458 "vpor %%ymm6,%%ymm0,%%ymm0 \n" | |
| 3459 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) | |
| 3460 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 3461 "sub $0x8,%2 \n" | |
| 3462 "jg 1b \n" | |
| 3463 "vzeroupper \n" | |
| 3464 : "+r"(src_argb), // %0 | |
| 3465 "+r"(dst_argb), // %1 | |
| 3466 "+r"(width) // %2 | |
| 3467 : "m"(kShuffleAlpha_AVX2) // %3 | |
| 3468 : "memory", "cc" | |
| 3469 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
| 3470 ); | |
| 3471 } | |
| 3472 #endif // HAS_ARGBATTENUATEROW_AVX2 | |
| 3473 | |
| 3474 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 | |
| 3475 // Unattenuate 4 pixels at a time. | |
| 3476 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, | |
| 3477 int width) { | |
| 3478 uintptr_t alpha = 0; | |
| 3479 asm volatile ( | |
| 3480 // 4 pixel loop. | |
| 3481 LABELALIGN | |
| 3482 "1: \n" | |
| 3483 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 3484 "movzb " MEMACCESS2(0x03,0) ",%3 \n" | |
| 3485 "punpcklbw %%xmm0,%%xmm0 \n" | |
| 3486 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 | |
| 3487 "movzb " MEMACCESS2(0x07,0) ",%3 \n" | |
| 3488 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 | |
| 3489 "pshuflw $0x40,%%xmm2,%%xmm2 \n" | |
| 3490 "pshuflw $0x40,%%xmm3,%%xmm3 \n" | |
| 3491 "movlhps %%xmm3,%%xmm2 \n" | |
| 3492 "pmulhuw %%xmm2,%%xmm0 \n" | |
| 3493 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
| 3494 "movzb " MEMACCESS2(0x0b,0) ",%3 \n" | |
| 3495 "punpckhbw %%xmm1,%%xmm1 \n" | |
| 3496 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 | |
| 3497 "movzb " MEMACCESS2(0x0f,0) ",%3 \n" | |
| 3498 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 | |
| 3499 "pshuflw $0x40,%%xmm2,%%xmm2 \n" | |
| 3500 "pshuflw $0x40,%%xmm3,%%xmm3 \n" | |
| 3501 "movlhps %%xmm3,%%xmm2 \n" | |
| 3502 "pmulhuw %%xmm2,%%xmm1 \n" | |
| 3503 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 3504 "packuswb %%xmm1,%%xmm0 \n" | |
| 3505 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 3506 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 3507 "sub $0x4,%2 \n" | |
| 3508 "jg 1b \n" | |
| 3509 : "+r"(src_argb), // %0 | |
| 3510 "+r"(dst_argb), // %1 | |
| 3511 "+r"(width), // %2 | |
| 3512 "+r"(alpha) // %3 | |
| 3513 : "r"(fixed_invtbl8) // %4 | |
| 3514 : "memory", "cc", NACL_R14 | |
| 3515 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 3516 ); | |
| 3517 } | |
| 3518 #endif // HAS_ARGBUNATTENUATEROW_SSE2 | |
| 3519 | |
| 3520 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 | |
| 3521 // Shuffle table duplicating alpha. | |
| 3522 static const uvec8 kUnattenShuffleAlpha_AVX2 = { | |
| 3523 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u | |
| 3524 }; | |
| 3525 // Unattenuate 8 pixels at a time. | |
| 3526 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, | |
| 3527 int width) { | |
| 3528 uintptr_t alpha = 0; | |
| 3529 asm volatile ( | |
| 3530 "sub %0,%1 \n" | |
| 3531 "vbroadcastf128 %5,%%ymm5 \n" | |
| 3532 | |
| 3533 // 8 pixel loop. | |
| 3534 LABELALIGN | |
| 3535 "1: \n" | |
| 3536 // replace VPGATHER | |
| 3537 "movzb " MEMACCESS2(0x03,0) ",%3 \n" | |
| 3538 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 | |
| 3539 "movzb " MEMACCESS2(0x07,0) ",%3 \n" | |
| 3540 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 | |
| 3541 "movzb " MEMACCESS2(0x0b,0) ",%3 \n" | |
| 3542 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" | |
| 3543 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 | |
| 3544 "movzb " MEMACCESS2(0x0f,0) ",%3 \n" | |
| 3545 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 | |
| 3546 "movzb " MEMACCESS2(0x13,0) ",%3 \n" | |
| 3547 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" | |
| 3548 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 | |
| 3549 "movzb " MEMACCESS2(0x17,0) ",%3 \n" | |
| 3550 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 | |
| 3551 "movzb " MEMACCESS2(0x1b,0) ",%3 \n" | |
| 3552 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" | |
| 3553 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 | |
| 3554 "movzb " MEMACCESS2(0x1f,0) ",%3 \n" | |
| 3555 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 | |
| 3556 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" | |
| 3557 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" | |
| 3558 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" | |
| 3559 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" | |
| 3560 // end of VPGATHER | |
| 3561 | |
| 3562 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" | |
| 3563 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" | |
| 3564 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" | |
| 3565 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" | |
| 3566 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" | |
| 3567 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" | |
| 3568 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" | |
| 3569 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" | |
| 3570 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" | |
| 3571 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 3572 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) | |
| 3573 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 3574 "sub $0x8,%2 \n" | |
| 3575 "jg 1b \n" | |
| 3576 "vzeroupper \n" | |
| 3577 : "+r"(src_argb), // %0 | |
| 3578 "+r"(dst_argb), // %1 | |
| 3579 "+r"(width), // %2 | |
| 3580 "+r"(alpha) // %3 | |
| 3581 : "r"(fixed_invtbl8), // %4 | |
| 3582 "m"(kUnattenShuffleAlpha_AVX2) // %5 | |
| 3583 : "memory", "cc", NACL_R14 | |
| 3584 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 3585 ); | |
| 3586 } | |
| 3587 #endif // HAS_ARGBUNATTENUATEROW_AVX2 | |
| 3588 | |
| 3589 #ifdef HAS_ARGBGRAYROW_SSSE3 | |
| 3590 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels | |
| 3591 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { | |
| 3592 asm volatile ( | |
| 3593 "movdqa %3,%%xmm4 \n" | |
| 3594 "movdqa %4,%%xmm5 \n" | |
| 3595 | |
| 3596 // 8 pixel loop. | |
| 3597 LABELALIGN | |
| 3598 "1: \n" | |
| 3599 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 3600 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 3601 "pmaddubsw %%xmm4,%%xmm0 \n" | |
| 3602 "pmaddubsw %%xmm4,%%xmm1 \n" | |
| 3603 "phaddw %%xmm1,%%xmm0 \n" | |
| 3604 "paddw %%xmm5,%%xmm0 \n" | |
| 3605 "psrlw $0x7,%%xmm0 \n" | |
| 3606 "packuswb %%xmm0,%%xmm0 \n" | |
| 3607 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
| 3608 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" | |
| 3609 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 3610 "psrld $0x18,%%xmm2 \n" | |
| 3611 "psrld $0x18,%%xmm3 \n" | |
| 3612 "packuswb %%xmm3,%%xmm2 \n" | |
| 3613 "packuswb %%xmm2,%%xmm2 \n" | |
| 3614 "movdqa %%xmm0,%%xmm3 \n" | |
| 3615 "punpcklbw %%xmm0,%%xmm0 \n" | |
| 3616 "punpcklbw %%xmm2,%%xmm3 \n" | |
| 3617 "movdqa %%xmm0,%%xmm1 \n" | |
| 3618 "punpcklwd %%xmm3,%%xmm0 \n" | |
| 3619 "punpckhwd %%xmm3,%%xmm1 \n" | |
| 3620 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 3621 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
| 3622 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 3623 "sub $0x8,%2 \n" | |
| 3624 "jg 1b \n" | |
| 3625 : "+r"(src_argb), // %0 | |
| 3626 "+r"(dst_argb), // %1 | |
| 3627 "+r"(width) // %2 | |
| 3628 : "m"(kARGBToYJ), // %3 | |
| 3629 "m"(kAddYJ64) // %4 | |
| 3630 : "memory", "cc" | |
| 3631 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 3632 ); | |
| 3633 } | |
| 3634 #endif // HAS_ARGBGRAYROW_SSSE3 | |
| 3635 | |
| 3636 #ifdef HAS_ARGBSEPIAROW_SSSE3 | |
| 3637 // b = (r * 35 + g * 68 + b * 17) >> 7 | |
| 3638 // g = (r * 45 + g * 88 + b * 22) >> 7 | |
| 3639 // r = (r * 50 + g * 98 + b * 24) >> 7 | |
| 3640 // Constant for ARGB color to sepia tone | |
| 3641 static vec8 kARGBToSepiaB = { | |
| 3642 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 | |
| 3643 }; | |
| 3644 | |
| 3645 static vec8 kARGBToSepiaG = { | |
| 3646 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 | |
| 3647 }; | |
| 3648 | |
| 3649 static vec8 kARGBToSepiaR = { | |
| 3650 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 | |
| 3651 }; | |
| 3652 | |
| 3653 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. | |
| 3654 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { | |
| 3655 asm volatile ( | |
| 3656 "movdqa %2,%%xmm2 \n" | |
| 3657 "movdqa %3,%%xmm3 \n" | |
| 3658 "movdqa %4,%%xmm4 \n" | |
| 3659 | |
| 3660 // 8 pixel loop. | |
| 3661 LABELALIGN | |
| 3662 "1: \n" | |
| 3663 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 3664 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" | |
| 3665 "pmaddubsw %%xmm2,%%xmm0 \n" | |
| 3666 "pmaddubsw %%xmm2,%%xmm6 \n" | |
| 3667 "phaddw %%xmm6,%%xmm0 \n" | |
| 3668 "psrlw $0x7,%%xmm0 \n" | |
| 3669 "packuswb %%xmm0,%%xmm0 \n" | |
| 3670 "movdqu " MEMACCESS(0) ",%%xmm5 \n" | |
| 3671 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 3672 "pmaddubsw %%xmm3,%%xmm5 \n" | |
| 3673 "pmaddubsw %%xmm3,%%xmm1 \n" | |
| 3674 "phaddw %%xmm1,%%xmm5 \n" | |
| 3675 "psrlw $0x7,%%xmm5 \n" | |
| 3676 "packuswb %%xmm5,%%xmm5 \n" | |
| 3677 "punpcklbw %%xmm5,%%xmm0 \n" | |
| 3678 "movdqu " MEMACCESS(0) ",%%xmm5 \n" | |
| 3679 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 3680 "pmaddubsw %%xmm4,%%xmm5 \n" | |
| 3681 "pmaddubsw %%xmm4,%%xmm1 \n" | |
| 3682 "phaddw %%xmm1,%%xmm5 \n" | |
| 3683 "psrlw $0x7,%%xmm5 \n" | |
| 3684 "packuswb %%xmm5,%%xmm5 \n" | |
| 3685 "movdqu " MEMACCESS(0) ",%%xmm6 \n" | |
| 3686 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 3687 "psrld $0x18,%%xmm6 \n" | |
| 3688 "psrld $0x18,%%xmm1 \n" | |
| 3689 "packuswb %%xmm1,%%xmm6 \n" | |
| 3690 "packuswb %%xmm6,%%xmm6 \n" | |
| 3691 "punpcklbw %%xmm6,%%xmm5 \n" | |
| 3692 "movdqa %%xmm0,%%xmm1 \n" | |
| 3693 "punpcklwd %%xmm5,%%xmm0 \n" | |
| 3694 "punpckhwd %%xmm5,%%xmm1 \n" | |
| 3695 "movdqu %%xmm0," MEMACCESS(0) " \n" | |
| 3696 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" | |
| 3697 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 3698 "sub $0x8,%1 \n" | |
| 3699 "jg 1b \n" | |
| 3700 : "+r"(dst_argb), // %0 | |
| 3701 "+r"(width) // %1 | |
| 3702 : "m"(kARGBToSepiaB), // %2 | |
| 3703 "m"(kARGBToSepiaG), // %3 | |
| 3704 "m"(kARGBToSepiaR) // %4 | |
| 3705 : "memory", "cc" | |
| 3706 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
| 3707 ); | |
| 3708 } | |
| 3709 #endif // HAS_ARGBSEPIAROW_SSSE3 | |
| 3710 | |
| 3711 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 | |
| 3712 // Tranform 8 ARGB pixels (32 bytes) with color matrix. | |
| 3713 // Same as Sepia except matrix is provided. | |
| 3714 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | |
| 3715 const int8* matrix_argb, int width) { | |
| 3716 asm volatile ( | |
| 3717 "movdqu " MEMACCESS(3) ",%%xmm5 \n" | |
| 3718 "pshufd $0x00,%%xmm5,%%xmm2 \n" | |
| 3719 "pshufd $0x55,%%xmm5,%%xmm3 \n" | |
| 3720 "pshufd $0xaa,%%xmm5,%%xmm4 \n" | |
| 3721 "pshufd $0xff,%%xmm5,%%xmm5 \n" | |
| 3722 | |
| 3723 // 8 pixel loop. | |
| 3724 LABELALIGN | |
| 3725 "1: \n" | |
| 3726 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 3727 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" | |
| 3728 "pmaddubsw %%xmm2,%%xmm0 \n" | |
| 3729 "pmaddubsw %%xmm2,%%xmm7 \n" | |
| 3730 "movdqu " MEMACCESS(0) ",%%xmm6 \n" | |
| 3731 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 3732 "pmaddubsw %%xmm3,%%xmm6 \n" | |
| 3733 "pmaddubsw %%xmm3,%%xmm1 \n" | |
| 3734 "phaddsw %%xmm7,%%xmm0 \n" | |
| 3735 "phaddsw %%xmm1,%%xmm6 \n" | |
| 3736 "psraw $0x6,%%xmm0 \n" | |
| 3737 "psraw $0x6,%%xmm6 \n" | |
| 3738 "packuswb %%xmm0,%%xmm0 \n" | |
| 3739 "packuswb %%xmm6,%%xmm6 \n" | |
| 3740 "punpcklbw %%xmm6,%%xmm0 \n" | |
| 3741 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
| 3742 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" | |
| 3743 "pmaddubsw %%xmm4,%%xmm1 \n" | |
| 3744 "pmaddubsw %%xmm4,%%xmm7 \n" | |
| 3745 "phaddsw %%xmm7,%%xmm1 \n" | |
| 3746 "movdqu " MEMACCESS(0) ",%%xmm6 \n" | |
| 3747 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" | |
| 3748 "pmaddubsw %%xmm5,%%xmm6 \n" | |
| 3749 "pmaddubsw %%xmm5,%%xmm7 \n" | |
| 3750 "phaddsw %%xmm7,%%xmm6 \n" | |
| 3751 "psraw $0x6,%%xmm1 \n" | |
| 3752 "psraw $0x6,%%xmm6 \n" | |
| 3753 "packuswb %%xmm1,%%xmm1 \n" | |
| 3754 "packuswb %%xmm6,%%xmm6 \n" | |
| 3755 "punpcklbw %%xmm6,%%xmm1 \n" | |
| 3756 "movdqa %%xmm0,%%xmm6 \n" | |
| 3757 "punpcklwd %%xmm1,%%xmm0 \n" | |
| 3758 "punpckhwd %%xmm1,%%xmm6 \n" | |
| 3759 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 3760 "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" | |
| 3761 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 3762 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 3763 "sub $0x8,%2 \n" | |
| 3764 "jg 1b \n" | |
| 3765 : "+r"(src_argb), // %0 | |
| 3766 "+r"(dst_argb), // %1 | |
| 3767 "+r"(width) // %2 | |
| 3768 : "r"(matrix_argb) // %3 | |
| 3769 : "memory", "cc" | |
| 3770 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 3771 ); | |
| 3772 } | |
| 3773 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 | |
| 3774 | |
| 3775 #ifdef HAS_ARGBQUANTIZEROW_SSE2 | |
| 3776 // Quantize 4 ARGB pixels (16 bytes). | |
| 3777 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, | |
| 3778 int interval_offset, int width) { | |
| 3779 asm volatile ( | |
| 3780 "movd %2,%%xmm2 \n" | |
| 3781 "movd %3,%%xmm3 \n" | |
| 3782 "movd %4,%%xmm4 \n" | |
| 3783 "pshuflw $0x40,%%xmm2,%%xmm2 \n" | |
| 3784 "pshufd $0x44,%%xmm2,%%xmm2 \n" | |
| 3785 "pshuflw $0x40,%%xmm3,%%xmm3 \n" | |
| 3786 "pshufd $0x44,%%xmm3,%%xmm3 \n" | |
| 3787 "pshuflw $0x40,%%xmm4,%%xmm4 \n" | |
| 3788 "pshufd $0x44,%%xmm4,%%xmm4 \n" | |
| 3789 "pxor %%xmm5,%%xmm5 \n" | |
| 3790 "pcmpeqb %%xmm6,%%xmm6 \n" | |
| 3791 "pslld $0x18,%%xmm6 \n" | |
| 3792 | |
| 3793 // 4 pixel loop. | |
| 3794 LABELALIGN | |
| 3795 "1: \n" | |
| 3796 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 3797 "punpcklbw %%xmm5,%%xmm0 \n" | |
| 3798 "pmulhuw %%xmm2,%%xmm0 \n" | |
| 3799 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
| 3800 "punpckhbw %%xmm5,%%xmm1 \n" | |
| 3801 "pmulhuw %%xmm2,%%xmm1 \n" | |
| 3802 "pmullw %%xmm3,%%xmm0 \n" | |
| 3803 "movdqu " MEMACCESS(0) ",%%xmm7 \n" | |
| 3804 "pmullw %%xmm3,%%xmm1 \n" | |
| 3805 "pand %%xmm6,%%xmm7 \n" | |
| 3806 "paddw %%xmm4,%%xmm0 \n" | |
| 3807 "paddw %%xmm4,%%xmm1 \n" | |
| 3808 "packuswb %%xmm1,%%xmm0 \n" | |
| 3809 "por %%xmm7,%%xmm0 \n" | |
| 3810 "movdqu %%xmm0," MEMACCESS(0) " \n" | |
| 3811 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 3812 "sub $0x4,%1 \n" | |
| 3813 "jg 1b \n" | |
| 3814 : "+r"(dst_argb), // %0 | |
| 3815 "+r"(width) // %1 | |
| 3816 : "r"(scale), // %2 | |
| 3817 "r"(interval_size), // %3 | |
| 3818 "r"(interval_offset) // %4 | |
| 3819 : "memory", "cc" | |
| 3820 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 3821 ); | |
| 3822 } | |
| 3823 #endif // HAS_ARGBQUANTIZEROW_SSE2 | |
| 3824 | |
| 3825 #ifdef HAS_ARGBSHADEROW_SSE2 | |
| 3826 // Shade 4 pixels at a time by specified value. | |
| 3827 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, | |
| 3828 uint32 value) { | |
| 3829 asm volatile ( | |
| 3830 "movd %3,%%xmm2 \n" | |
| 3831 "punpcklbw %%xmm2,%%xmm2 \n" | |
| 3832 "punpcklqdq %%xmm2,%%xmm2 \n" | |
| 3833 | |
| 3834 // 4 pixel loop. | |
| 3835 LABELALIGN | |
| 3836 "1: \n" | |
| 3837 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 3838 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 3839 "movdqa %%xmm0,%%xmm1 \n" | |
| 3840 "punpcklbw %%xmm0,%%xmm0 \n" | |
| 3841 "punpckhbw %%xmm1,%%xmm1 \n" | |
| 3842 "pmulhuw %%xmm2,%%xmm0 \n" | |
| 3843 "pmulhuw %%xmm2,%%xmm1 \n" | |
| 3844 "psrlw $0x8,%%xmm0 \n" | |
| 3845 "psrlw $0x8,%%xmm1 \n" | |
| 3846 "packuswb %%xmm1,%%xmm0 \n" | |
| 3847 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 3848 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 3849 "sub $0x4,%2 \n" | |
| 3850 "jg 1b \n" | |
| 3851 : "+r"(src_argb), // %0 | |
| 3852 "+r"(dst_argb), // %1 | |
| 3853 "+r"(width) // %2 | |
| 3854 : "r"(value) // %3 | |
| 3855 : "memory", "cc" | |
| 3856 , "xmm0", "xmm1", "xmm2" | |
| 3857 ); | |
| 3858 } | |
| 3859 #endif // HAS_ARGBSHADEROW_SSE2 | |
| 3860 | |
| 3861 #ifdef HAS_ARGBMULTIPLYROW_SSE2 | |
| 3862 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. | |
| 3863 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | |
| 3864 uint8* dst_argb, int width) { | |
| 3865 asm volatile ( | |
| 3866 "pxor %%xmm5,%%xmm5 \n" | |
| 3867 | |
| 3868 // 4 pixel loop. | |
| 3869 LABELALIGN | |
| 3870 "1: \n" | |
| 3871 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 3872 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 3873 "movdqu " MEMACCESS(1) ",%%xmm2 \n" | |
| 3874 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 3875 "movdqu %%xmm0,%%xmm1 \n" | |
| 3876 "movdqu %%xmm2,%%xmm3 \n" | |
| 3877 "punpcklbw %%xmm0,%%xmm0 \n" | |
| 3878 "punpckhbw %%xmm1,%%xmm1 \n" | |
| 3879 "punpcklbw %%xmm5,%%xmm2 \n" | |
| 3880 "punpckhbw %%xmm5,%%xmm3 \n" | |
| 3881 "pmulhuw %%xmm2,%%xmm0 \n" | |
| 3882 "pmulhuw %%xmm3,%%xmm1 \n" | |
| 3883 "packuswb %%xmm1,%%xmm0 \n" | |
| 3884 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
| 3885 "lea " MEMLEA(0x10,2) ",%2 \n" | |
| 3886 "sub $0x4,%3 \n" | |
| 3887 "jg 1b \n" | |
| 3888 : "+r"(src_argb0), // %0 | |
| 3889 "+r"(src_argb1), // %1 | |
| 3890 "+r"(dst_argb), // %2 | |
| 3891 "+r"(width) // %3 | |
| 3892 : | |
| 3893 : "memory", "cc" | |
| 3894 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 3895 ); | |
| 3896 } | |
| 3897 #endif // HAS_ARGBMULTIPLYROW_SSE2 | |
| 3898 | |
| 3899 #ifdef HAS_ARGBMULTIPLYROW_AVX2 | |
| 3900 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | |
| 3901 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | |
| 3902 uint8* dst_argb, int width) { | |
| 3903 asm volatile ( | |
| 3904 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" | |
| 3905 | |
| 3906 // 4 pixel loop. | |
| 3907 LABELALIGN | |
| 3908 "1: \n" | |
| 3909 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" | |
| 3910 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 3911 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" | |
| 3912 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 3913 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" | |
| 3914 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" | |
| 3915 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" | |
| 3916 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" | |
| 3917 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" | |
| 3918 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" | |
| 3919 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 3920 "vmovdqu %%ymm0," MEMACCESS(2) " \n" | |
| 3921 "lea " MEMLEA(0x20,2) ",%2 \n" | |
| 3922 "sub $0x8,%3 \n" | |
| 3923 "jg 1b \n" | |
| 3924 "vzeroupper \n" | |
| 3925 : "+r"(src_argb0), // %0 | |
| 3926 "+r"(src_argb1), // %1 | |
| 3927 "+r"(dst_argb), // %2 | |
| 3928 "+r"(width) // %3 | |
| 3929 : | |
| 3930 : "memory", "cc" | |
| 3931 #if defined(__AVX2__) | |
| 3932 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 3933 #endif | |
| 3934 ); | |
| 3935 } | |
| 3936 #endif // HAS_ARGBMULTIPLYROW_AVX2 | |
| 3937 | |
| 3938 #ifdef HAS_ARGBADDROW_SSE2 | |
| 3939 // Add 2 rows of ARGB pixels together, 4 pixels at a time. | |
| 3940 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | |
| 3941 uint8* dst_argb, int width) { | |
| 3942 asm volatile ( | |
| 3943 // 4 pixel loop. | |
| 3944 LABELALIGN | |
| 3945 "1: \n" | |
| 3946 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 3947 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 3948 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
| 3949 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 3950 "paddusb %%xmm1,%%xmm0 \n" | |
| 3951 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
| 3952 "lea " MEMLEA(0x10,2) ",%2 \n" | |
| 3953 "sub $0x4,%3 \n" | |
| 3954 "jg 1b \n" | |
| 3955 : "+r"(src_argb0), // %0 | |
| 3956 "+r"(src_argb1), // %1 | |
| 3957 "+r"(dst_argb), // %2 | |
| 3958 "+r"(width) // %3 | |
| 3959 : | |
| 3960 : "memory", "cc" | |
| 3961 , "xmm0", "xmm1" | |
| 3962 ); | |
| 3963 } | |
| 3964 #endif // HAS_ARGBADDROW_SSE2 | |
| 3965 | |
| 3966 #ifdef HAS_ARGBADDROW_AVX2 | |
| 3967 // Add 2 rows of ARGB pixels together, 4 pixels at a time. | |
| 3968 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | |
| 3969 uint8* dst_argb, int width) { | |
| 3970 asm volatile ( | |
| 3971 // 4 pixel loop. | |
| 3972 LABELALIGN | |
| 3973 "1: \n" | |
| 3974 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 3975 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 3976 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" | |
| 3977 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 3978 "vmovdqu %%ymm0," MEMACCESS(2) " \n" | |
| 3979 "lea " MEMLEA(0x20,2) ",%2 \n" | |
| 3980 "sub $0x8,%3 \n" | |
| 3981 "jg 1b \n" | |
| 3982 "vzeroupper \n" | |
| 3983 : "+r"(src_argb0), // %0 | |
| 3984 "+r"(src_argb1), // %1 | |
| 3985 "+r"(dst_argb), // %2 | |
| 3986 "+r"(width) // %3 | |
| 3987 : | |
| 3988 : "memory", "cc" | |
| 3989 , "xmm0" | |
| 3990 ); | |
| 3991 } | |
| 3992 #endif // HAS_ARGBADDROW_AVX2 | |
| 3993 | |
| 3994 #ifdef HAS_ARGBSUBTRACTROW_SSE2 | |
| 3995 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. | |
| 3996 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | |
| 3997 uint8* dst_argb, int width) { | |
| 3998 asm volatile ( | |
| 3999 // 4 pixel loop. | |
| 4000 LABELALIGN | |
| 4001 "1: \n" | |
| 4002 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 4003 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 4004 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
| 4005 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4006 "psubusb %%xmm1,%%xmm0 \n" | |
| 4007 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
| 4008 "lea " MEMLEA(0x10,2) ",%2 \n" | |
| 4009 "sub $0x4,%3 \n" | |
| 4010 "jg 1b \n" | |
| 4011 : "+r"(src_argb0), // %0 | |
| 4012 "+r"(src_argb1), // %1 | |
| 4013 "+r"(dst_argb), // %2 | |
| 4014 "+r"(width) // %3 | |
| 4015 : | |
| 4016 : "memory", "cc" | |
| 4017 , "xmm0", "xmm1" | |
| 4018 ); | |
| 4019 } | |
| 4020 #endif // HAS_ARGBSUBTRACTROW_SSE2 | |
| 4021 | |
| 4022 #ifdef HAS_ARGBSUBTRACTROW_AVX2 | |
| 4023 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. | |
| 4024 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | |
| 4025 uint8* dst_argb, int width) { | |
| 4026 asm volatile ( | |
| 4027 // 4 pixel loop. | |
| 4028 LABELALIGN | |
| 4029 "1: \n" | |
| 4030 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 4031 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 4032 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" | |
| 4033 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 4034 "vmovdqu %%ymm0," MEMACCESS(2) " \n" | |
| 4035 "lea " MEMLEA(0x20,2) ",%2 \n" | |
| 4036 "sub $0x8,%3 \n" | |
| 4037 "jg 1b \n" | |
| 4038 "vzeroupper \n" | |
| 4039 : "+r"(src_argb0), // %0 | |
| 4040 "+r"(src_argb1), // %1 | |
| 4041 "+r"(dst_argb), // %2 | |
| 4042 "+r"(width) // %3 | |
| 4043 : | |
| 4044 : "memory", "cc" | |
| 4045 , "xmm0" | |
| 4046 ); | |
| 4047 } | |
| 4048 #endif // HAS_ARGBSUBTRACTROW_AVX2 | |
| 4049 | |
| 4050 #ifdef HAS_SOBELXROW_SSE2 | |
| 4051 // SobelX as a matrix is | |
| 4052 // -1 0 1 | |
| 4053 // -2 0 2 | |
| 4054 // -1 0 1 | |
| 4055 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, | |
| 4056 const uint8* src_y2, uint8* dst_sobelx, int width) { | |
| 4057 asm volatile ( | |
| 4058 "sub %0,%1 \n" | |
| 4059 "sub %0,%2 \n" | |
| 4060 "sub %0,%3 \n" | |
| 4061 "pxor %%xmm5,%%xmm5 \n" | |
| 4062 | |
| 4063 // 8 pixel loop. | |
| 4064 LABELALIGN | |
| 4065 "1: \n" | |
| 4066 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
| 4067 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" | |
| 4068 "punpcklbw %%xmm5,%%xmm0 \n" | |
| 4069 "punpcklbw %%xmm5,%%xmm1 \n" | |
| 4070 "psubw %%xmm1,%%xmm0 \n" | |
| 4071 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 | |
| 4072 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 | |
| 4073 "punpcklbw %%xmm5,%%xmm1 \n" | |
| 4074 "punpcklbw %%xmm5,%%xmm2 \n" | |
| 4075 "psubw %%xmm2,%%xmm1 \n" | |
| 4076 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 | |
| 4077 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 | |
| 4078 "punpcklbw %%xmm5,%%xmm2 \n" | |
| 4079 "punpcklbw %%xmm5,%%xmm3 \n" | |
| 4080 "psubw %%xmm3,%%xmm2 \n" | |
| 4081 "paddw %%xmm2,%%xmm0 \n" | |
| 4082 "paddw %%xmm1,%%xmm0 \n" | |
| 4083 "paddw %%xmm1,%%xmm0 \n" | |
| 4084 "pxor %%xmm1,%%xmm1 \n" | |
| 4085 "psubw %%xmm0,%%xmm1 \n" | |
| 4086 "pmaxsw %%xmm1,%%xmm0 \n" | |
| 4087 "packuswb %%xmm0,%%xmm0 \n" | |
| 4088 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) | |
| 4089 "lea " MEMLEA(0x8,0) ",%0 \n" | |
| 4090 "sub $0x8,%4 \n" | |
| 4091 "jg 1b \n" | |
| 4092 : "+r"(src_y0), // %0 | |
| 4093 "+r"(src_y1), // %1 | |
| 4094 "+r"(src_y2), // %2 | |
| 4095 "+r"(dst_sobelx), // %3 | |
| 4096 "+r"(width) // %4 | |
| 4097 : | |
| 4098 : "memory", "cc", NACL_R14 | |
| 4099 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 4100 ); | |
| 4101 } | |
| 4102 #endif // HAS_SOBELXROW_SSE2 | |
| 4103 | |
| 4104 #ifdef HAS_SOBELYROW_SSE2 | |
| 4105 // SobelY as a matrix is | |
| 4106 // -1 -2 -1 | |
| 4107 // 0 0 0 | |
| 4108 // 1 2 1 | |
| 4109 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, | |
| 4110 uint8* dst_sobely, int width) { | |
| 4111 asm volatile ( | |
| 4112 "sub %0,%1 \n" | |
| 4113 "sub %0,%2 \n" | |
| 4114 "pxor %%xmm5,%%xmm5 \n" | |
| 4115 | |
| 4116 // 8 pixel loop. | |
| 4117 LABELALIGN | |
| 4118 "1: \n" | |
| 4119 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
| 4120 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 | |
| 4121 "punpcklbw %%xmm5,%%xmm0 \n" | |
| 4122 "punpcklbw %%xmm5,%%xmm1 \n" | |
| 4123 "psubw %%xmm1,%%xmm0 \n" | |
| 4124 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" | |
| 4125 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 | |
| 4126 "punpcklbw %%xmm5,%%xmm1 \n" | |
| 4127 "punpcklbw %%xmm5,%%xmm2 \n" | |
| 4128 "psubw %%xmm2,%%xmm1 \n" | |
| 4129 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" | |
| 4130 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 | |
| 4131 "punpcklbw %%xmm5,%%xmm2 \n" | |
| 4132 "punpcklbw %%xmm5,%%xmm3 \n" | |
| 4133 "psubw %%xmm3,%%xmm2 \n" | |
| 4134 "paddw %%xmm2,%%xmm0 \n" | |
| 4135 "paddw %%xmm1,%%xmm0 \n" | |
| 4136 "paddw %%xmm1,%%xmm0 \n" | |
| 4137 "pxor %%xmm1,%%xmm1 \n" | |
| 4138 "psubw %%xmm0,%%xmm1 \n" | |
| 4139 "pmaxsw %%xmm1,%%xmm0 \n" | |
| 4140 "packuswb %%xmm0,%%xmm0 \n" | |
| 4141 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) | |
| 4142 "lea " MEMLEA(0x8,0) ",%0 \n" | |
| 4143 "sub $0x8,%3 \n" | |
| 4144 "jg 1b \n" | |
| 4145 : "+r"(src_y0), // %0 | |
| 4146 "+r"(src_y1), // %1 | |
| 4147 "+r"(dst_sobely), // %2 | |
| 4148 "+r"(width) // %3 | |
| 4149 : | |
| 4150 : "memory", "cc", NACL_R14 | |
| 4151 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 4152 ); | |
| 4153 } | |
| 4154 #endif // HAS_SOBELYROW_SSE2 | |
| 4155 | |
| 4156 #ifdef HAS_SOBELROW_SSE2 | |
| 4157 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. | |
| 4158 // A = 255 | |
| 4159 // R = Sobel | |
| 4160 // G = Sobel | |
| 4161 // B = Sobel | |
| 4162 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | |
| 4163 uint8* dst_argb, int width) { | |
| 4164 asm volatile ( | |
| 4165 "sub %0,%1 \n" | |
| 4166 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 4167 "pslld $0x18,%%xmm5 \n" | |
| 4168 | |
| 4169 // 8 pixel loop. | |
| 4170 LABELALIGN | |
| 4171 "1: \n" | |
| 4172 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 4173 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | |
| 4174 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 4175 "paddusb %%xmm1,%%xmm0 \n" | |
| 4176 "movdqa %%xmm0,%%xmm2 \n" | |
| 4177 "punpcklbw %%xmm0,%%xmm2 \n" | |
| 4178 "punpckhbw %%xmm0,%%xmm0 \n" | |
| 4179 "movdqa %%xmm2,%%xmm1 \n" | |
| 4180 "punpcklwd %%xmm2,%%xmm1 \n" | |
| 4181 "punpckhwd %%xmm2,%%xmm2 \n" | |
| 4182 "por %%xmm5,%%xmm1 \n" | |
| 4183 "por %%xmm5,%%xmm2 \n" | |
| 4184 "movdqa %%xmm0,%%xmm3 \n" | |
| 4185 "punpcklwd %%xmm0,%%xmm3 \n" | |
| 4186 "punpckhwd %%xmm0,%%xmm0 \n" | |
| 4187 "por %%xmm5,%%xmm3 \n" | |
| 4188 "por %%xmm5,%%xmm0 \n" | |
| 4189 "movdqu %%xmm1," MEMACCESS(2) " \n" | |
| 4190 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" | |
| 4191 "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" | |
| 4192 "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" | |
| 4193 "lea " MEMLEA(0x40,2) ",%2 \n" | |
| 4194 "sub $0x10,%3 \n" | |
| 4195 "jg 1b \n" | |
| 4196 : "+r"(src_sobelx), // %0 | |
| 4197 "+r"(src_sobely), // %1 | |
| 4198 "+r"(dst_argb), // %2 | |
| 4199 "+r"(width) // %3 | |
| 4200 : | |
| 4201 : "memory", "cc", NACL_R14 | |
| 4202 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
| 4203 ); | |
| 4204 } | |
| 4205 #endif // HAS_SOBELROW_SSE2 | |
| 4206 | |
| 4207 #ifdef HAS_SOBELTOPLANEROW_SSE2 | |
| 4208 // Adds Sobel X and Sobel Y and stores Sobel into a plane. | |
| 4209 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | |
| 4210 uint8* dst_y, int width) { | |
| 4211 asm volatile ( | |
| 4212 "sub %0,%1 \n" | |
| 4213 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 4214 "pslld $0x18,%%xmm5 \n" | |
| 4215 | |
| 4216 // 8 pixel loop. | |
| 4217 LABELALIGN | |
| 4218 "1: \n" | |
| 4219 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 4220 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | |
| 4221 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 4222 "paddusb %%xmm1,%%xmm0 \n" | |
| 4223 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
| 4224 "lea " MEMLEA(0x10,2) ",%2 \n" | |
| 4225 "sub $0x10,%3 \n" | |
| 4226 "jg 1b \n" | |
| 4227 : "+r"(src_sobelx), // %0 | |
| 4228 "+r"(src_sobely), // %1 | |
| 4229 "+r"(dst_y), // %2 | |
| 4230 "+r"(width) // %3 | |
| 4231 : | |
| 4232 : "memory", "cc", NACL_R14 | |
| 4233 "xmm0", "xmm1" | |
| 4234 ); | |
| 4235 } | |
| 4236 #endif // HAS_SOBELTOPLANEROW_SSE2 | |
| 4237 | |
| 4238 #ifdef HAS_SOBELXYROW_SSE2 | |
| 4239 // Mixes Sobel X, Sobel Y and Sobel into ARGB. | |
| 4240 // A = 255 | |
| 4241 // R = Sobel X | |
| 4242 // G = Sobel | |
| 4243 // B = Sobel Y | |
| 4244 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | |
| 4245 uint8* dst_argb, int width) { | |
| 4246 asm volatile ( | |
| 4247 "sub %0,%1 \n" | |
| 4248 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 4249 | |
| 4250 // 8 pixel loop. | |
| 4251 LABELALIGN | |
| 4252 "1: \n" | |
| 4253 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 4254 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | |
| 4255 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 4256 "movdqa %%xmm0,%%xmm2 \n" | |
| 4257 "paddusb %%xmm1,%%xmm2 \n" | |
| 4258 "movdqa %%xmm0,%%xmm3 \n" | |
| 4259 "punpcklbw %%xmm5,%%xmm3 \n" | |
| 4260 "punpckhbw %%xmm5,%%xmm0 \n" | |
| 4261 "movdqa %%xmm1,%%xmm4 \n" | |
| 4262 "punpcklbw %%xmm2,%%xmm4 \n" | |
| 4263 "punpckhbw %%xmm2,%%xmm1 \n" | |
| 4264 "movdqa %%xmm4,%%xmm6 \n" | |
| 4265 "punpcklwd %%xmm3,%%xmm6 \n" | |
| 4266 "punpckhwd %%xmm3,%%xmm4 \n" | |
| 4267 "movdqa %%xmm1,%%xmm7 \n" | |
| 4268 "punpcklwd %%xmm0,%%xmm7 \n" | |
| 4269 "punpckhwd %%xmm0,%%xmm1 \n" | |
| 4270 "movdqu %%xmm6," MEMACCESS(2) " \n" | |
| 4271 "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" | |
| 4272 "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" | |
| 4273 "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" | |
| 4274 "lea " MEMLEA(0x40,2) ",%2 \n" | |
| 4275 "sub $0x10,%3 \n" | |
| 4276 "jg 1b \n" | |
| 4277 : "+r"(src_sobelx), // %0 | |
| 4278 "+r"(src_sobely), // %1 | |
| 4279 "+r"(dst_argb), // %2 | |
| 4280 "+r"(width) // %3 | |
| 4281 : | |
| 4282 : "memory", "cc", NACL_R14 | |
| 4283 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 4284 ); | |
| 4285 } | |
| 4286 #endif // HAS_SOBELXYROW_SSE2 | |
| 4287 | |
| 4288 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 | |
| 4289 // Creates a table of cumulative sums where each value is a sum of all values | |
| 4290 // above and to the left of the value, inclusive of the value. | |
| 4291 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, | |
| 4292 const int32* previous_cumsum, int width) { | |
| 4293 asm volatile ( | |
| 4294 "pxor %%xmm0,%%xmm0 \n" | |
| 4295 "pxor %%xmm1,%%xmm1 \n" | |
| 4296 "sub $0x4,%3 \n" | |
| 4297 "jl 49f \n" | |
| 4298 "test $0xf,%1 \n" | |
| 4299 "jne 49f \n" | |
| 4300 | |
| 4301 // 4 pixel loop \n" | |
| 4302 LABELALIGN | |
| 4303 "40: \n" | |
| 4304 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
| 4305 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 4306 "movdqa %%xmm2,%%xmm4 \n" | |
| 4307 "punpcklbw %%xmm1,%%xmm2 \n" | |
| 4308 "movdqa %%xmm2,%%xmm3 \n" | |
| 4309 "punpcklwd %%xmm1,%%xmm2 \n" | |
| 4310 "punpckhwd %%xmm1,%%xmm3 \n" | |
| 4311 "punpckhbw %%xmm1,%%xmm4 \n" | |
| 4312 "movdqa %%xmm4,%%xmm5 \n" | |
| 4313 "punpcklwd %%xmm1,%%xmm4 \n" | |
| 4314 "punpckhwd %%xmm1,%%xmm5 \n" | |
| 4315 "paddd %%xmm2,%%xmm0 \n" | |
| 4316 "movdqu " MEMACCESS(2) ",%%xmm2 \n" | |
| 4317 "paddd %%xmm0,%%xmm2 \n" | |
| 4318 "paddd %%xmm3,%%xmm0 \n" | |
| 4319 "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n" | |
| 4320 "paddd %%xmm0,%%xmm3 \n" | |
| 4321 "paddd %%xmm4,%%xmm0 \n" | |
| 4322 "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n" | |
| 4323 "paddd %%xmm0,%%xmm4 \n" | |
| 4324 "paddd %%xmm5,%%xmm0 \n" | |
| 4325 "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n" | |
| 4326 "lea " MEMLEA(0x40,2) ",%2 \n" | |
| 4327 "paddd %%xmm0,%%xmm5 \n" | |
| 4328 "movdqu %%xmm2," MEMACCESS(1) " \n" | |
| 4329 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" | |
| 4330 "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n" | |
| 4331 "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n" | |
| 4332 "lea " MEMLEA(0x40,1) ",%1 \n" | |
| 4333 "sub $0x4,%3 \n" | |
| 4334 "jge 40b \n" | |
| 4335 | |
| 4336 "49: \n" | |
| 4337 "add $0x3,%3 \n" | |
| 4338 "jl 19f \n" | |
| 4339 | |
| 4340 // 1 pixel loop \n" | |
| 4341 LABELALIGN | |
| 4342 "10: \n" | |
| 4343 "movd " MEMACCESS(0) ",%%xmm2 \n" | |
| 4344 "lea " MEMLEA(0x4,0) ",%0 \n" | |
| 4345 "punpcklbw %%xmm1,%%xmm2 \n" | |
| 4346 "punpcklwd %%xmm1,%%xmm2 \n" | |
| 4347 "paddd %%xmm2,%%xmm0 \n" | |
| 4348 "movdqu " MEMACCESS(2) ",%%xmm2 \n" | |
| 4349 "lea " MEMLEA(0x10,2) ",%2 \n" | |
| 4350 "paddd %%xmm0,%%xmm2 \n" | |
| 4351 "movdqu %%xmm2," MEMACCESS(1) " \n" | |
| 4352 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4353 "sub $0x1,%3 \n" | |
| 4354 "jge 10b \n" | |
| 4355 | |
| 4356 "19: \n" | |
| 4357 : "+r"(row), // %0 | |
| 4358 "+r"(cumsum), // %1 | |
| 4359 "+r"(previous_cumsum), // %2 | |
| 4360 "+r"(width) // %3 | |
| 4361 : | |
| 4362 : "memory", "cc" | |
| 4363 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 4364 ); | |
| 4365 } | |
| 4366 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 | |
| 4367 | |
| 4368 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 | |
| 4369 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, | |
| 4370 int width, int area, uint8* dst, | |
| 4371 int count) { | |
| 4372 asm volatile ( | |
| 4373 "movd %5,%%xmm5 \n" | |
| 4374 "cvtdq2ps %%xmm5,%%xmm5 \n" | |
| 4375 "rcpss %%xmm5,%%xmm4 \n" | |
| 4376 "pshufd $0x0,%%xmm4,%%xmm4 \n" | |
| 4377 "sub $0x4,%3 \n" | |
| 4378 "jl 49f \n" | |
| 4379 "cmpl $0x80,%5 \n" | |
| 4380 "ja 40f \n" | |
| 4381 | |
| 4382 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
| 4383 "pcmpeqb %%xmm6,%%xmm6 \n" | |
| 4384 "psrld $0x10,%%xmm6 \n" | |
| 4385 "cvtdq2ps %%xmm6,%%xmm6 \n" | |
| 4386 "addps %%xmm6,%%xmm5 \n" | |
| 4387 "mulps %%xmm4,%%xmm5 \n" | |
| 4388 "cvtps2dq %%xmm5,%%xmm5 \n" | |
| 4389 "packssdw %%xmm5,%%xmm5 \n" | |
| 4390 | |
| 4391 // 4 pixel small loop \n" | |
| 4392 LABELALIGN | |
| 4393 "4: \n" | |
| 4394 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 4395 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 4396 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 4397 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
| 4398 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 | |
| 4399 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 | |
| 4400 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 | |
| 4401 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 | |
| 4402 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 4403 "psubd " MEMACCESS(1) ",%%xmm0 \n" | |
| 4404 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" | |
| 4405 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" | |
| 4406 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" | |
| 4407 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 | |
| 4408 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 | |
| 4409 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 | |
| 4410 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 | |
| 4411 "lea " MEMLEA(0x40,1) ",%1 \n" | |
| 4412 "packssdw %%xmm1,%%xmm0 \n" | |
| 4413 "packssdw %%xmm3,%%xmm2 \n" | |
| 4414 "pmulhuw %%xmm5,%%xmm0 \n" | |
| 4415 "pmulhuw %%xmm5,%%xmm2 \n" | |
| 4416 "packuswb %%xmm2,%%xmm0 \n" | |
| 4417 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
| 4418 "lea " MEMLEA(0x10,2) ",%2 \n" | |
| 4419 "sub $0x4,%3 \n" | |
| 4420 "jge 4b \n" | |
| 4421 "jmp 49f \n" | |
| 4422 | |
| 4423 // 4 pixel loop \n" | |
| 4424 LABELALIGN | |
| 4425 "40: \n" | |
| 4426 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 4427 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 4428 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
| 4429 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
| 4430 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 | |
| 4431 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 | |
| 4432 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 | |
| 4433 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 | |
| 4434 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 4435 "psubd " MEMACCESS(1) ",%%xmm0 \n" | |
| 4436 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" | |
| 4437 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" | |
| 4438 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" | |
| 4439 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 | |
| 4440 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 | |
| 4441 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 | |
| 4442 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 | |
| 4443 "lea " MEMLEA(0x40,1) ",%1 \n" | |
| 4444 "cvtdq2ps %%xmm0,%%xmm0 \n" | |
| 4445 "cvtdq2ps %%xmm1,%%xmm1 \n" | |
| 4446 "mulps %%xmm4,%%xmm0 \n" | |
| 4447 "mulps %%xmm4,%%xmm1 \n" | |
| 4448 "cvtdq2ps %%xmm2,%%xmm2 \n" | |
| 4449 "cvtdq2ps %%xmm3,%%xmm3 \n" | |
| 4450 "mulps %%xmm4,%%xmm2 \n" | |
| 4451 "mulps %%xmm4,%%xmm3 \n" | |
| 4452 "cvtps2dq %%xmm0,%%xmm0 \n" | |
| 4453 "cvtps2dq %%xmm1,%%xmm1 \n" | |
| 4454 "cvtps2dq %%xmm2,%%xmm2 \n" | |
| 4455 "cvtps2dq %%xmm3,%%xmm3 \n" | |
| 4456 "packssdw %%xmm1,%%xmm0 \n" | |
| 4457 "packssdw %%xmm3,%%xmm2 \n" | |
| 4458 "packuswb %%xmm2,%%xmm0 \n" | |
| 4459 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
| 4460 "lea " MEMLEA(0x10,2) ",%2 \n" | |
| 4461 "sub $0x4,%3 \n" | |
| 4462 "jge 40b \n" | |
| 4463 | |
| 4464 "49: \n" | |
| 4465 "add $0x3,%3 \n" | |
| 4466 "jl 19f \n" | |
| 4467 | |
| 4468 // 1 pixel loop \n" | |
| 4469 LABELALIGN | |
| 4470 "10: \n" | |
| 4471 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 4472 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 | |
| 4473 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 4474 "psubd " MEMACCESS(1) ",%%xmm0 \n" | |
| 4475 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 | |
| 4476 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4477 "cvtdq2ps %%xmm0,%%xmm0 \n" | |
| 4478 "mulps %%xmm4,%%xmm0 \n" | |
| 4479 "cvtps2dq %%xmm0,%%xmm0 \n" | |
| 4480 "packssdw %%xmm0,%%xmm0 \n" | |
| 4481 "packuswb %%xmm0,%%xmm0 \n" | |
| 4482 "movd %%xmm0," MEMACCESS(2) " \n" | |
| 4483 "lea " MEMLEA(0x4,2) ",%2 \n" | |
| 4484 "sub $0x1,%3 \n" | |
| 4485 "jge 10b \n" | |
| 4486 "19: \n" | |
| 4487 : "+r"(topleft), // %0 | |
| 4488 "+r"(botleft), // %1 | |
| 4489 "+r"(dst), // %2 | |
| 4490 "+rm"(count) // %3 | |
| 4491 : "r"((intptr_t)(width)), // %4 | |
| 4492 "rm"(area) // %5 | |
| 4493 : "memory", "cc", NACL_R14 | |
| 4494 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
| 4495 ); | |
| 4496 } | |
| 4497 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 | |
| 4498 | |
| 4499 #ifdef HAS_ARGBAFFINEROW_SSE2 | |
| 4500 // Copy ARGB pixels from source image with slope to a row of destination. | |
| 4501 LIBYUV_API | |
| 4502 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, | |
| 4503 uint8* dst_argb, const float* src_dudv, int width) { | |
| 4504 intptr_t src_argb_stride_temp = src_argb_stride; | |
| 4505 intptr_t temp = 0; | |
| 4506 asm volatile ( | |
| 4507 "movq " MEMACCESS(3) ",%%xmm2 \n" | |
| 4508 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" | |
| 4509 "shl $0x10,%1 \n" | |
| 4510 "add $0x4,%1 \n" | |
| 4511 "movd %1,%%xmm5 \n" | |
| 4512 "sub $0x4,%4 \n" | |
| 4513 "jl 49f \n" | |
| 4514 | |
| 4515 "pshufd $0x44,%%xmm7,%%xmm7 \n" | |
| 4516 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
| 4517 "movdqa %%xmm2,%%xmm0 \n" | |
| 4518 "addps %%xmm7,%%xmm0 \n" | |
| 4519 "movlhps %%xmm0,%%xmm2 \n" | |
| 4520 "movdqa %%xmm7,%%xmm4 \n" | |
| 4521 "addps %%xmm4,%%xmm4 \n" | |
| 4522 "movdqa %%xmm2,%%xmm3 \n" | |
| 4523 "addps %%xmm4,%%xmm3 \n" | |
| 4524 "addps %%xmm4,%%xmm4 \n" | |
| 4525 | |
| 4526 // 4 pixel loop \n" | |
| 4527 LABELALIGN | |
| 4528 "40: \n" | |
| 4529 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 | |
| 4530 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 | |
| 4531 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts | |
| 4532 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride | |
| 4533 "movd %%xmm0,%k1 \n" | |
| 4534 "pshufd $0x39,%%xmm0,%%xmm0 \n" | |
| 4535 "movd %%xmm0,%k5 \n" | |
| 4536 "pshufd $0x39,%%xmm0,%%xmm0 \n" | |
| 4537 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 | |
| 4538 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 | |
| 4539 "punpckldq %%xmm6,%%xmm1 \n" | |
| 4540 "addps %%xmm4,%%xmm2 \n" | |
| 4541 "movq %%xmm1," MEMACCESS(2) " \n" | |
| 4542 "movd %%xmm0,%k1 \n" | |
| 4543 "pshufd $0x39,%%xmm0,%%xmm0 \n" | |
| 4544 "movd %%xmm0,%k5 \n" | |
| 4545 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 | |
| 4546 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 | |
| 4547 "punpckldq %%xmm6,%%xmm0 \n" | |
| 4548 "addps %%xmm4,%%xmm3 \n" | |
| 4549 "movq %%xmm0," MEMACCESS2(0x08,2) " \n" | |
| 4550 "lea " MEMLEA(0x10,2) ",%2 \n" | |
| 4551 "sub $0x4,%4 \n" | |
| 4552 "jge 40b \n" | |
| 4553 | |
| 4554 "49: \n" | |
| 4555 "add $0x3,%4 \n" | |
| 4556 "jl 19f \n" | |
| 4557 | |
| 4558 // 1 pixel loop \n" | |
| 4559 LABELALIGN | |
| 4560 "10: \n" | |
| 4561 "cvttps2dq %%xmm2,%%xmm0 \n" | |
| 4562 "packssdw %%xmm0,%%xmm0 \n" | |
| 4563 "pmaddwd %%xmm5,%%xmm0 \n" | |
| 4564 "addps %%xmm7,%%xmm2 \n" | |
| 4565 "movd %%xmm0,%k1 \n" | |
| 4566 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 | |
| 4567 "movd %%xmm0," MEMACCESS(2) " \n" | |
| 4568 "lea " MEMLEA(0x04,2) ",%2 \n" | |
| 4569 "sub $0x1,%4 \n" | |
| 4570 "jge 10b \n" | |
| 4571 "19: \n" | |
| 4572 : "+r"(src_argb), // %0 | |
| 4573 "+r"(src_argb_stride_temp), // %1 | |
| 4574 "+r"(dst_argb), // %2 | |
| 4575 "+r"(src_dudv), // %3 | |
| 4576 "+rm"(width), // %4 | |
| 4577 "+r"(temp) // %5 | |
| 4578 : | |
| 4579 : "memory", "cc", NACL_R14 | |
| 4580 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 4581 ); | |
| 4582 } | |
| 4583 #endif // HAS_ARGBAFFINEROW_SSE2 | |
| 4584 | |
| 4585 #ifdef HAS_INTERPOLATEROW_SSSE3 | |
| 4586 // Bilinear filter 16x2 -> 16x1 | |
| 4587 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | |
| 4588 ptrdiff_t src_stride, int dst_width, | |
| 4589 int source_y_fraction) { | |
| 4590 asm volatile ( | |
| 4591 "sub %1,%0 \n" | |
| 4592 "shr %3 \n" | |
| 4593 "cmp $0x0,%3 \n" | |
| 4594 "je 100f \n" | |
| 4595 "cmp $0x20,%3 \n" | |
| 4596 "je 75f \n" | |
| 4597 "cmp $0x40,%3 \n" | |
| 4598 "je 50f \n" | |
| 4599 "cmp $0x60,%3 \n" | |
| 4600 "je 25f \n" | |
| 4601 | |
| 4602 "movd %3,%%xmm0 \n" | |
| 4603 "neg %3 \n" | |
| 4604 "add $0x80,%3 \n" | |
| 4605 "movd %3,%%xmm5 \n" | |
| 4606 "punpcklbw %%xmm0,%%xmm5 \n" | |
| 4607 "punpcklwd %%xmm5,%%xmm5 \n" | |
| 4608 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
| 4609 | |
| 4610 // General purpose row blend. | |
| 4611 LABELALIGN | |
| 4612 "1: \n" | |
| 4613 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
| 4614 MEMOPREG(movdqu,0x00,1,4,1,xmm2) | |
| 4615 "movdqa %%xmm0,%%xmm1 \n" | |
| 4616 "punpcklbw %%xmm2,%%xmm0 \n" | |
| 4617 "punpckhbw %%xmm2,%%xmm1 \n" | |
| 4618 "pmaddubsw %%xmm5,%%xmm0 \n" | |
| 4619 "pmaddubsw %%xmm5,%%xmm1 \n" | |
| 4620 "psrlw $0x7,%%xmm0 \n" | |
| 4621 "psrlw $0x7,%%xmm1 \n" | |
| 4622 "packuswb %%xmm1,%%xmm0 \n" | |
| 4623 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) | |
| 4624 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4625 "sub $0x10,%2 \n" | |
| 4626 "jg 1b \n" | |
| 4627 "jmp 99f \n" | |
| 4628 | |
| 4629 // Blend 25 / 75. | |
| 4630 LABELALIGN | |
| 4631 "25: \n" | |
| 4632 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
| 4633 MEMOPREG(movdqu,0x00,1,4,1,xmm1) | |
| 4634 "pavgb %%xmm1,%%xmm0 \n" | |
| 4635 "pavgb %%xmm1,%%xmm0 \n" | |
| 4636 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) | |
| 4637 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4638 "sub $0x10,%2 \n" | |
| 4639 "jg 25b \n" | |
| 4640 "jmp 99f \n" | |
| 4641 | |
| 4642 // Blend 50 / 50. | |
| 4643 LABELALIGN | |
| 4644 "50: \n" | |
| 4645 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
| 4646 MEMOPREG(movdqu,0x00,1,4,1,xmm1) | |
| 4647 "pavgb %%xmm1,%%xmm0 \n" | |
| 4648 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) | |
| 4649 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4650 "sub $0x10,%2 \n" | |
| 4651 "jg 50b \n" | |
| 4652 "jmp 99f \n" | |
| 4653 | |
| 4654 // Blend 75 / 25. | |
| 4655 LABELALIGN | |
| 4656 "75: \n" | |
| 4657 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
| 4658 MEMOPREG(movdqu,0x00,1,4,1,xmm0) | |
| 4659 "pavgb %%xmm1,%%xmm0 \n" | |
| 4660 "pavgb %%xmm1,%%xmm0 \n" | |
| 4661 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) | |
| 4662 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4663 "sub $0x10,%2 \n" | |
| 4664 "jg 75b \n" | |
| 4665 "jmp 99f \n" | |
| 4666 | |
| 4667 // Blend 100 / 0 - Copy row unchanged. | |
| 4668 LABELALIGN | |
| 4669 "100: \n" | |
| 4670 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
| 4671 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) | |
| 4672 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4673 "sub $0x10,%2 \n" | |
| 4674 "jg 100b \n" | |
| 4675 | |
| 4676 "99: \n" | |
| 4677 : "+r"(dst_ptr), // %0 | |
| 4678 "+r"(src_ptr), // %1 | |
| 4679 "+r"(dst_width), // %2 | |
| 4680 "+r"(source_y_fraction) // %3 | |
| 4681 : "r"((intptr_t)(src_stride)) // %4 | |
| 4682 : "memory", "cc", NACL_R14 | |
| 4683 "xmm0", "xmm1", "xmm2", "xmm5" | |
| 4684 ); | |
| 4685 } | |
| 4686 #endif // HAS_INTERPOLATEROW_SSSE3 | |
| 4687 | |
| 4688 #ifdef HAS_INTERPOLATEROW_AVX2 | |
| 4689 // Bilinear filter 32x2 -> 32x1 | |
| 4690 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, | |
| 4691 ptrdiff_t src_stride, int dst_width, | |
| 4692 int source_y_fraction) { | |
| 4693 asm volatile ( | |
| 4694 "shr %3 \n" | |
| 4695 "cmp $0x0,%3 \n" | |
| 4696 "je 100f \n" | |
| 4697 "sub %1,%0 \n" | |
| 4698 "cmp $0x20,%3 \n" | |
| 4699 "je 75f \n" | |
| 4700 "cmp $0x40,%3 \n" | |
| 4701 "je 50f \n" | |
| 4702 "cmp $0x60,%3 \n" | |
| 4703 "je 25f \n" | |
| 4704 | |
| 4705 "vmovd %3,%%xmm0 \n" | |
| 4706 "neg %3 \n" | |
| 4707 "add $0x80,%3 \n" | |
| 4708 "vmovd %3,%%xmm5 \n" | |
| 4709 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" | |
| 4710 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" | |
| 4711 "vpxor %%ymm0,%%ymm0,%%ymm0 \n" | |
| 4712 "vpermd %%ymm5,%%ymm0,%%ymm5 \n" | |
| 4713 | |
| 4714 // General purpose row blend. | |
| 4715 LABELALIGN | |
| 4716 "1: \n" | |
| 4717 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" | |
| 4718 MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) | |
| 4719 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" | |
| 4720 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" | |
| 4721 "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n" | |
| 4722 "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n" | |
| 4723 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" | |
| 4724 "vpsrlw $0x7,%%ymm1,%%ymm1 \n" | |
| 4725 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 4726 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) | |
| 4727 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 4728 "sub $0x20,%2 \n" | |
| 4729 "jg 1b \n" | |
| 4730 "jmp 99f \n" | |
| 4731 | |
| 4732 // Blend 25 / 75. | |
| 4733 LABELALIGN | |
| 4734 "25: \n" | |
| 4735 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" | |
| 4736 MEMOPREG(vmovdqu,0x00,1,4,1,ymm1) | |
| 4737 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 4738 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 4739 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) | |
| 4740 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 4741 "sub $0x20,%2 \n" | |
| 4742 "jg 25b \n" | |
| 4743 "jmp 99f \n" | |
| 4744 | |
| 4745 // Blend 50 / 50. | |
| 4746 LABELALIGN | |
| 4747 "50: \n" | |
| 4748 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" | |
| 4749 VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0 | |
| 4750 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) | |
| 4751 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 4752 "sub $0x20,%2 \n" | |
| 4753 "jg 50b \n" | |
| 4754 "jmp 99f \n" | |
| 4755 | |
| 4756 // Blend 75 / 25. | |
| 4757 LABELALIGN | |
| 4758 "75: \n" | |
| 4759 "vmovdqu " MEMACCESS(1) ",%%ymm1 \n" | |
| 4760 MEMOPREG(vmovdqu,0x00,1,4,1,ymm0) | |
| 4761 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 4762 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" | |
| 4763 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) | |
| 4764 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 4765 "sub $0x20,%2 \n" | |
| 4766 "jg 75b \n" | |
| 4767 "jmp 99f \n" | |
| 4768 | |
| 4769 // Blend 100 / 0 - Copy row unchanged. | |
| 4770 LABELALIGN | |
| 4771 "100: \n" | |
| 4772 "rep movsb " MEMMOVESTRING(1,0) " \n" | |
| 4773 "jmp 999f \n" | |
| 4774 | |
| 4775 "99: \n" | |
| 4776 "vzeroupper \n" | |
| 4777 "999: \n" | |
| 4778 : "+D"(dst_ptr), // %0 | |
| 4779 "+S"(src_ptr), // %1 | |
| 4780 "+c"(dst_width), // %2 | |
| 4781 "+r"(source_y_fraction) // %3 | |
| 4782 : "r"((intptr_t)(src_stride)) // %4 | |
| 4783 : "memory", "cc", NACL_R14 | |
| 4784 "xmm0", "xmm1", "xmm2", "xmm5" | |
| 4785 ); | |
| 4786 } | |
| 4787 #endif // HAS_INTERPOLATEROW_AVX2 | |
| 4788 | |
| 4789 #ifdef HAS_INTERPOLATEROW_SSE2 | |
| 4790 // Bilinear filter 16x2 -> 16x1 | |
| 4791 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, | |
| 4792 ptrdiff_t src_stride, int dst_width, | |
| 4793 int source_y_fraction) { | |
| 4794 asm volatile ( | |
| 4795 "sub %1,%0 \n" | |
| 4796 "shr %3 \n" | |
| 4797 "cmp $0x0,%3 \n" | |
| 4798 "je 100f \n" | |
| 4799 "cmp $0x20,%3 \n" | |
| 4800 "je 75f \n" | |
| 4801 "cmp $0x40,%3 \n" | |
| 4802 "je 50f \n" | |
| 4803 "cmp $0x60,%3 \n" | |
| 4804 "je 25f \n" | |
| 4805 | |
| 4806 "movd %3,%%xmm0 \n" | |
| 4807 "neg %3 \n" | |
| 4808 "add $0x80,%3 \n" | |
| 4809 "movd %3,%%xmm5 \n" | |
| 4810 "punpcklbw %%xmm0,%%xmm5 \n" | |
| 4811 "punpcklwd %%xmm5,%%xmm5 \n" | |
| 4812 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
| 4813 "pxor %%xmm4,%%xmm4 \n" | |
| 4814 | |
| 4815 // General purpose row blend. | |
| 4816 LABELALIGN | |
| 4817 "1: \n" | |
| 4818 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
| 4819 MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 | |
| 4820 "movdqa %%xmm0,%%xmm1 \n" | |
| 4821 "movdqa %%xmm2,%%xmm3 \n" | |
| 4822 "punpcklbw %%xmm4,%%xmm2 \n" | |
| 4823 "punpckhbw %%xmm4,%%xmm3 \n" | |
| 4824 "punpcklbw %%xmm4,%%xmm0 \n" | |
| 4825 "punpckhbw %%xmm4,%%xmm1 \n" | |
| 4826 "psubw %%xmm0,%%xmm2 \n" | |
| 4827 "psubw %%xmm1,%%xmm3 \n" | |
| 4828 "paddw %%xmm2,%%xmm2 \n" | |
| 4829 "paddw %%xmm3,%%xmm3 \n" | |
| 4830 "pmulhw %%xmm5,%%xmm2 \n" | |
| 4831 "pmulhw %%xmm5,%%xmm3 \n" | |
| 4832 "paddw %%xmm2,%%xmm0 \n" | |
| 4833 "paddw %%xmm3,%%xmm1 \n" | |
| 4834 "packuswb %%xmm1,%%xmm0 \n" | |
| 4835 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) | |
| 4836 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4837 "sub $0x10,%2 \n" | |
| 4838 "jg 1b \n" | |
| 4839 "jmp 99f \n" | |
| 4840 | |
| 4841 // Blend 25 / 75. | |
| 4842 LABELALIGN | |
| 4843 "25: \n" | |
| 4844 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
| 4845 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 | |
| 4846 "pavgb %%xmm1,%%xmm0 \n" | |
| 4847 "pavgb %%xmm1,%%xmm0 \n" | |
| 4848 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) | |
| 4849 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4850 "sub $0x10,%2 \n" | |
| 4851 "jg 25b \n" | |
| 4852 "jmp 99f \n" | |
| 4853 | |
| 4854 // Blend 50 / 50. | |
| 4855 LABELALIGN | |
| 4856 "50: \n" | |
| 4857 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
| 4858 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 | |
| 4859 "pavgb %%xmm1,%%xmm0 \n" | |
| 4860 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) | |
| 4861 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4862 "sub $0x10,%2 \n" | |
| 4863 "jg 50b \n" | |
| 4864 "jmp 99f \n" | |
| 4865 | |
| 4866 // Blend 75 / 25. | |
| 4867 LABELALIGN | |
| 4868 "75: \n" | |
| 4869 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
| 4870 MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 | |
| 4871 "pavgb %%xmm1,%%xmm0 \n" | |
| 4872 "pavgb %%xmm1,%%xmm0 \n" | |
| 4873 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) | |
| 4874 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4875 "sub $0x10,%2 \n" | |
| 4876 "jg 75b \n" | |
| 4877 "jmp 99f \n" | |
| 4878 | |
| 4879 // Blend 100 / 0 - Copy row unchanged. | |
| 4880 LABELALIGN | |
| 4881 "100: \n" | |
| 4882 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
| 4883 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) | |
| 4884 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 4885 "sub $0x10,%2 \n" | |
| 4886 "jg 100b \n" | |
| 4887 | |
| 4888 "99: \n" | |
| 4889 : "+r"(dst_ptr), // %0 | |
| 4890 "+r"(src_ptr), // %1 | |
| 4891 "+r"(dst_width), // %2 | |
| 4892 "+r"(source_y_fraction) // %3 | |
| 4893 : "r"((intptr_t)(src_stride)) // %4 | |
| 4894 : "memory", "cc", NACL_R14 | |
| 4895 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 4896 ); | |
| 4897 } | |
| 4898 #endif // HAS_INTERPOLATEROW_SSE2 | |
| 4899 | |
| 4900 #ifdef HAS_ARGBTOBAYERGGROW_SSE2 | |
| 4901 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, | |
| 4902 uint32 selector, int pix) { | |
| 4903 asm volatile ( | |
| 4904 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 4905 "psrld $0x18,%%xmm5 \n" | |
| 4906 LABELALIGN | |
| 4907 "1: \n" | |
| 4908 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 4909 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 4910 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 4911 "psrld $0x8,%%xmm0 \n" | |
| 4912 "psrld $0x8,%%xmm1 \n" | |
| 4913 "pand %%xmm5,%%xmm0 \n" | |
| 4914 "pand %%xmm5,%%xmm1 \n" | |
| 4915 "packssdw %%xmm1,%%xmm0 \n" | |
| 4916 "packuswb %%xmm1,%%xmm0 \n" | |
| 4917 "movq %%xmm0," MEMACCESS(1) " \n" | |
| 4918 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 4919 "sub $0x8,%2 \n" | |
| 4920 "jg 1b \n" | |
| 4921 : "+r"(src_argb), // %0 | |
| 4922 "+r"(dst_bayer), // %1 | |
| 4923 "+r"(pix) // %2 | |
| 4924 : | |
| 4925 : "memory", "cc" | |
| 4926 , "xmm0", "xmm1", "xmm5" | |
| 4927 ); | |
| 4928 } | |
| 4929 #endif // HAS_ARGBTOBAYERGGROW_SSE2 | |
| 4930 | |
| 4931 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 | |
| 4932 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | |
| 4933 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | |
| 4934 const uint8* shuffler, int pix) { | |
| 4935 asm volatile ( | |
| 4936 "movdqu " MEMACCESS(3) ",%%xmm5 \n" | |
| 4937 LABELALIGN | |
| 4938 "1: \n" | |
| 4939 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 4940 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
| 4941 "lea " MEMLEA(0x20,0) ",%0 \n" | |
| 4942 "pshufb %%xmm5,%%xmm0 \n" | |
| 4943 "pshufb %%xmm5,%%xmm1 \n" | |
| 4944 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 4945 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
| 4946 "lea " MEMLEA(0x20,1) ",%1 \n" | |
| 4947 "sub $0x8,%2 \n" | |
| 4948 "jg 1b \n" | |
| 4949 : "+r"(src_argb), // %0 | |
| 4950 "+r"(dst_argb), // %1 | |
| 4951 "+r"(pix) // %2 | |
| 4952 : "r"(shuffler) // %3 | |
| 4953 : "memory", "cc" | |
| 4954 , "xmm0", "xmm1", "xmm5" | |
| 4955 ); | |
| 4956 } | |
| 4957 #endif // HAS_ARGBSHUFFLEROW_SSSE3 | |
| 4958 | |
| 4959 #ifdef HAS_ARGBSHUFFLEROW_AVX2 | |
| 4960 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | |
| 4961 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, | |
| 4962 const uint8* shuffler, int pix) { | |
| 4963 asm volatile ( | |
| 4964 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" | |
| 4965 LABELALIGN | |
| 4966 "1: \n" | |
| 4967 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
| 4968 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
| 4969 "lea " MEMLEA(0x40,0) ",%0 \n" | |
| 4970 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" | |
| 4971 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" | |
| 4972 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
| 4973 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" | |
| 4974 "lea " MEMLEA(0x40,1) ",%1 \n" | |
| 4975 "sub $0x10,%2 \n" | |
| 4976 "jg 1b \n" | |
| 4977 "vzeroupper \n" | |
| 4978 : "+r"(src_argb), // %0 | |
| 4979 "+r"(dst_argb), // %1 | |
| 4980 "+r"(pix) // %2 | |
| 4981 : "r"(shuffler) // %3 | |
| 4982 : "memory", "cc" | |
| 4983 , "xmm0", "xmm1", "xmm5" | |
| 4984 ); | |
| 4985 } | |
| 4986 #endif // HAS_ARGBSHUFFLEROW_AVX2 | |
| 4987 | |
| 4988 #ifdef HAS_ARGBSHUFFLEROW_SSE2 | |
| 4989 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | |
| 4990 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, | |
| 4991 const uint8* shuffler, int pix) { | |
| 4992 uintptr_t pixel_temp = 0u; | |
| 4993 asm volatile ( | |
| 4994 "pxor %%xmm5,%%xmm5 \n" | |
| 4995 "mov " MEMACCESS(4) ",%k2 \n" | |
| 4996 "cmp $0x3000102,%k2 \n" | |
| 4997 "je 3012f \n" | |
| 4998 "cmp $0x10203,%k2 \n" | |
| 4999 "je 123f \n" | |
| 5000 "cmp $0x30201,%k2 \n" | |
| 5001 "je 321f \n" | |
| 5002 "cmp $0x2010003,%k2 \n" | |
| 5003 "je 2103f \n" | |
| 5004 | |
| 5005 LABELALIGN | |
| 5006 "1: \n" | |
| 5007 "movzb " MEMACCESS(4) ",%2 \n" | |
| 5008 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 | |
| 5009 "mov %b2," MEMACCESS(1) " \n" | |
| 5010 "movzb " MEMACCESS2(0x1,4) ",%2 \n" | |
| 5011 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 | |
| 5012 "mov %b2," MEMACCESS2(0x1,1) " \n" | |
| 5013 "movzb " MEMACCESS2(0x2,4) ",%2 \n" | |
| 5014 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 | |
| 5015 "mov %b2," MEMACCESS2(0x2,1) " \n" | |
| 5016 "movzb " MEMACCESS2(0x3,4) ",%2 \n" | |
| 5017 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 | |
| 5018 "mov %b2," MEMACCESS2(0x3,1) " \n" | |
| 5019 "lea " MEMLEA(0x4,0) ",%0 \n" | |
| 5020 "lea " MEMLEA(0x4,1) ",%1 \n" | |
| 5021 "sub $0x1,%3 \n" | |
| 5022 "jg 1b \n" | |
| 5023 "jmp 99f \n" | |
| 5024 | |
| 5025 LABELALIGN | |
| 5026 "123: \n" | |
| 5027 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 5028 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 5029 "movdqa %%xmm0,%%xmm1 \n" | |
| 5030 "punpcklbw %%xmm5,%%xmm0 \n" | |
| 5031 "punpckhbw %%xmm5,%%xmm1 \n" | |
| 5032 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" | |
| 5033 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" | |
| 5034 "pshufhw $0x1b,%%xmm1,%%xmm1 \n" | |
| 5035 "pshuflw $0x1b,%%xmm1,%%xmm1 \n" | |
| 5036 "packuswb %%xmm1,%%xmm0 \n" | |
| 5037 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 5038 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 5039 "sub $0x4,%3 \n" | |
| 5040 "jg 123b \n" | |
| 5041 "jmp 99f \n" | |
| 5042 | |
| 5043 LABELALIGN | |
| 5044 "321: \n" | |
| 5045 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 5046 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 5047 "movdqa %%xmm0,%%xmm1 \n" | |
| 5048 "punpcklbw %%xmm5,%%xmm0 \n" | |
| 5049 "punpckhbw %%xmm5,%%xmm1 \n" | |
| 5050 "pshufhw $0x39,%%xmm0,%%xmm0 \n" | |
| 5051 "pshuflw $0x39,%%xmm0,%%xmm0 \n" | |
| 5052 "pshufhw $0x39,%%xmm1,%%xmm1 \n" | |
| 5053 "pshuflw $0x39,%%xmm1,%%xmm1 \n" | |
| 5054 "packuswb %%xmm1,%%xmm0 \n" | |
| 5055 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 5056 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 5057 "sub $0x4,%3 \n" | |
| 5058 "jg 321b \n" | |
| 5059 "jmp 99f \n" | |
| 5060 | |
| 5061 LABELALIGN | |
| 5062 "2103: \n" | |
| 5063 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 5064 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 5065 "movdqa %%xmm0,%%xmm1 \n" | |
| 5066 "punpcklbw %%xmm5,%%xmm0 \n" | |
| 5067 "punpckhbw %%xmm5,%%xmm1 \n" | |
| 5068 "pshufhw $0x93,%%xmm0,%%xmm0 \n" | |
| 5069 "pshuflw $0x93,%%xmm0,%%xmm0 \n" | |
| 5070 "pshufhw $0x93,%%xmm1,%%xmm1 \n" | |
| 5071 "pshuflw $0x93,%%xmm1,%%xmm1 \n" | |
| 5072 "packuswb %%xmm1,%%xmm0 \n" | |
| 5073 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 5074 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 5075 "sub $0x4,%3 \n" | |
| 5076 "jg 2103b \n" | |
| 5077 "jmp 99f \n" | |
| 5078 | |
| 5079 LABELALIGN | |
| 5080 "3012: \n" | |
| 5081 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 5082 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 5083 "movdqa %%xmm0,%%xmm1 \n" | |
| 5084 "punpcklbw %%xmm5,%%xmm0 \n" | |
| 5085 "punpckhbw %%xmm5,%%xmm1 \n" | |
| 5086 "pshufhw $0xc6,%%xmm0,%%xmm0 \n" | |
| 5087 "pshuflw $0xc6,%%xmm0,%%xmm0 \n" | |
| 5088 "pshufhw $0xc6,%%xmm1,%%xmm1 \n" | |
| 5089 "pshuflw $0xc6,%%xmm1,%%xmm1 \n" | |
| 5090 "packuswb %%xmm1,%%xmm0 \n" | |
| 5091 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 5092 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 5093 "sub $0x4,%3 \n" | |
| 5094 "jg 3012b \n" | |
| 5095 | |
| 5096 "99: \n" | |
| 5097 : "+r"(src_argb), // %0 | |
| 5098 "+r"(dst_argb), // %1 | |
| 5099 "+d"(pixel_temp), // %2 | |
| 5100 "+r"(pix) // %3 | |
| 5101 : "r"(shuffler) // %4 | |
| 5102 : "memory", "cc", NACL_R14 | |
| 5103 "xmm0", "xmm1", "xmm5" | |
| 5104 ); | |
| 5105 } | |
| 5106 #endif // HAS_ARGBSHUFFLEROW_SSE2 | |
| 5107 | |
| 5108 #ifdef HAS_I422TOYUY2ROW_SSE2 | |
| 5109 void I422ToYUY2Row_SSE2(const uint8* src_y, | |
| 5110 const uint8* src_u, | |
| 5111 const uint8* src_v, | |
| 5112 uint8* dst_frame, int width) { | |
| 5113 asm volatile ( | |
| 5114 "sub %1,%2 \n" | |
| 5115 LABELALIGN | |
| 5116 "1: \n" | |
| 5117 "movq " MEMACCESS(1) ",%%xmm2 \n" | |
| 5118 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 | |
| 5119 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 5120 "punpcklbw %%xmm3,%%xmm2 \n" | |
| 5121 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 5122 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 5123 "movdqa %%xmm0,%%xmm1 \n" | |
| 5124 "punpcklbw %%xmm2,%%xmm0 \n" | |
| 5125 "punpckhbw %%xmm2,%%xmm1 \n" | |
| 5126 "movdqu %%xmm0," MEMACCESS(3) " \n" | |
| 5127 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" | |
| 5128 "lea " MEMLEA(0x20,3) ",%3 \n" | |
| 5129 "sub $0x10,%4 \n" | |
| 5130 "jg 1b \n" | |
| 5131 : "+r"(src_y), // %0 | |
| 5132 "+r"(src_u), // %1 | |
| 5133 "+r"(src_v), // %2 | |
| 5134 "+r"(dst_frame), // %3 | |
| 5135 "+rm"(width) // %4 | |
| 5136 : | |
| 5137 : "memory", "cc", NACL_R14 | |
| 5138 "xmm0", "xmm1", "xmm2", "xmm3" | |
| 5139 ); | |
| 5140 } | |
| 5141 #endif // HAS_I422TOYUY2ROW_SSE2 | |
| 5142 | |
| 5143 #ifdef HAS_I422TOUYVYROW_SSE2 | |
| 5144 void I422ToUYVYRow_SSE2(const uint8* src_y, | |
| 5145 const uint8* src_u, | |
| 5146 const uint8* src_v, | |
| 5147 uint8* dst_frame, int width) { | |
| 5148 asm volatile ( | |
| 5149 "sub %1,%2 \n" | |
| 5150 LABELALIGN | |
| 5151 "1: \n" | |
| 5152 "movq " MEMACCESS(1) ",%%xmm2 \n" | |
| 5153 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 | |
| 5154 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 5155 "punpcklbw %%xmm3,%%xmm2 \n" | |
| 5156 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 5157 "movdqa %%xmm2,%%xmm1 \n" | |
| 5158 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 5159 "punpcklbw %%xmm0,%%xmm1 \n" | |
| 5160 "punpckhbw %%xmm0,%%xmm2 \n" | |
| 5161 "movdqu %%xmm1," MEMACCESS(3) " \n" | |
| 5162 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" | |
| 5163 "lea " MEMLEA(0x20,3) ",%3 \n" | |
| 5164 "sub $0x10,%4 \n" | |
| 5165 "jg 1b \n" | |
| 5166 : "+r"(src_y), // %0 | |
| 5167 "+r"(src_u), // %1 | |
| 5168 "+r"(src_v), // %2 | |
| 5169 "+r"(dst_frame), // %3 | |
| 5170 "+rm"(width) // %4 | |
| 5171 : | |
| 5172 : "memory", "cc", NACL_R14 | |
| 5173 "xmm0", "xmm1", "xmm2", "xmm3" | |
| 5174 ); | |
| 5175 } | |
| 5176 #endif // HAS_I422TOUYVYROW_SSE2 | |
| 5177 | |
| 5178 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 | |
| 5179 void ARGBPolynomialRow_SSE2(const uint8* src_argb, | |
| 5180 uint8* dst_argb, const float* poly, | |
| 5181 int width) { | |
| 5182 asm volatile ( | |
| 5183 "pxor %%xmm3,%%xmm3 \n" | |
| 5184 | |
| 5185 // 2 pixel loop. | |
| 5186 LABELALIGN | |
| 5187 "1: \n" | |
| 5188 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
| 5189 "lea " MEMLEA(0x8,0) ",%0 \n" | |
| 5190 "punpcklbw %%xmm3,%%xmm0 \n" | |
| 5191 "movdqa %%xmm0,%%xmm4 \n" | |
| 5192 "punpcklwd %%xmm3,%%xmm0 \n" | |
| 5193 "punpckhwd %%xmm3,%%xmm4 \n" | |
| 5194 "cvtdq2ps %%xmm0,%%xmm0 \n" | |
| 5195 "cvtdq2ps %%xmm4,%%xmm4 \n" | |
| 5196 "movdqa %%xmm0,%%xmm1 \n" | |
| 5197 "movdqa %%xmm4,%%xmm5 \n" | |
| 5198 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" | |
| 5199 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" | |
| 5200 "addps " MEMACCESS(3) ",%%xmm0 \n" | |
| 5201 "addps " MEMACCESS(3) ",%%xmm4 \n" | |
| 5202 "movdqa %%xmm1,%%xmm2 \n" | |
| 5203 "movdqa %%xmm5,%%xmm6 \n" | |
| 5204 "mulps %%xmm1,%%xmm2 \n" | |
| 5205 "mulps %%xmm5,%%xmm6 \n" | |
| 5206 "mulps %%xmm2,%%xmm1 \n" | |
| 5207 "mulps %%xmm6,%%xmm5 \n" | |
| 5208 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" | |
| 5209 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" | |
| 5210 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" | |
| 5211 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" | |
| 5212 "addps %%xmm2,%%xmm0 \n" | |
| 5213 "addps %%xmm6,%%xmm4 \n" | |
| 5214 "addps %%xmm1,%%xmm0 \n" | |
| 5215 "addps %%xmm5,%%xmm4 \n" | |
| 5216 "cvttps2dq %%xmm0,%%xmm0 \n" | |
| 5217 "cvttps2dq %%xmm4,%%xmm4 \n" | |
| 5218 "packuswb %%xmm4,%%xmm0 \n" | |
| 5219 "packuswb %%xmm0,%%xmm0 \n" | |
| 5220 "movq %%xmm0," MEMACCESS(1) " \n" | |
| 5221 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 5222 "sub $0x2,%2 \n" | |
| 5223 "jg 1b \n" | |
| 5224 : "+r"(src_argb), // %0 | |
| 5225 "+r"(dst_argb), // %1 | |
| 5226 "+r"(width) // %2 | |
| 5227 : "r"(poly) // %3 | |
| 5228 : "memory", "cc" | |
| 5229 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
| 5230 ); | |
| 5231 } | |
| 5232 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 | |
| 5233 | |
| 5234 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 | |
| 5235 void ARGBPolynomialRow_AVX2(const uint8* src_argb, | |
| 5236 uint8* dst_argb, const float* poly, | |
| 5237 int width) { | |
| 5238 asm volatile ( | |
| 5239 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" | |
| 5240 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" | |
| 5241 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" | |
| 5242 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" | |
| 5243 | |
| 5244 // 2 pixel loop. | |
| 5245 LABELALIGN | |
| 5246 "1: \n" | |
| 5247 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels | |
| 5248 "lea " MEMLEA(0x8,0) ",%0 \n" | |
| 5249 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats | |
| 5250 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X | |
| 5251 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X | |
| 5252 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X | |
| 5253 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X | |
| 5254 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X | |
| 5255 "vcvttps2dq %%ymm0,%%ymm0 \n" | |
| 5256 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" | |
| 5257 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
| 5258 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" | |
| 5259 "vmovq %%xmm0," MEMACCESS(1) " \n" | |
| 5260 "lea " MEMLEA(0x8,1) ",%1 \n" | |
| 5261 "sub $0x2,%2 \n" | |
| 5262 "jg 1b \n" | |
| 5263 "vzeroupper \n" | |
| 5264 : "+r"(src_argb), // %0 | |
| 5265 "+r"(dst_argb), // %1 | |
| 5266 "+r"(width) // %2 | |
| 5267 : "r"(poly) // %3 | |
| 5268 : "memory", "cc", | |
| 5269 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 5270 ); | |
| 5271 } | |
| 5272 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 | |
| 5273 | |
| 5274 #ifdef HAS_ARGBCOLORTABLEROW_X86 | |
| 5275 // Tranform ARGB pixels with color table. | |
| 5276 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, | |
| 5277 int width) { | |
| 5278 uintptr_t pixel_temp = 0u; | |
| 5279 asm volatile ( | |
| 5280 // 1 pixel loop. | |
| 5281 LABELALIGN | |
| 5282 "1: \n" | |
| 5283 "movzb " MEMACCESS(0) ",%1 \n" | |
| 5284 "lea " MEMLEA(0x4,0) ",%0 \n" | |
| 5285 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 | |
| 5286 "mov %b1," MEMACCESS2(-0x4,0) " \n" | |
| 5287 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" | |
| 5288 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 | |
| 5289 "mov %b1," MEMACCESS2(-0x3,0) " \n" | |
| 5290 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" | |
| 5291 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 | |
| 5292 "mov %b1," MEMACCESS2(-0x2,0) " \n" | |
| 5293 "movzb " MEMACCESS2(-0x1,0) ",%1 \n" | |
| 5294 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 | |
| 5295 "mov %b1," MEMACCESS2(-0x1,0) " \n" | |
| 5296 "dec %2 \n" | |
| 5297 "jg 1b \n" | |
| 5298 : "+r"(dst_argb), // %0 | |
| 5299 "+d"(pixel_temp), // %1 | |
| 5300 "+r"(width) // %2 | |
| 5301 : "r"(table_argb) // %3 | |
| 5302 : "memory", "cc"); | |
| 5303 } | |
| 5304 #endif // HAS_ARGBCOLORTABLEROW_X86 | |
| 5305 | |
| 5306 #ifdef HAS_RGBCOLORTABLEROW_X86 | |
| 5307 // Tranform RGB pixels with color table. | |
| 5308 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { | |
| 5309 uintptr_t pixel_temp = 0u; | |
| 5310 asm volatile ( | |
| 5311 // 1 pixel loop. | |
| 5312 LABELALIGN | |
| 5313 "1: \n" | |
| 5314 "movzb " MEMACCESS(0) ",%1 \n" | |
| 5315 "lea " MEMLEA(0x4,0) ",%0 \n" | |
| 5316 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 | |
| 5317 "mov %b1," MEMACCESS2(-0x4,0) " \n" | |
| 5318 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" | |
| 5319 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 | |
| 5320 "mov %b1," MEMACCESS2(-0x3,0) " \n" | |
| 5321 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" | |
| 5322 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 | |
| 5323 "mov %b1," MEMACCESS2(-0x2,0) " \n" | |
| 5324 "dec %2 \n" | |
| 5325 "jg 1b \n" | |
| 5326 : "+r"(dst_argb), // %0 | |
| 5327 "+d"(pixel_temp), // %1 | |
| 5328 "+r"(width) // %2 | |
| 5329 : "r"(table_argb) // %3 | |
| 5330 : "memory", "cc"); | |
| 5331 } | |
| 5332 #endif // HAS_RGBCOLORTABLEROW_X86 | |
| 5333 | |
| 5334 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 | |
| 5335 // Tranform RGB pixels with luma table. | |
| 5336 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | |
| 5337 int width, | |
| 5338 const uint8* luma, uint32 lumacoeff) { | |
| 5339 uintptr_t pixel_temp = 0u; | |
| 5340 uintptr_t table_temp = 0u; | |
| 5341 asm volatile ( | |
| 5342 "movd %6,%%xmm3 \n" | |
| 5343 "pshufd $0x0,%%xmm3,%%xmm3 \n" | |
| 5344 "pcmpeqb %%xmm4,%%xmm4 \n" | |
| 5345 "psllw $0x8,%%xmm4 \n" | |
| 5346 "pxor %%xmm5,%%xmm5 \n" | |
| 5347 | |
| 5348 // 4 pixel loop. | |
| 5349 LABELALIGN | |
| 5350 "1: \n" | |
| 5351 "movdqu " MEMACCESS(2) ",%%xmm0 \n" | |
| 5352 "pmaddubsw %%xmm3,%%xmm0 \n" | |
| 5353 "phaddw %%xmm0,%%xmm0 \n" | |
| 5354 "pand %%xmm4,%%xmm0 \n" | |
| 5355 "punpcklwd %%xmm5,%%xmm0 \n" | |
| 5356 "movd %%xmm0,%k1 \n" // 32 bit offset | |
| 5357 "add %5,%1 \n" | |
| 5358 "pshufd $0x39,%%xmm0,%%xmm0 \n" | |
| 5359 | |
| 5360 "movzb " MEMACCESS(2) ",%0 \n" | |
| 5361 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
| 5362 "mov %b0," MEMACCESS(3) " \n" | |
| 5363 "movzb " MEMACCESS2(0x1,2) ",%0 \n" | |
| 5364 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
| 5365 "mov %b0," MEMACCESS2(0x1,3) " \n" | |
| 5366 "movzb " MEMACCESS2(0x2,2) ",%0 \n" | |
| 5367 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
| 5368 "mov %b0," MEMACCESS2(0x2,3) " \n" | |
| 5369 "movzb " MEMACCESS2(0x3,2) ",%0 \n" | |
| 5370 "mov %b0," MEMACCESS2(0x3,3) " \n" | |
| 5371 | |
| 5372 "movd %%xmm0,%k1 \n" // 32 bit offset | |
| 5373 "add %5,%1 \n" | |
| 5374 "pshufd $0x39,%%xmm0,%%xmm0 \n" | |
| 5375 | |
| 5376 "movzb " MEMACCESS2(0x4,2) ",%0 \n" | |
| 5377 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
| 5378 "mov %b0," MEMACCESS2(0x4,3) " \n" | |
| 5379 "movzb " MEMACCESS2(0x5,2) ",%0 \n" | |
| 5380 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
| 5381 "mov %b0," MEMACCESS2(0x5,3) " \n" | |
| 5382 "movzb " MEMACCESS2(0x6,2) ",%0 \n" | |
| 5383 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
| 5384 "mov %b0," MEMACCESS2(0x6,3) " \n" | |
| 5385 "movzb " MEMACCESS2(0x7,2) ",%0 \n" | |
| 5386 "mov %b0," MEMACCESS2(0x7,3) " \n" | |
| 5387 | |
| 5388 "movd %%xmm0,%k1 \n" // 32 bit offset | |
| 5389 "add %5,%1 \n" | |
| 5390 "pshufd $0x39,%%xmm0,%%xmm0 \n" | |
| 5391 | |
| 5392 "movzb " MEMACCESS2(0x8,2) ",%0 \n" | |
| 5393 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
| 5394 "mov %b0," MEMACCESS2(0x8,3) " \n" | |
| 5395 "movzb " MEMACCESS2(0x9,2) ",%0 \n" | |
| 5396 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
| 5397 "mov %b0," MEMACCESS2(0x9,3) " \n" | |
| 5398 "movzb " MEMACCESS2(0xa,2) ",%0 \n" | |
| 5399 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
| 5400 "mov %b0," MEMACCESS2(0xa,3) " \n" | |
| 5401 "movzb " MEMACCESS2(0xb,2) ",%0 \n" | |
| 5402 "mov %b0," MEMACCESS2(0xb,3) " \n" | |
| 5403 | |
| 5404 "movd %%xmm0,%k1 \n" // 32 bit offset | |
| 5405 "add %5,%1 \n" | |
| 5406 | |
| 5407 "movzb " MEMACCESS2(0xc,2) ",%0 \n" | |
| 5408 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
| 5409 "mov %b0," MEMACCESS2(0xc,3) " \n" | |
| 5410 "movzb " MEMACCESS2(0xd,2) ",%0 \n" | |
| 5411 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
| 5412 "mov %b0," MEMACCESS2(0xd,3) " \n" | |
| 5413 "movzb " MEMACCESS2(0xe,2) ",%0 \n" | |
| 5414 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
| 5415 "mov %b0," MEMACCESS2(0xe,3) " \n" | |
| 5416 "movzb " MEMACCESS2(0xf,2) ",%0 \n" | |
| 5417 "mov %b0," MEMACCESS2(0xf,3) " \n" | |
| 5418 "lea " MEMLEA(0x10,2) ",%2 \n" | |
| 5419 "lea " MEMLEA(0x10,3) ",%3 \n" | |
| 5420 "sub $0x4,%4 \n" | |
| 5421 "jg 1b \n" | |
| 5422 : "+d"(pixel_temp), // %0 | |
| 5423 "+a"(table_temp), // %1 | |
| 5424 "+r"(src_argb), // %2 | |
| 5425 "+r"(dst_argb), // %3 | |
| 5426 "+rm"(width) // %4 | |
| 5427 : "r"(luma), // %5 | |
| 5428 "rm"(lumacoeff) // %6 | |
| 5429 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" | |
| 5430 ); | |
| 5431 } | |
| 5432 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | |
| 5433 | |
| 5434 #endif // defined(__x86_64__) || defined(__i386__) | |
| 5435 | |
| 5436 #ifdef __cplusplus | |
| 5437 } // extern "C" | |
| 5438 } // namespace libyuv | |
| 5439 #endif | |
| OLD | NEW |