OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 155 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
166 }; | 166 }; |
167 | 167 |
168 // NV21 shuf 8 VU to 16 UV. | 168 // NV21 shuf 8 VU to 16 UV. |
169 static const lvec8 kShuffleNV21 = { | 169 static const lvec8 kShuffleNV21 = { |
170 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, | 170 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
171 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, | 171 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
172 }; | 172 }; |
173 #endif // HAS_RGB24TOARGBROW_SSSE3 | 173 #endif // HAS_RGB24TOARGBROW_SSSE3 |
174 | 174 |
175 #ifdef HAS_J400TOARGBROW_SSE2 | 175 #ifdef HAS_J400TOARGBROW_SSE2 |
176 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | 176 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { |
177 asm volatile ( | 177 asm volatile ( |
178 "pcmpeqb %%xmm5,%%xmm5 \n" | 178 "pcmpeqb %%xmm5,%%xmm5 \n" |
179 "pslld $0x18,%%xmm5 \n" | 179 "pslld $0x18,%%xmm5 \n" |
180 LABELALIGN | 180 LABELALIGN |
181 "1: \n" | 181 "1: \n" |
182 "movq " MEMACCESS(0) ",%%xmm0 \n" | 182 "movq " MEMACCESS(0) ",%%xmm0 \n" |
183 "lea " MEMLEA(0x8,0) ",%0 \n" | 183 "lea " MEMLEA(0x8,0) ",%0 \n" |
184 "punpcklbw %%xmm0,%%xmm0 \n" | 184 "punpcklbw %%xmm0,%%xmm0 \n" |
185 "movdqa %%xmm0,%%xmm1 \n" | 185 "movdqa %%xmm0,%%xmm1 \n" |
186 "punpcklwd %%xmm0,%%xmm0 \n" | 186 "punpcklwd %%xmm0,%%xmm0 \n" |
187 "punpckhwd %%xmm1,%%xmm1 \n" | 187 "punpckhwd %%xmm1,%%xmm1 \n" |
188 "por %%xmm5,%%xmm0 \n" | 188 "por %%xmm5,%%xmm0 \n" |
189 "por %%xmm5,%%xmm1 \n" | 189 "por %%xmm5,%%xmm1 \n" |
190 "movdqu %%xmm0," MEMACCESS(1) " \n" | 190 "movdqu %%xmm0," MEMACCESS(1) " \n" |
191 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | 191 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
192 "lea " MEMLEA(0x20,1) ",%1 \n" | 192 "lea " MEMLEA(0x20,1) ",%1 \n" |
193 "sub $0x8,%2 \n" | 193 "sub $0x8,%2 \n" |
194 "jg 1b \n" | 194 "jg 1b \n" |
195 : "+r"(src_y), // %0 | 195 : "+r"(src_y), // %0 |
196 "+r"(dst_argb), // %1 | 196 "+r"(dst_argb), // %1 |
197 "+r"(pix) // %2 | 197 "+r"(width) // %2 |
198 :: "memory", "cc", "xmm0", "xmm1", "xmm5" | 198 :: "memory", "cc", "xmm0", "xmm1", "xmm5" |
199 ); | 199 ); |
200 } | 200 } |
201 #endif // HAS_J400TOARGBROW_SSE2 | 201 #endif // HAS_J400TOARGBROW_SSE2 |
202 | 202 |
203 #ifdef HAS_RGB24TOARGBROW_SSSE3 | 203 #ifdef HAS_RGB24TOARGBROW_SSSE3 |
204 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 204 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { |
205 asm volatile ( | 205 asm volatile ( |
206 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 | 206 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 |
207 "pslld $0x18,%%xmm5 \n" | 207 "pslld $0x18,%%xmm5 \n" |
208 "movdqa %3,%%xmm4 \n" | 208 "movdqa %3,%%xmm4 \n" |
209 LABELALIGN | 209 LABELALIGN |
210 "1: \n" | 210 "1: \n" |
211 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 211 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
212 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 212 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
213 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" | 213 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" |
214 "lea " MEMLEA(0x30,0) ",%0 \n" | 214 "lea " MEMLEA(0x30,0) ",%0 \n" |
(...skipping 11 matching lines...) Expand all Loading... |
226 "palignr $0x4,%%xmm3,%%xmm3 \n" | 226 "palignr $0x4,%%xmm3,%%xmm3 \n" |
227 "pshufb %%xmm4,%%xmm3 \n" | 227 "pshufb %%xmm4,%%xmm3 \n" |
228 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | 228 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
229 "por %%xmm5,%%xmm3 \n" | 229 "por %%xmm5,%%xmm3 \n" |
230 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" | 230 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" |
231 "lea " MEMLEA(0x40,1) ",%1 \n" | 231 "lea " MEMLEA(0x40,1) ",%1 \n" |
232 "sub $0x10,%2 \n" | 232 "sub $0x10,%2 \n" |
233 "jg 1b \n" | 233 "jg 1b \n" |
234 : "+r"(src_rgb24), // %0 | 234 : "+r"(src_rgb24), // %0 |
235 "+r"(dst_argb), // %1 | 235 "+r"(dst_argb), // %1 |
236 "+r"(pix) // %2 | 236 "+r"(width) // %2 |
237 : "m"(kShuffleMaskRGB24ToARGB) // %3 | 237 : "m"(kShuffleMaskRGB24ToARGB) // %3 |
238 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 238 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
239 ); | 239 ); |
240 } | 240 } |
241 | 241 |
242 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { | 242 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) { |
243 asm volatile ( | 243 asm volatile ( |
244 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 | 244 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 |
245 "pslld $0x18,%%xmm5 \n" | 245 "pslld $0x18,%%xmm5 \n" |
246 "movdqa %3,%%xmm4 \n" | 246 "movdqa %3,%%xmm4 \n" |
247 LABELALIGN | 247 LABELALIGN |
248 "1: \n" | 248 "1: \n" |
249 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 249 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
250 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 250 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
251 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" | 251 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" |
252 "lea " MEMLEA(0x30,0) ",%0 \n" | 252 "lea " MEMLEA(0x30,0) ",%0 \n" |
(...skipping 11 matching lines...) Expand all Loading... |
264 "palignr $0x4,%%xmm3,%%xmm3 \n" | 264 "palignr $0x4,%%xmm3,%%xmm3 \n" |
265 "pshufb %%xmm4,%%xmm3 \n" | 265 "pshufb %%xmm4,%%xmm3 \n" |
266 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | 266 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
267 "por %%xmm5,%%xmm3 \n" | 267 "por %%xmm5,%%xmm3 \n" |
268 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" | 268 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" |
269 "lea " MEMLEA(0x40,1) ",%1 \n" | 269 "lea " MEMLEA(0x40,1) ",%1 \n" |
270 "sub $0x10,%2 \n" | 270 "sub $0x10,%2 \n" |
271 "jg 1b \n" | 271 "jg 1b \n" |
272 : "+r"(src_raw), // %0 | 272 : "+r"(src_raw), // %0 |
273 "+r"(dst_argb), // %1 | 273 "+r"(dst_argb), // %1 |
274 "+r"(pix) // %2 | 274 "+r"(width) // %2 |
275 : "m"(kShuffleMaskRAWToARGB) // %3 | 275 : "m"(kShuffleMaskRAWToARGB) // %3 |
276 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 276 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
277 ); | 277 ); |
278 } | 278 } |
279 | 279 |
280 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { | 280 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { |
281 asm volatile ( | 281 asm volatile ( |
282 "mov $0x1080108,%%eax \n" | 282 "mov $0x1080108,%%eax \n" |
283 "movd %%eax,%%xmm5 \n" | 283 "movd %%eax,%%xmm5 \n" |
284 "pshufd $0x0,%%xmm5,%%xmm5 \n" | 284 "pshufd $0x0,%%xmm5,%%xmm5 \n" |
285 "mov $0x20802080,%%eax \n" | 285 "mov $0x20802080,%%eax \n" |
286 "movd %%eax,%%xmm6 \n" | 286 "movd %%eax,%%xmm6 \n" |
287 "pshufd $0x0,%%xmm6,%%xmm6 \n" | 287 "pshufd $0x0,%%xmm6,%%xmm6 \n" |
288 "pcmpeqb %%xmm3,%%xmm3 \n" | 288 "pcmpeqb %%xmm3,%%xmm3 \n" |
289 "psllw $0xb,%%xmm3 \n" | 289 "psllw $0xb,%%xmm3 \n" |
290 "pcmpeqb %%xmm4,%%xmm4 \n" | 290 "pcmpeqb %%xmm4,%%xmm4 \n" |
(...skipping 20 matching lines...) Expand all Loading... |
311 "movdqa %%xmm1,%%xmm2 \n" | 311 "movdqa %%xmm1,%%xmm2 \n" |
312 "punpcklbw %%xmm0,%%xmm1 \n" | 312 "punpcklbw %%xmm0,%%xmm1 \n" |
313 "punpckhbw %%xmm0,%%xmm2 \n" | 313 "punpckhbw %%xmm0,%%xmm2 \n" |
314 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) | 314 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) |
315 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) | 315 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) |
316 "lea " MEMLEA(0x10,0) ",%0 \n" | 316 "lea " MEMLEA(0x10,0) ",%0 \n" |
317 "sub $0x8,%2 \n" | 317 "sub $0x8,%2 \n" |
318 "jg 1b \n" | 318 "jg 1b \n" |
319 : "+r"(src), // %0 | 319 : "+r"(src), // %0 |
320 "+r"(dst), // %1 | 320 "+r"(dst), // %1 |
321 "+r"(pix) // %2 | 321 "+r"(width) // %2 |
322 : | 322 : |
323 : "memory", "cc", "eax", NACL_R14 | 323 : "memory", "cc", "eax", NACL_R14 |
324 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 324 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
325 ); | 325 ); |
326 } | 326 } |
327 | 327 |
328 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { | 328 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { |
329 asm volatile ( | 329 asm volatile ( |
330 "mov $0x1080108,%%eax \n" | 330 "mov $0x1080108,%%eax \n" |
331 "movd %%eax,%%xmm5 \n" | 331 "movd %%eax,%%xmm5 \n" |
332 "pshufd $0x0,%%xmm5,%%xmm5 \n" | 332 "pshufd $0x0,%%xmm5,%%xmm5 \n" |
333 "mov $0x42004200,%%eax \n" | 333 "mov $0x42004200,%%eax \n" |
334 "movd %%eax,%%xmm6 \n" | 334 "movd %%eax,%%xmm6 \n" |
335 "pshufd $0x0,%%xmm6,%%xmm6 \n" | 335 "pshufd $0x0,%%xmm6,%%xmm6 \n" |
336 "pcmpeqb %%xmm3,%%xmm3 \n" | 336 "pcmpeqb %%xmm3,%%xmm3 \n" |
337 "psllw $0xb,%%xmm3 \n" | 337 "psllw $0xb,%%xmm3 \n" |
338 "movdqa %%xmm3,%%xmm4 \n" | 338 "movdqa %%xmm3,%%xmm4 \n" |
(...skipping 23 matching lines...) Expand all Loading... |
362 "movdqa %%xmm1,%%xmm2 \n" | 362 "movdqa %%xmm1,%%xmm2 \n" |
363 "punpcklbw %%xmm0,%%xmm1 \n" | 363 "punpcklbw %%xmm0,%%xmm1 \n" |
364 "punpckhbw %%xmm0,%%xmm2 \n" | 364 "punpckhbw %%xmm0,%%xmm2 \n" |
365 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) | 365 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) |
366 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) | 366 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) |
367 "lea " MEMLEA(0x10,0) ",%0 \n" | 367 "lea " MEMLEA(0x10,0) ",%0 \n" |
368 "sub $0x8,%2 \n" | 368 "sub $0x8,%2 \n" |
369 "jg 1b \n" | 369 "jg 1b \n" |
370 : "+r"(src), // %0 | 370 : "+r"(src), // %0 |
371 "+r"(dst), // %1 | 371 "+r"(dst), // %1 |
372 "+r"(pix) // %2 | 372 "+r"(width) // %2 |
373 : | 373 : |
374 : "memory", "cc", "eax", NACL_R14 | 374 : "memory", "cc", "eax", NACL_R14 |
375 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 375 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
376 ); | 376 ); |
377 } | 377 } |
378 | 378 |
379 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { | 379 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { |
380 asm volatile ( | 380 asm volatile ( |
381 "mov $0xf0f0f0f,%%eax \n" | 381 "mov $0xf0f0f0f,%%eax \n" |
382 "movd %%eax,%%xmm4 \n" | 382 "movd %%eax,%%xmm4 \n" |
383 "pshufd $0x0,%%xmm4,%%xmm4 \n" | 383 "pshufd $0x0,%%xmm4,%%xmm4 \n" |
384 "movdqa %%xmm4,%%xmm5 \n" | 384 "movdqa %%xmm4,%%xmm5 \n" |
385 "pslld $0x4,%%xmm5 \n" | 385 "pslld $0x4,%%xmm5 \n" |
386 "sub %0,%1 \n" | 386 "sub %0,%1 \n" |
387 "sub %0,%1 \n" | 387 "sub %0,%1 \n" |
388 LABELALIGN | 388 LABELALIGN |
389 "1: \n" | 389 "1: \n" |
(...skipping 10 matching lines...) Expand all Loading... |
400 "movdqa %%xmm0,%%xmm1 \n" | 400 "movdqa %%xmm0,%%xmm1 \n" |
401 "punpcklbw %%xmm2,%%xmm0 \n" | 401 "punpcklbw %%xmm2,%%xmm0 \n" |
402 "punpckhbw %%xmm2,%%xmm1 \n" | 402 "punpckhbw %%xmm2,%%xmm1 \n" |
403 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) | 403 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) |
404 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) | 404 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) |
405 "lea " MEMLEA(0x10,0) ",%0 \n" | 405 "lea " MEMLEA(0x10,0) ",%0 \n" |
406 "sub $0x8,%2 \n" | 406 "sub $0x8,%2 \n" |
407 "jg 1b \n" | 407 "jg 1b \n" |
408 : "+r"(src), // %0 | 408 : "+r"(src), // %0 |
409 "+r"(dst), // %1 | 409 "+r"(dst), // %1 |
410 "+r"(pix) // %2 | 410 "+r"(width) // %2 |
411 : | 411 : |
412 : "memory", "cc", "eax", NACL_R14 | 412 : "memory", "cc", "eax", NACL_R14 |
413 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 413 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
414 ); | 414 ); |
415 } | 415 } |
416 | 416 |
417 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { | 417 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) { |
418 asm volatile ( | 418 asm volatile ( |
419 "movdqa %3,%%xmm6 \n" | 419 "movdqa %3,%%xmm6 \n" |
420 LABELALIGN | 420 LABELALIGN |
421 "1: \n" | 421 "1: \n" |
422 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 422 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
423 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 423 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
424 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | 424 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
425 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | 425 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
426 "lea " MEMLEA(0x40,0) ",%0 \n" | 426 "lea " MEMLEA(0x40,0) ",%0 \n" |
427 "pshufb %%xmm6,%%xmm0 \n" | 427 "pshufb %%xmm6,%%xmm0 \n" |
(...skipping 11 matching lines...) Expand all Loading... |
439 "psrldq $0x8,%%xmm2 \n" | 439 "psrldq $0x8,%%xmm2 \n" |
440 "pslldq $0x4,%%xmm3 \n" | 440 "pslldq $0x4,%%xmm3 \n" |
441 "por %%xmm3,%%xmm2 \n" | 441 "por %%xmm3,%%xmm2 \n" |
442 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | 442 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
443 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" | 443 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" |
444 "lea " MEMLEA(0x30,1) ",%1 \n" | 444 "lea " MEMLEA(0x30,1) ",%1 \n" |
445 "sub $0x10,%2 \n" | 445 "sub $0x10,%2 \n" |
446 "jg 1b \n" | 446 "jg 1b \n" |
447 : "+r"(src), // %0 | 447 : "+r"(src), // %0 |
448 "+r"(dst), // %1 | 448 "+r"(dst), // %1 |
449 "+r"(pix) // %2 | 449 "+r"(width) // %2 |
450 : "m"(kShuffleMaskARGBToRGB24) // %3 | 450 : "m"(kShuffleMaskARGBToRGB24) // %3 |
451 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 451 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
452 ); | 452 ); |
453 } | 453 } |
454 | 454 |
455 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { | 455 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) { |
456 asm volatile ( | 456 asm volatile ( |
457 "movdqa %3,%%xmm6 \n" | 457 "movdqa %3,%%xmm6 \n" |
458 LABELALIGN | 458 LABELALIGN |
459 "1: \n" | 459 "1: \n" |
460 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 460 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
461 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 461 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
462 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | 462 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
463 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | 463 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
464 "lea " MEMLEA(0x40,0) ",%0 \n" | 464 "lea " MEMLEA(0x40,0) ",%0 \n" |
465 "pshufb %%xmm6,%%xmm0 \n" | 465 "pshufb %%xmm6,%%xmm0 \n" |
(...skipping 11 matching lines...) Expand all Loading... |
477 "psrldq $0x8,%%xmm2 \n" | 477 "psrldq $0x8,%%xmm2 \n" |
478 "pslldq $0x4,%%xmm3 \n" | 478 "pslldq $0x4,%%xmm3 \n" |
479 "por %%xmm3,%%xmm2 \n" | 479 "por %%xmm3,%%xmm2 \n" |
480 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | 480 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
481 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" | 481 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" |
482 "lea " MEMLEA(0x30,1) ",%1 \n" | 482 "lea " MEMLEA(0x30,1) ",%1 \n" |
483 "sub $0x10,%2 \n" | 483 "sub $0x10,%2 \n" |
484 "jg 1b \n" | 484 "jg 1b \n" |
485 : "+r"(src), // %0 | 485 : "+r"(src), // %0 |
486 "+r"(dst), // %1 | 486 "+r"(dst), // %1 |
487 "+r"(pix) // %2 | 487 "+r"(width) // %2 |
488 : "m"(kShuffleMaskARGBToRAW) // %3 | 488 : "m"(kShuffleMaskARGBToRAW) // %3 |
489 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 489 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
490 ); | 490 ); |
491 } | 491 } |
492 | 492 |
493 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { | 493 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) { |
494 asm volatile ( | 494 asm volatile ( |
495 "pcmpeqb %%xmm3,%%xmm3 \n" | 495 "pcmpeqb %%xmm3,%%xmm3 \n" |
496 "psrld $0x1b,%%xmm3 \n" | 496 "psrld $0x1b,%%xmm3 \n" |
497 "pcmpeqb %%xmm4,%%xmm4 \n" | 497 "pcmpeqb %%xmm4,%%xmm4 \n" |
498 "psrld $0x1a,%%xmm4 \n" | 498 "psrld $0x1a,%%xmm4 \n" |
499 "pslld $0x5,%%xmm4 \n" | 499 "pslld $0x5,%%xmm4 \n" |
500 "pcmpeqb %%xmm5,%%xmm5 \n" | 500 "pcmpeqb %%xmm5,%%xmm5 \n" |
501 "pslld $0xb,%%xmm5 \n" | 501 "pslld $0xb,%%xmm5 \n" |
502 LABELALIGN | 502 LABELALIGN |
503 "1: \n" | 503 "1: \n" |
(...skipping 10 matching lines...) Expand all Loading... |
514 "por %%xmm2,%%xmm1 \n" | 514 "por %%xmm2,%%xmm1 \n" |
515 "por %%xmm1,%%xmm0 \n" | 515 "por %%xmm1,%%xmm0 \n" |
516 "packssdw %%xmm0,%%xmm0 \n" | 516 "packssdw %%xmm0,%%xmm0 \n" |
517 "lea " MEMLEA(0x10,0) ",%0 \n" | 517 "lea " MEMLEA(0x10,0) ",%0 \n" |
518 "movq %%xmm0," MEMACCESS(1) " \n" | 518 "movq %%xmm0," MEMACCESS(1) " \n" |
519 "lea " MEMLEA(0x8,1) ",%1 \n" | 519 "lea " MEMLEA(0x8,1) ",%1 \n" |
520 "sub $0x4,%2 \n" | 520 "sub $0x4,%2 \n" |
521 "jg 1b \n" | 521 "jg 1b \n" |
522 : "+r"(src), // %0 | 522 : "+r"(src), // %0 |
523 "+r"(dst), // %1 | 523 "+r"(dst), // %1 |
524 "+r"(pix) // %2 | 524 "+r"(width) // %2 |
525 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 525 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
526 ); | 526 ); |
527 } | 527 } |
528 | 528 |
529 void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, | 529 void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, |
530 const uint32 dither4, int pix) { | 530 const uint32 dither4, int width) { |
531 asm volatile ( | 531 asm volatile ( |
532 "movd %3,%%xmm6 \n" | 532 "movd %3,%%xmm6 \n" |
533 "punpcklbw %%xmm6,%%xmm6 \n" | 533 "punpcklbw %%xmm6,%%xmm6 \n" |
534 "movdqa %%xmm6,%%xmm7 \n" | 534 "movdqa %%xmm6,%%xmm7 \n" |
535 "punpcklwd %%xmm6,%%xmm6 \n" | 535 "punpcklwd %%xmm6,%%xmm6 \n" |
536 "punpckhwd %%xmm7,%%xmm7 \n" | 536 "punpckhwd %%xmm7,%%xmm7 \n" |
537 "pcmpeqb %%xmm3,%%xmm3 \n" | 537 "pcmpeqb %%xmm3,%%xmm3 \n" |
538 "psrld $0x1b,%%xmm3 \n" | 538 "psrld $0x1b,%%xmm3 \n" |
539 "pcmpeqb %%xmm4,%%xmm4 \n" | 539 "pcmpeqb %%xmm4,%%xmm4 \n" |
540 "psrld $0x1a,%%xmm4 \n" | 540 "psrld $0x1a,%%xmm4 \n" |
(...skipping 17 matching lines...) Expand all Loading... |
558 "por %%xmm2,%%xmm1 \n" | 558 "por %%xmm2,%%xmm1 \n" |
559 "por %%xmm1,%%xmm0 \n" | 559 "por %%xmm1,%%xmm0 \n" |
560 "packssdw %%xmm0,%%xmm0 \n" | 560 "packssdw %%xmm0,%%xmm0 \n" |
561 "lea 0x10(%0),%0 \n" | 561 "lea 0x10(%0),%0 \n" |
562 "movq %%xmm0,(%1) \n" | 562 "movq %%xmm0,(%1) \n" |
563 "lea 0x8(%1),%1 \n" | 563 "lea 0x8(%1),%1 \n" |
564 "sub $0x4,%2 \n" | 564 "sub $0x4,%2 \n" |
565 "jg 1b \n" | 565 "jg 1b \n" |
566 : "+r"(src), // %0 | 566 : "+r"(src), // %0 |
567 "+r"(dst), // %1 | 567 "+r"(dst), // %1 |
568 "+r"(pix) // %2 | 568 "+r"(width) // %2 |
569 : "m"(dither4) // %3 | 569 : "m"(dither4) // %3 |
570 : "memory", "cc", | 570 : "memory", "cc", |
571 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 571 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
572 ); | 572 ); |
573 } | 573 } |
574 | 574 |
575 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 | 575 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 |
576 void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, | 576 void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, |
577 const uint32 dither4, int pix) { | 577 const uint32 dither4, int width) { |
578 asm volatile ( | 578 asm volatile ( |
579 "vbroadcastss %3,%%xmm6 \n" | 579 "vbroadcastss %3,%%xmm6 \n" |
580 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" | 580 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" |
581 "vpermq $0xd8,%%ymm6,%%ymm6 \n" | 581 "vpermq $0xd8,%%ymm6,%%ymm6 \n" |
582 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" | 582 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" |
583 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" | 583 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" |
584 "vpsrld $0x1b,%%ymm3,%%ymm3 \n" | 584 "vpsrld $0x1b,%%ymm3,%%ymm3 \n" |
585 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" | 585 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" |
586 "vpsrld $0x1a,%%ymm4,%%ymm4 \n" | 586 "vpsrld $0x1a,%%ymm4,%%ymm4 \n" |
587 "vpslld $0x5,%%ymm4,%%ymm4 \n" | 587 "vpslld $0x5,%%ymm4,%%ymm4 \n" |
(...skipping 14 matching lines...) Expand all Loading... |
602 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" | 602 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" |
603 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 603 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
604 "lea 0x20(%0),%0 \n" | 604 "lea 0x20(%0),%0 \n" |
605 "vmovdqu %%xmm0,(%1) \n" | 605 "vmovdqu %%xmm0,(%1) \n" |
606 "lea 0x10(%1),%1 \n" | 606 "lea 0x10(%1),%1 \n" |
607 "sub $0x8,%2 \n" | 607 "sub $0x8,%2 \n" |
608 "jg 1b \n" | 608 "jg 1b \n" |
609 "vzeroupper \n" | 609 "vzeroupper \n" |
610 : "+r"(src), // %0 | 610 : "+r"(src), // %0 |
611 "+r"(dst), // %1 | 611 "+r"(dst), // %1 |
612 "+r"(pix) // %2 | 612 "+r"(width) // %2 |
613 : "m"(dither4) // %3 | 613 : "m"(dither4) // %3 |
614 : "memory", "cc", | 614 : "memory", "cc", |
615 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 615 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
616 ); | 616 ); |
617 } | 617 } |
618 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 | 618 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 |
619 | 619 |
620 | 620 |
621 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { | 621 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { |
622 asm volatile ( | 622 asm volatile ( |
623 "pcmpeqb %%xmm4,%%xmm4 \n" | 623 "pcmpeqb %%xmm4,%%xmm4 \n" |
624 "psrld $0x1b,%%xmm4 \n" | 624 "psrld $0x1b,%%xmm4 \n" |
625 "movdqa %%xmm4,%%xmm5 \n" | 625 "movdqa %%xmm4,%%xmm5 \n" |
626 "pslld $0x5,%%xmm5 \n" | 626 "pslld $0x5,%%xmm5 \n" |
627 "movdqa %%xmm4,%%xmm6 \n" | 627 "movdqa %%xmm4,%%xmm6 \n" |
628 "pslld $0xa,%%xmm6 \n" | 628 "pslld $0xa,%%xmm6 \n" |
629 "pcmpeqb %%xmm7,%%xmm7 \n" | 629 "pcmpeqb %%xmm7,%%xmm7 \n" |
630 "pslld $0xf,%%xmm7 \n" | 630 "pslld $0xf,%%xmm7 \n" |
631 LABELALIGN | 631 LABELALIGN |
(...skipping 14 matching lines...) Expand all Loading... |
646 "por %%xmm3,%%xmm2 \n" | 646 "por %%xmm3,%%xmm2 \n" |
647 "por %%xmm2,%%xmm0 \n" | 647 "por %%xmm2,%%xmm0 \n" |
648 "packssdw %%xmm0,%%xmm0 \n" | 648 "packssdw %%xmm0,%%xmm0 \n" |
649 "lea " MEMLEA(0x10,0) ",%0 \n" | 649 "lea " MEMLEA(0x10,0) ",%0 \n" |
650 "movq %%xmm0," MEMACCESS(1) " \n" | 650 "movq %%xmm0," MEMACCESS(1) " \n" |
651 "lea " MEMLEA(0x8,1) ",%1 \n" | 651 "lea " MEMLEA(0x8,1) ",%1 \n" |
652 "sub $0x4,%2 \n" | 652 "sub $0x4,%2 \n" |
653 "jg 1b \n" | 653 "jg 1b \n" |
654 : "+r"(src), // %0 | 654 : "+r"(src), // %0 |
655 "+r"(dst), // %1 | 655 "+r"(dst), // %1 |
656 "+r"(pix) // %2 | 656 "+r"(width) // %2 |
657 :: "memory", "cc", | 657 :: "memory", "cc", |
658 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 658 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
659 ); | 659 ); |
660 } | 660 } |
661 | 661 |
662 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { | 662 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) { |
663 asm volatile ( | 663 asm volatile ( |
664 "pcmpeqb %%xmm4,%%xmm4 \n" | 664 "pcmpeqb %%xmm4,%%xmm4 \n" |
665 "psllw $0xc,%%xmm4 \n" | 665 "psllw $0xc,%%xmm4 \n" |
666 "movdqa %%xmm4,%%xmm3 \n" | 666 "movdqa %%xmm4,%%xmm3 \n" |
667 "psrlw $0x8,%%xmm3 \n" | 667 "psrlw $0x8,%%xmm3 \n" |
668 LABELALIGN | 668 LABELALIGN |
669 "1: \n" | 669 "1: \n" |
670 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 670 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
671 "movdqa %%xmm0,%%xmm1 \n" | 671 "movdqa %%xmm0,%%xmm1 \n" |
672 "pand %%xmm3,%%xmm0 \n" | 672 "pand %%xmm3,%%xmm0 \n" |
673 "pand %%xmm4,%%xmm1 \n" | 673 "pand %%xmm4,%%xmm1 \n" |
674 "psrlq $0x4,%%xmm0 \n" | 674 "psrlq $0x4,%%xmm0 \n" |
675 "psrlq $0x8,%%xmm1 \n" | 675 "psrlq $0x8,%%xmm1 \n" |
676 "por %%xmm1,%%xmm0 \n" | 676 "por %%xmm1,%%xmm0 \n" |
677 "packuswb %%xmm0,%%xmm0 \n" | 677 "packuswb %%xmm0,%%xmm0 \n" |
678 "lea " MEMLEA(0x10,0) ",%0 \n" | 678 "lea " MEMLEA(0x10,0) ",%0 \n" |
679 "movq %%xmm0," MEMACCESS(1) " \n" | 679 "movq %%xmm0," MEMACCESS(1) " \n" |
680 "lea " MEMLEA(0x8,1) ",%1 \n" | 680 "lea " MEMLEA(0x8,1) ",%1 \n" |
681 "sub $0x4,%2 \n" | 681 "sub $0x4,%2 \n" |
682 "jg 1b \n" | 682 "jg 1b \n" |
683 : "+r"(src), // %0 | 683 : "+r"(src), // %0 |
684 "+r"(dst), // %1 | 684 "+r"(dst), // %1 |
685 "+r"(pix) // %2 | 685 "+r"(width) // %2 |
686 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | 686 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
687 ); | 687 ); |
688 } | 688 } |
689 #endif // HAS_RGB24TOARGBROW_SSSE3 | 689 #endif // HAS_RGB24TOARGBROW_SSSE3 |
690 | 690 |
691 #ifdef HAS_ARGBTOYROW_SSSE3 | 691 #ifdef HAS_ARGBTOYROW_SSSE3 |
692 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. | 692 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
693 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 693 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { |
694 asm volatile ( | 694 asm volatile ( |
695 "movdqa %3,%%xmm4 \n" | 695 "movdqa %3,%%xmm4 \n" |
696 "movdqa %4,%%xmm5 \n" | 696 "movdqa %4,%%xmm5 \n" |
697 LABELALIGN | 697 LABELALIGN |
698 "1: \n" | 698 "1: \n" |
699 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 699 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
700 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 700 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
701 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | 701 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
702 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | 702 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
703 "pmaddubsw %%xmm4,%%xmm0 \n" | 703 "pmaddubsw %%xmm4,%%xmm0 \n" |
704 "pmaddubsw %%xmm4,%%xmm1 \n" | 704 "pmaddubsw %%xmm4,%%xmm1 \n" |
705 "pmaddubsw %%xmm4,%%xmm2 \n" | 705 "pmaddubsw %%xmm4,%%xmm2 \n" |
706 "pmaddubsw %%xmm4,%%xmm3 \n" | 706 "pmaddubsw %%xmm4,%%xmm3 \n" |
707 "lea " MEMLEA(0x40,0) ",%0 \n" | 707 "lea " MEMLEA(0x40,0) ",%0 \n" |
708 "phaddw %%xmm1,%%xmm0 \n" | 708 "phaddw %%xmm1,%%xmm0 \n" |
709 "phaddw %%xmm3,%%xmm2 \n" | 709 "phaddw %%xmm3,%%xmm2 \n" |
710 "psrlw $0x7,%%xmm0 \n" | 710 "psrlw $0x7,%%xmm0 \n" |
711 "psrlw $0x7,%%xmm2 \n" | 711 "psrlw $0x7,%%xmm2 \n" |
712 "packuswb %%xmm2,%%xmm0 \n" | 712 "packuswb %%xmm2,%%xmm0 \n" |
713 "paddb %%xmm5,%%xmm0 \n" | 713 "paddb %%xmm5,%%xmm0 \n" |
714 "movdqu %%xmm0," MEMACCESS(1) " \n" | 714 "movdqu %%xmm0," MEMACCESS(1) " \n" |
715 "lea " MEMLEA(0x10,1) ",%1 \n" | 715 "lea " MEMLEA(0x10,1) ",%1 \n" |
716 "sub $0x10,%2 \n" | 716 "sub $0x10,%2 \n" |
717 "jg 1b \n" | 717 "jg 1b \n" |
718 : "+r"(src_argb), // %0 | 718 : "+r"(src_argb), // %0 |
719 "+r"(dst_y), // %1 | 719 "+r"(dst_y), // %1 |
720 "+r"(pix) // %2 | 720 "+r"(width) // %2 |
721 : "m"(kARGBToY), // %3 | 721 : "m"(kARGBToY), // %3 |
722 "m"(kAddY16) // %4 | 722 "m"(kAddY16) // %4 |
723 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 723 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
724 ); | 724 ); |
725 } | 725 } |
726 #endif // HAS_ARGBTOYROW_SSSE3 | 726 #endif // HAS_ARGBTOYROW_SSSE3 |
727 | 727 |
728 #ifdef HAS_ARGBTOYJROW_SSSE3 | 728 #ifdef HAS_ARGBTOYJROW_SSSE3 |
729 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. | 729 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. |
730 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. | 730 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. |
731 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 731 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { |
732 asm volatile ( | 732 asm volatile ( |
733 "movdqa %3,%%xmm4 \n" | 733 "movdqa %3,%%xmm4 \n" |
734 "movdqa %4,%%xmm5 \n" | 734 "movdqa %4,%%xmm5 \n" |
735 LABELALIGN | 735 LABELALIGN |
736 "1: \n" | 736 "1: \n" |
737 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 737 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
738 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 738 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
739 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | 739 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
740 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | 740 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
741 "pmaddubsw %%xmm4,%%xmm0 \n" | 741 "pmaddubsw %%xmm4,%%xmm0 \n" |
742 "pmaddubsw %%xmm4,%%xmm1 \n" | 742 "pmaddubsw %%xmm4,%%xmm1 \n" |
743 "pmaddubsw %%xmm4,%%xmm2 \n" | 743 "pmaddubsw %%xmm4,%%xmm2 \n" |
744 "pmaddubsw %%xmm4,%%xmm3 \n" | 744 "pmaddubsw %%xmm4,%%xmm3 \n" |
745 "lea " MEMLEA(0x40,0) ",%0 \n" | 745 "lea " MEMLEA(0x40,0) ",%0 \n" |
746 "phaddw %%xmm1,%%xmm0 \n" | 746 "phaddw %%xmm1,%%xmm0 \n" |
747 "phaddw %%xmm3,%%xmm2 \n" | 747 "phaddw %%xmm3,%%xmm2 \n" |
748 "paddw %%xmm5,%%xmm0 \n" | 748 "paddw %%xmm5,%%xmm0 \n" |
749 "paddw %%xmm5,%%xmm2 \n" | 749 "paddw %%xmm5,%%xmm2 \n" |
750 "psrlw $0x7,%%xmm0 \n" | 750 "psrlw $0x7,%%xmm0 \n" |
751 "psrlw $0x7,%%xmm2 \n" | 751 "psrlw $0x7,%%xmm2 \n" |
752 "packuswb %%xmm2,%%xmm0 \n" | 752 "packuswb %%xmm2,%%xmm0 \n" |
753 "movdqu %%xmm0," MEMACCESS(1) " \n" | 753 "movdqu %%xmm0," MEMACCESS(1) " \n" |
754 "lea " MEMLEA(0x10,1) ",%1 \n" | 754 "lea " MEMLEA(0x10,1) ",%1 \n" |
755 "sub $0x10,%2 \n" | 755 "sub $0x10,%2 \n" |
756 "jg 1b \n" | 756 "jg 1b \n" |
757 : "+r"(src_argb), // %0 | 757 : "+r"(src_argb), // %0 |
758 "+r"(dst_y), // %1 | 758 "+r"(dst_y), // %1 |
759 "+r"(pix) // %2 | 759 "+r"(width) // %2 |
760 : "m"(kARGBToYJ), // %3 | 760 : "m"(kARGBToYJ), // %3 |
761 "m"(kAddYJ64) // %4 | 761 "m"(kAddYJ64) // %4 |
762 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 762 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
763 ); | 763 ); |
764 } | 764 } |
765 #endif // HAS_ARGBTOYJROW_SSSE3 | 765 #endif // HAS_ARGBTOYJROW_SSSE3 |
766 | 766 |
767 #ifdef HAS_ARGBTOYROW_AVX2 | 767 #ifdef HAS_ARGBTOYROW_AVX2 |
768 // vpermd for vphaddw + vpackuswb vpermd. | 768 // vpermd for vphaddw + vpackuswb vpermd. |
769 static const lvec32 kPermdARGBToY_AVX = { | 769 static const lvec32 kPermdARGBToY_AVX = { |
770 0, 4, 1, 5, 2, 6, 3, 7 | 770 0, 4, 1, 5, 2, 6, 3, 7 |
771 }; | 771 }; |
772 | 772 |
773 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 773 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
774 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | 774 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { |
775 asm volatile ( | 775 asm volatile ( |
776 "vbroadcastf128 %3,%%ymm4 \n" | 776 "vbroadcastf128 %3,%%ymm4 \n" |
777 "vbroadcastf128 %4,%%ymm5 \n" | 777 "vbroadcastf128 %4,%%ymm5 \n" |
778 "vmovdqu %5,%%ymm6 \n" | 778 "vmovdqu %5,%%ymm6 \n" |
779 LABELALIGN | 779 LABELALIGN |
780 "1: \n" | 780 "1: \n" |
781 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 781 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
782 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 782 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
783 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" | 783 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" |
784 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" | 784 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" |
785 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" | 785 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" |
786 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" | 786 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" |
787 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" | 787 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" |
788 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" | 788 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" |
789 "lea " MEMLEA(0x80,0) ",%0 \n" | 789 "lea " MEMLEA(0x80,0) ",%0 \n" |
790 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. | 790 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. |
791 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" | 791 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" |
792 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" | 792 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" |
793 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" | 793 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" |
794 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. | 794 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. |
795 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. | 795 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. |
796 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y | 796 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y |
797 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | 797 "vmovdqu %%ymm0," MEMACCESS(1) " \n" |
798 "lea " MEMLEA(0x20,1) ",%1 \n" | 798 "lea " MEMLEA(0x20,1) ",%1 \n" |
799 "sub $0x20,%2 \n" | 799 "sub $0x20,%2 \n" |
800 "jg 1b \n" | 800 "jg 1b \n" |
801 "vzeroupper \n" | 801 "vzeroupper \n" |
802 : "+r"(src_argb), // %0 | 802 : "+r"(src_argb), // %0 |
803 "+r"(dst_y), // %1 | 803 "+r"(dst_y), // %1 |
804 "+r"(pix) // %2 | 804 "+r"(width) // %2 |
805 : "m"(kARGBToY), // %3 | 805 : "m"(kARGBToY), // %3 |
806 "m"(kAddY16), // %4 | 806 "m"(kAddY16), // %4 |
807 "m"(kPermdARGBToY_AVX) // %5 | 807 "m"(kPermdARGBToY_AVX) // %5 |
808 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 808 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
809 ); | 809 ); |
810 } | 810 } |
811 #endif // HAS_ARGBTOYROW_AVX2 | 811 #endif // HAS_ARGBTOYROW_AVX2 |
812 | 812 |
813 #ifdef HAS_ARGBTOYJROW_AVX2 | 813 #ifdef HAS_ARGBTOYJROW_AVX2 |
814 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 814 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
815 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | 815 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { |
816 asm volatile ( | 816 asm volatile ( |
817 "vbroadcastf128 %3,%%ymm4 \n" | 817 "vbroadcastf128 %3,%%ymm4 \n" |
818 "vbroadcastf128 %4,%%ymm5 \n" | 818 "vbroadcastf128 %4,%%ymm5 \n" |
819 "vmovdqu %5,%%ymm6 \n" | 819 "vmovdqu %5,%%ymm6 \n" |
820 LABELALIGN | 820 LABELALIGN |
821 "1: \n" | 821 "1: \n" |
822 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 822 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
823 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 823 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
824 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" | 824 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" |
825 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" | 825 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" |
(...skipping 10 matching lines...) Expand all Loading... |
836 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" | 836 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" |
837 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. | 837 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. |
838 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. | 838 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. |
839 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | 839 "vmovdqu %%ymm0," MEMACCESS(1) " \n" |
840 "lea " MEMLEA(0x20,1) ",%1 \n" | 840 "lea " MEMLEA(0x20,1) ",%1 \n" |
841 "sub $0x20,%2 \n" | 841 "sub $0x20,%2 \n" |
842 "jg 1b \n" | 842 "jg 1b \n" |
843 "vzeroupper \n" | 843 "vzeroupper \n" |
844 : "+r"(src_argb), // %0 | 844 : "+r"(src_argb), // %0 |
845 "+r"(dst_y), // %1 | 845 "+r"(dst_y), // %1 |
846 "+r"(pix) // %2 | 846 "+r"(width) // %2 |
847 : "m"(kARGBToYJ), // %3 | 847 : "m"(kARGBToYJ), // %3 |
848 "m"(kAddYJ64), // %4 | 848 "m"(kAddYJ64), // %4 |
849 "m"(kPermdARGBToY_AVX) // %5 | 849 "m"(kPermdARGBToY_AVX) // %5 |
850 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 850 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
851 ); | 851 ); |
852 } | 852 } |
853 #endif // HAS_ARGBTOYJROW_AVX2 | 853 #endif // HAS_ARGBTOYJROW_AVX2 |
854 | 854 |
855 #ifdef HAS_ARGBTOUVROW_SSSE3 | 855 #ifdef HAS_ARGBTOUVROW_SSSE3 |
856 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 856 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
(...skipping 290 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1147 "+rm"(width) // %3 | 1147 "+rm"(width) // %3 |
1148 : "m"(kARGBToV), // %4 | 1148 : "m"(kARGBToV), // %4 |
1149 "m"(kARGBToU), // %5 | 1149 "m"(kARGBToU), // %5 |
1150 "m"(kAddUV128) // %6 | 1150 "m"(kAddUV128) // %6 |
1151 : "memory", "cc", NACL_R14 | 1151 : "memory", "cc", NACL_R14 |
1152 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | 1152 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
1153 ); | 1153 ); |
1154 } | 1154 } |
1155 #endif // HAS_ARGBTOUV422ROW_SSSE3 | 1155 #endif // HAS_ARGBTOUV422ROW_SSSE3 |
1156 | 1156 |
1157 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { | 1157 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) { |
1158 asm volatile ( | 1158 asm volatile ( |
1159 "movdqa %4,%%xmm5 \n" | 1159 "movdqa %4,%%xmm5 \n" |
1160 "movdqa %3,%%xmm4 \n" | 1160 "movdqa %3,%%xmm4 \n" |
1161 LABELALIGN | 1161 LABELALIGN |
1162 "1: \n" | 1162 "1: \n" |
1163 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1163 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
1164 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 1164 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
1165 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | 1165 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
1166 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | 1166 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
1167 "pmaddubsw %%xmm4,%%xmm0 \n" | 1167 "pmaddubsw %%xmm4,%%xmm0 \n" |
1168 "pmaddubsw %%xmm4,%%xmm1 \n" | 1168 "pmaddubsw %%xmm4,%%xmm1 \n" |
1169 "pmaddubsw %%xmm4,%%xmm2 \n" | 1169 "pmaddubsw %%xmm4,%%xmm2 \n" |
1170 "pmaddubsw %%xmm4,%%xmm3 \n" | 1170 "pmaddubsw %%xmm4,%%xmm3 \n" |
1171 "lea " MEMLEA(0x40,0) ",%0 \n" | 1171 "lea " MEMLEA(0x40,0) ",%0 \n" |
1172 "phaddw %%xmm1,%%xmm0 \n" | 1172 "phaddw %%xmm1,%%xmm0 \n" |
1173 "phaddw %%xmm3,%%xmm2 \n" | 1173 "phaddw %%xmm3,%%xmm2 \n" |
1174 "psrlw $0x7,%%xmm0 \n" | 1174 "psrlw $0x7,%%xmm0 \n" |
1175 "psrlw $0x7,%%xmm2 \n" | 1175 "psrlw $0x7,%%xmm2 \n" |
1176 "packuswb %%xmm2,%%xmm0 \n" | 1176 "packuswb %%xmm2,%%xmm0 \n" |
1177 "paddb %%xmm5,%%xmm0 \n" | 1177 "paddb %%xmm5,%%xmm0 \n" |
1178 "movdqu %%xmm0," MEMACCESS(1) " \n" | 1178 "movdqu %%xmm0," MEMACCESS(1) " \n" |
1179 "lea " MEMLEA(0x10,1) ",%1 \n" | 1179 "lea " MEMLEA(0x10,1) ",%1 \n" |
1180 "sub $0x10,%2 \n" | 1180 "sub $0x10,%2 \n" |
1181 "jg 1b \n" | 1181 "jg 1b \n" |
1182 : "+r"(src_bgra), // %0 | 1182 : "+r"(src_bgra), // %0 |
1183 "+r"(dst_y), // %1 | 1183 "+r"(dst_y), // %1 |
1184 "+r"(pix) // %2 | 1184 "+r"(width) // %2 |
1185 : "m"(kBGRAToY), // %3 | 1185 : "m"(kBGRAToY), // %3 |
1186 "m"(kAddY16) // %4 | 1186 "m"(kAddY16) // %4 |
1187 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1187 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1188 ); | 1188 ); |
1189 } | 1189 } |
1190 | 1190 |
1191 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, | 1191 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, |
1192 uint8* dst_u, uint8* dst_v, int width) { | 1192 uint8* dst_u, uint8* dst_v, int width) { |
1193 asm volatile ( | 1193 asm volatile ( |
1194 "movdqa %5,%%xmm3 \n" | 1194 "movdqa %5,%%xmm3 \n" |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1242 "+rm"(width) // %3 | 1242 "+rm"(width) // %3 |
1243 : "r"((intptr_t)(src_stride_bgra)), // %4 | 1243 : "r"((intptr_t)(src_stride_bgra)), // %4 |
1244 "m"(kBGRAToV), // %5 | 1244 "m"(kBGRAToV), // %5 |
1245 "m"(kBGRAToU), // %6 | 1245 "m"(kBGRAToU), // %6 |
1246 "m"(kAddUV128) // %7 | 1246 "m"(kAddUV128) // %7 |
1247 : "memory", "cc", NACL_R14 | 1247 : "memory", "cc", NACL_R14 |
1248 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | 1248 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
1249 ); | 1249 ); |
1250 } | 1250 } |
1251 | 1251 |
1252 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { | 1252 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) { |
1253 asm volatile ( | 1253 asm volatile ( |
1254 "movdqa %4,%%xmm5 \n" | 1254 "movdqa %4,%%xmm5 \n" |
1255 "movdqa %3,%%xmm4 \n" | 1255 "movdqa %3,%%xmm4 \n" |
1256 LABELALIGN | 1256 LABELALIGN |
1257 "1: \n" | 1257 "1: \n" |
1258 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1258 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
1259 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 1259 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
1260 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | 1260 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
1261 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | 1261 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
1262 "pmaddubsw %%xmm4,%%xmm0 \n" | 1262 "pmaddubsw %%xmm4,%%xmm0 \n" |
1263 "pmaddubsw %%xmm4,%%xmm1 \n" | 1263 "pmaddubsw %%xmm4,%%xmm1 \n" |
1264 "pmaddubsw %%xmm4,%%xmm2 \n" | 1264 "pmaddubsw %%xmm4,%%xmm2 \n" |
1265 "pmaddubsw %%xmm4,%%xmm3 \n" | 1265 "pmaddubsw %%xmm4,%%xmm3 \n" |
1266 "lea " MEMLEA(0x40,0) ",%0 \n" | 1266 "lea " MEMLEA(0x40,0) ",%0 \n" |
1267 "phaddw %%xmm1,%%xmm0 \n" | 1267 "phaddw %%xmm1,%%xmm0 \n" |
1268 "phaddw %%xmm3,%%xmm2 \n" | 1268 "phaddw %%xmm3,%%xmm2 \n" |
1269 "psrlw $0x7,%%xmm0 \n" | 1269 "psrlw $0x7,%%xmm0 \n" |
1270 "psrlw $0x7,%%xmm2 \n" | 1270 "psrlw $0x7,%%xmm2 \n" |
1271 "packuswb %%xmm2,%%xmm0 \n" | 1271 "packuswb %%xmm2,%%xmm0 \n" |
1272 "paddb %%xmm5,%%xmm0 \n" | 1272 "paddb %%xmm5,%%xmm0 \n" |
1273 "movdqu %%xmm0," MEMACCESS(1) " \n" | 1273 "movdqu %%xmm0," MEMACCESS(1) " \n" |
1274 "lea " MEMLEA(0x10,1) ",%1 \n" | 1274 "lea " MEMLEA(0x10,1) ",%1 \n" |
1275 "sub $0x10,%2 \n" | 1275 "sub $0x10,%2 \n" |
1276 "jg 1b \n" | 1276 "jg 1b \n" |
1277 : "+r"(src_abgr), // %0 | 1277 : "+r"(src_abgr), // %0 |
1278 "+r"(dst_y), // %1 | 1278 "+r"(dst_y), // %1 |
1279 "+r"(pix) // %2 | 1279 "+r"(width) // %2 |
1280 : "m"(kABGRToY), // %3 | 1280 : "m"(kABGRToY), // %3 |
1281 "m"(kAddY16) // %4 | 1281 "m"(kAddY16) // %4 |
1282 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1282 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1283 ); | 1283 ); |
1284 } | 1284 } |
1285 | 1285 |
1286 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { | 1286 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) { |
1287 asm volatile ( | 1287 asm volatile ( |
1288 "movdqa %4,%%xmm5 \n" | 1288 "movdqa %4,%%xmm5 \n" |
1289 "movdqa %3,%%xmm4 \n" | 1289 "movdqa %3,%%xmm4 \n" |
1290 LABELALIGN | 1290 LABELALIGN |
1291 "1: \n" | 1291 "1: \n" |
1292 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1292 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
1293 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 1293 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
1294 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | 1294 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
1295 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | 1295 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
1296 "pmaddubsw %%xmm4,%%xmm0 \n" | 1296 "pmaddubsw %%xmm4,%%xmm0 \n" |
1297 "pmaddubsw %%xmm4,%%xmm1 \n" | 1297 "pmaddubsw %%xmm4,%%xmm1 \n" |
1298 "pmaddubsw %%xmm4,%%xmm2 \n" | 1298 "pmaddubsw %%xmm4,%%xmm2 \n" |
1299 "pmaddubsw %%xmm4,%%xmm3 \n" | 1299 "pmaddubsw %%xmm4,%%xmm3 \n" |
1300 "lea " MEMLEA(0x40,0) ",%0 \n" | 1300 "lea " MEMLEA(0x40,0) ",%0 \n" |
1301 "phaddw %%xmm1,%%xmm0 \n" | 1301 "phaddw %%xmm1,%%xmm0 \n" |
1302 "phaddw %%xmm3,%%xmm2 \n" | 1302 "phaddw %%xmm3,%%xmm2 \n" |
1303 "psrlw $0x7,%%xmm0 \n" | 1303 "psrlw $0x7,%%xmm0 \n" |
1304 "psrlw $0x7,%%xmm2 \n" | 1304 "psrlw $0x7,%%xmm2 \n" |
1305 "packuswb %%xmm2,%%xmm0 \n" | 1305 "packuswb %%xmm2,%%xmm0 \n" |
1306 "paddb %%xmm5,%%xmm0 \n" | 1306 "paddb %%xmm5,%%xmm0 \n" |
1307 "movdqu %%xmm0," MEMACCESS(1) " \n" | 1307 "movdqu %%xmm0," MEMACCESS(1) " \n" |
1308 "lea " MEMLEA(0x10,1) ",%1 \n" | 1308 "lea " MEMLEA(0x10,1) ",%1 \n" |
1309 "sub $0x10,%2 \n" | 1309 "sub $0x10,%2 \n" |
1310 "jg 1b \n" | 1310 "jg 1b \n" |
1311 : "+r"(src_rgba), // %0 | 1311 : "+r"(src_rgba), // %0 |
1312 "+r"(dst_y), // %1 | 1312 "+r"(dst_y), // %1 |
1313 "+r"(pix) // %2 | 1313 "+r"(width) // %2 |
1314 : "m"(kRGBAToY), // %3 | 1314 : "m"(kRGBAToY), // %3 |
1315 "m"(kAddY16) // %4 | 1315 "m"(kAddY16) // %4 |
1316 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1316 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1317 ); | 1317 ); |
1318 } | 1318 } |
1319 | 1319 |
1320 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, | 1320 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, |
1321 uint8* dst_u, uint8* dst_v, int width) { | 1321 uint8* dst_u, uint8* dst_v, int width) { |
1322 asm volatile ( | 1322 asm volatile ( |
1323 "movdqa %5,%%xmm3 \n" | 1323 "movdqa %5,%%xmm3 \n" |
(...skipping 1384 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2708 "+r"(dst), // %1 | 2708 "+r"(dst), // %1 |
2709 "+r"(temp_width) // %2 | 2709 "+r"(temp_width) // %2 |
2710 : "m"(kARGBShuffleMirror_AVX2) // %3 | 2710 : "m"(kARGBShuffleMirror_AVX2) // %3 |
2711 : "memory", "cc", NACL_R14 | 2711 : "memory", "cc", NACL_R14 |
2712 "xmm0", "xmm5" | 2712 "xmm0", "xmm5" |
2713 ); | 2713 ); |
2714 } | 2714 } |
2715 #endif // HAS_ARGBMIRRORROW_AVX2 | 2715 #endif // HAS_ARGBMIRRORROW_AVX2 |
2716 | 2716 |
2717 #ifdef HAS_SPLITUVROW_AVX2 | 2717 #ifdef HAS_SPLITUVROW_AVX2 |
2718 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | 2718 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width)
{ |
2719 asm volatile ( | 2719 asm volatile ( |
2720 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2720 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2721 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 2721 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
2722 "sub %1,%2 \n" | 2722 "sub %1,%2 \n" |
2723 LABELALIGN | 2723 LABELALIGN |
2724 "1: \n" | 2724 "1: \n" |
2725 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 2725 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
2726 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 2726 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
2727 "lea " MEMLEA(0x40,0) ",%0 \n" | 2727 "lea " MEMLEA(0x40,0) ",%0 \n" |
2728 "vpsrlw $0x8,%%ymm0,%%ymm2 \n" | 2728 "vpsrlw $0x8,%%ymm0,%%ymm2 \n" |
2729 "vpsrlw $0x8,%%ymm1,%%ymm3 \n" | 2729 "vpsrlw $0x8,%%ymm1,%%ymm3 \n" |
2730 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | 2730 "vpand %%ymm5,%%ymm0,%%ymm0 \n" |
2731 "vpand %%ymm5,%%ymm1,%%ymm1 \n" | 2731 "vpand %%ymm5,%%ymm1,%%ymm1 \n" |
2732 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | 2732 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
2733 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" | 2733 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" |
2734 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 2734 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
2735 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | 2735 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
2736 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | 2736 "vmovdqu %%ymm0," MEMACCESS(1) " \n" |
2737 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) | 2737 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) |
2738 "lea " MEMLEA(0x20,1) ",%1 \n" | 2738 "lea " MEMLEA(0x20,1) ",%1 \n" |
2739 "sub $0x20,%3 \n" | 2739 "sub $0x20,%3 \n" |
2740 "jg 1b \n" | 2740 "jg 1b \n" |
2741 "vzeroupper \n" | 2741 "vzeroupper \n" |
2742 : "+r"(src_uv), // %0 | 2742 : "+r"(src_uv), // %0 |
2743 "+r"(dst_u), // %1 | 2743 "+r"(dst_u), // %1 |
2744 "+r"(dst_v), // %2 | 2744 "+r"(dst_v), // %2 |
2745 "+r"(pix) // %3 | 2745 "+r"(width) // %3 |
2746 : | 2746 : |
2747 : "memory", "cc", NACL_R14 | 2747 : "memory", "cc", NACL_R14 |
2748 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 2748 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
2749 ); | 2749 ); |
2750 } | 2750 } |
2751 #endif // HAS_SPLITUVROW_AVX2 | 2751 #endif // HAS_SPLITUVROW_AVX2 |
2752 | 2752 |
2753 #ifdef HAS_SPLITUVROW_SSE2 | 2753 #ifdef HAS_SPLITUVROW_SSE2 |
2754 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | 2754 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width)
{ |
2755 asm volatile ( | 2755 asm volatile ( |
2756 "pcmpeqb %%xmm5,%%xmm5 \n" | 2756 "pcmpeqb %%xmm5,%%xmm5 \n" |
2757 "psrlw $0x8,%%xmm5 \n" | 2757 "psrlw $0x8,%%xmm5 \n" |
2758 "sub %1,%2 \n" | 2758 "sub %1,%2 \n" |
2759 LABELALIGN | 2759 LABELALIGN |
2760 "1: \n" | 2760 "1: \n" |
2761 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 2761 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
2762 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 2762 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
2763 "lea " MEMLEA(0x20,0) ",%0 \n" | 2763 "lea " MEMLEA(0x20,0) ",%0 \n" |
2764 "movdqa %%xmm0,%%xmm2 \n" | 2764 "movdqa %%xmm0,%%xmm2 \n" |
2765 "movdqa %%xmm1,%%xmm3 \n" | 2765 "movdqa %%xmm1,%%xmm3 \n" |
2766 "pand %%xmm5,%%xmm0 \n" | 2766 "pand %%xmm5,%%xmm0 \n" |
2767 "pand %%xmm5,%%xmm1 \n" | 2767 "pand %%xmm5,%%xmm1 \n" |
2768 "packuswb %%xmm1,%%xmm0 \n" | 2768 "packuswb %%xmm1,%%xmm0 \n" |
2769 "psrlw $0x8,%%xmm2 \n" | 2769 "psrlw $0x8,%%xmm2 \n" |
2770 "psrlw $0x8,%%xmm3 \n" | 2770 "psrlw $0x8,%%xmm3 \n" |
2771 "packuswb %%xmm3,%%xmm2 \n" | 2771 "packuswb %%xmm3,%%xmm2 \n" |
2772 "movdqu %%xmm0," MEMACCESS(1) " \n" | 2772 "movdqu %%xmm0," MEMACCESS(1) " \n" |
2773 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) | 2773 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) |
2774 "lea " MEMLEA(0x10,1) ",%1 \n" | 2774 "lea " MEMLEA(0x10,1) ",%1 \n" |
2775 "sub $0x10,%3 \n" | 2775 "sub $0x10,%3 \n" |
2776 "jg 1b \n" | 2776 "jg 1b \n" |
2777 : "+r"(src_uv), // %0 | 2777 : "+r"(src_uv), // %0 |
2778 "+r"(dst_u), // %1 | 2778 "+r"(dst_u), // %1 |
2779 "+r"(dst_v), // %2 | 2779 "+r"(dst_v), // %2 |
2780 "+r"(pix) // %3 | 2780 "+r"(width) // %3 |
2781 : | 2781 : |
2782 : "memory", "cc", NACL_R14 | 2782 : "memory", "cc", NACL_R14 |
2783 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 2783 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
2784 ); | 2784 ); |
2785 } | 2785 } |
2786 #endif // HAS_SPLITUVROW_SSE2 | 2786 #endif // HAS_SPLITUVROW_SSE2 |
2787 | 2787 |
2788 #ifdef HAS_MERGEUVROW_AVX2 | 2788 #ifdef HAS_MERGEUVROW_AVX2 |
2789 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 2789 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
2790 int width) { | 2790 int width) { |
(...skipping 276 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3067 asm volatile ( | 3067 asm volatile ( |
3068 "rep stosl " MEMSTORESTRING(eax,0) " \n" | 3068 "rep stosl " MEMSTORESTRING(eax,0) " \n" |
3069 : "+D"(dst_argb), // %0 | 3069 : "+D"(dst_argb), // %0 |
3070 "+c"(width_tmp) // %1 | 3070 "+c"(width_tmp) // %1 |
3071 : "a"(v32) // %2 | 3071 : "a"(v32) // %2 |
3072 : "memory", "cc"); | 3072 : "memory", "cc"); |
3073 } | 3073 } |
3074 #endif // HAS_SETROW_X86 | 3074 #endif // HAS_SETROW_X86 |
3075 | 3075 |
3076 #ifdef HAS_YUY2TOYROW_SSE2 | 3076 #ifdef HAS_YUY2TOYROW_SSE2 |
3077 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { | 3077 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { |
3078 asm volatile ( | 3078 asm volatile ( |
3079 "pcmpeqb %%xmm5,%%xmm5 \n" | 3079 "pcmpeqb %%xmm5,%%xmm5 \n" |
3080 "psrlw $0x8,%%xmm5 \n" | 3080 "psrlw $0x8,%%xmm5 \n" |
3081 LABELALIGN | 3081 LABELALIGN |
3082 "1: \n" | 3082 "1: \n" |
3083 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3083 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3084 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3084 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
3085 "lea " MEMLEA(0x20,0) ",%0 \n" | 3085 "lea " MEMLEA(0x20,0) ",%0 \n" |
3086 "pand %%xmm5,%%xmm0 \n" | 3086 "pand %%xmm5,%%xmm0 \n" |
3087 "pand %%xmm5,%%xmm1 \n" | 3087 "pand %%xmm5,%%xmm1 \n" |
3088 "packuswb %%xmm1,%%xmm0 \n" | 3088 "packuswb %%xmm1,%%xmm0 \n" |
3089 "movdqu %%xmm0," MEMACCESS(1) " \n" | 3089 "movdqu %%xmm0," MEMACCESS(1) " \n" |
3090 "lea " MEMLEA(0x10,1) ",%1 \n" | 3090 "lea " MEMLEA(0x10,1) ",%1 \n" |
3091 "sub $0x10,%2 \n" | 3091 "sub $0x10,%2 \n" |
3092 "jg 1b \n" | 3092 "jg 1b \n" |
3093 : "+r"(src_yuy2), // %0 | 3093 : "+r"(src_yuy2), // %0 |
3094 "+r"(dst_y), // %1 | 3094 "+r"(dst_y), // %1 |
3095 "+r"(pix) // %2 | 3095 "+r"(width) // %2 |
3096 : | 3096 : |
3097 : "memory", "cc" | 3097 : "memory", "cc" |
3098 , "xmm0", "xmm1", "xmm5" | 3098 , "xmm0", "xmm1", "xmm5" |
3099 ); | 3099 ); |
3100 } | 3100 } |
3101 | 3101 |
3102 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, | 3102 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, |
3103 uint8* dst_u, uint8* dst_v, int pix) { | 3103 uint8* dst_u, uint8* dst_v, int width) { |
3104 asm volatile ( | 3104 asm volatile ( |
3105 "pcmpeqb %%xmm5,%%xmm5 \n" | 3105 "pcmpeqb %%xmm5,%%xmm5 \n" |
3106 "psrlw $0x8,%%xmm5 \n" | 3106 "psrlw $0x8,%%xmm5 \n" |
3107 "sub %1,%2 \n" | 3107 "sub %1,%2 \n" |
3108 LABELALIGN | 3108 LABELALIGN |
3109 "1: \n" | 3109 "1: \n" |
3110 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3110 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3111 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3111 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
3112 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 | 3112 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 |
3113 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 | 3113 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 |
3114 "lea " MEMLEA(0x20,0) ",%0 \n" | 3114 "lea " MEMLEA(0x20,0) ",%0 \n" |
3115 "pavgb %%xmm2,%%xmm0 \n" | 3115 "pavgb %%xmm2,%%xmm0 \n" |
3116 "pavgb %%xmm3,%%xmm1 \n" | 3116 "pavgb %%xmm3,%%xmm1 \n" |
3117 "psrlw $0x8,%%xmm0 \n" | 3117 "psrlw $0x8,%%xmm0 \n" |
3118 "psrlw $0x8,%%xmm1 \n" | 3118 "psrlw $0x8,%%xmm1 \n" |
3119 "packuswb %%xmm1,%%xmm0 \n" | 3119 "packuswb %%xmm1,%%xmm0 \n" |
3120 "movdqa %%xmm0,%%xmm1 \n" | 3120 "movdqa %%xmm0,%%xmm1 \n" |
3121 "pand %%xmm5,%%xmm0 \n" | 3121 "pand %%xmm5,%%xmm0 \n" |
3122 "packuswb %%xmm0,%%xmm0 \n" | 3122 "packuswb %%xmm0,%%xmm0 \n" |
3123 "psrlw $0x8,%%xmm1 \n" | 3123 "psrlw $0x8,%%xmm1 \n" |
3124 "packuswb %%xmm1,%%xmm1 \n" | 3124 "packuswb %%xmm1,%%xmm1 \n" |
3125 "movq %%xmm0," MEMACCESS(1) " \n" | 3125 "movq %%xmm0," MEMACCESS(1) " \n" |
3126 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) | 3126 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
3127 "lea " MEMLEA(0x8,1) ",%1 \n" | 3127 "lea " MEMLEA(0x8,1) ",%1 \n" |
3128 "sub $0x10,%3 \n" | 3128 "sub $0x10,%3 \n" |
3129 "jg 1b \n" | 3129 "jg 1b \n" |
3130 : "+r"(src_yuy2), // %0 | 3130 : "+r"(src_yuy2), // %0 |
3131 "+r"(dst_u), // %1 | 3131 "+r"(dst_u), // %1 |
3132 "+r"(dst_v), // %2 | 3132 "+r"(dst_v), // %2 |
3133 "+r"(pix) // %3 | 3133 "+r"(width) // %3 |
3134 : "r"((intptr_t)(stride_yuy2)) // %4 | 3134 : "r"((intptr_t)(stride_yuy2)) // %4 |
3135 : "memory", "cc", NACL_R14 | 3135 : "memory", "cc", NACL_R14 |
3136 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 3136 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
3137 ); | 3137 ); |
3138 } | 3138 } |
3139 | 3139 |
3140 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, | 3140 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, |
3141 uint8* dst_u, uint8* dst_v, int pix) { | 3141 uint8* dst_u, uint8* dst_v, int width) { |
3142 asm volatile ( | 3142 asm volatile ( |
3143 "pcmpeqb %%xmm5,%%xmm5 \n" | 3143 "pcmpeqb %%xmm5,%%xmm5 \n" |
3144 "psrlw $0x8,%%xmm5 \n" | 3144 "psrlw $0x8,%%xmm5 \n" |
3145 "sub %1,%2 \n" | 3145 "sub %1,%2 \n" |
3146 LABELALIGN | 3146 LABELALIGN |
3147 "1: \n" | 3147 "1: \n" |
3148 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3148 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3149 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3149 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
3150 "lea " MEMLEA(0x20,0) ",%0 \n" | 3150 "lea " MEMLEA(0x20,0) ",%0 \n" |
3151 "psrlw $0x8,%%xmm0 \n" | 3151 "psrlw $0x8,%%xmm0 \n" |
3152 "psrlw $0x8,%%xmm1 \n" | 3152 "psrlw $0x8,%%xmm1 \n" |
3153 "packuswb %%xmm1,%%xmm0 \n" | 3153 "packuswb %%xmm1,%%xmm0 \n" |
3154 "movdqa %%xmm0,%%xmm1 \n" | 3154 "movdqa %%xmm0,%%xmm1 \n" |
3155 "pand %%xmm5,%%xmm0 \n" | 3155 "pand %%xmm5,%%xmm0 \n" |
3156 "packuswb %%xmm0,%%xmm0 \n" | 3156 "packuswb %%xmm0,%%xmm0 \n" |
3157 "psrlw $0x8,%%xmm1 \n" | 3157 "psrlw $0x8,%%xmm1 \n" |
3158 "packuswb %%xmm1,%%xmm1 \n" | 3158 "packuswb %%xmm1,%%xmm1 \n" |
3159 "movq %%xmm0," MEMACCESS(1) " \n" | 3159 "movq %%xmm0," MEMACCESS(1) " \n" |
3160 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) | 3160 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
3161 "lea " MEMLEA(0x8,1) ",%1 \n" | 3161 "lea " MEMLEA(0x8,1) ",%1 \n" |
3162 "sub $0x10,%3 \n" | 3162 "sub $0x10,%3 \n" |
3163 "jg 1b \n" | 3163 "jg 1b \n" |
3164 : "+r"(src_yuy2), // %0 | 3164 : "+r"(src_yuy2), // %0 |
3165 "+r"(dst_u), // %1 | 3165 "+r"(dst_u), // %1 |
3166 "+r"(dst_v), // %2 | 3166 "+r"(dst_v), // %2 |
3167 "+r"(pix) // %3 | 3167 "+r"(width) // %3 |
3168 : | 3168 : |
3169 : "memory", "cc", NACL_R14 | 3169 : "memory", "cc", NACL_R14 |
3170 "xmm0", "xmm1", "xmm5" | 3170 "xmm0", "xmm1", "xmm5" |
3171 ); | 3171 ); |
3172 } | 3172 } |
3173 | 3173 |
3174 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { | 3174 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) { |
3175 asm volatile ( | 3175 asm volatile ( |
3176 LABELALIGN | 3176 LABELALIGN |
3177 "1: \n" | 3177 "1: \n" |
3178 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3178 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3179 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3179 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
3180 "lea " MEMLEA(0x20,0) ",%0 \n" | 3180 "lea " MEMLEA(0x20,0) ",%0 \n" |
3181 "psrlw $0x8,%%xmm0 \n" | 3181 "psrlw $0x8,%%xmm0 \n" |
3182 "psrlw $0x8,%%xmm1 \n" | 3182 "psrlw $0x8,%%xmm1 \n" |
3183 "packuswb %%xmm1,%%xmm0 \n" | 3183 "packuswb %%xmm1,%%xmm0 \n" |
3184 "movdqu %%xmm0," MEMACCESS(1) " \n" | 3184 "movdqu %%xmm0," MEMACCESS(1) " \n" |
3185 "lea " MEMLEA(0x10,1) ",%1 \n" | 3185 "lea " MEMLEA(0x10,1) ",%1 \n" |
3186 "sub $0x10,%2 \n" | 3186 "sub $0x10,%2 \n" |
3187 "jg 1b \n" | 3187 "jg 1b \n" |
3188 : "+r"(src_uyvy), // %0 | 3188 : "+r"(src_uyvy), // %0 |
3189 "+r"(dst_y), // %1 | 3189 "+r"(dst_y), // %1 |
3190 "+r"(pix) // %2 | 3190 "+r"(width) // %2 |
3191 : | 3191 : |
3192 : "memory", "cc" | 3192 : "memory", "cc" |
3193 , "xmm0", "xmm1" | 3193 , "xmm0", "xmm1" |
3194 ); | 3194 ); |
3195 } | 3195 } |
3196 | 3196 |
3197 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, | 3197 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, |
3198 uint8* dst_u, uint8* dst_v, int pix) { | 3198 uint8* dst_u, uint8* dst_v, int width) { |
3199 asm volatile ( | 3199 asm volatile ( |
3200 "pcmpeqb %%xmm5,%%xmm5 \n" | 3200 "pcmpeqb %%xmm5,%%xmm5 \n" |
3201 "psrlw $0x8,%%xmm5 \n" | 3201 "psrlw $0x8,%%xmm5 \n" |
3202 "sub %1,%2 \n" | 3202 "sub %1,%2 \n" |
3203 LABELALIGN | 3203 LABELALIGN |
3204 "1: \n" | 3204 "1: \n" |
3205 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3205 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3206 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3206 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
3207 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 | 3207 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 |
3208 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 | 3208 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 |
3209 "lea " MEMLEA(0x20,0) ",%0 \n" | 3209 "lea " MEMLEA(0x20,0) ",%0 \n" |
3210 "pavgb %%xmm2,%%xmm0 \n" | 3210 "pavgb %%xmm2,%%xmm0 \n" |
3211 "pavgb %%xmm3,%%xmm1 \n" | 3211 "pavgb %%xmm3,%%xmm1 \n" |
3212 "pand %%xmm5,%%xmm0 \n" | 3212 "pand %%xmm5,%%xmm0 \n" |
3213 "pand %%xmm5,%%xmm1 \n" | 3213 "pand %%xmm5,%%xmm1 \n" |
3214 "packuswb %%xmm1,%%xmm0 \n" | 3214 "packuswb %%xmm1,%%xmm0 \n" |
3215 "movdqa %%xmm0,%%xmm1 \n" | 3215 "movdqa %%xmm0,%%xmm1 \n" |
3216 "pand %%xmm5,%%xmm0 \n" | 3216 "pand %%xmm5,%%xmm0 \n" |
3217 "packuswb %%xmm0,%%xmm0 \n" | 3217 "packuswb %%xmm0,%%xmm0 \n" |
3218 "psrlw $0x8,%%xmm1 \n" | 3218 "psrlw $0x8,%%xmm1 \n" |
3219 "packuswb %%xmm1,%%xmm1 \n" | 3219 "packuswb %%xmm1,%%xmm1 \n" |
3220 "movq %%xmm0," MEMACCESS(1) " \n" | 3220 "movq %%xmm0," MEMACCESS(1) " \n" |
3221 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) | 3221 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
3222 "lea " MEMLEA(0x8,1) ",%1 \n" | 3222 "lea " MEMLEA(0x8,1) ",%1 \n" |
3223 "sub $0x10,%3 \n" | 3223 "sub $0x10,%3 \n" |
3224 "jg 1b \n" | 3224 "jg 1b \n" |
3225 : "+r"(src_uyvy), // %0 | 3225 : "+r"(src_uyvy), // %0 |
3226 "+r"(dst_u), // %1 | 3226 "+r"(dst_u), // %1 |
3227 "+r"(dst_v), // %2 | 3227 "+r"(dst_v), // %2 |
3228 "+r"(pix) // %3 | 3228 "+r"(width) // %3 |
3229 : "r"((intptr_t)(stride_uyvy)) // %4 | 3229 : "r"((intptr_t)(stride_uyvy)) // %4 |
3230 : "memory", "cc", NACL_R14 | 3230 : "memory", "cc", NACL_R14 |
3231 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 3231 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
3232 ); | 3232 ); |
3233 } | 3233 } |
3234 | 3234 |
3235 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, | 3235 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
3236 uint8* dst_u, uint8* dst_v, int pix) { | 3236 uint8* dst_u, uint8* dst_v, int width) { |
3237 asm volatile ( | 3237 asm volatile ( |
3238 "pcmpeqb %%xmm5,%%xmm5 \n" | 3238 "pcmpeqb %%xmm5,%%xmm5 \n" |
3239 "psrlw $0x8,%%xmm5 \n" | 3239 "psrlw $0x8,%%xmm5 \n" |
3240 "sub %1,%2 \n" | 3240 "sub %1,%2 \n" |
3241 LABELALIGN | 3241 LABELALIGN |
3242 "1: \n" | 3242 "1: \n" |
3243 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3243 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3244 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3244 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
3245 "lea " MEMLEA(0x20,0) ",%0 \n" | 3245 "lea " MEMLEA(0x20,0) ",%0 \n" |
3246 "pand %%xmm5,%%xmm0 \n" | 3246 "pand %%xmm5,%%xmm0 \n" |
3247 "pand %%xmm5,%%xmm1 \n" | 3247 "pand %%xmm5,%%xmm1 \n" |
3248 "packuswb %%xmm1,%%xmm0 \n" | 3248 "packuswb %%xmm1,%%xmm0 \n" |
3249 "movdqa %%xmm0,%%xmm1 \n" | 3249 "movdqa %%xmm0,%%xmm1 \n" |
3250 "pand %%xmm5,%%xmm0 \n" | 3250 "pand %%xmm5,%%xmm0 \n" |
3251 "packuswb %%xmm0,%%xmm0 \n" | 3251 "packuswb %%xmm0,%%xmm0 \n" |
3252 "psrlw $0x8,%%xmm1 \n" | 3252 "psrlw $0x8,%%xmm1 \n" |
3253 "packuswb %%xmm1,%%xmm1 \n" | 3253 "packuswb %%xmm1,%%xmm1 \n" |
3254 "movq %%xmm0," MEMACCESS(1) " \n" | 3254 "movq %%xmm0," MEMACCESS(1) " \n" |
3255 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) | 3255 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
3256 "lea " MEMLEA(0x8,1) ",%1 \n" | 3256 "lea " MEMLEA(0x8,1) ",%1 \n" |
3257 "sub $0x10,%3 \n" | 3257 "sub $0x10,%3 \n" |
3258 "jg 1b \n" | 3258 "jg 1b \n" |
3259 : "+r"(src_uyvy), // %0 | 3259 : "+r"(src_uyvy), // %0 |
3260 "+r"(dst_u), // %1 | 3260 "+r"(dst_u), // %1 |
3261 "+r"(dst_v), // %2 | 3261 "+r"(dst_v), // %2 |
3262 "+r"(pix) // %3 | 3262 "+r"(width) // %3 |
3263 : | 3263 : |
3264 : "memory", "cc", NACL_R14 | 3264 : "memory", "cc", NACL_R14 |
3265 "xmm0", "xmm1", "xmm5" | 3265 "xmm0", "xmm1", "xmm5" |
3266 ); | 3266 ); |
3267 } | 3267 } |
3268 #endif // HAS_YUY2TOYROW_SSE2 | 3268 #endif // HAS_YUY2TOYROW_SSE2 |
3269 | 3269 |
3270 #ifdef HAS_YUY2TOYROW_AVX2 | 3270 #ifdef HAS_YUY2TOYROW_AVX2 |
3271 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { | 3271 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { |
3272 asm volatile ( | 3272 asm volatile ( |
3273 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3273 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
3274 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3274 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
3275 LABELALIGN | 3275 LABELALIGN |
3276 "1: \n" | 3276 "1: \n" |
3277 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3277 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
3278 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3278 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
3279 "lea " MEMLEA(0x40,0) ",%0 \n" | 3279 "lea " MEMLEA(0x40,0) ",%0 \n" |
3280 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | 3280 "vpand %%ymm5,%%ymm0,%%ymm0 \n" |
3281 "vpand %%ymm5,%%ymm1,%%ymm1 \n" | 3281 "vpand %%ymm5,%%ymm1,%%ymm1 \n" |
3282 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | 3282 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
3283 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 3283 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
3284 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | 3284 "vmovdqu %%ymm0," MEMACCESS(1) " \n" |
3285 "lea " MEMLEA(0x20,1) ",%1 \n" | 3285 "lea " MEMLEA(0x20,1) ",%1 \n" |
3286 "sub $0x20,%2 \n" | 3286 "sub $0x20,%2 \n" |
3287 "jg 1b \n" | 3287 "jg 1b \n" |
3288 "vzeroupper \n" | 3288 "vzeroupper \n" |
3289 : "+r"(src_yuy2), // %0 | 3289 : "+r"(src_yuy2), // %0 |
3290 "+r"(dst_y), // %1 | 3290 "+r"(dst_y), // %1 |
3291 "+r"(pix) // %2 | 3291 "+r"(width) // %2 |
3292 : | 3292 : |
3293 : "memory", "cc" | 3293 : "memory", "cc" |
3294 , "xmm0", "xmm1", "xmm5" | 3294 , "xmm0", "xmm1", "xmm5" |
3295 ); | 3295 ); |
3296 } | 3296 } |
3297 | 3297 |
3298 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, | 3298 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, |
3299 uint8* dst_u, uint8* dst_v, int pix) { | 3299 uint8* dst_u, uint8* dst_v, int width) { |
3300 asm volatile ( | 3300 asm volatile ( |
3301 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3301 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
3302 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3302 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
3303 "sub %1,%2 \n" | 3303 "sub %1,%2 \n" |
3304 LABELALIGN | 3304 LABELALIGN |
3305 "1: \n" | 3305 "1: \n" |
3306 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3306 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
3307 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3307 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
3308 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 | 3308 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 |
3309 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) | 3309 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) |
(...skipping 10 matching lines...) Expand all Loading... |
3320 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 3320 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
3321 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" | 3321 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" |
3322 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) | 3322 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) |
3323 "lea " MEMLEA(0x10,1) ",%1 \n" | 3323 "lea " MEMLEA(0x10,1) ",%1 \n" |
3324 "sub $0x20,%3 \n" | 3324 "sub $0x20,%3 \n" |
3325 "jg 1b \n" | 3325 "jg 1b \n" |
3326 "vzeroupper \n" | 3326 "vzeroupper \n" |
3327 : "+r"(src_yuy2), // %0 | 3327 : "+r"(src_yuy2), // %0 |
3328 "+r"(dst_u), // %1 | 3328 "+r"(dst_u), // %1 |
3329 "+r"(dst_v), // %2 | 3329 "+r"(dst_v), // %2 |
3330 "+r"(pix) // %3 | 3330 "+r"(width) // %3 |
3331 : "r"((intptr_t)(stride_yuy2)) // %4 | 3331 : "r"((intptr_t)(stride_yuy2)) // %4 |
3332 : "memory", "cc", NACL_R14 | 3332 : "memory", "cc", NACL_R14 |
3333 "xmm0", "xmm1", "xmm5" | 3333 "xmm0", "xmm1", "xmm5" |
3334 ); | 3334 ); |
3335 } | 3335 } |
3336 | 3336 |
3337 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, | 3337 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, |
3338 uint8* dst_u, uint8* dst_v, int pix) { | 3338 uint8* dst_u, uint8* dst_v, int width) { |
3339 asm volatile ( | 3339 asm volatile ( |
3340 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3340 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
3341 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3341 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
3342 "sub %1,%2 \n" | 3342 "sub %1,%2 \n" |
3343 LABELALIGN | 3343 LABELALIGN |
3344 "1: \n" | 3344 "1: \n" |
3345 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3345 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
3346 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3346 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
3347 "lea " MEMLEA(0x40,0) ",%0 \n" | 3347 "lea " MEMLEA(0x40,0) ",%0 \n" |
3348 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | 3348 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
3349 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" | 3349 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" |
3350 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | 3350 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
3351 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 3351 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
3352 "vpand %%ymm5,%%ymm0,%%ymm1 \n" | 3352 "vpand %%ymm5,%%ymm0,%%ymm1 \n" |
3353 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | 3353 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
3354 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" | 3354 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" |
3355 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | 3355 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" |
3356 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | 3356 "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
3357 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 3357 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
3358 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" | 3358 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" |
3359 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) | 3359 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) |
3360 "lea " MEMLEA(0x10,1) ",%1 \n" | 3360 "lea " MEMLEA(0x10,1) ",%1 \n" |
3361 "sub $0x20,%3 \n" | 3361 "sub $0x20,%3 \n" |
3362 "jg 1b \n" | 3362 "jg 1b \n" |
3363 "vzeroupper \n" | 3363 "vzeroupper \n" |
3364 : "+r"(src_yuy2), // %0 | 3364 : "+r"(src_yuy2), // %0 |
3365 "+r"(dst_u), // %1 | 3365 "+r"(dst_u), // %1 |
3366 "+r"(dst_v), // %2 | 3366 "+r"(dst_v), // %2 |
3367 "+r"(pix) // %3 | 3367 "+r"(width) // %3 |
3368 : | 3368 : |
3369 : "memory", "cc", NACL_R14 | 3369 : "memory", "cc", NACL_R14 |
3370 "xmm0", "xmm1", "xmm5" | 3370 "xmm0", "xmm1", "xmm5" |
3371 ); | 3371 ); |
3372 } | 3372 } |
3373 | 3373 |
3374 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) { | 3374 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) { |
3375 asm volatile ( | 3375 asm volatile ( |
3376 LABELALIGN | 3376 LABELALIGN |
3377 "1: \n" | 3377 "1: \n" |
3378 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3378 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
3379 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3379 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
3380 "lea " MEMLEA(0x40,0) ",%0 \n" | 3380 "lea " MEMLEA(0x40,0) ",%0 \n" |
3381 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | 3381 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
3382 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" | 3382 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" |
3383 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | 3383 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
3384 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 3384 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
3385 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | 3385 "vmovdqu %%ymm0," MEMACCESS(1) " \n" |
3386 "lea " MEMLEA(0x20,1) ",%1 \n" | 3386 "lea " MEMLEA(0x20,1) ",%1 \n" |
3387 "sub $0x20,%2 \n" | 3387 "sub $0x20,%2 \n" |
3388 "jg 1b \n" | 3388 "jg 1b \n" |
3389 "vzeroupper \n" | 3389 "vzeroupper \n" |
3390 : "+r"(src_uyvy), // %0 | 3390 : "+r"(src_uyvy), // %0 |
3391 "+r"(dst_y), // %1 | 3391 "+r"(dst_y), // %1 |
3392 "+r"(pix) // %2 | 3392 "+r"(width) // %2 |
3393 : | 3393 : |
3394 : "memory", "cc" | 3394 : "memory", "cc" |
3395 , "xmm0", "xmm1", "xmm5" | 3395 , "xmm0", "xmm1", "xmm5" |
3396 ); | 3396 ); |
3397 } | 3397 } |
3398 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, | 3398 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, |
3399 uint8* dst_u, uint8* dst_v, int pix) { | 3399 uint8* dst_u, uint8* dst_v, int width) { |
3400 asm volatile ( | 3400 asm volatile ( |
3401 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3401 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
3402 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3402 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
3403 "sub %1,%2 \n" | 3403 "sub %1,%2 \n" |
3404 | 3404 |
3405 LABELALIGN | 3405 LABELALIGN |
3406 "1: \n" | 3406 "1: \n" |
3407 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3407 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
3408 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3408 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
3409 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 | 3409 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 |
(...skipping 11 matching lines...) Expand all Loading... |
3421 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 3421 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
3422 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" | 3422 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" |
3423 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) | 3423 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) |
3424 "lea " MEMLEA(0x10,1) ",%1 \n" | 3424 "lea " MEMLEA(0x10,1) ",%1 \n" |
3425 "sub $0x20,%3 \n" | 3425 "sub $0x20,%3 \n" |
3426 "jg 1b \n" | 3426 "jg 1b \n" |
3427 "vzeroupper \n" | 3427 "vzeroupper \n" |
3428 : "+r"(src_uyvy), // %0 | 3428 : "+r"(src_uyvy), // %0 |
3429 "+r"(dst_u), // %1 | 3429 "+r"(dst_u), // %1 |
3430 "+r"(dst_v), // %2 | 3430 "+r"(dst_v), // %2 |
3431 "+r"(pix) // %3 | 3431 "+r"(width) // %3 |
3432 : "r"((intptr_t)(stride_uyvy)) // %4 | 3432 : "r"((intptr_t)(stride_uyvy)) // %4 |
3433 : "memory", "cc", NACL_R14 | 3433 : "memory", "cc", NACL_R14 |
3434 "xmm0", "xmm1", "xmm5" | 3434 "xmm0", "xmm1", "xmm5" |
3435 ); | 3435 ); |
3436 } | 3436 } |
3437 | 3437 |
3438 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, | 3438 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, |
3439 uint8* dst_u, uint8* dst_v, int pix) { | 3439 uint8* dst_u, uint8* dst_v, int width) { |
3440 asm volatile ( | 3440 asm volatile ( |
3441 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3441 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
3442 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3442 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
3443 "sub %1,%2 \n" | 3443 "sub %1,%2 \n" |
3444 LABELALIGN | 3444 LABELALIGN |
3445 "1: \n" | 3445 "1: \n" |
3446 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3446 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
3447 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3447 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
3448 "lea " MEMLEA(0x40,0) ",%0 \n" | 3448 "lea " MEMLEA(0x40,0) ",%0 \n" |
3449 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | 3449 "vpand %%ymm5,%%ymm0,%%ymm0 \n" |
3450 "vpand %%ymm5,%%ymm1,%%ymm1 \n" | 3450 "vpand %%ymm5,%%ymm1,%%ymm1 \n" |
3451 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | 3451 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
3452 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 3452 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
3453 "vpand %%ymm5,%%ymm0,%%ymm1 \n" | 3453 "vpand %%ymm5,%%ymm0,%%ymm1 \n" |
3454 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | 3454 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
3455 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" | 3455 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" |
3456 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | 3456 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" |
3457 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | 3457 "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
3458 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 3458 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
3459 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" | 3459 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" |
3460 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) | 3460 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) |
3461 "lea " MEMLEA(0x10,1) ",%1 \n" | 3461 "lea " MEMLEA(0x10,1) ",%1 \n" |
3462 "sub $0x20,%3 \n" | 3462 "sub $0x20,%3 \n" |
3463 "jg 1b \n" | 3463 "jg 1b \n" |
3464 "vzeroupper \n" | 3464 "vzeroupper \n" |
3465 : "+r"(src_uyvy), // %0 | 3465 : "+r"(src_uyvy), // %0 |
3466 "+r"(dst_u), // %1 | 3466 "+r"(dst_u), // %1 |
3467 "+r"(dst_v), // %2 | 3467 "+r"(dst_v), // %2 |
3468 "+r"(pix) // %3 | 3468 "+r"(width) // %3 |
3469 : | 3469 : |
3470 : "memory", "cc", NACL_R14 | 3470 : "memory", "cc", NACL_R14 |
3471 "xmm0", "xmm1", "xmm5" | 3471 "xmm0", "xmm1", "xmm5" |
3472 ); | 3472 ); |
3473 } | 3473 } |
3474 #endif // HAS_YUY2TOYROW_AVX2 | 3474 #endif // HAS_YUY2TOYROW_AVX2 |
3475 | 3475 |
3476 #ifdef HAS_ARGBBLENDROW_SSSE3 | 3476 #ifdef HAS_ARGBBLENDROW_SSSE3 |
3477 // Shuffle table for isolating alpha. | 3477 // Shuffle table for isolating alpha. |
3478 static uvec8 kShuffleAlpha = { | 3478 static uvec8 kShuffleAlpha = { |
(...skipping 1599 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5078 : "r"((intptr_t)(src_stride)) // %4 | 5078 : "r"((intptr_t)(src_stride)) // %4 |
5079 : "memory", "cc", NACL_R14 | 5079 : "memory", "cc", NACL_R14 |
5080 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 5080 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
5081 ); | 5081 ); |
5082 } | 5082 } |
5083 #endif // HAS_INTERPOLATEROW_SSE2 | 5083 #endif // HAS_INTERPOLATEROW_SSE2 |
5084 | 5084 |
5085 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 | 5085 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 |
5086 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5086 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
5087 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 5087 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
5088 const uint8* shuffler, int pix) { | 5088 const uint8* shuffler, int width) { |
5089 asm volatile ( | 5089 asm volatile ( |
5090 "movdqu " MEMACCESS(3) ",%%xmm5 \n" | 5090 "movdqu " MEMACCESS(3) ",%%xmm5 \n" |
5091 LABELALIGN | 5091 LABELALIGN |
5092 "1: \n" | 5092 "1: \n" |
5093 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 5093 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
5094 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 5094 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
5095 "lea " MEMLEA(0x20,0) ",%0 \n" | 5095 "lea " MEMLEA(0x20,0) ",%0 \n" |
5096 "pshufb %%xmm5,%%xmm0 \n" | 5096 "pshufb %%xmm5,%%xmm0 \n" |
5097 "pshufb %%xmm5,%%xmm1 \n" | 5097 "pshufb %%xmm5,%%xmm1 \n" |
5098 "movdqu %%xmm0," MEMACCESS(1) " \n" | 5098 "movdqu %%xmm0," MEMACCESS(1) " \n" |
5099 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | 5099 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
5100 "lea " MEMLEA(0x20,1) ",%1 \n" | 5100 "lea " MEMLEA(0x20,1) ",%1 \n" |
5101 "sub $0x8,%2 \n" | 5101 "sub $0x8,%2 \n" |
5102 "jg 1b \n" | 5102 "jg 1b \n" |
5103 : "+r"(src_argb), // %0 | 5103 : "+r"(src_argb), // %0 |
5104 "+r"(dst_argb), // %1 | 5104 "+r"(dst_argb), // %1 |
5105 "+r"(pix) // %2 | 5105 "+r"(width) // %2 |
5106 : "r"(shuffler) // %3 | 5106 : "r"(shuffler) // %3 |
5107 : "memory", "cc" | 5107 : "memory", "cc" |
5108 , "xmm0", "xmm1", "xmm5" | 5108 , "xmm0", "xmm1", "xmm5" |
5109 ); | 5109 ); |
5110 } | 5110 } |
5111 #endif // HAS_ARGBSHUFFLEROW_SSSE3 | 5111 #endif // HAS_ARGBSHUFFLEROW_SSSE3 |
5112 | 5112 |
5113 #ifdef HAS_ARGBSHUFFLEROW_AVX2 | 5113 #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
5114 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5114 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
5115 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 5115 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
5116 const uint8* shuffler, int pix) { | 5116 const uint8* shuffler, int width) { |
5117 asm volatile ( | 5117 asm volatile ( |
5118 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" | 5118 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" |
5119 LABELALIGN | 5119 LABELALIGN |
5120 "1: \n" | 5120 "1: \n" |
5121 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 5121 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
5122 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 5122 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
5123 "lea " MEMLEA(0x40,0) ",%0 \n" | 5123 "lea " MEMLEA(0x40,0) ",%0 \n" |
5124 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" | 5124 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" |
5125 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" | 5125 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" |
5126 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | 5126 "vmovdqu %%ymm0," MEMACCESS(1) " \n" |
5127 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" | 5127 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" |
5128 "lea " MEMLEA(0x40,1) ",%1 \n" | 5128 "lea " MEMLEA(0x40,1) ",%1 \n" |
5129 "sub $0x10,%2 \n" | 5129 "sub $0x10,%2 \n" |
5130 "jg 1b \n" | 5130 "jg 1b \n" |
5131 "vzeroupper \n" | 5131 "vzeroupper \n" |
5132 : "+r"(src_argb), // %0 | 5132 : "+r"(src_argb), // %0 |
5133 "+r"(dst_argb), // %1 | 5133 "+r"(dst_argb), // %1 |
5134 "+r"(pix) // %2 | 5134 "+r"(width) // %2 |
5135 : "r"(shuffler) // %3 | 5135 : "r"(shuffler) // %3 |
5136 : "memory", "cc" | 5136 : "memory", "cc" |
5137 , "xmm0", "xmm1", "xmm5" | 5137 , "xmm0", "xmm1", "xmm5" |
5138 ); | 5138 ); |
5139 } | 5139 } |
5140 #endif // HAS_ARGBSHUFFLEROW_AVX2 | 5140 #endif // HAS_ARGBSHUFFLEROW_AVX2 |
5141 | 5141 |
5142 #ifdef HAS_ARGBSHUFFLEROW_SSE2 | 5142 #ifdef HAS_ARGBSHUFFLEROW_SSE2 |
5143 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5143 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
5144 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 5144 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
5145 const uint8* shuffler, int pix) { | 5145 const uint8* shuffler, int width) { |
5146 uintptr_t pixel_temp = 0u; | 5146 uintptr_t pixel_temp = 0u; |
5147 asm volatile ( | 5147 asm volatile ( |
5148 "pxor %%xmm5,%%xmm5 \n" | 5148 "pxor %%xmm5,%%xmm5 \n" |
5149 "mov " MEMACCESS(4) ",%k2 \n" | 5149 "mov " MEMACCESS(4) ",%k2 \n" |
5150 "cmp $0x3000102,%k2 \n" | 5150 "cmp $0x3000102,%k2 \n" |
5151 "je 3012f \n" | 5151 "je 3012f \n" |
5152 "cmp $0x10203,%k2 \n" | 5152 "cmp $0x10203,%k2 \n" |
5153 "je 123f \n" | 5153 "je 123f \n" |
5154 "cmp $0x30201,%k2 \n" | 5154 "cmp $0x30201,%k2 \n" |
5155 "je 321f \n" | 5155 "je 321f \n" |
(...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5244 "packuswb %%xmm1,%%xmm0 \n" | 5244 "packuswb %%xmm1,%%xmm0 \n" |
5245 "movdqu %%xmm0," MEMACCESS(1) " \n" | 5245 "movdqu %%xmm0," MEMACCESS(1) " \n" |
5246 "lea " MEMLEA(0x10,1) ",%1 \n" | 5246 "lea " MEMLEA(0x10,1) ",%1 \n" |
5247 "sub $0x4,%3 \n" | 5247 "sub $0x4,%3 \n" |
5248 "jg 3012b \n" | 5248 "jg 3012b \n" |
5249 | 5249 |
5250 "99: \n" | 5250 "99: \n" |
5251 : "+r"(src_argb), // %0 | 5251 : "+r"(src_argb), // %0 |
5252 "+r"(dst_argb), // %1 | 5252 "+r"(dst_argb), // %1 |
5253 "+d"(pixel_temp), // %2 | 5253 "+d"(pixel_temp), // %2 |
5254 "+r"(pix) // %3 | 5254 "+r"(width) // %3 |
5255 : "r"(shuffler) // %4 | 5255 : "r"(shuffler) // %4 |
5256 : "memory", "cc", NACL_R14 | 5256 : "memory", "cc", NACL_R14 |
5257 "xmm0", "xmm1", "xmm5" | 5257 "xmm0", "xmm1", "xmm5" |
5258 ); | 5258 ); |
5259 } | 5259 } |
5260 #endif // HAS_ARGBSHUFFLEROW_SSE2 | 5260 #endif // HAS_ARGBSHUFFLEROW_SSE2 |
5261 | 5261 |
5262 #ifdef HAS_I422TOYUY2ROW_SSE2 | 5262 #ifdef HAS_I422TOYUY2ROW_SSE2 |
5263 void I422ToYUY2Row_SSE2(const uint8* src_y, | 5263 void I422ToYUY2Row_SSE2(const uint8* src_y, |
5264 const uint8* src_u, | 5264 const uint8* src_u, |
(...skipping 319 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5584 ); | 5584 ); |
5585 } | 5585 } |
5586 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5586 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5587 | 5587 |
5588 #endif // defined(__x86_64__) || defined(__i386__) | 5588 #endif // defined(__x86_64__) || defined(__i386__) |
5589 | 5589 |
5590 #ifdef __cplusplus | 5590 #ifdef __cplusplus |
5591 } // extern "C" | 5591 } // extern "C" |
5592 } // namespace libyuv | 5592 } // namespace libyuv |
5593 #endif | 5593 #endif |
OLD | NEW |