OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 5323 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5334 "+r"(dst), // %1 | 5334 "+r"(dst), // %1 |
5335 "+r"(width) // %2 | 5335 "+r"(width) // %2 |
5336 : "x"(scale * kScaleBias) // %3 | 5336 : "x"(scale * kScaleBias) // %3 |
5337 : "memory", "cc", | 5337 : "memory", "cc", |
5338 "xmm2", "xmm3", "xmm4", "xmm5" | 5338 "xmm2", "xmm3", "xmm4", "xmm5" |
5339 ); | 5339 ); |
5340 } | 5340 } |
5341 #endif // HAS_HALFFLOATROW_SSE2 | 5341 #endif // HAS_HALFFLOATROW_SSE2 |
5342 | 5342 |
5343 #ifdef HAS_HALFFLOATROW_AVX2 | 5343 #ifdef HAS_HALFFLOATROW_AVX2 |
| 5344 // TODO(fbarchard): consider vadddw instead of vmulps |
5344 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { | 5345 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { |
5345 asm volatile ( | 5346 asm volatile ( |
| 5347 "vbroadcastss %3, %%ymm4 \n" |
| 5348 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" |
| 5349 |
| 5350 // 16 pixel loop. |
| 5351 LABELALIGN |
| 5352 "1: \n" |
| 5353 "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts |
| 5354 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 5355 "vpunpckhwd %%ymm2,%%ymm5,%%ymm3 \n" |
| 5356 "vpunpcklwd %%ymm2,%%ymm5,%%ymm2 \n" |
| 5357 "vcvtdq2ps %%ymm3,%%ymm3 \n" |
| 5358 "vcvtdq2ps %%ymm2,%%ymm2 \n" |
| 5359 "vmulps %%ymm3,%%ymm4,%%ymm3 \n" |
| 5360 "vmulps %%ymm2,%%ymm4,%%ymm2 \n" |
| 5361 "vpsrld $0xd,%%ymm3,%%ymm3 \n" |
| 5362 "vpsrld $0xd,%%ymm2,%%ymm2 \n" |
| 5363 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates |
| 5364 "vmovdqu %%ymm2," MEMACCESS(1) " \n" |
| 5365 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 5366 "sub $0x10,%2 \n" |
| 5367 "jg 1b \n" |
| 5368 "vzeroupper \n" |
| 5369 : "+r"(src), // %0 |
| 5370 "+r"(dst), // %1 |
| 5371 "+r"(width) // %2 |
| 5372 : "x"(scale * kScaleBias) // %3 |
| 5373 : "memory", "cc", |
| 5374 "xmm2", "xmm3", "xmm4", "xmm5" |
| 5375 ); |
| 5376 } |
| 5377 #endif // HAS_HALFFLOATROW_AVX2 |
| 5378 |
| 5379 #ifdef HAS_HALFFLOATROW_F16C |
| 5380 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { |
| 5381 asm volatile ( |
5346 "vbroadcastss %3, %%ymm4 \n" | 5382 "vbroadcastss %3, %%ymm4 \n" |
5347 | 5383 |
5348 // 16 pixel loop. | 5384 // 16 pixel loop. |
5349 LABELALIGN | 5385 LABELALIGN |
5350 "1: \n" | 5386 "1: \n" |
5351 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints | 5387 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints |
5352 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more | 5388 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more |
5353 "lea " MEMLEA(0x20,0) ",%0 \n" | 5389 "lea " MEMLEA(0x20,0) ",%0 \n" |
5354 "vcvtdq2ps %%ymm2,%%ymm2 \n" | 5390 "vcvtdq2ps %%ymm2,%%ymm2 \n" |
5355 "vcvtdq2ps %%ymm3,%%ymm3 \n" | 5391 "vcvtdq2ps %%ymm3,%%ymm3 \n" |
5356 "vmulps %%ymm2,%%ymm4,%%ymm2 \n" | 5392 "vmulps %%ymm2,%%ymm4,%%ymm2 \n" |
5357 "vmulps %%ymm3,%%ymm4,%%ymm3 \n" | 5393 "vmulps %%ymm3,%%ymm4,%%ymm3 \n" |
5358 "vcvtps2ph $3, %%ymm2, %%xmm2 \n" | 5394 "vcvtps2ph $3, %%ymm2, %%xmm2 \n" |
5359 "vcvtps2ph $3, %%ymm3, %%xmm3 \n" | 5395 "vcvtps2ph $3, %%ymm3, %%xmm3 \n" |
5360 "vmovdqu %%xmm2," MEMACCESS(1) " \n" | 5396 "vmovdqu %%xmm2," MEMACCESS(1) " \n" |
5361 "vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n" | 5397 "vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n" |
5362 "lea " MEMLEA(0x20,1) ",%1 \n" | 5398 "lea " MEMLEA(0x20,1) ",%1 \n" |
5363 "sub $0x10,%2 \n" | 5399 "sub $0x10,%2 \n" |
5364 "jg 1b \n" | 5400 "jg 1b \n" |
| 5401 |
5365 "vzeroupper \n" | 5402 "vzeroupper \n" |
5366 : "+r"(src), // %0 | 5403 : "+r"(src), // %0 |
5367 "+r"(dst), // %1 | 5404 "+r"(dst), // %1 |
5368 "+r"(width) // %2 | 5405 "+r"(width) // %2 |
5369 : "x"(scale) // %3 | 5406 : "x"(scale) // %3 |
5370 : "memory", "cc", | 5407 : "memory", "cc", |
5371 "xmm2", "xmm3", "xmm4" | 5408 "xmm2", "xmm3", "xmm4" |
5372 ); | 5409 ); |
5373 } | 5410 } |
5374 #endif // HAS_HALFFLOATROW_AVX2 | 5411 #endif // HAS_HALFFLOATROW_F16C |
5375 | 5412 |
5376 #ifdef HAS_ARGBCOLORTABLEROW_X86 | 5413 #ifdef HAS_ARGBCOLORTABLEROW_X86 |
5377 // Tranform ARGB pixels with color table. | 5414 // Tranform ARGB pixels with color table. |
5378 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, | 5415 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, |
5379 int width) { | 5416 int width) { |
5380 uintptr_t pixel_temp; | 5417 uintptr_t pixel_temp; |
5381 asm volatile ( | 5418 asm volatile ( |
5382 // 1 pixel loop. | 5419 // 1 pixel loop. |
5383 LABELALIGN | 5420 LABELALIGN |
5384 "1: \n" | 5421 "1: \n" |
(...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5532 ); | 5569 ); |
5533 } | 5570 } |
5534 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5571 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5535 | 5572 |
5536 #endif // defined(__x86_64__) || defined(__i386__) | 5573 #endif // defined(__x86_64__) || defined(__i386__) |
5537 | 5574 |
5538 #ifdef __cplusplus | 5575 #ifdef __cplusplus |
5539 } // extern "C" | 5576 } // extern "C" |
5540 } // namespace libyuv | 5577 } // namespace libyuv |
5541 #endif | 5578 #endif |
OLD | NEW |