Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(219)

Side by Side Diff: source/row_gcc.cc

Issue 2421993002: Port HalfFloatRow_SSE2 to AVX2 but not using F16C. (Closed)
Patch Set: disable f16 Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_any.cc ('k') | source/row_win.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // VERSION 2 1 // VERSION 2
2 /* 2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * 4 *
5 * Use of this source code is governed by a BSD-style license 5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source 6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found 7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may 8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree. 9 * be found in the AUTHORS file in the root of the source tree.
10 */ 10 */
(...skipping 5323 matching lines...) Expand 10 before | Expand all | Expand 10 after
5334 "+r"(dst), // %1 5334 "+r"(dst), // %1
5335 "+r"(width) // %2 5335 "+r"(width) // %2
5336 : "x"(scale * kScaleBias) // %3 5336 : "x"(scale * kScaleBias) // %3
5337 : "memory", "cc", 5337 : "memory", "cc",
5338 "xmm2", "xmm3", "xmm4", "xmm5" 5338 "xmm2", "xmm3", "xmm4", "xmm5"
5339 ); 5339 );
5340 } 5340 }
5341 #endif // HAS_HALFFLOATROW_SSE2 5341 #endif // HAS_HALFFLOATROW_SSE2
5342 5342
5343 #ifdef HAS_HALFFLOATROW_AVX2 5343 #ifdef HAS_HALFFLOATROW_AVX2
5344 // TODO(fbarchard): consider vadddw instead of vmulps
5344 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { 5345 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
5345 asm volatile ( 5346 asm volatile (
5347 "vbroadcastss %3, %%ymm4 \n"
5348 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
5349
5350 // 16 pixel loop.
5351 LABELALIGN
5352 "1: \n"
5353 "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts
5354 "lea " MEMLEA(0x20,0) ",%0 \n"
5355 "vpunpckhwd %%ymm2,%%ymm5,%%ymm3 \n"
5356 "vpunpcklwd %%ymm2,%%ymm5,%%ymm2 \n"
5357 "vcvtdq2ps %%ymm3,%%ymm3 \n"
5358 "vcvtdq2ps %%ymm2,%%ymm2 \n"
5359 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
5360 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
5361 "vpsrld $0xd,%%ymm3,%%ymm3 \n"
5362 "vpsrld $0xd,%%ymm2,%%ymm2 \n"
5363 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
5364 "vmovdqu %%ymm2," MEMACCESS(1) " \n"
5365 "lea " MEMLEA(0x20,1) ",%1 \n"
5366 "sub $0x10,%2 \n"
5367 "jg 1b \n"
5368 "vzeroupper \n"
5369 : "+r"(src), // %0
5370 "+r"(dst), // %1
5371 "+r"(width) // %2
5372 : "x"(scale * kScaleBias) // %3
5373 : "memory", "cc",
5374 "xmm2", "xmm3", "xmm4", "xmm5"
5375 );
5376 }
5377 #endif // HAS_HALFFLOATROW_AVX2
5378
5379 #ifdef HAS_HALFFLOATROW_F16C
5380 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
5381 asm volatile (
5346 "vbroadcastss %3, %%ymm4 \n" 5382 "vbroadcastss %3, %%ymm4 \n"
5347 5383
5348 // 16 pixel loop. 5384 // 16 pixel loop.
5349 LABELALIGN 5385 LABELALIGN
5350 "1: \n" 5386 "1: \n"
5351 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints 5387 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints
5352 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more 5388 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more
5353 "lea " MEMLEA(0x20,0) ",%0 \n" 5389 "lea " MEMLEA(0x20,0) ",%0 \n"
5354 "vcvtdq2ps %%ymm2,%%ymm2 \n" 5390 "vcvtdq2ps %%ymm2,%%ymm2 \n"
5355 "vcvtdq2ps %%ymm3,%%ymm3 \n" 5391 "vcvtdq2ps %%ymm3,%%ymm3 \n"
5356 "vmulps %%ymm2,%%ymm4,%%ymm2 \n" 5392 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
5357 "vmulps %%ymm3,%%ymm4,%%ymm3 \n" 5393 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
5358 "vcvtps2ph $3, %%ymm2, %%xmm2 \n" 5394 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
5359 "vcvtps2ph $3, %%ymm3, %%xmm3 \n" 5395 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
5360 "vmovdqu %%xmm2," MEMACCESS(1) " \n" 5396 "vmovdqu %%xmm2," MEMACCESS(1) " \n"
5361 "vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n" 5397 "vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
5362 "lea " MEMLEA(0x20,1) ",%1 \n" 5398 "lea " MEMLEA(0x20,1) ",%1 \n"
5363 "sub $0x10,%2 \n" 5399 "sub $0x10,%2 \n"
5364 "jg 1b \n" 5400 "jg 1b \n"
5401
5365 "vzeroupper \n" 5402 "vzeroupper \n"
5366 : "+r"(src), // %0 5403 : "+r"(src), // %0
5367 "+r"(dst), // %1 5404 "+r"(dst), // %1
5368 "+r"(width) // %2 5405 "+r"(width) // %2
5369 : "x"(scale) // %3 5406 : "x"(scale) // %3
5370 : "memory", "cc", 5407 : "memory", "cc",
5371 "xmm2", "xmm3", "xmm4" 5408 "xmm2", "xmm3", "xmm4"
5372 ); 5409 );
5373 } 5410 }
5374 #endif // HAS_HALFFLOATROW_AVX2 5411 #endif // HAS_HALFFLOATROW_F16C
5375 5412
5376 #ifdef HAS_ARGBCOLORTABLEROW_X86 5413 #ifdef HAS_ARGBCOLORTABLEROW_X86
5377 // Tranform ARGB pixels with color table. 5414 // Tranform ARGB pixels with color table.
5378 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 5415 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
5379 int width) { 5416 int width) {
5380 uintptr_t pixel_temp; 5417 uintptr_t pixel_temp;
5381 asm volatile ( 5418 asm volatile (
5382 // 1 pixel loop. 5419 // 1 pixel loop.
5383 LABELALIGN 5420 LABELALIGN
5384 "1: \n" 5421 "1: \n"
(...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after
5532 ); 5569 );
5533 } 5570 }
5534 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5571 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5535 5572
5536 #endif // defined(__x86_64__) || defined(__i386__) 5573 #endif // defined(__x86_64__) || defined(__i386__)
5537 5574
5538 #ifdef __cplusplus 5575 #ifdef __cplusplus
5539 } // extern "C" 5576 } // extern "C"
5540 } // namespace libyuv 5577 } // namespace libyuv
5541 #endif 5578 #endif
OLDNEW
« no previous file with comments | « source/row_any.cc ('k') | source/row_win.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698