source/row_gcc.cc - Issue 2421993002: Port HalfFloatRow_SSE2 to AVX2 but not using F16C.

Side by Side Diff: source/row_gcc.cc

Issue 2421993002: Port HalfFloatRow_SSE2 to AVX2 but not using F16C. (Closed)

Patch Set: disable f16 Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // VERSION 2	1 // VERSION 2

2 /*	2 /*

3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.	3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.

4 *	4 *

5 * Use of this source code is governed by a BSD-style license	5 * Use of this source code is governed by a BSD-style license

6 * that can be found in the LICENSE file in the root of the source	6 * that can be found in the LICENSE file in the root of the source

7 * tree. An additional intellectual property rights grant can be found	7 * tree. An additional intellectual property rights grant can be found

8 * in the file PATENTS. All contributing project authors may	8 * in the file PATENTS. All contributing project authors may

9 * be found in the AUTHORS file in the root of the source tree.	9 * be found in the AUTHORS file in the root of the source tree.

10 */	10 */

(...skipping 5323 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5334 "+r"(dst), // %1	5334 "+r"(dst), // %1

5335 "+r"(width) // %2	5335 "+r"(width) // %2

5336 : "x"(scale * kScaleBias) // %3	5336 : "x"(scale * kScaleBias) // %3

5337 : "memory", "cc",	5337 : "memory", "cc",

5338 "xmm2", "xmm3", "xmm4", "xmm5"	5338 "xmm2", "xmm3", "xmm4", "xmm5"

5339 );	5339 );

5340 }	5340 }

5341 #endif // HAS_HALFFLOATROW_SSE2	5341 #endif // HAS_HALFFLOATROW_SSE2

5342	5342

5343 #ifdef HAS_HALFFLOATROW_AVX2	5343 #ifdef HAS_HALFFLOATROW_AVX2

	5344 // TODO(fbarchard): consider vadddw instead of vmulps

5344 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {	5345 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {

5345 asm volatile (	5346 asm volatile (

	5347 "vbroadcastss %3, %%ymm4 \n"

	5348 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"

	5349

	5350 // 16 pixel loop.

	5351 LABELALIGN

	5352 "1: \n"

	5353 "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts

	5354 "lea " MEMLEA(0x20,0) ",%0 \n"

	5355 "vpunpckhwd %%ymm2,%%ymm5,%%ymm3 \n"

	5356 "vpunpcklwd %%ymm2,%%ymm5,%%ymm2 \n"

	5357 "vcvtdq2ps %%ymm3,%%ymm3 \n"

	5358 "vcvtdq2ps %%ymm2,%%ymm2 \n"

	5359 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"

	5360 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"

	5361 "vpsrld $0xd,%%ymm3,%%ymm3 \n"

	5362 "vpsrld $0xd,%%ymm2,%%ymm2 \n"

	5363 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates

	5364 "vmovdqu %%ymm2," MEMACCESS(1) " \n"

	5365 "lea " MEMLEA(0x20,1) ",%1 \n"

	5366 "sub $0x10,%2 \n"

	5367 "jg 1b \n"

	5368 "vzeroupper \n"

	5369 : "+r"(src), // %0

	5370 "+r"(dst), // %1

	5371 "+r"(width) // %2

	5372 : "x"(scale * kScaleBias) // %3

	5373 : "memory", "cc",

	5374 "xmm2", "xmm3", "xmm4", "xmm5"

	5375 );

	5376 }

	5377 #endif // HAS_HALFFLOATROW_AVX2

	5378

	5379 #ifdef HAS_HALFFLOATROW_F16C

	5380 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {

	5381 asm volatile (

5346 "vbroadcastss %3, %%ymm4 \n"	5382 "vbroadcastss %3, %%ymm4 \n"

5347	5383

5348 // 16 pixel loop.	5384 // 16 pixel loop.

5349 LABELALIGN	5385 LABELALIGN

5350 "1: \n"	5386 "1: \n"

5351 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints	5387 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints

5352 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more	5388 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more

5353 "lea " MEMLEA(0x20,0) ",%0 \n"	5389 "lea " MEMLEA(0x20,0) ",%0 \n"

5354 "vcvtdq2ps %%ymm2,%%ymm2 \n"	5390 "vcvtdq2ps %%ymm2,%%ymm2 \n"

5355 "vcvtdq2ps %%ymm3,%%ymm3 \n"	5391 "vcvtdq2ps %%ymm3,%%ymm3 \n"

5356 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"	5392 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"

5357 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"	5393 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"

5358 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"	5394 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"

5359 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"	5395 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"

5360 "vmovdqu %%xmm2," MEMACCESS(1) " \n"	5396 "vmovdqu %%xmm2," MEMACCESS(1) " \n"

5361 "vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n"	5397 "vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n"

5362 "lea " MEMLEA(0x20,1) ",%1 \n"	5398 "lea " MEMLEA(0x20,1) ",%1 \n"

5363 "sub $0x10,%2 \n"	5399 "sub $0x10,%2 \n"

5364 "jg 1b \n"	5400 "jg 1b \n"

	5401

5365 "vzeroupper \n"	5402 "vzeroupper \n"

5366 : "+r"(src), // %0	5403 : "+r"(src), // %0

5367 "+r"(dst), // %1	5404 "+r"(dst), // %1

5368 "+r"(width) // %2	5405 "+r"(width) // %2

5369 : "x"(scale) // %3	5406 : "x"(scale) // %3

5370 : "memory", "cc",	5407 : "memory", "cc",

5371 "xmm2", "xmm3", "xmm4"	5408 "xmm2", "xmm3", "xmm4"

5372 );	5409 );

5373 }	5410 }

5374 #endif // HAS_HALFFLOATROW_AVX2	5411 #endif // HAS_HALFFLOATROW_F16C

5375	5412

5376 #ifdef HAS_ARGBCOLORTABLEROW_X86	5413 #ifdef HAS_ARGBCOLORTABLEROW_X86

5377 // Tranform ARGB pixels with color table.	5414 // Tranform ARGB pixels with color table.

5378 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,	5415 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,

5379 int width) {	5416 int width) {

5380 uintptr_t pixel_temp;	5417 uintptr_t pixel_temp;

5381 asm volatile (	5418 asm volatile (

5382 // 1 pixel loop.	5419 // 1 pixel loop.

5383 LABELALIGN	5420 LABELALIGN

5384 "1: \n"	5421 "1: \n"

(...skipping 147 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5532 );	5569 );

5533 }	5570 }

5534 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3	5571 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3

5535	5572

5536 #endif // defined(__x86_64__) \|\| defined(__i386__)	5573 #endif // defined(__x86_64__) \|\| defined(__i386__)

5537	5574

5538 #ifdef __cplusplus	5575 #ifdef __cplusplus

5539 } // extern "C"	5576 } // extern "C"

5540 } // namespace libyuv	5577 } // namespace libyuv

5541 #endif	5578 #endif

OLD	NEW

« no previous file with comments | « source/row_any.cc ('k') | source/row_win.cc » ('j') | no next file with comments »