Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(706)

Side by Side Diff: source/row_gcc.cc

Issue 2438893002: HalfFloat avx2 unpack bug fix. (Closed)
Patch Set: test use random Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « include/libyuv/version.h ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // VERSION 2 1 // VERSION 2
2 /* 2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * 4 *
5 * Use of this source code is governed by a BSD-style license 5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source 6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found 7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may 8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree. 9 * be found in the AUTHORS file in the root of the source tree.
10 */ 10 */
(...skipping 5332 matching lines...) Expand 10 before | Expand all | Expand 10 after
5343 #ifdef HAS_HALFFLOATROW_AVX2 5343 #ifdef HAS_HALFFLOATROW_AVX2
5344 // TODO(fbarchard): consider vadddw instead of vmulps 5344 // TODO(fbarchard): consider vadddw instead of vmulps
5345 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { 5345 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
5346 asm volatile ( 5346 asm volatile (
5347 "vbroadcastss %3, %%ymm4 \n" 5347 "vbroadcastss %3, %%ymm4 \n"
5348 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" 5348 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
5349 5349
5350 // 16 pixel loop. 5350 // 16 pixel loop.
5351 LABELALIGN 5351 LABELALIGN
5352 "1: \n" 5352 "1: \n"
5353 "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts 5353 "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts
5354 "lea " MEMLEA(0x20,0) ",%0 \n" 5354 "lea " MEMLEA(0x20,0) ",%0 \n"
5355 "vpunpckhwd %%ymm2,%%ymm5,%%ymm3 \n" 5355 "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
wangcheng 2016/10/20 22:38:30 reverse order of ymm5 and ymm2
5356 "vpunpcklwd %%ymm2,%%ymm5,%%ymm2 \n" 5356 "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
5357 "vcvtdq2ps %%ymm3,%%ymm3 \n" 5357 "vcvtdq2ps %%ymm3,%%ymm3 \n"
5358 "vcvtdq2ps %%ymm2,%%ymm2 \n" 5358 "vcvtdq2ps %%ymm2,%%ymm2 \n"
5359 "vmulps %%ymm3,%%ymm4,%%ymm3 \n" 5359 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
5360 "vmulps %%ymm2,%%ymm4,%%ymm2 \n" 5360 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
5361 "vpsrld $0xd,%%ymm3,%%ymm3 \n" 5361 "vpsrld $0xd,%%ymm3,%%ymm3 \n"
5362 "vpsrld $0xd,%%ymm2,%%ymm2 \n" 5362 "vpsrld $0xd,%%ymm2,%%ymm2 \n"
5363 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates 5363 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
5364 "vmovdqu %%ymm2," MEMACCESS(1) " \n" 5364 "vmovdqu %%ymm2," MEMACCESS(1) " \n"
5365 "lea " MEMLEA(0x20,1) ",%1 \n" 5365 "lea " MEMLEA(0x20,1) ",%1 \n"
5366 "sub $0x10,%2 \n" 5366 "sub $0x10,%2 \n"
5367 "jg 1b \n" 5367 "jg 1b \n"
5368 "vzeroupper \n" 5368 "vzeroupper \n"
5369 : "+r"(src), // %0 5369 : "+r"(src), // %0
5370 "+r"(dst), // %1 5370 "+r"(dst), // %1
5371 "+r"(width) // %2 5371 "+r"(width) // %2
5372 : "x"(scale * kScaleBias) // %3 5372 : "x"(scale * kScaleBias) // %3
5373 : "memory", "cc", 5373 : "memory", "cc",
5374 "xmm2", "xmm3", "xmm4", "xmm5" 5374 "xmm2", "xmm3", "xmm4", "xmm5"
5375 ); 5375 );
5376 } 5376 }
5377 #endif // HAS_HALFFLOATROW_AVX2 5377 #endif // HAS_HALFFLOATROW_AVX2
5378 5378
5379 #ifdef HAS_HALFFLOATROW_F16C 5379 #ifdef HAS_HALFFLOATROW_F16C
5380 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { 5380 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
5381 asm volatile ( 5381 asm volatile (
5382 "vbroadcastss %3, %%ymm4 \n" 5382 "vbroadcastss %3, %%ymm4 \n"
5383 5383
5384 // 16 pixel loop. 5384 // 16 pixel loop.
5385 LABELALIGN 5385 LABELALIGN
5386 "1: \n" 5386 "1: \n"
5387 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints 5387 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
5388 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more 5388 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
5389 "lea " MEMLEA(0x20,0) ",%0 \n" 5389 "lea " MEMLEA(0x20,0) ",%0 \n"
5390 "vcvtdq2ps %%ymm2,%%ymm2 \n" 5390 "vcvtdq2ps %%ymm2,%%ymm2 \n"
5391 "vcvtdq2ps %%ymm3,%%ymm3 \n" 5391 "vcvtdq2ps %%ymm3,%%ymm3 \n"
5392 "vmulps %%ymm2,%%ymm4,%%ymm2 \n" 5392 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
5393 "vmulps %%ymm3,%%ymm4,%%ymm3 \n" 5393 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
5394 "vcvtps2ph $3, %%ymm2, %%xmm2 \n" 5394 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
5395 "vcvtps2ph $3, %%ymm3, %%xmm3 \n" 5395 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
5396 "vmovdqu %%xmm2," MEMACCESS(1) " \n" 5396 "vmovdqu %%xmm2," MEMACCESS(1) " \n"
5397 "vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n" 5397 "vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
5398 "lea " MEMLEA(0x20,1) ",%1 \n" 5398 "lea " MEMLEA(0x20,1) ",%1 \n"
(...skipping 170 matching lines...) Expand 10 before | Expand all | Expand 10 after
5569 ); 5569 );
5570 } 5570 }
5571 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5571 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5572 5572
5573 #endif // defined(__x86_64__) || defined(__i386__) 5573 #endif // defined(__x86_64__) || defined(__i386__)
5574 5574
5575 #ifdef __cplusplus 5575 #ifdef __cplusplus
5576 } // extern "C" 5576 } // extern "C"
5577 } // namespace libyuv 5577 } // namespace libyuv
5578 #endif 5578 #endif
OLDNEW
« no previous file with comments | « include/libyuv/version.h ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698