Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4)

Side by Side Diff: source/row_gcc.cc

Issue 2484083003: clang-format row_gcc.cc with some functions disabled (Closed)
Patch Set: clang-format row_gcc.cc with some functions disabled Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // VERSION 2 1 // VERSION 2
2 /* 2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * 4 *
5 * Use of this source code is governed by a BSD-style license 5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source 6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found 7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may 8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree. 9 * be found in the AUTHORS file in the root of the source tree.
10 */ 10 */
11 11
12 #include "libyuv/row.h" 12 #include "libyuv/row.h"
13 13
14 #ifdef __cplusplus 14 #ifdef __cplusplus
15 namespace libyuv { 15 namespace libyuv {
16 extern "C" { 16 extern "C" {
17 #endif 17 #endif
18 18
19 // clang-format off
20
21 // This module is for GCC x86 and x64. 19 // This module is for GCC x86 and x64.
22 #if !defined(LIBYUV_DISABLE_X86) && \ 20 #if !defined(LIBYUV_DISABLE_X86) && \
23 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 21 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
24 22
25 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) 23 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
26 24
27 // Constants for ARGB 25 // Constants for ARGB
28 static vec8 kARGBToY = { 26 static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
29 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 27 13, 65, 33, 0, 13, 65, 33, 0};
30 };
31 28
32 // JPeg full range. 29 // JPeg full range.
33 static vec8 kARGBToYJ = { 30 static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
34 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 31 15, 75, 38, 0, 15, 75, 38, 0};
35 };
36 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) 32 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
37 33
38 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 34 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
39 35
40 static vec8 kARGBToU = { 36 static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
41 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 37 112, -74, -38, 0, 112, -74, -38, 0};
38
39 static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
40 127, -84, -43, 0, 127, -84, -43, 0};
41
42 static vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
42 }; 44 };
43 45
44 static vec8 kARGBToUJ = { 46 static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
45 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 47 -20, -107, 127, 0, -20, -107, 127, 0};
46 };
47
48 static vec8 kARGBToV = {
49 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
50 };
51
52 static vec8 kARGBToVJ = {
53 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
54 };
55 48
56 // Constants for BGRA 49 // Constants for BGRA
57 static vec8 kBGRAToY = { 50 static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
58 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 51 0, 33, 65, 13, 0, 33, 65, 13};
59 };
60 52
61 static vec8 kBGRAToU = { 53 static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
62 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 54 0, -38, -74, 112, 0, -38, -74, 112};
63 };
64 55
65 static vec8 kBGRAToV = { 56 static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
66 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 57 0, 112, -94, -18, 0, 112, -94, -18};
67 };
68 58
69 // Constants for ABGR 59 // Constants for ABGR
70 static vec8 kABGRToY = { 60 static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
71 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 61 33, 65, 13, 0, 33, 65, 13, 0};
72 };
73 62
74 static vec8 kABGRToU = { 63 static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
75 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 64 -38, -74, 112, 0, -38, -74, 112, 0};
76 };
77 65
78 static vec8 kABGRToV = { 66 static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
79 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 67 112, -94, -18, 0, 112, -94, -18, 0};
80 };
81 68
82 // Constants for RGBA. 69 // Constants for RGBA.
83 static vec8 kRGBAToY = { 70 static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
84 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 71 0, 13, 65, 33, 0, 13, 65, 33};
85 };
86 72
87 static vec8 kRGBAToU = { 73 static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
88 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 74 0, 112, -74, -38, 0, 112, -74, -38};
89 };
90 75
91 static vec8 kRGBAToV = { 76 static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
92 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 77 0, -18, -94, 112, 0, -18, -94, 112};
93 };
94 78
95 static uvec8 kAddY16 = { 79 static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
96 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 80 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
97 };
98 81
99 // 7 bit fixed point 0.5. 82 // 7 bit fixed point 0.5.
100 static vec16 kAddYJ64 = { 83 static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
101 64, 64, 64, 64, 64, 64, 64, 64
102 };
103 84
104 static uvec8 kAddUV128 = { 85 static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
105 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 86 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
106 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
107 };
108 87
109 static uvec16 kAddUVJ128 = { 88 static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
110 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 89 0x8080u, 0x8080u, 0x8080u, 0x8080u};
111 };
112 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 90 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
113 91
114 #ifdef HAS_RGB24TOARGBROW_SSSE3 92 #ifdef HAS_RGB24TOARGBROW_SSSE3
115 93
116 // Shuffle table for converting RGB24 to ARGB. 94 // Shuffle table for converting RGB24 to ARGB.
117 static uvec8 kShuffleMaskRGB24ToARGB = { 95 static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u,
118 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 96 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
119 };
120 97
121 // Shuffle table for converting RAW to ARGB. 98 // Shuffle table for converting RAW to ARGB.
122 static uvec8 kShuffleMaskRAWToARGB = { 99 static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
123 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 100 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
124 };
125 101
126 // Shuffle table for converting RAW to RGB24. First 8. 102 // Shuffle table for converting RAW to RGB24. First 8.
127 static const uvec8 kShuffleMaskRAWToRGB24_0 = { 103 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
128 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, 104 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
129 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 105 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
130 };
131 106
132 // Shuffle table for converting RAW to RGB24. Middle 8. 107 // Shuffle table for converting RAW to RGB24. Middle 8.
133 static const uvec8 kShuffleMaskRAWToRGB24_1 = { 108 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
134 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, 109 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
135 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 110 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
136 };
137 111
138 // Shuffle table for converting RAW to RGB24. Last 8. 112 // Shuffle table for converting RAW to RGB24. Last 8.
139 static const uvec8 kShuffleMaskRAWToRGB24_2 = { 113 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
140 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, 114 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
141 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 115 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
142 };
143 116
144 // Shuffle table for converting ARGB to RGB24. 117 // Shuffle table for converting ARGB to RGB24.
145 static uvec8 kShuffleMaskARGBToRGB24 = { 118 static uvec8 kShuffleMaskARGBToRGB24 = {
146 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 119 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
147 };
148 120
149 // Shuffle table for converting ARGB to RAW. 121 // Shuffle table for converting ARGB to RAW.
150 static uvec8 kShuffleMaskARGBToRAW = { 122 static uvec8 kShuffleMaskARGBToRAW = {
151 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 123 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
152 };
153 124
154 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 125 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
155 static uvec8 kShuffleMaskARGBToRGB24_0 = { 126 static uvec8 kShuffleMaskARGBToRGB24_0 = {
156 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 127 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
157 };
158 128
159 // YUY2 shuf 16 Y to 32 Y. 129 // YUY2 shuf 16 Y to 32 Y.
160 static const lvec8 kShuffleYUY2Y = { 130 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
161 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 131 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
162 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 132 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
163 };
164 133
165 // YUY2 shuf 8 UV to 16 UV. 134 // YUY2 shuf 8 UV to 16 UV.
166 static const lvec8 kShuffleYUY2UV = { 135 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
167 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, 136 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
168 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 137 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
169 };
170 138
171 // UYVY shuf 16 Y to 32 Y. 139 // UYVY shuf 16 Y to 32 Y.
172 static const lvec8 kShuffleUYVYY = { 140 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
173 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, 141 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
174 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 142 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
175 };
176 143
177 // UYVY shuf 8 UV to 16 UV. 144 // UYVY shuf 8 UV to 16 UV.
178 static const lvec8 kShuffleUYVYUV = { 145 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
179 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, 146 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
180 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 147 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
181 };
182 148
183 // NV21 shuf 8 VU to 16 UV. 149 // NV21 shuf 8 VU to 16 UV.
184 static const lvec8 kShuffleNV21 = { 150 static const lvec8 kShuffleNV21 = {
185 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 151 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
186 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 152 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
187 }; 153 };
188 #endif // HAS_RGB24TOARGBROW_SSSE3 154 #endif // HAS_RGB24TOARGBROW_SSSE3
189 155
190 #ifdef HAS_J400TOARGBROW_SSE2 156 #ifdef HAS_J400TOARGBROW_SSE2
191 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { 157 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
192 asm volatile ( 158 asm volatile (
193 "pcmpeqb %%xmm5,%%xmm5 \n" 159 "pcmpeqb %%xmm5,%%xmm5 \n"
194 "pslld $0x18,%%xmm5 \n" 160 "pslld $0x18,%%xmm5 \n"
195 LABELALIGN 161 LABELALIGN
196 "1: \n" 162 "1: \n"
(...skipping 367 matching lines...) Expand 10 before | Expand all | Expand 10 after
564 "lea " MEMLEA(0x8,1) ",%1 \n" 530 "lea " MEMLEA(0x8,1) ",%1 \n"
565 "sub $0x4,%2 \n" 531 "sub $0x4,%2 \n"
566 "jg 1b \n" 532 "jg 1b \n"
567 : "+r"(src), // %0 533 : "+r"(src), // %0
568 "+r"(dst), // %1 534 "+r"(dst), // %1
569 "+r"(width) // %2 535 "+r"(width) // %2
570 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 536 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
571 ); 537 );
572 } 538 }
573 539
574 void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, 540 void ARGBToRGB565DitherRow_SSE2(const uint8* src,
575 const uint32 dither4, int width) { 541 uint8* dst,
576 asm volatile ( 542 const uint32 dither4,
577 "movd %3,%%xmm6 \n" 543 int width) {
578 "punpcklbw %%xmm6,%%xmm6 \n" 544 asm volatile(
579 "movdqa %%xmm6,%%xmm7 \n" 545 "movd %3,%%xmm6 \n"
580 "punpcklwd %%xmm6,%%xmm6 \n" 546 "punpcklbw %%xmm6,%%xmm6 \n"
581 "punpckhwd %%xmm7,%%xmm7 \n" 547 "movdqa %%xmm6,%%xmm7 \n"
582 "pcmpeqb %%xmm3,%%xmm3 \n" 548 "punpcklwd %%xmm6,%%xmm6 \n"
583 "psrld $0x1b,%%xmm3 \n" 549 "punpckhwd %%xmm7,%%xmm7 \n"
584 "pcmpeqb %%xmm4,%%xmm4 \n" 550 "pcmpeqb %%xmm3,%%xmm3 \n"
585 "psrld $0x1a,%%xmm4 \n" 551 "psrld $0x1b,%%xmm3 \n"
586 "pslld $0x5,%%xmm4 \n" 552 "pcmpeqb %%xmm4,%%xmm4 \n"
587 "pcmpeqb %%xmm5,%%xmm5 \n" 553 "psrld $0x1a,%%xmm4 \n"
588 "pslld $0xb,%%xmm5 \n" 554 "pslld $0x5,%%xmm4 \n"
555 "pcmpeqb %%xmm5,%%xmm5 \n"
556 "pslld $0xb,%%xmm5 \n"
589 557
590 LABELALIGN 558 LABELALIGN
591 "1: \n" 559 "1: \n"
592 "movdqu (%0),%%xmm0 \n" 560 "movdqu (%0),%%xmm0 \n"
593 "paddusb %%xmm6,%%xmm0 \n" 561 "paddusb %%xmm6,%%xmm0 \n"
594 "movdqa %%xmm0,%%xmm1 \n" 562 "movdqa %%xmm0,%%xmm1 \n"
595 "movdqa %%xmm0,%%xmm2 \n" 563 "movdqa %%xmm0,%%xmm2 \n"
596 "pslld $0x8,%%xmm0 \n" 564 "pslld $0x8,%%xmm0 \n"
597 "psrld $0x3,%%xmm1 \n" 565 "psrld $0x3,%%xmm1 \n"
598 "psrld $0x5,%%xmm2 \n" 566 "psrld $0x5,%%xmm2 \n"
599 "psrad $0x10,%%xmm0 \n" 567 "psrad $0x10,%%xmm0 \n"
600 "pand %%xmm3,%%xmm1 \n" 568 "pand %%xmm3,%%xmm1 \n"
601 "pand %%xmm4,%%xmm2 \n" 569 "pand %%xmm4,%%xmm2 \n"
602 "pand %%xmm5,%%xmm0 \n" 570 "pand %%xmm5,%%xmm0 \n"
603 "por %%xmm2,%%xmm1 \n" 571 "por %%xmm2,%%xmm1 \n"
604 "por %%xmm1,%%xmm0 \n" 572 "por %%xmm1,%%xmm0 \n"
605 "packssdw %%xmm0,%%xmm0 \n" 573 "packssdw %%xmm0,%%xmm0 \n"
606 "lea 0x10(%0),%0 \n" 574 "lea 0x10(%0),%0 \n"
607 "movq %%xmm0,(%1) \n" 575 "movq %%xmm0,(%1) \n"
608 "lea 0x8(%1),%1 \n" 576 "lea 0x8(%1),%1 \n"
609 "sub $0x4,%2 \n" 577 "sub $0x4,%2 \n"
610 "jg 1b \n" 578 "jg 1b \n"
611 : "+r"(src), // %0 579 : "+r"(src), // %0
612 "+r"(dst), // %1 580 "+r"(dst), // %1
613 "+r"(width) // %2 581 "+r"(width) // %2
614 : "m"(dither4) // %3 582 : "m"(dither4) // %3
615 : "memory", "cc", 583 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
616 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 584 "xmm7");
617 );
618 } 585 }
619 586
620 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 587 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
621 void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, 588 void ARGBToRGB565DitherRow_AVX2(const uint8* src,
622 const uint32 dither4, int width) { 589 uint8* dst,
623 asm volatile ( 590 const uint32 dither4,
624 "vbroadcastss %3,%%xmm6 \n" 591 int width) {
625 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" 592 asm volatile(
626 "vpermq $0xd8,%%ymm6,%%ymm6 \n" 593 "vbroadcastss %3,%%xmm6 \n"
627 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" 594 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
628 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" 595 "vpermq $0xd8,%%ymm6,%%ymm6 \n"
629 "vpsrld $0x1b,%%ymm3,%%ymm3 \n" 596 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
630 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" 597 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
631 "vpsrld $0x1a,%%ymm4,%%ymm4 \n" 598 "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
632 "vpslld $0x5,%%ymm4,%%ymm4 \n" 599 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
633 "vpslld $0xb,%%ymm3,%%ymm5 \n" 600 "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
601 "vpslld $0x5,%%ymm4,%%ymm4 \n"
602 "vpslld $0xb,%%ymm3,%%ymm5 \n"
634 603
635 LABELALIGN 604 LABELALIGN
636 "1: \n" 605 "1: \n"
637 "vmovdqu (%0),%%ymm0 \n" 606 "vmovdqu (%0),%%ymm0 \n"
638 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" 607 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
639 "vpsrld $0x5,%%ymm0,%%ymm2 \n" 608 "vpsrld $0x5,%%ymm0,%%ymm2 \n"
640 "vpsrld $0x3,%%ymm0,%%ymm1 \n" 609 "vpsrld $0x3,%%ymm0,%%ymm1 \n"
641 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 610 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
642 "vpand %%ymm4,%%ymm2,%%ymm2 \n" 611 "vpand %%ymm4,%%ymm2,%%ymm2 \n"
643 "vpand %%ymm3,%%ymm1,%%ymm1 \n" 612 "vpand %%ymm3,%%ymm1,%%ymm1 \n"
644 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 613 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
645 "vpor %%ymm2,%%ymm1,%%ymm1 \n" 614 "vpor %%ymm2,%%ymm1,%%ymm1 \n"
646 "vpor %%ymm1,%%ymm0,%%ymm0 \n" 615 "vpor %%ymm1,%%ymm0,%%ymm0 \n"
647 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" 616 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
648 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 617 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
649 "lea 0x20(%0),%0 \n" 618 "lea 0x20(%0),%0 \n"
650 "vmovdqu %%xmm0,(%1) \n" 619 "vmovdqu %%xmm0,(%1) \n"
651 "lea 0x10(%1),%1 \n" 620 "lea 0x10(%1),%1 \n"
652 "sub $0x8,%2 \n" 621 "sub $0x8,%2 \n"
653 "jg 1b \n" 622 "jg 1b \n"
654 "vzeroupper \n" 623 "vzeroupper \n"
655 : "+r"(src), // %0 624 : "+r"(src), // %0
656 "+r"(dst), // %1 625 "+r"(dst), // %1
657 "+r"(width) // %2 626 "+r"(width) // %2
658 : "m"(dither4) // %3 627 : "m"(dither4) // %3
659 : "memory", "cc", 628 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
660 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 629 "xmm7");
661 );
662 } 630 }
663 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 631 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
664 632
665
666 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { 633 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
667 asm volatile ( 634 asm volatile (
668 "pcmpeqb %%xmm4,%%xmm4 \n" 635 "pcmpeqb %%xmm4,%%xmm4 \n"
669 "psrld $0x1b,%%xmm4 \n" 636 "psrld $0x1b,%%xmm4 \n"
670 "movdqa %%xmm4,%%xmm5 \n" 637 "movdqa %%xmm4,%%xmm5 \n"
671 "pslld $0x5,%%xmm5 \n" 638 "pslld $0x5,%%xmm5 \n"
672 "movdqa %%xmm4,%%xmm6 \n" 639 "movdqa %%xmm4,%%xmm6 \n"
673 "pslld $0xa,%%xmm6 \n" 640 "pslld $0xa,%%xmm6 \n"
674 "pcmpeqb %%xmm7,%%xmm7 \n" 641 "pcmpeqb %%xmm7,%%xmm7 \n"
675 "pslld $0xf,%%xmm7 \n" 642 "pslld $0xf,%%xmm7 \n"
(...skipping 128 matching lines...) Expand 10 before | Expand all | Expand 10 after
804 "+r"(width) // %2 771 "+r"(width) // %2
805 : "m"(kARGBToYJ), // %3 772 : "m"(kARGBToYJ), // %3
806 "m"(kAddYJ64) // %4 773 "m"(kAddYJ64) // %4
807 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 774 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
808 ); 775 );
809 } 776 }
810 #endif // HAS_ARGBTOYJROW_SSSE3 777 #endif // HAS_ARGBTOYJROW_SSSE3
811 778
812 #ifdef HAS_ARGBTOYROW_AVX2 779 #ifdef HAS_ARGBTOYROW_AVX2
813 // vpermd for vphaddw + vpackuswb vpermd. 780 // vpermd for vphaddw + vpackuswb vpermd.
814 static const lvec32 kPermdARGBToY_AVX = { 781 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
815 0, 4, 1, 5, 2, 6, 3, 7
816 };
817 782
818 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 783 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
819 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { 784 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
820 asm volatile ( 785 asm volatile (
821 "vbroadcastf128 %3,%%ymm4 \n" 786 "vbroadcastf128 %3,%%ymm4 \n"
822 "vbroadcastf128 %4,%%ymm5 \n" 787 "vbroadcastf128 %4,%%ymm5 \n"
823 "vmovdqu %5,%%ymm6 \n" 788 "vmovdqu %5,%%ymm6 \n"
824 LABELALIGN 789 LABELALIGN
825 "1: \n" 790 "1: \n"
826 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 791 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
891 "+r"(width) // %2 856 "+r"(width) // %2
892 : "m"(kARGBToYJ), // %3 857 : "m"(kARGBToYJ), // %3
893 "m"(kAddYJ64), // %4 858 "m"(kAddYJ64), // %4
894 "m"(kPermdARGBToY_AVX) // %5 859 "m"(kPermdARGBToY_AVX) // %5
895 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 860 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
896 ); 861 );
897 } 862 }
898 #endif // HAS_ARGBTOYJROW_AVX2 863 #endif // HAS_ARGBTOYJROW_AVX2
899 864
900 #ifdef HAS_ARGBTOUVROW_SSSE3 865 #ifdef HAS_ARGBTOUVROW_SSSE3
901 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 866 void ARGBToUVRow_SSSE3(const uint8* src_argb0,
902 uint8* dst_u, uint8* dst_v, int width) { 867 int src_stride_argb,
868 uint8* dst_u,
869 uint8* dst_v,
870 int width) {
903 asm volatile ( 871 asm volatile (
904 "movdqa %5,%%xmm3 \n" 872 "movdqa %5,%%xmm3 \n"
905 "movdqa %6,%%xmm4 \n" 873 "movdqa %6,%%xmm4 \n"
906 "movdqa %7,%%xmm5 \n" 874 "movdqa %7,%%xmm5 \n"
907 "sub %1,%2 \n" 875 "sub %1,%2 \n"
908 LABELALIGN 876 LABELALIGN
909 "1: \n" 877 "1: \n"
910 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 878 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
911 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 879 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
912 "pavgb %%xmm7,%%xmm0 \n" 880 "pavgb %%xmm7,%%xmm0 \n"
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
956 "m"(kAddUV128) // %7 924 "m"(kAddUV128) // %7
957 : "memory", "cc", NACL_R14 925 : "memory", "cc", NACL_R14
958 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 926 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
959 ); 927 );
960 } 928 }
961 #endif // HAS_ARGBTOUVROW_SSSE3 929 #endif // HAS_ARGBTOUVROW_SSSE3
962 930
963 #ifdef HAS_ARGBTOUVROW_AVX2 931 #ifdef HAS_ARGBTOUVROW_AVX2
964 // vpshufb for vphaddw + vpackuswb packed to shorts. 932 // vpshufb for vphaddw + vpackuswb packed to shorts.
965 static const lvec8 kShufARGBToUV_AVX = { 933 static const lvec8 kShufARGBToUV_AVX = {
966 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 934 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
967 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 935 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
968 }; 936 void ARGBToUVRow_AVX2(const uint8* src_argb0,
969 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, 937 int src_stride_argb,
970 uint8* dst_u, uint8* dst_v, int width) { 938 uint8* dst_u,
939 uint8* dst_v,
940 int width) {
971 asm volatile ( 941 asm volatile (
972 "vbroadcastf128 %5,%%ymm5 \n" 942 "vbroadcastf128 %5,%%ymm5 \n"
973 "vbroadcastf128 %6,%%ymm6 \n" 943 "vbroadcastf128 %6,%%ymm6 \n"
974 "vbroadcastf128 %7,%%ymm7 \n" 944 "vbroadcastf128 %7,%%ymm7 \n"
975 "sub %1,%2 \n" 945 "sub %1,%2 \n"
976 LABELALIGN 946 LABELALIGN
977 "1: \n" 947 "1: \n"
978 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 948 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
979 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 949 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
980 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 950 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
1019 "m"(kARGBToV), // %6 989 "m"(kARGBToV), // %6
1020 "m"(kARGBToU), // %7 990 "m"(kARGBToU), // %7
1021 "m"(kShufARGBToUV_AVX) // %8 991 "m"(kShufARGBToUV_AVX) // %8
1022 : "memory", "cc", NACL_R14 992 : "memory", "cc", NACL_R14
1023 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 993 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1024 ); 994 );
1025 } 995 }
1026 #endif // HAS_ARGBTOUVROW_AVX2 996 #endif // HAS_ARGBTOUVROW_AVX2
1027 997
1028 #ifdef HAS_ARGBTOUVJROW_AVX2 998 #ifdef HAS_ARGBTOUVJROW_AVX2
1029 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, 999 void ARGBToUVJRow_AVX2(const uint8* src_argb0,
1030 uint8* dst_u, uint8* dst_v, int width) { 1000 int src_stride_argb,
1001 uint8* dst_u,
1002 uint8* dst_v,
1003 int width) {
1031 asm volatile ( 1004 asm volatile (
1032 "vbroadcastf128 %5,%%ymm5 \n" 1005 "vbroadcastf128 %5,%%ymm5 \n"
1033 "vbroadcastf128 %6,%%ymm6 \n" 1006 "vbroadcastf128 %6,%%ymm6 \n"
1034 "vbroadcastf128 %7,%%ymm7 \n" 1007 "vbroadcastf128 %7,%%ymm7 \n"
1035 "sub %1,%2 \n" 1008 "sub %1,%2 \n"
1036 LABELALIGN 1009 LABELALIGN
1037 "1: \n" 1010 "1: \n"
1038 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 1011 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
1039 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 1012 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
1040 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 1013 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
1080 "m"(kARGBToVJ), // %6 1053 "m"(kARGBToVJ), // %6
1081 "m"(kARGBToUJ), // %7 1054 "m"(kARGBToUJ), // %7
1082 "m"(kShufARGBToUV_AVX) // %8 1055 "m"(kShufARGBToUV_AVX) // %8
1083 : "memory", "cc", NACL_R14 1056 : "memory", "cc", NACL_R14
1084 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1057 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1085 ); 1058 );
1086 } 1059 }
1087 #endif // HAS_ARGBTOUVJROW_AVX2 1060 #endif // HAS_ARGBTOUVJROW_AVX2
1088 1061
1089 #ifdef HAS_ARGBTOUVJROW_SSSE3 1062 #ifdef HAS_ARGBTOUVJROW_SSSE3
1090 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1063 void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
1091 uint8* dst_u, uint8* dst_v, int width) { 1064 int src_stride_argb,
1065 uint8* dst_u,
1066 uint8* dst_v,
1067 int width) {
1092 asm volatile ( 1068 asm volatile (
1093 "movdqa %5,%%xmm3 \n" 1069 "movdqa %5,%%xmm3 \n"
1094 "movdqa %6,%%xmm4 \n" 1070 "movdqa %6,%%xmm4 \n"
1095 "movdqa %7,%%xmm5 \n" 1071 "movdqa %7,%%xmm5 \n"
1096 "sub %1,%2 \n" 1072 "sub %1,%2 \n"
1097 LABELALIGN 1073 LABELALIGN
1098 "1: \n" 1074 "1: \n"
1099 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1075 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1100 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1076 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1101 "pavgb %%xmm7,%%xmm0 \n" 1077 "pavgb %%xmm7,%%xmm0 \n"
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
1144 "m"(kARGBToVJ), // %5 1120 "m"(kARGBToVJ), // %5
1145 "m"(kARGBToUJ), // %6 1121 "m"(kARGBToUJ), // %6
1146 "m"(kAddUVJ128) // %7 1122 "m"(kAddUVJ128) // %7
1147 : "memory", "cc", NACL_R14 1123 : "memory", "cc", NACL_R14
1148 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1124 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1149 ); 1125 );
1150 } 1126 }
1151 #endif // HAS_ARGBTOUVJROW_SSSE3 1127 #endif // HAS_ARGBTOUVJROW_SSSE3
1152 1128
1153 #ifdef HAS_ARGBTOUV444ROW_SSSE3 1129 #ifdef HAS_ARGBTOUV444ROW_SSSE3
1154 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1130 void ARGBToUV444Row_SSSE3(const uint8* src_argb,
1131 uint8* dst_u,
1132 uint8* dst_v,
1155 int width) { 1133 int width) {
1156 asm volatile ( 1134 asm volatile (
1157 "movdqa %4,%%xmm3 \n" 1135 "movdqa %4,%%xmm3 \n"
1158 "movdqa %5,%%xmm4 \n" 1136 "movdqa %5,%%xmm4 \n"
1159 "movdqa %6,%%xmm5 \n" 1137 "movdqa %6,%%xmm5 \n"
1160 "sub %1,%2 \n" 1138 "sub %1,%2 \n"
1161 LABELALIGN 1139 LABELALIGN
1162 "1: \n" 1140 "1: \n"
1163 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1141 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1164 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1142 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after
1234 "jg 1b \n" 1212 "jg 1b \n"
1235 : "+r"(src_bgra), // %0 1213 : "+r"(src_bgra), // %0
1236 "+r"(dst_y), // %1 1214 "+r"(dst_y), // %1
1237 "+r"(width) // %2 1215 "+r"(width) // %2
1238 : "m"(kBGRAToY), // %3 1216 : "m"(kBGRAToY), // %3
1239 "m"(kAddY16) // %4 1217 "m"(kAddY16) // %4
1240 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1218 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1241 ); 1219 );
1242 } 1220 }
1243 1221
1244 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, 1222 void BGRAToUVRow_SSSE3(const uint8* src_bgra0,
1245 uint8* dst_u, uint8* dst_v, int width) { 1223 int src_stride_bgra,
1224 uint8* dst_u,
1225 uint8* dst_v,
1226 int width) {
1246 asm volatile ( 1227 asm volatile (
1247 "movdqa %5,%%xmm3 \n" 1228 "movdqa %5,%%xmm3 \n"
1248 "movdqa %6,%%xmm4 \n" 1229 "movdqa %6,%%xmm4 \n"
1249 "movdqa %7,%%xmm5 \n" 1230 "movdqa %7,%%xmm5 \n"
1250 "sub %1,%2 \n" 1231 "sub %1,%2 \n"
1251 LABELALIGN 1232 LABELALIGN
1252 "1: \n" 1233 "1: \n"
1253 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1234 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1254 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1235 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1255 "pavgb %%xmm7,%%xmm0 \n" 1236 "pavgb %%xmm7,%%xmm0 \n"
(...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after
1363 "jg 1b \n" 1344 "jg 1b \n"
1364 : "+r"(src_rgba), // %0 1345 : "+r"(src_rgba), // %0
1365 "+r"(dst_y), // %1 1346 "+r"(dst_y), // %1
1366 "+r"(width) // %2 1347 "+r"(width) // %2
1367 : "m"(kRGBAToY), // %3 1348 : "m"(kRGBAToY), // %3
1368 "m"(kAddY16) // %4 1349 "m"(kAddY16) // %4
1369 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1350 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1370 ); 1351 );
1371 } 1352 }
1372 1353
1373 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, 1354 void ABGRToUVRow_SSSE3(const uint8* src_abgr0,
1374 uint8* dst_u, uint8* dst_v, int width) { 1355 int src_stride_abgr,
1356 uint8* dst_u,
1357 uint8* dst_v,
1358 int width) {
1375 asm volatile ( 1359 asm volatile (
1376 "movdqa %5,%%xmm3 \n" 1360 "movdqa %5,%%xmm3 \n"
1377 "movdqa %6,%%xmm4 \n" 1361 "movdqa %6,%%xmm4 \n"
1378 "movdqa %7,%%xmm5 \n" 1362 "movdqa %7,%%xmm5 \n"
1379 "sub %1,%2 \n" 1363 "sub %1,%2 \n"
1380 LABELALIGN 1364 LABELALIGN
1381 "1: \n" 1365 "1: \n"
1382 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1366 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1383 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1367 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1384 "pavgb %%xmm7,%%xmm0 \n" 1368 "pavgb %%xmm7,%%xmm0 \n"
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
1424 "+rm"(width) // %3 1408 "+rm"(width) // %3
1425 : "r"((intptr_t)(src_stride_abgr)), // %4 1409 : "r"((intptr_t)(src_stride_abgr)), // %4
1426 "m"(kABGRToV), // %5 1410 "m"(kABGRToV), // %5
1427 "m"(kABGRToU), // %6 1411 "m"(kABGRToU), // %6
1428 "m"(kAddUV128) // %7 1412 "m"(kAddUV128) // %7
1429 : "memory", "cc", NACL_R14 1413 : "memory", "cc", NACL_R14
1430 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1414 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1431 ); 1415 );
1432 } 1416 }
1433 1417
1434 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, 1418 void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
1435 uint8* dst_u, uint8* dst_v, int width) { 1419 int src_stride_rgba,
1420 uint8* dst_u,
1421 uint8* dst_v,
1422 int width) {
1436 asm volatile ( 1423 asm volatile (
1437 "movdqa %5,%%xmm3 \n" 1424 "movdqa %5,%%xmm3 \n"
1438 "movdqa %6,%%xmm4 \n" 1425 "movdqa %6,%%xmm4 \n"
1439 "movdqa %7,%%xmm5 \n" 1426 "movdqa %7,%%xmm5 \n"
1440 "sub %1,%2 \n" 1427 "sub %1,%2 \n"
1441 LABELALIGN 1428 LABELALIGN
1442 "1: \n" 1429 "1: \n"
1443 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1430 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1444 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1431 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1445 "pavgb %%xmm7,%%xmm0 \n" 1432 "pavgb %%xmm7,%%xmm0 \n"
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
1488 "m"(kRGBAToU), // %6 1475 "m"(kRGBAToU), // %6
1489 "m"(kAddUV128) // %7 1476 "m"(kAddUV128) // %7
1490 : "memory", "cc", NACL_R14 1477 : "memory", "cc", NACL_R14
1491 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1478 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1492 ); 1479 );
1493 } 1480 }
1494 1481
1495 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) 1482 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1496 1483
1497 // Read 8 UV from 444 1484 // Read 8 UV from 444
1498 #define READYUV444 \ 1485 #define READYUV444 \
1499 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1486 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1500 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1487 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1501 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1488 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1502 "punpcklbw %%xmm1,%%xmm0 \n" \ 1489 "punpcklbw %%xmm1,%%xmm0 \n" \
1503 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1490 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1504 "punpcklbw %%xmm4,%%xmm4 \n" \ 1491 "punpcklbw %%xmm4,%%xmm4 \n" \
1505 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1492 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1506 1493
1507 // Read 4 UV from 422, upsample to 8 UV 1494 // Read 4 UV from 422, upsample to 8 UV
1508 #define READYUV422 \ 1495 #define READYUV422 \
1509 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1496 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1510 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1497 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1511 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ 1498 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1512 "punpcklbw %%xmm1,%%xmm0 \n" \ 1499 "punpcklbw %%xmm1,%%xmm0 \n" \
1513 "punpcklwd %%xmm0,%%xmm0 \n" \ 1500 "punpcklwd %%xmm0,%%xmm0 \n" \
1514 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1501 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1515 "punpcklbw %%xmm4,%%xmm4 \n" \ 1502 "punpcklbw %%xmm4,%%xmm4 \n" \
1516 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1503 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1517 1504
1518 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. 1505 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
1519 #define READYUVA422 \ 1506 #define READYUVA422 \
1520 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1507 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1521 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1508 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1522 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ 1509 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1523 "punpcklbw %%xmm1,%%xmm0 \n" \ 1510 "punpcklbw %%xmm1,%%xmm0 \n" \
1524 "punpcklwd %%xmm0,%%xmm0 \n" \ 1511 "punpcklwd %%xmm0,%%xmm0 \n" \
1525 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1512 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1526 "punpcklbw %%xmm4,%%xmm4 \n" \ 1513 "punpcklbw %%xmm4,%%xmm4 \n" \
1527 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ 1514 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
1528 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ 1515 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
1529 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" 1516 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
1530 1517
1531 // Read 4 UV from NV12, upsample to 8 UV 1518 // Read 4 UV from NV12, upsample to 8 UV
1532 #define READNV12 \ 1519 #define READNV12 \
1533 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ 1520 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1534 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ 1521 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
1535 "punpcklwd %%xmm0,%%xmm0 \n" \ 1522 "punpcklwd %%xmm0,%%xmm0 \n" \
1536 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1523 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1537 "punpcklbw %%xmm4,%%xmm4 \n" \ 1524 "punpcklbw %%xmm4,%%xmm4 \n" \
1538 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1525 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1539 1526
1540 // Read 4 VU from NV21, upsample to 8 UV 1527 // Read 4 VU from NV21, upsample to 8 UV
1541 #define READNV21 \ 1528 #define READNV21 \
1542 "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ 1529 "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
1543 "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ 1530 "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
1544 "pshufb %[kShuffleNV21], %%xmm0 \n" \ 1531 "pshufb %[kShuffleNV21], %%xmm0 \n" \
1545 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1532 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1546 "punpcklbw %%xmm4,%%xmm4 \n" \ 1533 "punpcklbw %%xmm4,%%xmm4 \n" \
1547 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1534 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1548 1535
1549 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. 1536 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1550 #define READYUY2 \ 1537 #define READYUY2 \
1551 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ 1538 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
1552 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ 1539 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
1553 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ 1540 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
1554 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ 1541 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
1555 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" 1542 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
1556 1543
1557 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. 1544 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1558 #define READUYVY \ 1545 #define READUYVY \
1559 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ 1546 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
1560 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ 1547 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
1561 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ 1548 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
1562 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ 1549 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
1563 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" 1550 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
1564 1551
1565 #if defined(__x86_64__) 1552 #if defined(__x86_64__)
1566 #define YUVTORGB_SETUP(yuvconstants) \ 1553 #define YUVTORGB_SETUP(yuvconstants) \
1567 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ 1554 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
1568 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ 1555 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
1569 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ 1556 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
1570 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ 1557 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
1571 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \ 1558 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \
1572 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ 1559 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
1573 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" 1560 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
1574 // Convert 8 pixels: 8 UV and 8 Y 1561 // Convert 8 pixels: 8 UV and 8 Y
1575 #define YUVTORGB(yuvconstants) \ 1562 #define YUVTORGB(yuvconstants) \
1576 "movdqa %%xmm0,%%xmm1 \n" \ 1563 "movdqa %%xmm0,%%xmm1 \n" \
1577 "movdqa %%xmm0,%%xmm2 \n" \ 1564 "movdqa %%xmm0,%%xmm2 \n" \
1578 "movdqa %%xmm0,%%xmm3 \n" \ 1565 "movdqa %%xmm0,%%xmm3 \n" \
1579 "movdqa %%xmm11,%%xmm0 \n" \ 1566 "movdqa %%xmm11,%%xmm0 \n" \
1580 "pmaddubsw %%xmm8,%%xmm1 \n" \ 1567 "pmaddubsw %%xmm8,%%xmm1 \n" \
1581 "psubw %%xmm1,%%xmm0 \n" \ 1568 "psubw %%xmm1,%%xmm0 \n" \
1582 "movdqa %%xmm12,%%xmm1 \n" \ 1569 "movdqa %%xmm12,%%xmm1 \n" \
1583 "pmaddubsw %%xmm9,%%xmm2 \n" \ 1570 "pmaddubsw %%xmm9,%%xmm2 \n" \
1584 "psubw %%xmm2,%%xmm1 \n" \ 1571 "psubw %%xmm2,%%xmm1 \n" \
1585 "movdqa %%xmm13,%%xmm2 \n" \ 1572 "movdqa %%xmm13,%%xmm2 \n" \
1586 "pmaddubsw %%xmm10,%%xmm3 \n" \ 1573 "pmaddubsw %%xmm10,%%xmm3 \n" \
1587 "psubw %%xmm3,%%xmm2 \n" \ 1574 "psubw %%xmm3,%%xmm2 \n" \
1588 "pmulhuw %%xmm14,%%xmm4 \n" \ 1575 "pmulhuw %%xmm14,%%xmm4 \n" \
1589 "paddsw %%xmm4,%%xmm0 \n" \ 1576 "paddsw %%xmm4,%%xmm0 \n" \
1590 "paddsw %%xmm4,%%xmm1 \n" \ 1577 "paddsw %%xmm4,%%xmm1 \n" \
1591 "paddsw %%xmm4,%%xmm2 \n" \ 1578 "paddsw %%xmm4,%%xmm2 \n" \
1592 "psraw $0x6,%%xmm0 \n" \ 1579 "psraw $0x6,%%xmm0 \n" \
1593 "psraw $0x6,%%xmm1 \n" \ 1580 "psraw $0x6,%%xmm1 \n" \
1594 "psraw $0x6,%%xmm2 \n" \ 1581 "psraw $0x6,%%xmm2 \n" \
1595 "packuswb %%xmm0,%%xmm0 \n" \ 1582 "packuswb %%xmm0,%%xmm0 \n" \
1596 "packuswb %%xmm1,%%xmm1 \n" \ 1583 "packuswb %%xmm1,%%xmm1 \n" \
1597 "packuswb %%xmm2,%%xmm2 \n" 1584 "packuswb %%xmm2,%%xmm2 \n"
1598 #define YUVTORGB_REGS \ 1585 #define YUVTORGB_REGS \
1599 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", 1586 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
1600 1587
1601 #else 1588 #else
1602 #define YUVTORGB_SETUP(yuvconstants) 1589 #define YUVTORGB_SETUP(yuvconstants)
1603 // Convert 8 pixels: 8 UV and 8 Y 1590 // Convert 8 pixels: 8 UV and 8 Y
1604 #define YUVTORGB(yuvconstants) \ 1591 #define YUVTORGB(yuvconstants) \
1605 "movdqa %%xmm0,%%xmm1 \n" \ 1592 "movdqa %%xmm0,%%xmm1 \n" \
1606 "movdqa %%xmm0,%%xmm2 \n" \ 1593 "movdqa %%xmm0,%%xmm2 \n" \
1607 "movdqa %%xmm0,%%xmm3 \n" \ 1594 "movdqa %%xmm0,%%xmm3 \n" \
1608 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ 1595 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
1609 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ 1596 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \
1610 "psubw %%xmm1,%%xmm0 \n" \ 1597 "psubw %%xmm1,%%xmm0 \n" \
1611 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ 1598 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \
1612 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ 1599 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \
1613 "psubw %%xmm2,%%xmm1 \n" \ 1600 "psubw %%xmm2,%%xmm1 \n" \
1614 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ 1601 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
1615 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ 1602 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
1616 "psubw %%xmm3,%%xmm2 \n" \ 1603 "psubw %%xmm3,%%xmm2 \n" \
1617 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ 1604 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
1618 "paddsw %%xmm4,%%xmm0 \n" \ 1605 "paddsw %%xmm4,%%xmm0 \n" \
1619 "paddsw %%xmm4,%%xmm1 \n" \ 1606 "paddsw %%xmm4,%%xmm1 \n" \
1620 "paddsw %%xmm4,%%xmm2 \n" \ 1607 "paddsw %%xmm4,%%xmm2 \n" \
1621 "psraw $0x6,%%xmm0 \n" \ 1608 "psraw $0x6,%%xmm0 \n" \
1622 "psraw $0x6,%%xmm1 \n" \ 1609 "psraw $0x6,%%xmm1 \n" \
1623 "psraw $0x6,%%xmm2 \n" \ 1610 "psraw $0x6,%%xmm2 \n" \
1624 "packuswb %%xmm0,%%xmm0 \n" \ 1611 "packuswb %%xmm0,%%xmm0 \n" \
1625 "packuswb %%xmm1,%%xmm1 \n" \ 1612 "packuswb %%xmm1,%%xmm1 \n" \
1626 "packuswb %%xmm2,%%xmm2 \n" 1613 "packuswb %%xmm2,%%xmm2 \n"
1627 #define YUVTORGB_REGS 1614 #define YUVTORGB_REGS
1628 #endif 1615 #endif
1629 1616
1630 // Store 8 ARGB values. 1617 // Store 8 ARGB values.
1631 #define STOREARGB \ 1618 #define STOREARGB \
1632 "punpcklbw %%xmm1,%%xmm0 \n" \ 1619 "punpcklbw %%xmm1,%%xmm0 \n" \
1633 "punpcklbw %%xmm5,%%xmm2 \n" \ 1620 "punpcklbw %%xmm5,%%xmm2 \n" \
1634 "movdqa %%xmm0,%%xmm1 \n" \ 1621 "movdqa %%xmm0,%%xmm1 \n" \
1635 "punpcklwd %%xmm2,%%xmm0 \n" \ 1622 "punpcklwd %%xmm2,%%xmm0 \n" \
1636 "punpckhwd %%xmm2,%%xmm1 \n" \ 1623 "punpckhwd %%xmm2,%%xmm1 \n" \
1637 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ 1624 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
1638 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ 1625 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
1639 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" 1626 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
1640 1627
1641 // Store 8 RGBA values. 1628 // Store 8 RGBA values.
1642 #define STORERGBA \ 1629 #define STORERGBA \
1643 "pcmpeqb %%xmm5,%%xmm5 \n" \ 1630 "pcmpeqb %%xmm5,%%xmm5 \n" \
1644 "punpcklbw %%xmm2,%%xmm1 \n" \ 1631 "punpcklbw %%xmm2,%%xmm1 \n" \
1645 "punpcklbw %%xmm0,%%xmm5 \n" \ 1632 "punpcklbw %%xmm0,%%xmm5 \n" \
1646 "movdqa %%xmm5,%%xmm0 \n" \ 1633 "movdqa %%xmm5,%%xmm0 \n" \
1647 "punpcklwd %%xmm1,%%xmm5 \n" \ 1634 "punpcklwd %%xmm1,%%xmm5 \n" \
1648 "punpckhwd %%xmm1,%%xmm0 \n" \ 1635 "punpckhwd %%xmm1,%%xmm0 \n" \
1649 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ 1636 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
1650 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ 1637 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
1651 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" 1638 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
1652 1639
1653 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, 1640 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after
1752 } 1739 }
1753 1740
1754 #ifdef HAS_I422ALPHATOARGBROW_SSSE3 1741 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
1755 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, 1742 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
1756 const uint8* u_buf, 1743 const uint8* u_buf,
1757 const uint8* v_buf, 1744 const uint8* v_buf,
1758 const uint8* a_buf, 1745 const uint8* a_buf,
1759 uint8* dst_argb, 1746 uint8* dst_argb,
1760 const struct YuvConstants* yuvconstants, 1747 const struct YuvConstants* yuvconstants,
1761 int width) { 1748 int width) {
1749 // clang-format off
1762 asm volatile ( 1750 asm volatile (
1763 YUVTORGB_SETUP(yuvconstants) 1751 YUVTORGB_SETUP(yuvconstants)
1764 "sub %[u_buf],%[v_buf] \n" 1752 "sub %[u_buf],%[v_buf] \n"
1765 LABELALIGN 1753 LABELALIGN
1766 "1: \n" 1754 "1: \n"
1767 READYUVA422 1755 READYUVA422
1768 YUVTORGB(yuvconstants) 1756 YUVTORGB(yuvconstants)
1769 STOREARGB 1757 STOREARGB
1770 "subl $0x8,%[width] \n" 1758 "subl $0x8,%[width] \n"
1771 "jg 1b \n" 1759 "jg 1b \n"
1772 : [y_buf]"+r"(y_buf), // %[y_buf] 1760 : [y_buf]"+r"(y_buf), // %[y_buf]
1773 [u_buf]"+r"(u_buf), // %[u_buf] 1761 [u_buf]"+r"(u_buf), // %[u_buf]
1774 [v_buf]"+r"(v_buf), // %[v_buf] 1762 [v_buf]"+r"(v_buf), // %[v_buf]
1775 [a_buf]"+r"(a_buf), // %[a_buf] 1763 [a_buf]"+r"(a_buf), // %[a_buf]
1776 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1764 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1777 #if defined(__i386__) 1765 #if defined(__i386__)
1778 [width]"+m"(width) // %[width] 1766 [width]"+m"(width) // %[width]
1779 #else 1767 #else
1780 [width]"+rm"(width) // %[width] 1768 [width]"+rm"(width) // %[width]
1781 #endif 1769 #endif
1782 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1770 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1783 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1771 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1784 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1772 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1785 ); 1773 );
1774 // clang-format on
1786 } 1775 }
1787 #endif // HAS_I422ALPHATOARGBROW_SSSE3 1776 #endif // HAS_I422ALPHATOARGBROW_SSSE3
1788 1777
1789 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, 1778 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1790 const uint8* uv_buf, 1779 const uint8* uv_buf,
1791 uint8* dst_argb, 1780 uint8* dst_argb,
1792 const struct YuvConstants* yuvconstants, 1781 const struct YuvConstants* yuvconstants,
1793 int width) { 1782 int width) {
1783 // clang-format off
1794 asm volatile ( 1784 asm volatile (
1795 YUVTORGB_SETUP(yuvconstants) 1785 YUVTORGB_SETUP(yuvconstants)
1796 "pcmpeqb %%xmm5,%%xmm5 \n" 1786 "pcmpeqb %%xmm5,%%xmm5 \n"
1797 LABELALIGN 1787 LABELALIGN
1798 "1: \n" 1788 "1: \n"
1799 READNV12 1789 READNV12
1800 YUVTORGB(yuvconstants) 1790 YUVTORGB(yuvconstants)
1801 STOREARGB 1791 STOREARGB
1802 "sub $0x8,%[width] \n" 1792 "sub $0x8,%[width] \n"
1803 "jg 1b \n" 1793 "jg 1b \n"
1804 : [y_buf]"+r"(y_buf), // %[y_buf] 1794 : [y_buf]"+r"(y_buf), // %[y_buf]
1805 [uv_buf]"+r"(uv_buf), // %[uv_buf] 1795 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1806 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1796 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1807 [width]"+rm"(width) // %[width] 1797 [width]"+rm"(width) // %[width]
1808 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1798 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1809 : "memory", "cc", YUVTORGB_REGS // Does not use r14. 1799 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1810 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1800 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1811 ); 1801 );
1802 // clang-format on
1812 } 1803 }
1813 1804
1814 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, 1805 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1815 const uint8* vu_buf, 1806 const uint8* vu_buf,
1816 uint8* dst_argb, 1807 uint8* dst_argb,
1817 const struct YuvConstants* yuvconstants, 1808 const struct YuvConstants* yuvconstants,
1818 int width) { 1809 int width) {
1810 // clang-format off
1819 asm volatile ( 1811 asm volatile (
1820 YUVTORGB_SETUP(yuvconstants) 1812 YUVTORGB_SETUP(yuvconstants)
1821 "pcmpeqb %%xmm5,%%xmm5 \n" 1813 "pcmpeqb %%xmm5,%%xmm5 \n"
1822 LABELALIGN 1814 LABELALIGN
1823 "1: \n" 1815 "1: \n"
1824 READNV21 1816 READNV21
1825 YUVTORGB(yuvconstants) 1817 YUVTORGB(yuvconstants)
1826 STOREARGB 1818 STOREARGB
1827 "sub $0x8,%[width] \n" 1819 "sub $0x8,%[width] \n"
1828 "jg 1b \n" 1820 "jg 1b \n"
1829 : [y_buf]"+r"(y_buf), // %[y_buf] 1821 : [y_buf]"+r"(y_buf), // %[y_buf]
1830 [vu_buf]"+r"(vu_buf), // %[vu_buf] 1822 [vu_buf]"+r"(vu_buf), // %[vu_buf]
1831 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1823 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1832 [width]"+rm"(width) // %[width] 1824 [width]"+rm"(width) // %[width]
1833 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1825 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1834 [kShuffleNV21]"m"(kShuffleNV21) 1826 [kShuffleNV21]"m"(kShuffleNV21)
1835 : "memory", "cc", YUVTORGB_REGS // Does not use r14. 1827 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1836 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1828 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1837 ); 1829 );
1830 // clang-format on
1838 } 1831 }
1839 1832
1840 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, 1833 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
1841 uint8* dst_argb, 1834 uint8* dst_argb,
1842 const struct YuvConstants* yuvconstants, 1835 const struct YuvConstants* yuvconstants,
1843 int width) { 1836 int width) {
1837 // clang-format off
1844 asm volatile ( 1838 asm volatile (
1845 YUVTORGB_SETUP(yuvconstants) 1839 YUVTORGB_SETUP(yuvconstants)
1846 "pcmpeqb %%xmm5,%%xmm5 \n" 1840 "pcmpeqb %%xmm5,%%xmm5 \n"
1847 LABELALIGN 1841 LABELALIGN
1848 "1: \n" 1842 "1: \n"
1849 READYUY2 1843 READYUY2
1850 YUVTORGB(yuvconstants) 1844 YUVTORGB(yuvconstants)
1851 STOREARGB 1845 STOREARGB
1852 "sub $0x8,%[width] \n" 1846 "sub $0x8,%[width] \n"
1853 "jg 1b \n" 1847 "jg 1b \n"
1854 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] 1848 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
1855 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1849 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1856 [width]"+rm"(width) // %[width] 1850 [width]"+rm"(width) // %[width]
1857 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1851 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1858 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), 1852 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
1859 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) 1853 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
1860 : "memory", "cc", YUVTORGB_REGS // Does not use r14. 1854 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1861 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1855 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1862 ); 1856 );
1857 // clang-format on
1863 } 1858 }
1864 1859
1865 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, 1860 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
1866 uint8* dst_argb, 1861 uint8* dst_argb,
1867 const struct YuvConstants* yuvconstants, 1862 const struct YuvConstants* yuvconstants,
1868 int width) { 1863 int width) {
1864 // clang-format off
1869 asm volatile ( 1865 asm volatile (
1870 YUVTORGB_SETUP(yuvconstants) 1866 YUVTORGB_SETUP(yuvconstants)
1871 "pcmpeqb %%xmm5,%%xmm5 \n" 1867 "pcmpeqb %%xmm5,%%xmm5 \n"
1872 LABELALIGN 1868 LABELALIGN
1873 "1: \n" 1869 "1: \n"
1874 READUYVY 1870 READUYVY
1875 YUVTORGB(yuvconstants) 1871 YUVTORGB(yuvconstants)
1876 STOREARGB 1872 STOREARGB
1877 "sub $0x8,%[width] \n" 1873 "sub $0x8,%[width] \n"
1878 "jg 1b \n" 1874 "jg 1b \n"
1879 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] 1875 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
1880 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1876 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1881 [width]"+rm"(width) // %[width] 1877 [width]"+rm"(width) // %[width]
1882 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1878 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1883 [kShuffleUYVYY]"m"(kShuffleUYVYY), 1879 [kShuffleUYVYY]"m"(kShuffleUYVYY),
1884 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) 1880 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
1885 : "memory", "cc", YUVTORGB_REGS // Does not use r14. 1881 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1886 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1882 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1887 ); 1883 );
1884 // clang-format on
1888 } 1885 }
1889 1886
1890 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, 1887 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1891 const uint8* u_buf, 1888 const uint8* u_buf,
1892 const uint8* v_buf, 1889 const uint8* v_buf,
1893 uint8* dst_rgba, 1890 uint8* dst_rgba,
1894 const struct YuvConstants* yuvconstants, 1891 const struct YuvConstants* yuvconstants,
1895 int width) { 1892 int width) {
1896 asm volatile ( 1893 asm volatile (
1897 YUVTORGB_SETUP(yuvconstants) 1894 YUVTORGB_SETUP(yuvconstants)
(...skipping 13 matching lines...) Expand all
1911 [width]"+rm"(width) // %[width] 1908 [width]"+rm"(width) // %[width]
1912 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1909 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1913 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1910 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1914 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1911 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1915 ); 1912 );
1916 } 1913 }
1917 1914
1918 #endif // HAS_I422TOARGBROW_SSSE3 1915 #endif // HAS_I422TOARGBROW_SSSE3
1919 1916
1920 // Read 16 UV from 444 1917 // Read 16 UV from 444
1921 #define READYUV444_AVX2 \ 1918 #define READYUV444_AVX2 \
1922 "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1919 "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1923 MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1920 MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
1924 "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ 1921 "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
1925 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1922 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1926 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ 1923 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
1927 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1924 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1928 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1925 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1929 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1926 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1930 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1927 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1931 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 1928 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1932 1929
1933 // Read 8 UV from 422, upsample to 16 UV. 1930 // Read 8 UV from 422, upsample to 16 UV.
1934 #define READYUV422_AVX2 \ 1931 #define READYUV422_AVX2 \
1935 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1932 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1936 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1933 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1937 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1934 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1938 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1935 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1939 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1936 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1940 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 1937 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1941 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1938 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1942 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1939 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1943 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1940 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1944 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 1941 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1945 1942
1946 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. 1943 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
1947 #define READYUVA422_AVX2 \ 1944 #define READYUVA422_AVX2 \
1948 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1945 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1949 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1946 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1950 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1947 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1951 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1948 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1952 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1949 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1953 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 1950 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1954 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1951 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1955 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1952 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1956 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1953 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1957 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ 1954 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
1958 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ 1955 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \
1959 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ 1956 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
1960 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" 1957 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
1961 1958
1962 // Read 8 UV from NV12, upsample to 16 UV. 1959 // Read 8 UV from NV12, upsample to 16 UV.
1963 #define READNV12_AVX2 \ 1960 #define READNV12_AVX2 \
1964 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ 1961 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1965 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ 1962 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
1966 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1963 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1967 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 1964 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1968 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1965 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1969 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1966 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1970 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1967 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1971 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 1968 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1972 1969
1973 // Read 8 VU from NV21, upsample to 16 UV. 1970 // Read 8 VU from NV21, upsample to 16 UV.
1974 #define READNV21_AVX2 \ 1971 #define READNV21_AVX2 \
1975 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ 1972 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
1976 "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ 1973 "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
1977 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1974 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1978 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ 1975 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
1979 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1976 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1980 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1977 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1981 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1978 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1982 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 1979 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1983 1980
1984 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. 1981 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
1985 #define READYUY2_AVX2 \ 1982 #define READYUY2_AVX2 \
1986 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ 1983 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
1987 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ 1984 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
1988 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ 1985 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
1989 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ 1986 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
1990 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" 1987 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
1991 1988
1992 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. 1989 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
1993 #define READUYVY_AVX2 \ 1990 #define READUYVY_AVX2 \
1994 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ 1991 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
1995 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ 1992 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
1996 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ 1993 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
1997 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ 1994 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
1998 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" 1995 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
1999 1996
2000 #if defined(__x86_64__) 1997 #if defined(__x86_64__)
2001 #define YUVTORGB_SETUP_AVX2(yuvconstants) \ 1998 #define YUVTORGB_SETUP_AVX2(yuvconstants) \
2002 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ 1999 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
2003 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ 2000 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
2004 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ 2001 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
2005 "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ 2002 "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
2006 "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ 2003 "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
2007 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ 2004 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
2008 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" 2005 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
2009 #define YUVTORGB_AVX2(yuvconstants) \ 2006 #define YUVTORGB_AVX2(yuvconstants) \
2010 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ 2007 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
2011 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ 2008 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
2012 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ 2009 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
2013 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ 2010 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
2014 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ 2011 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
2015 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ 2012 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
2016 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ 2013 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
2017 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ 2014 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2018 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ 2015 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2019 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ 2016 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
2020 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 2017 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
2021 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 2018 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
2022 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ 2019 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
2023 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ 2020 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2024 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ 2021 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
2025 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" 2022 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
2026 #define YUVTORGB_REGS_AVX2 \ 2023 #define YUVTORGB_REGS_AVX2 \
2027 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", 2024 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2028 #else // Convert 16 pixels: 16 UV and 16 Y. 2025 #else // Convert 16 pixels: 16 UV and 16 Y.
2029 #define YUVTORGB_SETUP_AVX2(yuvconstants) 2026 #define YUVTORGB_SETUP_AVX2(yuvconstants)
2030 #define YUVTORGB_AVX2(yuvconstants) \ 2027 #define YUVTORGB_AVX2(yuvconstants) \
2031 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ 2028 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
2032 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ 2029 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
2033 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ 2030 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
2034 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ 2031 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
2035 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ 2032 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
2036 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ 2033 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
2037 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ 2034 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
2038 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ 2035 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
2039 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ 2036 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
2040 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ 2037 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
2041 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ 2038 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2042 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ 2039 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2043 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ 2040 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
2044 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 2041 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
2045 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 2042 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
2046 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ 2043 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
2047 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ 2044 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2048 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ 2045 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
2049 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" 2046 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
2050 #define YUVTORGB_REGS_AVX2 2047 #define YUVTORGB_REGS_AVX2
2051 #endif 2048 #endif
2052 2049
2053 // Store 16 ARGB values. 2050 // Store 16 ARGB values.
2054 #define STOREARGB_AVX2 \ 2051 #define STOREARGB_AVX2 \
2055 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 2052 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2056 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 2053 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2057 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ 2054 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
2058 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ 2055 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
2059 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ 2056 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
2060 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ 2057 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
2061 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ 2058 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
2062 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ 2059 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
2063 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" 2060 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
2064 2061
2065 #ifdef HAS_I444TOARGBROW_AVX2 2062 #ifdef HAS_I444TOARGBROW_AVX2
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
2125 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 2122 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2126 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2123 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2127 ); 2124 );
2128 } 2125 }
2129 #endif // HAS_I422TOARGBROW_AVX2 2126 #endif // HAS_I422TOARGBROW_AVX2
2130 2127
2131 #if defined(HAS_I422ALPHATOARGBROW_AVX2) 2128 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
2132 // 16 pixels 2129 // 16 pixels
2133 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. 2130 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2134 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, 2131 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2135 const uint8* u_buf, 2132 const uint8* u_buf,
2136 const uint8* v_buf, 2133 const uint8* v_buf,
2137 const uint8* a_buf, 2134 const uint8* a_buf,
2138 uint8* dst_argb, 2135 uint8* dst_argb,
2139 const struct YuvConstants* yuvconstants, 2136 const struct YuvConstants* yuvconstants,
2140 int width) { 2137 int width) {
2138 // clang-format off
2141 asm volatile ( 2139 asm volatile (
2142 YUVTORGB_SETUP_AVX2(yuvconstants) 2140 YUVTORGB_SETUP_AVX2(yuvconstants)
2143 "sub %[u_buf],%[v_buf] \n" 2141 "sub %[u_buf],%[v_buf] \n"
2144 LABELALIGN 2142 LABELALIGN
2145 "1: \n" 2143 "1: \n"
2146 READYUVA422_AVX2 2144 READYUVA422_AVX2
2147 YUVTORGB_AVX2(yuvconstants) 2145 YUVTORGB_AVX2(yuvconstants)
2148 STOREARGB_AVX2 2146 STOREARGB_AVX2
2149 "subl $0x10,%[width] \n" 2147 "subl $0x10,%[width] \n"
2150 "jg 1b \n" 2148 "jg 1b \n"
2151 "vzeroupper \n" 2149 "vzeroupper \n"
2152 : [y_buf]"+r"(y_buf), // %[y_buf] 2150 : [y_buf]"+r"(y_buf), // %[y_buf]
2153 [u_buf]"+r"(u_buf), // %[u_buf] 2151 [u_buf]"+r"(u_buf), // %[u_buf]
2154 [v_buf]"+r"(v_buf), // %[v_buf] 2152 [v_buf]"+r"(v_buf), // %[v_buf]
2155 [a_buf]"+r"(a_buf), // %[a_buf] 2153 [a_buf]"+r"(a_buf), // %[a_buf]
2156 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2154 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2157 #if defined(__i386__) 2155 #if defined(__i386__)
2158 [width]"+m"(width) // %[width] 2156 [width]"+m"(width) // %[width]
2159 #else 2157 #else
2160 [width]"+rm"(width) // %[width] 2158 [width]"+rm"(width) // %[width]
2161 #endif 2159 #endif
2162 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2160 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2163 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 2161 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2164 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2162 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2165 ); 2163 );
2164 // clang-format on
2166 } 2165 }
2167 #endif // HAS_I422ALPHATOARGBROW_AVX2 2166 #endif // HAS_I422ALPHATOARGBROW_AVX2
2168 2167
2169 #if defined(HAS_I422TORGBAROW_AVX2) 2168 #if defined(HAS_I422TORGBAROW_AVX2)
2170 // 16 pixels 2169 // 16 pixels
2171 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). 2170 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2172 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, 2171 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2173 const uint8* u_buf, 2172 const uint8* u_buf,
2174 const uint8* v_buf, 2173 const uint8* v_buf,
2175 uint8* dst_argb, 2174 uint8* dst_argb,
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
2210 #endif // HAS_I422TORGBAROW_AVX2 2209 #endif // HAS_I422TORGBAROW_AVX2
2211 2210
2212 #if defined(HAS_NV12TOARGBROW_AVX2) 2211 #if defined(HAS_NV12TOARGBROW_AVX2)
2213 // 16 pixels. 2212 // 16 pixels.
2214 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2213 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2215 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, 2214 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
2216 const uint8* uv_buf, 2215 const uint8* uv_buf,
2217 uint8* dst_argb, 2216 uint8* dst_argb,
2218 const struct YuvConstants* yuvconstants, 2217 const struct YuvConstants* yuvconstants,
2219 int width) { 2218 int width) {
2219 // clang-format off
2220 asm volatile ( 2220 asm volatile (
2221 YUVTORGB_SETUP_AVX2(yuvconstants) 2221 YUVTORGB_SETUP_AVX2(yuvconstants)
2222 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2222 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2223 LABELALIGN 2223 LABELALIGN
2224 "1: \n" 2224 "1: \n"
2225 READNV12_AVX2 2225 READNV12_AVX2
2226 YUVTORGB_AVX2(yuvconstants) 2226 YUVTORGB_AVX2(yuvconstants)
2227 STOREARGB_AVX2 2227 STOREARGB_AVX2
2228 "sub $0x10,%[width] \n" 2228 "sub $0x10,%[width] \n"
2229 "jg 1b \n" 2229 "jg 1b \n"
2230 "vzeroupper \n" 2230 "vzeroupper \n"
2231 : [y_buf]"+r"(y_buf), // %[y_buf] 2231 : [y_buf]"+r"(y_buf), // %[y_buf]
2232 [uv_buf]"+r"(uv_buf), // %[uv_buf] 2232 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2233 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2233 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2234 [width]"+rm"(width) // %[width] 2234 [width]"+rm"(width) // %[width]
2235 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2235 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2236 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. 2236 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2237 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2237 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2238 ); 2238 );
2239 // clang-format on
2239 } 2240 }
2240 #endif // HAS_NV12TOARGBROW_AVX2 2241 #endif // HAS_NV12TOARGBROW_AVX2
2241 2242
2242 #if defined(HAS_NV21TOARGBROW_AVX2) 2243 #if defined(HAS_NV21TOARGBROW_AVX2)
2243 // 16 pixels. 2244 // 16 pixels.
2244 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2245 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2245 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, 2246 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
2246 const uint8* vu_buf, 2247 const uint8* vu_buf,
2247 uint8* dst_argb, 2248 uint8* dst_argb,
2248 const struct YuvConstants* yuvconstants, 2249 const struct YuvConstants* yuvconstants,
2249 int width) { 2250 int width) {
2251 // clang-format off
2250 asm volatile ( 2252 asm volatile (
2251 YUVTORGB_SETUP_AVX2(yuvconstants) 2253 YUVTORGB_SETUP_AVX2(yuvconstants)
2252 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2254 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2253 LABELALIGN 2255 LABELALIGN
2254 "1: \n" 2256 "1: \n"
2255 READNV21_AVX2 2257 READNV21_AVX2
2256 YUVTORGB_AVX2(yuvconstants) 2258 YUVTORGB_AVX2(yuvconstants)
2257 STOREARGB_AVX2 2259 STOREARGB_AVX2
2258 "sub $0x10,%[width] \n" 2260 "sub $0x10,%[width] \n"
2259 "jg 1b \n" 2261 "jg 1b \n"
2260 "vzeroupper \n" 2262 "vzeroupper \n"
2261 : [y_buf]"+r"(y_buf), // %[y_buf] 2263 : [y_buf]"+r"(y_buf), // %[y_buf]
2262 [vu_buf]"+r"(vu_buf), // %[vu_buf] 2264 [vu_buf]"+r"(vu_buf), // %[vu_buf]
2263 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2265 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2264 [width]"+rm"(width) // %[width] 2266 [width]"+rm"(width) // %[width]
2265 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2267 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2266 [kShuffleNV21]"m"(kShuffleNV21) 2268 [kShuffleNV21]"m"(kShuffleNV21)
2267 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. 2269 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2268 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2270 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2269 ); 2271 );
2272 // clang-format on
2270 } 2273 }
2271 #endif // HAS_NV21TOARGBROW_AVX2 2274 #endif // HAS_NV21TOARGBROW_AVX2
2272 2275
2273 #if defined(HAS_YUY2TOARGBROW_AVX2) 2276 #if defined(HAS_YUY2TOARGBROW_AVX2)
2274 // 16 pixels. 2277 // 16 pixels.
2275 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2278 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2276 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, 2279 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
2277 uint8* dst_argb, 2280 uint8* dst_argb,
2278 const struct YuvConstants* yuvconstants, 2281 const struct YuvConstants* yuvconstants,
2279 int width) { 2282 int width) {
2283 // clang-format off
2280 asm volatile ( 2284 asm volatile (
2281 YUVTORGB_SETUP_AVX2(yuvconstants) 2285 YUVTORGB_SETUP_AVX2(yuvconstants)
2282 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2286 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2283 LABELALIGN 2287 LABELALIGN
2284 "1: \n" 2288 "1: \n"
2285 READYUY2_AVX2 2289 READYUY2_AVX2
2286 YUVTORGB_AVX2(yuvconstants) 2290 YUVTORGB_AVX2(yuvconstants)
2287 STOREARGB_AVX2 2291 STOREARGB_AVX2
2288 "sub $0x10,%[width] \n" 2292 "sub $0x10,%[width] \n"
2289 "jg 1b \n" 2293 "jg 1b \n"
2290 "vzeroupper \n" 2294 "vzeroupper \n"
2291 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] 2295 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
2292 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2296 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2293 [width]"+rm"(width) // %[width] 2297 [width]"+rm"(width) // %[width]
2294 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2298 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2295 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), 2299 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2296 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) 2300 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2297 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. 2301 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2298 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2302 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2299 ); 2303 );
2304 // clang-format on
2300 } 2305 }
2301 #endif // HAS_YUY2TOARGBROW_AVX2 2306 #endif // HAS_YUY2TOARGBROW_AVX2
2302 2307
2303 #if defined(HAS_UYVYTOARGBROW_AVX2) 2308 #if defined(HAS_UYVYTOARGBROW_AVX2)
2304 // 16 pixels. 2309 // 16 pixels.
2305 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2310 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2306 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, 2311 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
2307 uint8* dst_argb, 2312 uint8* dst_argb,
2308 const struct YuvConstants* yuvconstants, 2313 const struct YuvConstants* yuvconstants,
2309 int width) { 2314 int width) {
2315 // clang-format off
2310 asm volatile ( 2316 asm volatile (
2311 YUVTORGB_SETUP_AVX2(yuvconstants) 2317 YUVTORGB_SETUP_AVX2(yuvconstants)
2312 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2318 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2313 LABELALIGN 2319 LABELALIGN
2314 "1: \n" 2320 "1: \n"
2315 READUYVY_AVX2 2321 READUYVY_AVX2
2316 YUVTORGB_AVX2(yuvconstants) 2322 YUVTORGB_AVX2(yuvconstants)
2317 STOREARGB_AVX2 2323 STOREARGB_AVX2
2318 "sub $0x10,%[width] \n" 2324 "sub $0x10,%[width] \n"
2319 "jg 1b \n" 2325 "jg 1b \n"
2320 "vzeroupper \n" 2326 "vzeroupper \n"
2321 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] 2327 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
2322 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2328 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2323 [width]"+rm"(width) // %[width] 2329 [width]"+rm"(width) // %[width]
2324 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2330 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2325 [kShuffleUYVYY]"m"(kShuffleUYVYY), 2331 [kShuffleUYVYY]"m"(kShuffleUYVYY),
2326 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) 2332 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2327 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. 2333 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2328 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2334 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2329 ); 2335 );
2336 // clang-format on
2330 } 2337 }
2331 #endif // HAS_UYVYTOARGBROW_AVX2 2338 #endif // HAS_UYVYTOARGBROW_AVX2
2332 2339
2333 #ifdef HAS_I400TOARGBROW_SSE2 2340 #ifdef HAS_I400TOARGBROW_SSE2
2334 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { 2341 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2335 asm volatile ( 2342 asm volatile (
2336 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 2343 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
2337 "movd %%eax,%%xmm2 \n" 2344 "movd %%eax,%%xmm2 \n"
2338 "pshufd $0x0,%%xmm2,%%xmm2 \n" 2345 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2339 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 2346 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
2417 "+rm"(width) // %2 2424 "+rm"(width) // %2
2418 : 2425 :
2419 : "memory", "cc", "eax" 2426 : "memory", "cc", "eax"
2420 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 2427 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2421 ); 2428 );
2422 } 2429 }
2423 #endif // HAS_I400TOARGBROW_AVX2 2430 #endif // HAS_I400TOARGBROW_AVX2
2424 2431
2425 #ifdef HAS_MIRRORROW_SSSE3 2432 #ifdef HAS_MIRRORROW_SSSE3
2426 // Shuffle table for reversing the bytes. 2433 // Shuffle table for reversing the bytes.
2427 static uvec8 kShuffleMirror = { 2434 static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
2428 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 2435 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
2429 };
2430 2436
2431 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 2437 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2432 intptr_t temp_width = (intptr_t)(width); 2438 intptr_t temp_width = (intptr_t)(width);
2433 asm volatile ( 2439 asm volatile (
2434 "movdqa %3,%%xmm5 \n" 2440 "movdqa %3,%%xmm5 \n"
2435 LABELALIGN 2441 LABELALIGN
2436 "1: \n" 2442 "1: \n"
2437 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 2443 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2438 "pshufb %%xmm5,%%xmm0 \n" 2444 "pshufb %%xmm5,%%xmm0 \n"
2439 "movdqu %%xmm0," MEMACCESS(1) " \n" 2445 "movdqu %%xmm0," MEMACCESS(1) " \n"
(...skipping 30 matching lines...) Expand all
2470 "+r"(temp_width) // %2 2476 "+r"(temp_width) // %2
2471 : "m"(kShuffleMirror) // %3 2477 : "m"(kShuffleMirror) // %3
2472 : "memory", "cc", NACL_R14 2478 : "memory", "cc", NACL_R14
2473 "xmm0", "xmm5" 2479 "xmm0", "xmm5"
2474 ); 2480 );
2475 } 2481 }
2476 #endif // HAS_MIRRORROW_AVX2 2482 #endif // HAS_MIRRORROW_AVX2
2477 2483
2478 #ifdef HAS_MIRRORUVROW_SSSE3 2484 #ifdef HAS_MIRRORUVROW_SSSE3
2479 // Shuffle table for reversing the bytes of UV channels. 2485 // Shuffle table for reversing the bytes of UV channels.
2480 static uvec8 kShuffleMirrorUV = { 2486 static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
2481 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 2487 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
2482 }; 2488 void MirrorUVRow_SSSE3(const uint8* src,
2483 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 2489 uint8* dst_u,
2490 uint8* dst_v,
2484 int width) { 2491 int width) {
2485 intptr_t temp_width = (intptr_t)(width); 2492 intptr_t temp_width = (intptr_t)(width);
2486 asm volatile ( 2493 asm volatile (
2487 "movdqa %4,%%xmm1 \n" 2494 "movdqa %4,%%xmm1 \n"
2488 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" 2495 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
2489 "sub %1,%2 \n" 2496 "sub %1,%2 \n"
2490 LABELALIGN 2497 LABELALIGN
2491 "1: \n" 2498 "1: \n"
2492 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2499 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2493 "lea " MEMLEA(-0x10,0) ",%0 \n" 2500 "lea " MEMLEA(-0x10,0) ",%0 \n"
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
2528 "+r"(temp_width) // %2 2535 "+r"(temp_width) // %2
2529 : 2536 :
2530 : "memory", "cc" 2537 : "memory", "cc"
2531 , "xmm0" 2538 , "xmm0"
2532 ); 2539 );
2533 } 2540 }
2534 #endif // HAS_ARGBMIRRORROW_SSE2 2541 #endif // HAS_ARGBMIRRORROW_SSE2
2535 2542
2536 #ifdef HAS_ARGBMIRRORROW_AVX2 2543 #ifdef HAS_ARGBMIRRORROW_AVX2
2537 // Shuffle table for reversing the bytes. 2544 // Shuffle table for reversing the bytes.
2538 static const ulvec32 kARGBShuffleMirror_AVX2 = { 2545 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
2539 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2540 };
2541 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 2546 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2542 intptr_t temp_width = (intptr_t)(width); 2547 intptr_t temp_width = (intptr_t)(width);
2543 asm volatile ( 2548 asm volatile (
2544 "vmovdqu %3,%%ymm5 \n" 2549 "vmovdqu %3,%%ymm5 \n"
2545 LABELALIGN 2550 LABELALIGN
2546 "1: \n" 2551 "1: \n"
2547 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 2552 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2548 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2553 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2549 "lea " MEMLEA(0x20,1) ",%1 \n" 2554 "lea " MEMLEA(0x20,1) ",%1 \n"
2550 "sub $0x8,%2 \n" 2555 "sub $0x8,%2 \n"
2551 "jg 1b \n" 2556 "jg 1b \n"
2552 "vzeroupper \n" 2557 "vzeroupper \n"
2553 : "+r"(src), // %0 2558 : "+r"(src), // %0
2554 "+r"(dst), // %1 2559 "+r"(dst), // %1
2555 "+r"(temp_width) // %2 2560 "+r"(temp_width) // %2
2556 : "m"(kARGBShuffleMirror_AVX2) // %3 2561 : "m"(kARGBShuffleMirror_AVX2) // %3
2557 : "memory", "cc", NACL_R14 2562 : "memory", "cc", NACL_R14
2558 "xmm0", "xmm5" 2563 "xmm0", "xmm5"
2559 ); 2564 );
2560 } 2565 }
2561 #endif // HAS_ARGBMIRRORROW_AVX2 2566 #endif // HAS_ARGBMIRRORROW_AVX2
2562 2567
2563 #ifdef HAS_SPLITUVROW_AVX2 2568 #ifdef HAS_SPLITUVROW_AVX2
2564 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 2569 void SplitUVRow_AVX2(const uint8* src_uv,
2570 uint8* dst_u,
2571 uint8* dst_v,
2565 int width) { 2572 int width) {
2566 asm volatile ( 2573 asm volatile (
2567 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2574 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2568 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 2575 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2569 "sub %1,%2 \n" 2576 "sub %1,%2 \n"
2570 LABELALIGN 2577 LABELALIGN
2571 "1: \n" 2578 "1: \n"
2572 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2579 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2573 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 2580 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2574 "lea " MEMLEA(0x40,0) ",%0 \n" 2581 "lea " MEMLEA(0x40,0) ",%0 \n"
(...skipping 16 matching lines...) Expand all
2591 "+r"(dst_v), // %2 2598 "+r"(dst_v), // %2
2592 "+r"(width) // %3 2599 "+r"(width) // %3
2593 : 2600 :
2594 : "memory", "cc", NACL_R14 2601 : "memory", "cc", NACL_R14
2595 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2602 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2596 ); 2603 );
2597 } 2604 }
2598 #endif // HAS_SPLITUVROW_AVX2 2605 #endif // HAS_SPLITUVROW_AVX2
2599 2606
2600 #ifdef HAS_SPLITUVROW_SSE2 2607 #ifdef HAS_SPLITUVROW_SSE2
2601 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 2608 void SplitUVRow_SSE2(const uint8* src_uv,
2609 uint8* dst_u,
2610 uint8* dst_v,
2602 int width) { 2611 int width) {
2603 asm volatile ( 2612 asm volatile (
2604 "pcmpeqb %%xmm5,%%xmm5 \n" 2613 "pcmpeqb %%xmm5,%%xmm5 \n"
2605 "psrlw $0x8,%%xmm5 \n" 2614 "psrlw $0x8,%%xmm5 \n"
2606 "sub %1,%2 \n" 2615 "sub %1,%2 \n"
2607 LABELALIGN 2616 LABELALIGN
2608 "1: \n" 2617 "1: \n"
2609 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2618 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2610 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2619 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2611 "lea " MEMLEA(0x20,0) ",%0 \n" 2620 "lea " MEMLEA(0x20,0) ",%0 \n"
(...skipping 15 matching lines...) Expand all
2627 "+r"(dst_v), // %2 2636 "+r"(dst_v), // %2
2628 "+r"(width) // %3 2637 "+r"(width) // %3
2629 : 2638 :
2630 : "memory", "cc", NACL_R14 2639 : "memory", "cc", NACL_R14
2631 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2640 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2632 ); 2641 );
2633 } 2642 }
2634 #endif // HAS_SPLITUVROW_SSE2 2643 #endif // HAS_SPLITUVROW_SSE2
2635 2644
2636 #ifdef HAS_MERGEUVROW_AVX2 2645 #ifdef HAS_MERGEUVROW_AVX2
2637 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 2646 void MergeUVRow_AVX2(const uint8* src_u,
2647 const uint8* src_v,
2648 uint8* dst_uv,
2638 int width) { 2649 int width) {
2639 asm volatile ( 2650 asm volatile (
2640 "sub %0,%1 \n" 2651 "sub %0,%1 \n"
2641 LABELALIGN 2652 LABELALIGN
2642 "1: \n" 2653 "1: \n"
2643 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2654 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2644 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 2655 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
2645 "lea " MEMLEA(0x20,0) ",%0 \n" 2656 "lea " MEMLEA(0x20,0) ",%0 \n"
2646 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" 2657 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
2647 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" 2658 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
(...skipping 10 matching lines...) Expand all
2658 "+r"(dst_uv), // %2 2669 "+r"(dst_uv), // %2
2659 "+r"(width) // %3 2670 "+r"(width) // %3
2660 : 2671 :
2661 : "memory", "cc", NACL_R14 2672 : "memory", "cc", NACL_R14
2662 "xmm0", "xmm1", "xmm2" 2673 "xmm0", "xmm1", "xmm2"
2663 ); 2674 );
2664 } 2675 }
2665 #endif // HAS_MERGEUVROW_AVX2 2676 #endif // HAS_MERGEUVROW_AVX2
2666 2677
2667 #ifdef HAS_MERGEUVROW_SSE2 2678 #ifdef HAS_MERGEUVROW_SSE2
2668 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 2679 void MergeUVRow_SSE2(const uint8* src_u,
2680 const uint8* src_v,
2681 uint8* dst_uv,
2669 int width) { 2682 int width) {
2670 asm volatile ( 2683 asm volatile (
2671 "sub %0,%1 \n" 2684 "sub %0,%1 \n"
2672 LABELALIGN 2685 LABELALIGN
2673 "1: \n" 2686 "1: \n"
2674 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2687 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2675 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 2688 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
2676 "lea " MEMLEA(0x10,0) ",%0 \n" 2689 "lea " MEMLEA(0x10,0) ",%0 \n"
2677 "movdqa %%xmm0,%%xmm2 \n" 2690 "movdqa %%xmm0,%%xmm2 \n"
2678 "punpcklbw %%xmm1,%%xmm0 \n" 2691 "punpcklbw %%xmm1,%%xmm0 \n"
(...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after
2752 : "memory", "cc" 2765 : "memory", "cc"
2753 , "xmm0", "xmm1" 2766 , "xmm0", "xmm1"
2754 ); 2767 );
2755 } 2768 }
2756 #endif // HAS_COPYROW_AVX 2769 #endif // HAS_COPYROW_AVX
2757 2770
2758 #ifdef HAS_COPYROW_ERMS 2771 #ifdef HAS_COPYROW_ERMS
2759 // Multiple of 1. 2772 // Multiple of 1.
2760 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { 2773 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2761 size_t width_tmp = (size_t)(width); 2774 size_t width_tmp = (size_t)(width);
2762 asm volatile ( 2775 asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n"
2763 "rep movsb " MEMMOVESTRING(0,1) " \n" 2776 : "+S"(src), // %0
2764 : "+S"(src), // %0 2777 "+D"(dst), // %1
2765 "+D"(dst), // %1 2778 "+c"(width_tmp) // %2
2766 "+c"(width_tmp) // %2 2779 :
2767 : 2780 : "memory", "cc");
2768 : "memory", "cc"
2769 );
2770 } 2781 }
2771 #endif // HAS_COPYROW_ERMS 2782 #endif // HAS_COPYROW_ERMS
2772 2783
2773 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 2784 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2774 // width in pixels 2785 // width in pixels
2775 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 2786 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2776 asm volatile ( 2787 asm volatile (
2777 "pcmpeqb %%xmm0,%%xmm0 \n" 2788 "pcmpeqb %%xmm0,%%xmm0 \n"
2778 "pslld $0x18,%%xmm0 \n" 2789 "pslld $0x18,%%xmm0 \n"
2779 "pcmpeqb %%xmm1,%%xmm1 \n" 2790 "pcmpeqb %%xmm1,%%xmm1 \n"
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
2831 : 2842 :
2832 : "memory", "cc" 2843 : "memory", "cc"
2833 , "xmm0", "xmm1", "xmm2" 2844 , "xmm0", "xmm1", "xmm2"
2834 ); 2845 );
2835 } 2846 }
2836 #endif // HAS_ARGBCOPYALPHAROW_AVX2 2847 #endif // HAS_ARGBCOPYALPHAROW_AVX2
2837 2848
2838 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 2849 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
2839 // width in pixels 2850 // width in pixels
2840 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { 2851 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
2841 asm volatile ( 2852 asm volatile (
2842 LABELALIGN 2853 LABELALIGN
2843 "1: \n" 2854 "1: \n"
2844 "movdqu " MEMACCESS(0) ", %%xmm0 \n" 2855 "movdqu " MEMACCESS(0) ", %%xmm0 \n"
2845 "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" 2856 "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
2846 "lea " MEMLEA(0x20, 0) ", %0 \n" 2857 "lea " MEMLEA(0x20, 0) ", %0 \n"
2847 "psrld $0x18, %%xmm0 \n" 2858 "psrld $0x18, %%xmm0 \n"
2848 "psrld $0x18, %%xmm1 \n" 2859 "psrld $0x18, %%xmm1 \n"
2849 "packssdw %%xmm1, %%xmm0 \n" 2860 "packssdw %%xmm1, %%xmm0 \n"
2850 "packuswb %%xmm0, %%xmm0 \n" 2861 "packuswb %%xmm0, %%xmm0 \n"
2851 "movq %%xmm0," MEMACCESS(1) " \n" 2862 "movq %%xmm0," MEMACCESS(1) " \n"
2852 "lea " MEMLEA(0x8, 1) ", %1 \n" 2863 "lea " MEMLEA(0x8, 1) ", %1 \n"
2853 "sub $0x8, %2 \n" 2864 "sub $0x8, %2 \n"
2854 "jg 1b \n" 2865 "jg 1b \n"
2855 : "+r"(src_argb), // %0 2866 : "+r"(src_argb), // %0
2856 "+r"(dst_a), // %1 2867 "+r"(dst_a), // %1
2857 "+rm"(width) // %2 2868 "+rm"(width) // %2
2858 : 2869 :
2859 : "memory", "cc" 2870 : "memory", "cc"
2860 , "xmm0", "xmm1" 2871 , "xmm0", "xmm1"
2861 ); 2872 );
2862 } 2873 }
2863 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 2874 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
2864 2875
2865 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 2876 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
2866 static const uvec8 kShuffleAlphaShort_AVX2 = { 2877 static const uvec8 kShuffleAlphaShort_AVX2 = {
2867 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, 2878 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
2868 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u 2879 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
2869 };
2870 2880
2871 void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) { 2881 void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
2872 asm volatile ( 2882 asm volatile (
2873 "vmovdqa %3,%%ymm4 \n" 2883 "vmovdqa %3,%%ymm4 \n"
2874 "vbroadcastf128 %4,%%ymm5 \n" 2884 "vbroadcastf128 %4,%%ymm5 \n"
2875 LABELALIGN 2885 LABELALIGN
2876 "1: \n" 2886 "1: \n"
2877 "vmovdqu " MEMACCESS(0) ", %%ymm0 \n" 2887 "vmovdqu " MEMACCESS(0) ", %%ymm0 \n"
2878 "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n" 2888 "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
2879 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 2889 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
2880 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" 2890 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
2881 "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n" 2891 "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
2882 "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n" 2892 "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after
2969 : "memory", "cc" 2979 : "memory", "cc"
2970 , "xmm0", "xmm1", "xmm2" 2980 , "xmm0", "xmm1", "xmm2"
2971 ); 2981 );
2972 } 2982 }
2973 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 2983 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
2974 2984
2975 #ifdef HAS_SETROW_X86 2985 #ifdef HAS_SETROW_X86
2976 void SetRow_X86(uint8* dst, uint8 v8, int width) { 2986 void SetRow_X86(uint8* dst, uint8 v8, int width) {
2977 size_t width_tmp = (size_t)(width >> 2); 2987 size_t width_tmp = (size_t)(width >> 2);
2978 const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. 2988 const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
2979 asm volatile ( 2989 asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
2980 "rep stosl " MEMSTORESTRING(eax,0) " \n" 2990 : "+D"(dst), // %0
2981 : "+D"(dst), // %0 2991 "+c"(width_tmp) // %1
2982 "+c"(width_tmp) // %1 2992 : "a"(v32) // %2
2983 : "a"(v32) // %2 2993 : "memory", "cc");
2984 : "memory", "cc");
2985 } 2994 }
2986 2995
2987 void SetRow_ERMS(uint8* dst, uint8 v8, int width) { 2996 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
2988 size_t width_tmp = (size_t)(width); 2997 size_t width_tmp = (size_t)(width);
2989 asm volatile ( 2998 asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n"
2990 "rep stosb " MEMSTORESTRING(al,0) " \n" 2999 : "+D"(dst), // %0
2991 : "+D"(dst), // %0 3000 "+c"(width_tmp) // %1
2992 "+c"(width_tmp) // %1 3001 : "a"(v8) // %2
2993 : "a"(v8) // %2 3002 : "memory", "cc");
2994 : "memory", "cc");
2995 } 3003 }
2996 3004
2997 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { 3005 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
2998 size_t width_tmp = (size_t)(width); 3006 size_t width_tmp = (size_t)(width);
2999 asm volatile ( 3007 asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
3000 "rep stosl " MEMSTORESTRING(eax,0) " \n" 3008 : "+D"(dst_argb), // %0
3001 : "+D"(dst_argb), // %0 3009 "+c"(width_tmp) // %1
3002 "+c"(width_tmp) // %1 3010 : "a"(v32) // %2
3003 : "a"(v32) // %2 3011 : "memory", "cc");
3004 : "memory", "cc");
3005 } 3012 }
3006 #endif // HAS_SETROW_X86 3013 #endif // HAS_SETROW_X86
3007 3014
3008 #ifdef HAS_YUY2TOYROW_SSE2 3015 #ifdef HAS_YUY2TOYROW_SSE2
3009 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { 3016 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
3010 asm volatile ( 3017 asm volatile (
3011 "pcmpeqb %%xmm5,%%xmm5 \n" 3018 "pcmpeqb %%xmm5,%%xmm5 \n"
3012 "psrlw $0x8,%%xmm5 \n" 3019 "psrlw $0x8,%%xmm5 \n"
3013 LABELALIGN 3020 LABELALIGN
3014 "1: \n" 3021 "1: \n"
3015 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3022 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3016 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3023 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3017 "lea " MEMLEA(0x20,0) ",%0 \n" 3024 "lea " MEMLEA(0x20,0) ",%0 \n"
3018 "pand %%xmm5,%%xmm0 \n" 3025 "pand %%xmm5,%%xmm0 \n"
3019 "pand %%xmm5,%%xmm1 \n" 3026 "pand %%xmm5,%%xmm1 \n"
3020 "packuswb %%xmm1,%%xmm0 \n" 3027 "packuswb %%xmm1,%%xmm0 \n"
3021 "movdqu %%xmm0," MEMACCESS(1) " \n" 3028 "movdqu %%xmm0," MEMACCESS(1) " \n"
3022 "lea " MEMLEA(0x10,1) ",%1 \n" 3029 "lea " MEMLEA(0x10,1) ",%1 \n"
3023 "sub $0x10,%2 \n" 3030 "sub $0x10,%2 \n"
3024 "jg 1b \n" 3031 "jg 1b \n"
3025 : "+r"(src_yuy2), // %0 3032 : "+r"(src_yuy2), // %0
3026 "+r"(dst_y), // %1 3033 "+r"(dst_y), // %1
3027 "+r"(width) // %2 3034 "+r"(width) // %2
3028 : 3035 :
3029 : "memory", "cc" 3036 : "memory", "cc"
3030 , "xmm0", "xmm1", "xmm5" 3037 , "xmm0", "xmm1", "xmm5"
3031 ); 3038 );
3032 } 3039 }
3033 3040
3034 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 3041 void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
3035 uint8* dst_u, uint8* dst_v, int width) { 3042 int stride_yuy2,
3043 uint8* dst_u,
3044 uint8* dst_v,
3045 int width) {
3036 asm volatile ( 3046 asm volatile (
3037 "pcmpeqb %%xmm5,%%xmm5 \n" 3047 "pcmpeqb %%xmm5,%%xmm5 \n"
3038 "psrlw $0x8,%%xmm5 \n" 3048 "psrlw $0x8,%%xmm5 \n"
3039 "sub %1,%2 \n" 3049 "sub %1,%2 \n"
3040 LABELALIGN 3050 LABELALIGN
3041 "1: \n" 3051 "1: \n"
3042 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3052 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3043 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3053 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3044 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 3054 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
3045 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 3055 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
(...skipping 17 matching lines...) Expand all
3063 "+r"(dst_u), // %1 3073 "+r"(dst_u), // %1
3064 "+r"(dst_v), // %2 3074 "+r"(dst_v), // %2
3065 "+r"(width) // %3 3075 "+r"(width) // %3
3066 : "r"((intptr_t)(stride_yuy2)) // %4 3076 : "r"((intptr_t)(stride_yuy2)) // %4
3067 : "memory", "cc", NACL_R14 3077 : "memory", "cc", NACL_R14
3068 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3078 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3069 ); 3079 );
3070 } 3080 }
3071 3081
3072 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 3082 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3073 uint8* dst_u, uint8* dst_v, int width) { 3083 uint8* dst_u,
3084 uint8* dst_v,
3085 int width) {
3074 asm volatile ( 3086 asm volatile (
3075 "pcmpeqb %%xmm5,%%xmm5 \n" 3087 "pcmpeqb %%xmm5,%%xmm5 \n"
3076 "psrlw $0x8,%%xmm5 \n" 3088 "psrlw $0x8,%%xmm5 \n"
3077 "sub %1,%2 \n" 3089 "sub %1,%2 \n"
3078 LABELALIGN 3090 LABELALIGN
3079 "1: \n" 3091 "1: \n"
3080 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3092 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3081 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3093 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3082 "lea " MEMLEA(0x20,0) ",%0 \n" 3094 "lea " MEMLEA(0x20,0) ",%0 \n"
3083 "psrlw $0x8,%%xmm0 \n" 3095 "psrlw $0x8,%%xmm0 \n"
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
3119 "jg 1b \n" 3131 "jg 1b \n"
3120 : "+r"(src_uyvy), // %0 3132 : "+r"(src_uyvy), // %0
3121 "+r"(dst_y), // %1 3133 "+r"(dst_y), // %1
3122 "+r"(width) // %2 3134 "+r"(width) // %2
3123 : 3135 :
3124 : "memory", "cc" 3136 : "memory", "cc"
3125 , "xmm0", "xmm1" 3137 , "xmm0", "xmm1"
3126 ); 3138 );
3127 } 3139 }
3128 3140
3129 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 3141 void UYVYToUVRow_SSE2(const uint8* src_uyvy,
3130 uint8* dst_u, uint8* dst_v, int width) { 3142 int stride_uyvy,
3143 uint8* dst_u,
3144 uint8* dst_v,
3145 int width) {
3131 asm volatile ( 3146 asm volatile (
3132 "pcmpeqb %%xmm5,%%xmm5 \n" 3147 "pcmpeqb %%xmm5,%%xmm5 \n"
3133 "psrlw $0x8,%%xmm5 \n" 3148 "psrlw $0x8,%%xmm5 \n"
3134 "sub %1,%2 \n" 3149 "sub %1,%2 \n"
3135 LABELALIGN 3150 LABELALIGN
3136 "1: \n" 3151 "1: \n"
3137 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3152 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3138 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3153 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3139 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 3154 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
3140 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 3155 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
(...skipping 17 matching lines...) Expand all
3158 "+r"(dst_u), // %1 3173 "+r"(dst_u), // %1
3159 "+r"(dst_v), // %2 3174 "+r"(dst_v), // %2
3160 "+r"(width) // %3 3175 "+r"(width) // %3
3161 : "r"((intptr_t)(stride_uyvy)) // %4 3176 : "r"((intptr_t)(stride_uyvy)) // %4
3162 : "memory", "cc", NACL_R14 3177 : "memory", "cc", NACL_R14
3163 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3178 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3164 ); 3179 );
3165 } 3180 }
3166 3181
3167 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 3182 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3168 uint8* dst_u, uint8* dst_v, int width) { 3183 uint8* dst_u,
3184 uint8* dst_v,
3185 int width) {
3169 asm volatile ( 3186 asm volatile (
3170 "pcmpeqb %%xmm5,%%xmm5 \n" 3187 "pcmpeqb %%xmm5,%%xmm5 \n"
3171 "psrlw $0x8,%%xmm5 \n" 3188 "psrlw $0x8,%%xmm5 \n"
3172 "sub %1,%2 \n" 3189 "sub %1,%2 \n"
3173 LABELALIGN 3190 LABELALIGN
3174 "1: \n" 3191 "1: \n"
3175 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3192 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3176 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3193 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3177 "lea " MEMLEA(0x20,0) ",%0 \n" 3194 "lea " MEMLEA(0x20,0) ",%0 \n"
3178 "pand %%xmm5,%%xmm0 \n" 3195 "pand %%xmm5,%%xmm0 \n"
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
3220 "vzeroupper \n" 3237 "vzeroupper \n"
3221 : "+r"(src_yuy2), // %0 3238 : "+r"(src_yuy2), // %0
3222 "+r"(dst_y), // %1 3239 "+r"(dst_y), // %1
3223 "+r"(width) // %2 3240 "+r"(width) // %2
3224 : 3241 :
3225 : "memory", "cc" 3242 : "memory", "cc"
3226 , "xmm0", "xmm1", "xmm5" 3243 , "xmm0", "xmm1", "xmm5"
3227 ); 3244 );
3228 } 3245 }
3229 3246
3230 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, 3247 void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
3231 uint8* dst_u, uint8* dst_v, int width) { 3248 int stride_yuy2,
3249 uint8* dst_u,
3250 uint8* dst_v,
3251 int width) {
3232 asm volatile ( 3252 asm volatile (
3233 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3253 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3234 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3254 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3235 "sub %1,%2 \n" 3255 "sub %1,%2 \n"
3236 LABELALIGN 3256 LABELALIGN
3237 "1: \n" 3257 "1: \n"
3238 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3258 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3239 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3259 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3240 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 3260 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3241 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) 3261 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
(...skipping 18 matching lines...) Expand all
3260 "+r"(dst_u), // %1 3280 "+r"(dst_u), // %1
3261 "+r"(dst_v), // %2 3281 "+r"(dst_v), // %2
3262 "+r"(width) // %3 3282 "+r"(width) // %3
3263 : "r"((intptr_t)(stride_yuy2)) // %4 3283 : "r"((intptr_t)(stride_yuy2)) // %4
3264 : "memory", "cc", NACL_R14 3284 : "memory", "cc", NACL_R14
3265 "xmm0", "xmm1", "xmm5" 3285 "xmm0", "xmm1", "xmm5"
3266 ); 3286 );
3267 } 3287 }
3268 3288
3269 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 3289 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3270 uint8* dst_u, uint8* dst_v, int width) { 3290 uint8* dst_u,
3291 uint8* dst_v,
3292 int width) {
3271 asm volatile ( 3293 asm volatile (
3272 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3294 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3273 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3295 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3274 "sub %1,%2 \n" 3296 "sub %1,%2 \n"
3275 LABELALIGN 3297 LABELALIGN
3276 "1: \n" 3298 "1: \n"
3277 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3299 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3278 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3300 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3279 "lea " MEMLEA(0x40,0) ",%0 \n" 3301 "lea " MEMLEA(0x40,0) ",%0 \n"
3280 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3302 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
3320 "jg 1b \n" 3342 "jg 1b \n"
3321 "vzeroupper \n" 3343 "vzeroupper \n"
3322 : "+r"(src_uyvy), // %0 3344 : "+r"(src_uyvy), // %0
3323 "+r"(dst_y), // %1 3345 "+r"(dst_y), // %1
3324 "+r"(width) // %2 3346 "+r"(width) // %2
3325 : 3347 :
3326 : "memory", "cc" 3348 : "memory", "cc"
3327 , "xmm0", "xmm1", "xmm5" 3349 , "xmm0", "xmm1", "xmm5"
3328 ); 3350 );
3329 } 3351 }
3330 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, 3352 void UYVYToUVRow_AVX2(const uint8* src_uyvy,
3331 uint8* dst_u, uint8* dst_v, int width) { 3353 int stride_uyvy,
3354 uint8* dst_u,
3355 uint8* dst_v,
3356 int width) {
3332 asm volatile ( 3357 asm volatile (
3333 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3358 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3334 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3359 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3335 "sub %1,%2 \n" 3360 "sub %1,%2 \n"
3336 3361
3337 LABELALIGN 3362 LABELALIGN
3338 "1: \n" 3363 "1: \n"
3339 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3364 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3340 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3365 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3341 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 3366 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
(...skipping 19 matching lines...) Expand all
3361 "+r"(dst_u), // %1 3386 "+r"(dst_u), // %1
3362 "+r"(dst_v), // %2 3387 "+r"(dst_v), // %2
3363 "+r"(width) // %3 3388 "+r"(width) // %3
3364 : "r"((intptr_t)(stride_uyvy)) // %4 3389 : "r"((intptr_t)(stride_uyvy)) // %4
3365 : "memory", "cc", NACL_R14 3390 : "memory", "cc", NACL_R14
3366 "xmm0", "xmm1", "xmm5" 3391 "xmm0", "xmm1", "xmm5"
3367 ); 3392 );
3368 } 3393 }
3369 3394
3370 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 3395 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3371 uint8* dst_u, uint8* dst_v, int width) { 3396 uint8* dst_u,
3397 uint8* dst_v,
3398 int width) {
3372 asm volatile ( 3399 asm volatile (
3373 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3400 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3374 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3401 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3375 "sub %1,%2 \n" 3402 "sub %1,%2 \n"
3376 LABELALIGN 3403 LABELALIGN
3377 "1: \n" 3404 "1: \n"
3378 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3405 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3379 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3406 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3380 "lea " MEMLEA(0x40,0) ",%0 \n" 3407 "lea " MEMLEA(0x40,0) ",%0 \n"
3381 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 3408 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
(...skipping 18 matching lines...) Expand all
3400 "+r"(width) // %3 3427 "+r"(width) // %3
3401 : 3428 :
3402 : "memory", "cc", NACL_R14 3429 : "memory", "cc", NACL_R14
3403 "xmm0", "xmm1", "xmm5" 3430 "xmm0", "xmm1", "xmm5"
3404 ); 3431 );
3405 } 3432 }
3406 #endif // HAS_YUY2TOYROW_AVX2 3433 #endif // HAS_YUY2TOYROW_AVX2
3407 3434
3408 #ifdef HAS_ARGBBLENDROW_SSSE3 3435 #ifdef HAS_ARGBBLENDROW_SSSE3
3409 // Shuffle table for isolating alpha. 3436 // Shuffle table for isolating alpha.
3410 static uvec8 kShuffleAlpha = { 3437 static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3411 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 3438 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
3412 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3413 };
3414 3439
3415 // Blend 8 pixels at a time 3440 // Blend 8 pixels at a time
3416 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 3441 void ARGBBlendRow_SSSE3(const uint8* src_argb0,
3417 uint8* dst_argb, int width) { 3442 const uint8* src_argb1,
3443 uint8* dst_argb,
3444 int width) {
3418 asm volatile ( 3445 asm volatile (
3419 "pcmpeqb %%xmm7,%%xmm7 \n" 3446 "pcmpeqb %%xmm7,%%xmm7 \n"
3420 "psrlw $0xf,%%xmm7 \n" 3447 "psrlw $0xf,%%xmm7 \n"
3421 "pcmpeqb %%xmm6,%%xmm6 \n" 3448 "pcmpeqb %%xmm6,%%xmm6 \n"
3422 "psrlw $0x8,%%xmm6 \n" 3449 "psrlw $0x8,%%xmm6 \n"
3423 "pcmpeqb %%xmm5,%%xmm5 \n" 3450 "pcmpeqb %%xmm5,%%xmm5 \n"
3424 "psllw $0x8,%%xmm5 \n" 3451 "psllw $0x8,%%xmm5 \n"
3425 "pcmpeqb %%xmm4,%%xmm4 \n" 3452 "pcmpeqb %%xmm4,%%xmm4 \n"
3426 "pslld $0x18,%%xmm4 \n" 3453 "pslld $0x18,%%xmm4 \n"
3427 "sub $0x4,%3 \n" 3454 "sub $0x4,%3 \n"
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
3492 ); 3519 );
3493 } 3520 }
3494 #endif // HAS_ARGBBLENDROW_SSSE3 3521 #endif // HAS_ARGBBLENDROW_SSSE3
3495 3522
3496 #ifdef HAS_BLENDPLANEROW_SSSE3 3523 #ifdef HAS_BLENDPLANEROW_SSSE3
3497 // Blend 8 pixels at a time. 3524 // Blend 8 pixels at a time.
3498 // unsigned version of math 3525 // unsigned version of math
3499 // =((A2*C2)+(B2*(255-C2))+255)/256 3526 // =((A2*C2)+(B2*(255-C2))+255)/256
3500 // signed version of math 3527 // signed version of math
3501 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 3528 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
3502 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, 3529 void BlendPlaneRow_SSSE3(const uint8* src0,
3503 const uint8* alpha, uint8* dst, int width) { 3530 const uint8* src1,
3504 asm volatile ( 3531 const uint8* alpha,
3505 "pcmpeqb %%xmm5,%%xmm5 \n" 3532 uint8* dst,
3506 "psllw $0x8,%%xmm5 \n" 3533 int width) {
3507 "mov $0x80808080,%%eax \n" 3534 asm volatile(
3508 "movd %%eax,%%xmm6 \n" 3535 "pcmpeqb %%xmm5,%%xmm5 \n"
3509 "pshufd $0x0,%%xmm6,%%xmm6 \n" 3536 "psllw $0x8,%%xmm5 \n"
3510 "mov $0x807f807f,%%eax \n" 3537 "mov $0x80808080,%%eax \n"
3511 "movd %%eax,%%xmm7 \n" 3538 "movd %%eax,%%xmm6 \n"
3512 "pshufd $0x0,%%xmm7,%%xmm7 \n" 3539 "pshufd $0x0,%%xmm6,%%xmm6 \n"
3513 "sub %2,%0 \n" 3540 "mov $0x807f807f,%%eax \n"
3514 "sub %2,%1 \n" 3541 "movd %%eax,%%xmm7 \n"
3515 "sub %2,%3 \n" 3542 "pshufd $0x0,%%xmm7,%%xmm7 \n"
3543 "sub %2,%0 \n"
3544 "sub %2,%1 \n"
3545 "sub %2,%3 \n"
3516 3546
3517 // 8 pixel loop. 3547 // 8 pixel loop.
3518 LABELALIGN 3548 LABELALIGN
3519 "1: \n" 3549 "1: \n"
3520 "movq (%2),%%xmm0 \n" 3550 "movq (%2),%%xmm0 \n"
3521 "punpcklbw %%xmm0,%%xmm0 \n" 3551 "punpcklbw %%xmm0,%%xmm0 \n"
3522 "pxor %%xmm5,%%xmm0 \n" 3552 "pxor %%xmm5,%%xmm0 \n"
3523 "movq (%0,%2,1),%%xmm1 \n" 3553 "movq (%0,%2,1),%%xmm1 \n"
3524 "movq (%1,%2,1),%%xmm2 \n" 3554 "movq (%1,%2,1),%%xmm2 \n"
3525 "punpcklbw %%xmm2,%%xmm1 \n" 3555 "punpcklbw %%xmm2,%%xmm1 \n"
3526 "psubb %%xmm6,%%xmm1 \n" 3556 "psubb %%xmm6,%%xmm1 \n"
3527 "pmaddubsw %%xmm1,%%xmm0 \n" 3557 "pmaddubsw %%xmm1,%%xmm0 \n"
3528 "paddw %%xmm7,%%xmm0 \n" 3558 "paddw %%xmm7,%%xmm0 \n"
3529 "psrlw $0x8,%%xmm0 \n" 3559 "psrlw $0x8,%%xmm0 \n"
3530 "packuswb %%xmm0,%%xmm0 \n" 3560 "packuswb %%xmm0,%%xmm0 \n"
3531 "movq %%xmm0,(%3,%2,1) \n" 3561 "movq %%xmm0,(%3,%2,1) \n"
3532 "lea 0x8(%2),%2 \n" 3562 "lea 0x8(%2),%2 \n"
3533 "sub $0x8,%4 \n" 3563 "sub $0x8,%4 \n"
3534 "jg 1b \n" 3564 "jg 1b \n"
3535 : "+r"(src0), // %0 3565 : "+r"(src0), // %0
3536 "+r"(src1), // %1 3566 "+r"(src1), // %1
3537 "+r"(alpha), // %2 3567 "+r"(alpha), // %2
3538 "+r"(dst), // %3 3568 "+r"(dst), // %3
3539 "+rm"(width) // %4 3569 "+rm"(width) // %4
3540 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" 3570 ::"memory",
3541 ); 3571 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
3542 } 3572 }
3543 #endif // HAS_BLENDPLANEROW_SSSE3 3573 #endif // HAS_BLENDPLANEROW_SSSE3
3544 3574
3545 #ifdef HAS_BLENDPLANEROW_AVX2 3575 #ifdef HAS_BLENDPLANEROW_AVX2
3546 // Blend 32 pixels at a time. 3576 // Blend 32 pixels at a time.
3547 // unsigned version of math 3577 // unsigned version of math
3548 // =((A2*C2)+(B2*(255-C2))+255)/256 3578 // =((A2*C2)+(B2*(255-C2))+255)/256
3549 // signed version of math 3579 // signed version of math
3550 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 3580 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
3551 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, 3581 void BlendPlaneRow_AVX2(const uint8* src0,
3552 const uint8* alpha, uint8* dst, int width) { 3582 const uint8* src1,
3553 asm volatile ( 3583 const uint8* alpha,
3554 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3584 uint8* dst,
3555 "vpsllw $0x8,%%ymm5,%%ymm5 \n" 3585 int width) {
3556 "mov $0x80808080,%%eax \n" 3586 asm volatile(
3557 "vmovd %%eax,%%xmm6 \n" 3587 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3558 "vbroadcastss %%xmm6,%%ymm6 \n" 3588 "vpsllw $0x8,%%ymm5,%%ymm5 \n"
3559 "mov $0x807f807f,%%eax \n" 3589 "mov $0x80808080,%%eax \n"
3560 "vmovd %%eax,%%xmm7 \n" 3590 "vmovd %%eax,%%xmm6 \n"
3561 "vbroadcastss %%xmm7,%%ymm7 \n" 3591 "vbroadcastss %%xmm6,%%ymm6 \n"
3562 "sub %2,%0 \n" 3592 "mov $0x807f807f,%%eax \n"
3563 "sub %2,%1 \n" 3593 "vmovd %%eax,%%xmm7 \n"
3564 "sub %2,%3 \n" 3594 "vbroadcastss %%xmm7,%%ymm7 \n"
3595 "sub %2,%0 \n"
3596 "sub %2,%1 \n"
3597 "sub %2,%3 \n"
3565 3598
3566 // 32 pixel loop. 3599 // 32 pixel loop.
3567 LABELALIGN 3600 LABELALIGN
3568 "1: \n" 3601 "1: \n"
3569 "vmovdqu (%2),%%ymm0 \n" 3602 "vmovdqu (%2),%%ymm0 \n"
3570 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" 3603 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
3571 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" 3604 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
3572 "vpxor %%ymm5,%%ymm3,%%ymm3 \n" 3605 "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
3573 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" 3606 "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
3574 "vmovdqu (%0,%2,1),%%ymm1 \n" 3607 "vmovdqu (%0,%2,1),%%ymm1 \n"
3575 "vmovdqu (%1,%2,1),%%ymm2 \n" 3608 "vmovdqu (%1,%2,1),%%ymm2 \n"
3576 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" 3609 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
3577 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" 3610 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
3578 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" 3611 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
3579 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" 3612 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
3580 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 3613 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
3581 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" 3614 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
3582 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" 3615 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
3583 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" 3616 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
3584 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" 3617 "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
3585 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3618 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3586 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" 3619 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
3587 "vmovdqu %%ymm0,(%3,%2,1) \n" 3620 "vmovdqu %%ymm0,(%3,%2,1) \n"
3588 "lea 0x20(%2),%2 \n" 3621 "lea 0x20(%2),%2 \n"
3589 "sub $0x20,%4 \n" 3622 "sub $0x20,%4 \n"
3590 "jg 1b \n" 3623 "jg 1b \n"
3591 "vzeroupper \n" 3624 "vzeroupper \n"
3592 : "+r"(src0), // %0 3625 : "+r"(src0), // %0
3593 "+r"(src1), // %1 3626 "+r"(src1), // %1
3594 "+r"(alpha), // %2 3627 "+r"(alpha), // %2
3595 "+r"(dst), // %3 3628 "+r"(dst), // %3
3596 "+rm"(width) // %4 3629 "+rm"(width) // %4
3597 :: "memory", "cc", "eax", 3630 ::"memory",
3598 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 3631 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
3599 ); 3632 "xmm7");
3600 } 3633 }
3601 #endif // HAS_BLENDPLANEROW_AVX2 3634 #endif // HAS_BLENDPLANEROW_AVX2
3602 3635
3603 #ifdef HAS_ARGBATTENUATEROW_SSSE3 3636 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3604 // Shuffle table duplicating alpha 3637 // Shuffle table duplicating alpha
3605 static uvec8 kShuffleAlpha0 = { 3638 static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
3606 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u 3639 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
3607 }; 3640 static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3608 static uvec8 kShuffleAlpha1 = { 3641 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
3609 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3610 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
3611 };
3612 // Attenuate 4 pixels at a time. 3642 // Attenuate 4 pixels at a time.
3613 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 3643 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3614 asm volatile ( 3644 asm volatile (
3615 "pcmpeqb %%xmm3,%%xmm3 \n" 3645 "pcmpeqb %%xmm3,%%xmm3 \n"
3616 "pslld $0x18,%%xmm3 \n" 3646 "pslld $0x18,%%xmm3 \n"
3617 "movdqa %3,%%xmm4 \n" 3647 "movdqa %3,%%xmm4 \n"
3618 "movdqa %4,%%xmm5 \n" 3648 "movdqa %4,%%xmm5 \n"
3619 3649
3620 // 4 pixel loop. 3650 // 4 pixel loop.
3621 LABELALIGN 3651 LABELALIGN
(...skipping 25 matching lines...) Expand all
3647 : "m"(kShuffleAlpha0), // %3 3677 : "m"(kShuffleAlpha0), // %3
3648 "m"(kShuffleAlpha1) // %4 3678 "m"(kShuffleAlpha1) // %4
3649 : "memory", "cc" 3679 : "memory", "cc"
3650 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3680 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3651 ); 3681 );
3652 } 3682 }
3653 #endif // HAS_ARGBATTENUATEROW_SSSE3 3683 #endif // HAS_ARGBATTENUATEROW_SSSE3
3654 3684
3655 #ifdef HAS_ARGBATTENUATEROW_AVX2 3685 #ifdef HAS_ARGBATTENUATEROW_AVX2
3656 // Shuffle table duplicating alpha. 3686 // Shuffle table duplicating alpha.
3657 static const uvec8 kShuffleAlpha_AVX2 = { 3687 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
3658 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u 3688 128u, 128u, 14u, 15u, 14u, 15u,
3659 }; 3689 14u, 15u, 128u, 128u};
3660 // Attenuate 8 pixels at a time. 3690 // Attenuate 8 pixels at a time.
3661 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { 3691 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3662 asm volatile ( 3692 asm volatile (
3663 "vbroadcastf128 %3,%%ymm4 \n" 3693 "vbroadcastf128 %3,%%ymm4 \n"
3664 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3694 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3665 "vpslld $0x18,%%ymm5,%%ymm5 \n" 3695 "vpslld $0x18,%%ymm5,%%ymm5 \n"
3666 "sub %0,%1 \n" 3696 "sub %0,%1 \n"
3667 3697
3668 // 8 pixel loop. 3698 // 8 pixel loop.
3669 LABELALIGN 3699 LABELALIGN
(...skipping 20 matching lines...) Expand all
3690 "+r"(width) // %2 3720 "+r"(width) // %2
3691 : "m"(kShuffleAlpha_AVX2) // %3 3721 : "m"(kShuffleAlpha_AVX2) // %3
3692 : "memory", "cc" 3722 : "memory", "cc"
3693 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 3723 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3694 ); 3724 );
3695 } 3725 }
3696 #endif // HAS_ARGBATTENUATEROW_AVX2 3726 #endif // HAS_ARGBATTENUATEROW_AVX2
3697 3727
3698 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 3728 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3699 // Unattenuate 4 pixels at a time. 3729 // Unattenuate 4 pixels at a time.
3700 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 3730 void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
3731 uint8* dst_argb,
3701 int width) { 3732 int width) {
3702 uintptr_t alpha; 3733 uintptr_t alpha;
3703 asm volatile ( 3734 asm volatile (
3704 // 4 pixel loop. 3735 // 4 pixel loop.
3705 LABELALIGN 3736 LABELALIGN
3706 "1: \n" 3737 "1: \n"
3707 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3738 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3708 "movzb " MEMACCESS2(0x03,0) ",%3 \n" 3739 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3709 "punpcklbw %%xmm0,%%xmm0 \n" 3740 "punpcklbw %%xmm0,%%xmm0 \n"
3710 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 3741 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
(...skipping 26 matching lines...) Expand all
3737 : "r"(fixed_invtbl8) // %4 3768 : "r"(fixed_invtbl8) // %4
3738 : "memory", "cc", NACL_R14 3769 : "memory", "cc", NACL_R14
3739 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3770 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3740 ); 3771 );
3741 } 3772 }
3742 #endif // HAS_ARGBUNATTENUATEROW_SSE2 3773 #endif // HAS_ARGBUNATTENUATEROW_SSE2
3743 3774
3744 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 3775 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3745 // Shuffle table duplicating alpha. 3776 // Shuffle table duplicating alpha.
3746 static const uvec8 kUnattenShuffleAlpha_AVX2 = { 3777 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3747 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u 3778 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
3748 };
3749 // Unattenuate 8 pixels at a time. 3779 // Unattenuate 8 pixels at a time.
3750 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 3780 void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
3781 uint8* dst_argb,
3751 int width) { 3782 int width) {
3752 uintptr_t alpha; 3783 uintptr_t alpha;
3753 asm volatile ( 3784 asm volatile (
3754 "sub %0,%1 \n" 3785 "sub %0,%1 \n"
3755 "vbroadcastf128 %5,%%ymm5 \n" 3786 "vbroadcastf128 %5,%%ymm5 \n"
3756 3787
3757 // 8 pixel loop. 3788 // 8 pixel loop.
3758 LABELALIGN 3789 LABELALIGN
3759 "1: \n" 3790 "1: \n"
3760 // replace VPGATHER 3791 // replace VPGATHER
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after
3855 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3886 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3856 ); 3887 );
3857 } 3888 }
3858 #endif // HAS_ARGBGRAYROW_SSSE3 3889 #endif // HAS_ARGBGRAYROW_SSSE3
3859 3890
3860 #ifdef HAS_ARGBSEPIAROW_SSSE3 3891 #ifdef HAS_ARGBSEPIAROW_SSSE3
3861 // b = (r * 35 + g * 68 + b * 17) >> 7 3892 // b = (r * 35 + g * 68 + b * 17) >> 7
3862 // g = (r * 45 + g * 88 + b * 22) >> 7 3893 // g = (r * 45 + g * 88 + b * 22) >> 7
3863 // r = (r * 50 + g * 98 + b * 24) >> 7 3894 // r = (r * 50 + g * 98 + b * 24) >> 7
3864 // Constant for ARGB color to sepia tone 3895 // Constant for ARGB color to sepia tone
3865 static vec8 kARGBToSepiaB = { 3896 static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
3866 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 3897 17, 68, 35, 0, 17, 68, 35, 0};
3867 };
3868 3898
3869 static vec8 kARGBToSepiaG = { 3899 static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
3870 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 3900 22, 88, 45, 0, 22, 88, 45, 0};
3871 };
3872 3901
3873 static vec8 kARGBToSepiaR = { 3902 static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
3874 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 3903 24, 98, 50, 0, 24, 98, 50, 0};
3875 };
3876 3904
3877 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 3905 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3878 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 3906 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3879 asm volatile ( 3907 asm volatile (
3880 "movdqa %2,%%xmm2 \n" 3908 "movdqa %2,%%xmm2 \n"
3881 "movdqa %3,%%xmm3 \n" 3909 "movdqa %3,%%xmm3 \n"
3882 "movdqa %4,%%xmm4 \n" 3910 "movdqa %4,%%xmm4 \n"
3883 3911
3884 // 8 pixel loop. 3912 // 8 pixel loop.
3885 LABELALIGN 3913 LABELALIGN
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
3928 "m"(kARGBToSepiaR) // %4 3956 "m"(kARGBToSepiaR) // %4
3929 : "memory", "cc" 3957 : "memory", "cc"
3930 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 3958 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3931 ); 3959 );
3932 } 3960 }
3933 #endif // HAS_ARGBSEPIAROW_SSSE3 3961 #endif // HAS_ARGBSEPIAROW_SSSE3
3934 3962
3935 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 3963 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3936 // Tranform 8 ARGB pixels (32 bytes) with color matrix. 3964 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3937 // Same as Sepia except matrix is provided. 3965 // Same as Sepia except matrix is provided.
3938 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 3966 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
3939 const int8* matrix_argb, int width) { 3967 uint8* dst_argb,
3968 const int8* matrix_argb,
3969 int width) {
3940 asm volatile ( 3970 asm volatile (
3941 "movdqu " MEMACCESS(3) ",%%xmm5 \n" 3971 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
3942 "pshufd $0x00,%%xmm5,%%xmm2 \n" 3972 "pshufd $0x00,%%xmm5,%%xmm2 \n"
3943 "pshufd $0x55,%%xmm5,%%xmm3 \n" 3973 "pshufd $0x55,%%xmm5,%%xmm3 \n"
3944 "pshufd $0xaa,%%xmm5,%%xmm4 \n" 3974 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
3945 "pshufd $0xff,%%xmm5,%%xmm5 \n" 3975 "pshufd $0xff,%%xmm5,%%xmm5 \n"
3946 3976
3947 // 8 pixel loop. 3977 // 8 pixel loop.
3948 LABELALIGN 3978 LABELALIGN
3949 "1: \n" 3979 "1: \n"
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
3991 "+r"(width) // %2 4021 "+r"(width) // %2
3992 : "r"(matrix_argb) // %3 4022 : "r"(matrix_argb) // %3
3993 : "memory", "cc" 4023 : "memory", "cc"
3994 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4024 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3995 ); 4025 );
3996 } 4026 }
3997 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 4027 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3998 4028
3999 #ifdef HAS_ARGBQUANTIZEROW_SSE2 4029 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4000 // Quantize 4 ARGB pixels (16 bytes). 4030 // Quantize 4 ARGB pixels (16 bytes).
4001 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 4031 void ARGBQuantizeRow_SSE2(uint8* dst_argb,
4002 int interval_offset, int width) { 4032 int scale,
4033 int interval_size,
4034 int interval_offset,
4035 int width) {
4003 asm volatile ( 4036 asm volatile (
4004 "movd %2,%%xmm2 \n" 4037 "movd %2,%%xmm2 \n"
4005 "movd %3,%%xmm3 \n" 4038 "movd %3,%%xmm3 \n"
4006 "movd %4,%%xmm4 \n" 4039 "movd %4,%%xmm4 \n"
4007 "pshuflw $0x40,%%xmm2,%%xmm2 \n" 4040 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4008 "pshufd $0x44,%%xmm2,%%xmm2 \n" 4041 "pshufd $0x44,%%xmm2,%%xmm2 \n"
4009 "pshuflw $0x40,%%xmm3,%%xmm3 \n" 4042 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
4010 "pshufd $0x44,%%xmm3,%%xmm3 \n" 4043 "pshufd $0x44,%%xmm3,%%xmm3 \n"
4011 "pshuflw $0x40,%%xmm4,%%xmm4 \n" 4044 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
4012 "pshufd $0x44,%%xmm4,%%xmm4 \n" 4045 "pshufd $0x44,%%xmm4,%%xmm4 \n"
(...skipping 28 matching lines...) Expand all
4041 "r"(interval_size), // %3 4074 "r"(interval_size), // %3
4042 "r"(interval_offset) // %4 4075 "r"(interval_offset) // %4
4043 : "memory", "cc" 4076 : "memory", "cc"
4044 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4077 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4045 ); 4078 );
4046 } 4079 }
4047 #endif // HAS_ARGBQUANTIZEROW_SSE2 4080 #endif // HAS_ARGBQUANTIZEROW_SSE2
4048 4081
4049 #ifdef HAS_ARGBSHADEROW_SSE2 4082 #ifdef HAS_ARGBSHADEROW_SSE2
4050 // Shade 4 pixels at a time by specified value. 4083 // Shade 4 pixels at a time by specified value.
4051 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 4084 void ARGBShadeRow_SSE2(const uint8* src_argb,
4085 uint8* dst_argb,
4086 int width,
4052 uint32 value) { 4087 uint32 value) {
4053 asm volatile ( 4088 asm volatile (
4054 "movd %3,%%xmm2 \n" 4089 "movd %3,%%xmm2 \n"
4055 "punpcklbw %%xmm2,%%xmm2 \n" 4090 "punpcklbw %%xmm2,%%xmm2 \n"
4056 "punpcklqdq %%xmm2,%%xmm2 \n" 4091 "punpcklqdq %%xmm2,%%xmm2 \n"
4057 4092
4058 // 4 pixel loop. 4093 // 4 pixel loop.
4059 LABELALIGN 4094 LABELALIGN
4060 "1: \n" 4095 "1: \n"
4061 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4096 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
(...skipping 15 matching lines...) Expand all
4077 "+r"(width) // %2 4112 "+r"(width) // %2
4078 : "r"(value) // %3 4113 : "r"(value) // %3
4079 : "memory", "cc" 4114 : "memory", "cc"
4080 , "xmm0", "xmm1", "xmm2" 4115 , "xmm0", "xmm1", "xmm2"
4081 ); 4116 );
4082 } 4117 }
4083 #endif // HAS_ARGBSHADEROW_SSE2 4118 #endif // HAS_ARGBSHADEROW_SSE2
4084 4119
4085 #ifdef HAS_ARGBMULTIPLYROW_SSE2 4120 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4086 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 4121 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4087 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4122 void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
4088 uint8* dst_argb, int width) { 4123 const uint8* src_argb1,
4124 uint8* dst_argb,
4125 int width) {
4089 asm volatile ( 4126 asm volatile (
4090 "pxor %%xmm5,%%xmm5 \n" 4127 "pxor %%xmm5,%%xmm5 \n"
4091 4128
4092 // 4 pixel loop. 4129 // 4 pixel loop.
4093 LABELALIGN 4130 LABELALIGN
4094 "1: \n" 4131 "1: \n"
4095 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4132 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4096 "lea " MEMLEA(0x10,0) ",%0 \n" 4133 "lea " MEMLEA(0x10,0) ",%0 \n"
4097 "movdqu " MEMACCESS(1) ",%%xmm2 \n" 4134 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
4098 "lea " MEMLEA(0x10,1) ",%1 \n" 4135 "lea " MEMLEA(0x10,1) ",%1 \n"
(...skipping 16 matching lines...) Expand all
4115 "+r"(width) // %3 4152 "+r"(width) // %3
4116 : 4153 :
4117 : "memory", "cc" 4154 : "memory", "cc"
4118 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4155 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4119 ); 4156 );
4120 } 4157 }
4121 #endif // HAS_ARGBMULTIPLYROW_SSE2 4158 #endif // HAS_ARGBMULTIPLYROW_SSE2
4122 4159
4123 #ifdef HAS_ARGBMULTIPLYROW_AVX2 4160 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4124 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 4161 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4125 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4162 void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
4126 uint8* dst_argb, int width) { 4163 const uint8* src_argb1,
4164 uint8* dst_argb,
4165 int width) {
4127 asm volatile ( 4166 asm volatile (
4128 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" 4167 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
4129 4168
4130 // 4 pixel loop. 4169 // 4 pixel loop.
4131 LABELALIGN 4170 LABELALIGN
4132 "1: \n" 4171 "1: \n"
4133 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" 4172 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
4134 "lea " MEMLEA(0x20,0) ",%0 \n" 4173 "lea " MEMLEA(0x20,0) ",%0 \n"
4135 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" 4174 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
4136 "lea " MEMLEA(0x20,1) ",%1 \n" 4175 "lea " MEMLEA(0x20,1) ",%1 \n"
(...skipping 17 matching lines...) Expand all
4154 : "memory", "cc" 4193 : "memory", "cc"
4155 #if defined(__AVX2__) 4194 #if defined(__AVX2__)
4156 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4195 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4157 #endif 4196 #endif
4158 ); 4197 );
4159 } 4198 }
4160 #endif // HAS_ARGBMULTIPLYROW_AVX2 4199 #endif // HAS_ARGBMULTIPLYROW_AVX2
4161 4200
4162 #ifdef HAS_ARGBADDROW_SSE2 4201 #ifdef HAS_ARGBADDROW_SSE2
4163 // Add 2 rows of ARGB pixels together, 4 pixels at a time. 4202 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4164 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4203 void ARGBAddRow_SSE2(const uint8* src_argb0,
4165 uint8* dst_argb, int width) { 4204 const uint8* src_argb1,
4205 uint8* dst_argb,
4206 int width) {
4166 asm volatile ( 4207 asm volatile (
4167 // 4 pixel loop. 4208 // 4 pixel loop.
4168 LABELALIGN 4209 LABELALIGN
4169 "1: \n" 4210 "1: \n"
4170 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4211 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4171 "lea " MEMLEA(0x10,0) ",%0 \n" 4212 "lea " MEMLEA(0x10,0) ",%0 \n"
4172 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 4213 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4173 "lea " MEMLEA(0x10,1) ",%1 \n" 4214 "lea " MEMLEA(0x10,1) ",%1 \n"
4174 "paddusb %%xmm1,%%xmm0 \n" 4215 "paddusb %%xmm1,%%xmm0 \n"
4175 "movdqu %%xmm0," MEMACCESS(2) " \n" 4216 "movdqu %%xmm0," MEMACCESS(2) " \n"
4176 "lea " MEMLEA(0x10,2) ",%2 \n" 4217 "lea " MEMLEA(0x10,2) ",%2 \n"
4177 "sub $0x4,%3 \n" 4218 "sub $0x4,%3 \n"
4178 "jg 1b \n" 4219 "jg 1b \n"
4179 : "+r"(src_argb0), // %0 4220 : "+r"(src_argb0), // %0
4180 "+r"(src_argb1), // %1 4221 "+r"(src_argb1), // %1
4181 "+r"(dst_argb), // %2 4222 "+r"(dst_argb), // %2
4182 "+r"(width) // %3 4223 "+r"(width) // %3
4183 : 4224 :
4184 : "memory", "cc" 4225 : "memory", "cc"
4185 , "xmm0", "xmm1" 4226 , "xmm0", "xmm1"
4186 ); 4227 );
4187 } 4228 }
4188 #endif // HAS_ARGBADDROW_SSE2 4229 #endif // HAS_ARGBADDROW_SSE2
4189 4230
4190 #ifdef HAS_ARGBADDROW_AVX2 4231 #ifdef HAS_ARGBADDROW_AVX2
4191 // Add 2 rows of ARGB pixels together, 4 pixels at a time. 4232 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4192 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4233 void ARGBAddRow_AVX2(const uint8* src_argb0,
4193 uint8* dst_argb, int width) { 4234 const uint8* src_argb1,
4235 uint8* dst_argb,
4236 int width) {
4194 asm volatile ( 4237 asm volatile (
4195 // 4 pixel loop. 4238 // 4 pixel loop.
4196 LABELALIGN 4239 LABELALIGN
4197 "1: \n" 4240 "1: \n"
4198 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 4241 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4199 "lea " MEMLEA(0x20,0) ",%0 \n" 4242 "lea " MEMLEA(0x20,0) ",%0 \n"
4200 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" 4243 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4201 "lea " MEMLEA(0x20,1) ",%1 \n" 4244 "lea " MEMLEA(0x20,1) ",%1 \n"
4202 "vmovdqu %%ymm0," MEMACCESS(2) " \n" 4245 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4203 "lea " MEMLEA(0x20,2) ",%2 \n" 4246 "lea " MEMLEA(0x20,2) ",%2 \n"
4204 "sub $0x8,%3 \n" 4247 "sub $0x8,%3 \n"
4205 "jg 1b \n" 4248 "jg 1b \n"
4206 "vzeroupper \n" 4249 "vzeroupper \n"
4207 : "+r"(src_argb0), // %0 4250 : "+r"(src_argb0), // %0
4208 "+r"(src_argb1), // %1 4251 "+r"(src_argb1), // %1
4209 "+r"(dst_argb), // %2 4252 "+r"(dst_argb), // %2
4210 "+r"(width) // %3 4253 "+r"(width) // %3
4211 : 4254 :
4212 : "memory", "cc" 4255 : "memory", "cc"
4213 , "xmm0" 4256 , "xmm0"
4214 ); 4257 );
4215 } 4258 }
4216 #endif // HAS_ARGBADDROW_AVX2 4259 #endif // HAS_ARGBADDROW_AVX2
4217 4260
4218 #ifdef HAS_ARGBSUBTRACTROW_SSE2 4261 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4219 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. 4262 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4220 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4263 void ARGBSubtractRow_SSE2(const uint8* src_argb0,
4221 uint8* dst_argb, int width) { 4264 const uint8* src_argb1,
4265 uint8* dst_argb,
4266 int width) {
4222 asm volatile ( 4267 asm volatile (
4223 // 4 pixel loop. 4268 // 4 pixel loop.
4224 LABELALIGN 4269 LABELALIGN
4225 "1: \n" 4270 "1: \n"
4226 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4271 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4227 "lea " MEMLEA(0x10,0) ",%0 \n" 4272 "lea " MEMLEA(0x10,0) ",%0 \n"
4228 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 4273 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4229 "lea " MEMLEA(0x10,1) ",%1 \n" 4274 "lea " MEMLEA(0x10,1) ",%1 \n"
4230 "psubusb %%xmm1,%%xmm0 \n" 4275 "psubusb %%xmm1,%%xmm0 \n"
4231 "movdqu %%xmm0," MEMACCESS(2) " \n" 4276 "movdqu %%xmm0," MEMACCESS(2) " \n"
4232 "lea " MEMLEA(0x10,2) ",%2 \n" 4277 "lea " MEMLEA(0x10,2) ",%2 \n"
4233 "sub $0x4,%3 \n" 4278 "sub $0x4,%3 \n"
4234 "jg 1b \n" 4279 "jg 1b \n"
4235 : "+r"(src_argb0), // %0 4280 : "+r"(src_argb0), // %0
4236 "+r"(src_argb1), // %1 4281 "+r"(src_argb1), // %1
4237 "+r"(dst_argb), // %2 4282 "+r"(dst_argb), // %2
4238 "+r"(width) // %3 4283 "+r"(width) // %3
4239 : 4284 :
4240 : "memory", "cc" 4285 : "memory", "cc"
4241 , "xmm0", "xmm1" 4286 , "xmm0", "xmm1"
4242 ); 4287 );
4243 } 4288 }
4244 #endif // HAS_ARGBSUBTRACTROW_SSE2 4289 #endif // HAS_ARGBSUBTRACTROW_SSE2
4245 4290
4246 #ifdef HAS_ARGBSUBTRACTROW_AVX2 4291 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4247 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. 4292 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
4248 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4293 void ARGBSubtractRow_AVX2(const uint8* src_argb0,
4249 uint8* dst_argb, int width) { 4294 const uint8* src_argb1,
4295 uint8* dst_argb,
4296 int width) {
4250 asm volatile ( 4297 asm volatile (
4251 // 4 pixel loop. 4298 // 4 pixel loop.
4252 LABELALIGN 4299 LABELALIGN
4253 "1: \n" 4300 "1: \n"
4254 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 4301 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4255 "lea " MEMLEA(0x20,0) ",%0 \n" 4302 "lea " MEMLEA(0x20,0) ",%0 \n"
4256 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" 4303 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4257 "lea " MEMLEA(0x20,1) ",%1 \n" 4304 "lea " MEMLEA(0x20,1) ",%1 \n"
4258 "vmovdqu %%ymm0," MEMACCESS(2) " \n" 4305 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4259 "lea " MEMLEA(0x20,2) ",%2 \n" 4306 "lea " MEMLEA(0x20,2) ",%2 \n"
4260 "sub $0x8,%3 \n" 4307 "sub $0x8,%3 \n"
4261 "jg 1b \n" 4308 "jg 1b \n"
4262 "vzeroupper \n" 4309 "vzeroupper \n"
4263 : "+r"(src_argb0), // %0 4310 : "+r"(src_argb0), // %0
4264 "+r"(src_argb1), // %1 4311 "+r"(src_argb1), // %1
4265 "+r"(dst_argb), // %2 4312 "+r"(dst_argb), // %2
4266 "+r"(width) // %3 4313 "+r"(width) // %3
4267 : 4314 :
4268 : "memory", "cc" 4315 : "memory", "cc"
4269 , "xmm0" 4316 , "xmm0"
4270 ); 4317 );
4271 } 4318 }
4272 #endif // HAS_ARGBSUBTRACTROW_AVX2 4319 #endif // HAS_ARGBSUBTRACTROW_AVX2
4273 4320
4274 #ifdef HAS_SOBELXROW_SSE2 4321 #ifdef HAS_SOBELXROW_SSE2
4275 // SobelX as a matrix is 4322 // SobelX as a matrix is
4276 // -1 0 1 4323 // -1 0 1
4277 // -2 0 2 4324 // -2 0 2
4278 // -1 0 1 4325 // -1 0 1
4279 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, 4326 void SobelXRow_SSE2(const uint8* src_y0,
4280 const uint8* src_y2, uint8* dst_sobelx, int width) { 4327 const uint8* src_y1,
4328 const uint8* src_y2,
4329 uint8* dst_sobelx,
4330 int width) {
4281 asm volatile ( 4331 asm volatile (
4282 "sub %0,%1 \n" 4332 "sub %0,%1 \n"
4283 "sub %0,%2 \n" 4333 "sub %0,%2 \n"
4284 "sub %0,%3 \n" 4334 "sub %0,%3 \n"
4285 "pxor %%xmm5,%%xmm5 \n" 4335 "pxor %%xmm5,%%xmm5 \n"
4286 4336
4287 // 8 pixel loop. 4337 // 8 pixel loop.
4288 LABELALIGN 4338 LABELALIGN
4289 "1: \n" 4339 "1: \n"
4290 "movq " MEMACCESS(0) ",%%xmm0 \n" 4340 "movq " MEMACCESS(0) ",%%xmm0 \n"
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
4323 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4373 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4324 ); 4374 );
4325 } 4375 }
4326 #endif // HAS_SOBELXROW_SSE2 4376 #endif // HAS_SOBELXROW_SSE2
4327 4377
4328 #ifdef HAS_SOBELYROW_SSE2 4378 #ifdef HAS_SOBELYROW_SSE2
4329 // SobelY as a matrix is 4379 // SobelY as a matrix is
4330 // -1 -2 -1 4380 // -1 -2 -1
4331 // 0 0 0 4381 // 0 0 0
4332 // 1 2 1 4382 // 1 2 1
4333 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, 4383 void SobelYRow_SSE2(const uint8* src_y0,
4334 uint8* dst_sobely, int width) { 4384 const uint8* src_y1,
4385 uint8* dst_sobely,
4386 int width) {
4335 asm volatile ( 4387 asm volatile (
4336 "sub %0,%1 \n" 4388 "sub %0,%1 \n"
4337 "sub %0,%2 \n" 4389 "sub %0,%2 \n"
4338 "pxor %%xmm5,%%xmm5 \n" 4390 "pxor %%xmm5,%%xmm5 \n"
4339 4391
4340 // 8 pixel loop. 4392 // 8 pixel loop.
4341 LABELALIGN 4393 LABELALIGN
4342 "1: \n" 4394 "1: \n"
4343 "movq " MEMACCESS(0) ",%%xmm0 \n" 4395 "movq " MEMACCESS(0) ",%%xmm0 \n"
4344 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 4396 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
4376 ); 4428 );
4377 } 4429 }
4378 #endif // HAS_SOBELYROW_SSE2 4430 #endif // HAS_SOBELYROW_SSE2
4379 4431
4380 #ifdef HAS_SOBELROW_SSE2 4432 #ifdef HAS_SOBELROW_SSE2
4381 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. 4433 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4382 // A = 255 4434 // A = 255
4383 // R = Sobel 4435 // R = Sobel
4384 // G = Sobel 4436 // G = Sobel
4385 // B = Sobel 4437 // B = Sobel
4386 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 4438 void SobelRow_SSE2(const uint8* src_sobelx,
4387 uint8* dst_argb, int width) { 4439 const uint8* src_sobely,
4440 uint8* dst_argb,
4441 int width) {
4388 asm volatile ( 4442 asm volatile (
4389 "sub %0,%1 \n" 4443 "sub %0,%1 \n"
4390 "pcmpeqb %%xmm5,%%xmm5 \n" 4444 "pcmpeqb %%xmm5,%%xmm5 \n"
4391 "pslld $0x18,%%xmm5 \n" 4445 "pslld $0x18,%%xmm5 \n"
4392 4446
4393 // 8 pixel loop. 4447 // 8 pixel loop.
4394 LABELALIGN 4448 LABELALIGN
4395 "1: \n" 4449 "1: \n"
4396 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4450 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4397 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 4451 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
(...skipping 25 matching lines...) Expand all
4423 "+r"(width) // %3 4477 "+r"(width) // %3
4424 : 4478 :
4425 : "memory", "cc", NACL_R14 4479 : "memory", "cc", NACL_R14
4426 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4480 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4427 ); 4481 );
4428 } 4482 }
4429 #endif // HAS_SOBELROW_SSE2 4483 #endif // HAS_SOBELROW_SSE2
4430 4484
4431 #ifdef HAS_SOBELTOPLANEROW_SSE2 4485 #ifdef HAS_SOBELTOPLANEROW_SSE2
4432 // Adds Sobel X and Sobel Y and stores Sobel into a plane. 4486 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
4433 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 4487 void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
4434 uint8* dst_y, int width) { 4488 const uint8* src_sobely,
4489 uint8* dst_y,
4490 int width) {
4435 asm volatile ( 4491 asm volatile (
4436 "sub %0,%1 \n" 4492 "sub %0,%1 \n"
4437 "pcmpeqb %%xmm5,%%xmm5 \n" 4493 "pcmpeqb %%xmm5,%%xmm5 \n"
4438 "pslld $0x18,%%xmm5 \n" 4494 "pslld $0x18,%%xmm5 \n"
4439 4495
4440 // 8 pixel loop. 4496 // 8 pixel loop.
4441 LABELALIGN 4497 LABELALIGN
4442 "1: \n" 4498 "1: \n"
4443 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4499 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4444 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 4500 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
(...skipping 13 matching lines...) Expand all
4458 ); 4514 );
4459 } 4515 }
4460 #endif // HAS_SOBELTOPLANEROW_SSE2 4516 #endif // HAS_SOBELTOPLANEROW_SSE2
4461 4517
4462 #ifdef HAS_SOBELXYROW_SSE2 4518 #ifdef HAS_SOBELXYROW_SSE2
4463 // Mixes Sobel X, Sobel Y and Sobel into ARGB. 4519 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4464 // A = 255 4520 // A = 255
4465 // R = Sobel X 4521 // R = Sobel X
4466 // G = Sobel 4522 // G = Sobel
4467 // B = Sobel Y 4523 // B = Sobel Y
4468 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 4524 void SobelXYRow_SSE2(const uint8* src_sobelx,
4469 uint8* dst_argb, int width) { 4525 const uint8* src_sobely,
4526 uint8* dst_argb,
4527 int width) {
4470 asm volatile ( 4528 asm volatile (
4471 "sub %0,%1 \n" 4529 "sub %0,%1 \n"
4472 "pcmpeqb %%xmm5,%%xmm5 \n" 4530 "pcmpeqb %%xmm5,%%xmm5 \n"
4473 4531
4474 // 8 pixel loop. 4532 // 8 pixel loop.
4475 LABELALIGN 4533 LABELALIGN
4476 "1: \n" 4534 "1: \n"
4477 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4535 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4478 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 4536 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4479 "lea " MEMLEA(0x10,0) ",%0 \n" 4537 "lea " MEMLEA(0x10,0) ",%0 \n"
(...skipping 25 matching lines...) Expand all
4505 : 4563 :
4506 : "memory", "cc", NACL_R14 4564 : "memory", "cc", NACL_R14
4507 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4565 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4508 ); 4566 );
4509 } 4567 }
4510 #endif // HAS_SOBELXYROW_SSE2 4568 #endif // HAS_SOBELXYROW_SSE2
4511 4569
4512 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 4570 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4513 // Creates a table of cumulative sums where each value is a sum of all values 4571 // Creates a table of cumulative sums where each value is a sum of all values
4514 // above and to the left of the value, inclusive of the value. 4572 // above and to the left of the value, inclusive of the value.
4515 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 4573 void ComputeCumulativeSumRow_SSE2(const uint8* row,
4516 const int32* previous_cumsum, int width) { 4574 int32* cumsum,
4575 const int32* previous_cumsum,
4576 int width) {
4517 asm volatile ( 4577 asm volatile (
4518 "pxor %%xmm0,%%xmm0 \n" 4578 "pxor %%xmm0,%%xmm0 \n"
4519 "pxor %%xmm1,%%xmm1 \n" 4579 "pxor %%xmm1,%%xmm1 \n"
4520 "sub $0x4,%3 \n" 4580 "sub $0x4,%3 \n"
4521 "jl 49f \n" 4581 "jl 49f \n"
4522 "test $0xf,%1 \n" 4582 "test $0xf,%1 \n"
4523 "jne 49f \n" 4583 "jne 49f \n"
4524 4584
4525 // 4 pixel loop \n" 4585 // 4 pixel loop \n"
4526 LABELALIGN 4586 LABELALIGN
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
4583 "+r"(previous_cumsum), // %2 4643 "+r"(previous_cumsum), // %2
4584 "+r"(width) // %3 4644 "+r"(width) // %3
4585 : 4645 :
4586 : "memory", "cc" 4646 : "memory", "cc"
4587 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 4647 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4588 ); 4648 );
4589 } 4649 }
4590 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 4650 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4591 4651
4592 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 4652 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4593 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, 4653 void CumulativeSumToAverageRow_SSE2(const int32* topleft,
4594 int width, int area, uint8* dst, 4654 const int32* botleft,
4655 int width,
4656 int area,
4657 uint8* dst,
4595 int count) { 4658 int count) {
4596 asm volatile ( 4659 asm volatile (
4597 "movd %5,%%xmm5 \n" 4660 "movd %5,%%xmm5 \n"
4598 "cvtdq2ps %%xmm5,%%xmm5 \n" 4661 "cvtdq2ps %%xmm5,%%xmm5 \n"
4599 "rcpss %%xmm5,%%xmm4 \n" 4662 "rcpss %%xmm5,%%xmm4 \n"
4600 "pshufd $0x0,%%xmm4,%%xmm4 \n" 4663 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4601 "sub $0x4,%3 \n" 4664 "sub $0x4,%3 \n"
4602 "jl 49f \n" 4665 "jl 49f \n"
4603 "cmpl $0x80,%5 \n" 4666 "cmpl $0x80,%5 \n"
4604 "ja 40f \n" 4667 "ja 40f \n"
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after
4716 "rm"(area) // %5 4779 "rm"(area) // %5
4717 : "memory", "cc", NACL_R14 4780 : "memory", "cc", NACL_R14
4718 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 4781 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4719 ); 4782 );
4720 } 4783 }
4721 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 4784 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4722 4785
4723 #ifdef HAS_ARGBAFFINEROW_SSE2 4786 #ifdef HAS_ARGBAFFINEROW_SSE2
4724 // Copy ARGB pixels from source image with slope to a row of destination. 4787 // Copy ARGB pixels from source image with slope to a row of destination.
4725 LIBYUV_API 4788 LIBYUV_API
4726 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 4789 void ARGBAffineRow_SSE2(const uint8* src_argb,
4727 uint8* dst_argb, const float* src_dudv, int width) { 4790 int src_argb_stride,
4791 uint8* dst_argb,
4792 const float* src_dudv,
4793 int width) {
4728 intptr_t src_argb_stride_temp = src_argb_stride; 4794 intptr_t src_argb_stride_temp = src_argb_stride;
4729 intptr_t temp; 4795 intptr_t temp;
4730 asm volatile ( 4796 asm volatile (
4731 "movq " MEMACCESS(3) ",%%xmm2 \n" 4797 "movq " MEMACCESS(3) ",%%xmm2 \n"
4732 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" 4798 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
4733 "shl $0x10,%1 \n" 4799 "shl $0x10,%1 \n"
4734 "add $0x4,%1 \n" 4800 "add $0x4,%1 \n"
4735 "movd %1,%%xmm5 \n" 4801 "movd %1,%%xmm5 \n"
4736 "sub $0x4,%4 \n" 4802 "sub $0x4,%4 \n"
4737 "jl 49f \n" 4803 "jl 49f \n"
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
4801 "=&r"(temp) // %5 4867 "=&r"(temp) // %5
4802 : 4868 :
4803 : "memory", "cc", NACL_R14 4869 : "memory", "cc", NACL_R14
4804 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4870 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4805 ); 4871 );
4806 } 4872 }
4807 #endif // HAS_ARGBAFFINEROW_SSE2 4873 #endif // HAS_ARGBAFFINEROW_SSE2
4808 4874
4809 #ifdef HAS_INTERPOLATEROW_SSSE3 4875 #ifdef HAS_INTERPOLATEROW_SSSE3
4810 // Bilinear filter 16x2 -> 16x1 4876 // Bilinear filter 16x2 -> 16x1
4811 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 4877 void InterpolateRow_SSSE3(uint8* dst_ptr,
4812 ptrdiff_t src_stride, int dst_width, 4878 const uint8* src_ptr,
4879 ptrdiff_t src_stride,
4880 int dst_width,
4813 int source_y_fraction) { 4881 int source_y_fraction) {
4814 asm volatile ( 4882 asm volatile (
4815 "sub %1,%0 \n" 4883 "sub %1,%0 \n"
4816 "cmp $0x0,%3 \n" 4884 "cmp $0x0,%3 \n"
4817 "je 100f \n" 4885 "je 100f \n"
4818 "cmp $0x80,%3 \n" 4886 "cmp $0x80,%3 \n"
4819 "je 50f \n" 4887 "je 50f \n"
4820 4888
4821 "movd %3,%%xmm0 \n" 4889 "movd %3,%%xmm0 \n"
4822 "neg %3 \n" 4890 "neg %3 \n"
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
4882 "+r"(source_y_fraction) // %3 4950 "+r"(source_y_fraction) // %3
4883 : "r"((intptr_t)(src_stride)) // %4 4951 : "r"((intptr_t)(src_stride)) // %4
4884 : "memory", "cc", "eax", NACL_R14 4952 : "memory", "cc", "eax", NACL_R14
4885 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 4953 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4886 ); 4954 );
4887 } 4955 }
4888 #endif // HAS_INTERPOLATEROW_SSSE3 4956 #endif // HAS_INTERPOLATEROW_SSSE3
4889 4957
4890 #ifdef HAS_INTERPOLATEROW_AVX2 4958 #ifdef HAS_INTERPOLATEROW_AVX2
4891 // Bilinear filter 32x2 -> 32x1 4959 // Bilinear filter 32x2 -> 32x1
4892 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, 4960 void InterpolateRow_AVX2(uint8* dst_ptr,
4893 ptrdiff_t src_stride, int dst_width, 4961 const uint8* src_ptr,
4962 ptrdiff_t src_stride,
4963 int dst_width,
4894 int source_y_fraction) { 4964 int source_y_fraction) {
4895 asm volatile ( 4965 asm volatile (
4896 "cmp $0x0,%3 \n" 4966 "cmp $0x0,%3 \n"
4897 "je 100f \n" 4967 "je 100f \n"
4898 "sub %1,%0 \n" 4968 "sub %1,%0 \n"
4899 "cmp $0x80,%3 \n" 4969 "cmp $0x80,%3 \n"
4900 "je 50f \n" 4970 "je 50f \n"
4901 4971
4902 "vmovd %3,%%xmm0 \n" 4972 "vmovd %3,%%xmm0 \n"
4903 "neg %3 \n" 4973 "neg %3 \n"
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
4958 "+r"(source_y_fraction) // %3 5028 "+r"(source_y_fraction) // %3
4959 : "r"((intptr_t)(src_stride)) // %4 5029 : "r"((intptr_t)(src_stride)) // %4
4960 : "memory", "cc", "eax", NACL_R14 5030 : "memory", "cc", "eax", NACL_R14
4961 "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" 5031 "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
4962 ); 5032 );
4963 } 5033 }
4964 #endif // HAS_INTERPOLATEROW_AVX2 5034 #endif // HAS_INTERPOLATEROW_AVX2
4965 5035
4966 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 5036 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
4967 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5037 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4968 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5038 void ARGBShuffleRow_SSSE3(const uint8* src_argb,
4969 const uint8* shuffler, int width) { 5039 uint8* dst_argb,
5040 const uint8* shuffler,
5041 int width) {
4970 asm volatile ( 5042 asm volatile (
4971 "movdqu " MEMACCESS(3) ",%%xmm5 \n" 5043 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
4972 LABELALIGN 5044 LABELALIGN
4973 "1: \n" 5045 "1: \n"
4974 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5046 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4975 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 5047 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4976 "lea " MEMLEA(0x20,0) ",%0 \n" 5048 "lea " MEMLEA(0x20,0) ",%0 \n"
4977 "pshufb %%xmm5,%%xmm0 \n" 5049 "pshufb %%xmm5,%%xmm0 \n"
4978 "pshufb %%xmm5,%%xmm1 \n" 5050 "pshufb %%xmm5,%%xmm1 \n"
4979 "movdqu %%xmm0," MEMACCESS(1) " \n" 5051 "movdqu %%xmm0," MEMACCESS(1) " \n"
4980 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 5052 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
4981 "lea " MEMLEA(0x20,1) ",%1 \n" 5053 "lea " MEMLEA(0x20,1) ",%1 \n"
4982 "sub $0x8,%2 \n" 5054 "sub $0x8,%2 \n"
4983 "jg 1b \n" 5055 "jg 1b \n"
4984 : "+r"(src_argb), // %0 5056 : "+r"(src_argb), // %0
4985 "+r"(dst_argb), // %1 5057 "+r"(dst_argb), // %1
4986 "+r"(width) // %2 5058 "+r"(width) // %2
4987 : "r"(shuffler) // %3 5059 : "r"(shuffler) // %3
4988 : "memory", "cc" 5060 : "memory", "cc"
4989 , "xmm0", "xmm1", "xmm5" 5061 , "xmm0", "xmm1", "xmm5"
4990 ); 5062 );
4991 } 5063 }
4992 #endif // HAS_ARGBSHUFFLEROW_SSSE3 5064 #endif // HAS_ARGBSHUFFLEROW_SSSE3
4993 5065
4994 #ifdef HAS_ARGBSHUFFLEROW_AVX2 5066 #ifdef HAS_ARGBSHUFFLEROW_AVX2
4995 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5067 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4996 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 5068 void ARGBShuffleRow_AVX2(const uint8* src_argb,
4997 const uint8* shuffler, int width) { 5069 uint8* dst_argb,
5070 const uint8* shuffler,
5071 int width) {
4998 asm volatile ( 5072 asm volatile (
4999 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" 5073 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
5000 LABELALIGN 5074 LABELALIGN
5001 "1: \n" 5075 "1: \n"
5002 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 5076 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
5003 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 5077 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
5004 "lea " MEMLEA(0x40,0) ",%0 \n" 5078 "lea " MEMLEA(0x40,0) ",%0 \n"
5005 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" 5079 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
5006 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" 5080 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
5007 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 5081 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
5008 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" 5082 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
5009 "lea " MEMLEA(0x40,1) ",%1 \n" 5083 "lea " MEMLEA(0x40,1) ",%1 \n"
5010 "sub $0x10,%2 \n" 5084 "sub $0x10,%2 \n"
5011 "jg 1b \n" 5085 "jg 1b \n"
5012 "vzeroupper \n" 5086 "vzeroupper \n"
5013 : "+r"(src_argb), // %0 5087 : "+r"(src_argb), // %0
5014 "+r"(dst_argb), // %1 5088 "+r"(dst_argb), // %1
5015 "+r"(width) // %2 5089 "+r"(width) // %2
5016 : "r"(shuffler) // %3 5090 : "r"(shuffler) // %3
5017 : "memory", "cc" 5091 : "memory", "cc"
5018 , "xmm0", "xmm1", "xmm5" 5092 , "xmm0", "xmm1", "xmm5"
5019 ); 5093 );
5020 } 5094 }
5021 #endif // HAS_ARGBSHUFFLEROW_AVX2 5095 #endif // HAS_ARGBSHUFFLEROW_AVX2
5022 5096
5023 #ifdef HAS_ARGBSHUFFLEROW_SSE2 5097 #ifdef HAS_ARGBSHUFFLEROW_SSE2
5024 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5098 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5025 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 5099 void ARGBShuffleRow_SSE2(const uint8* src_argb,
5026 const uint8* shuffler, int width) { 5100 uint8* dst_argb,
5101 const uint8* shuffler,
5102 int width) {
5027 uintptr_t pixel_temp; 5103 uintptr_t pixel_temp;
5028 asm volatile ( 5104 asm volatile (
5029 "pxor %%xmm5,%%xmm5 \n" 5105 "pxor %%xmm5,%%xmm5 \n"
5030 "mov " MEMACCESS(4) ",%k2 \n" 5106 "mov " MEMACCESS(4) ",%k2 \n"
5031 "cmp $0x3000102,%k2 \n" 5107 "cmp $0x3000102,%k2 \n"
5032 "je 3012f \n" 5108 "je 3012f \n"
5033 "cmp $0x10203,%k2 \n" 5109 "cmp $0x10203,%k2 \n"
5034 "je 123f \n" 5110 "je 123f \n"
5035 "cmp $0x30201,%k2 \n" 5111 "cmp $0x30201,%k2 \n"
5036 "je 321f \n" 5112 "je 321f \n"
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after
5137 : "memory", "cc", NACL_R14 5213 : "memory", "cc", NACL_R14
5138 "xmm0", "xmm1", "xmm5" 5214 "xmm0", "xmm1", "xmm5"
5139 ); 5215 );
5140 } 5216 }
5141 #endif // HAS_ARGBSHUFFLEROW_SSE2 5217 #endif // HAS_ARGBSHUFFLEROW_SSE2
5142 5218
5143 #ifdef HAS_I422TOYUY2ROW_SSE2 5219 #ifdef HAS_I422TOYUY2ROW_SSE2
5144 void I422ToYUY2Row_SSE2(const uint8* src_y, 5220 void I422ToYUY2Row_SSE2(const uint8* src_y,
5145 const uint8* src_u, 5221 const uint8* src_u,
5146 const uint8* src_v, 5222 const uint8* src_v,
5147 uint8* dst_frame, int width) { 5223 uint8* dst_frame,
5148 asm volatile ( 5224 int width) {
5225 asm volatile (
5149 "sub %1,%2 \n" 5226 "sub %1,%2 \n"
5150 LABELALIGN 5227 LABELALIGN
5151 "1: \n" 5228 "1: \n"
5152 "movq " MEMACCESS(1) ",%%xmm2 \n" 5229 "movq " MEMACCESS(1) ",%%xmm2 \n"
5153 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 5230 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5154 "lea " MEMLEA(0x8,1) ",%1 \n" 5231 "lea " MEMLEA(0x8,1) ",%1 \n"
5155 "punpcklbw %%xmm3,%%xmm2 \n" 5232 "punpcklbw %%xmm3,%%xmm2 \n"
5156 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5233 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5157 "lea " MEMLEA(0x10,0) ",%0 \n" 5234 "lea " MEMLEA(0x10,0) ",%0 \n"
5158 "movdqa %%xmm0,%%xmm1 \n" 5235 "movdqa %%xmm0,%%xmm1 \n"
(...skipping 13 matching lines...) Expand all
5172 : "memory", "cc", NACL_R14 5249 : "memory", "cc", NACL_R14
5173 "xmm0", "xmm1", "xmm2", "xmm3" 5250 "xmm0", "xmm1", "xmm2", "xmm3"
5174 ); 5251 );
5175 } 5252 }
5176 #endif // HAS_I422TOYUY2ROW_SSE2 5253 #endif // HAS_I422TOYUY2ROW_SSE2
5177 5254
5178 #ifdef HAS_I422TOUYVYROW_SSE2 5255 #ifdef HAS_I422TOUYVYROW_SSE2
5179 void I422ToUYVYRow_SSE2(const uint8* src_y, 5256 void I422ToUYVYRow_SSE2(const uint8* src_y,
5180 const uint8* src_u, 5257 const uint8* src_u,
5181 const uint8* src_v, 5258 const uint8* src_v,
5182 uint8* dst_frame, int width) { 5259 uint8* dst_frame,
5183 asm volatile ( 5260 int width) {
5261 asm volatile (
5184 "sub %1,%2 \n" 5262 "sub %1,%2 \n"
5185 LABELALIGN 5263 LABELALIGN
5186 "1: \n" 5264 "1: \n"
5187 "movq " MEMACCESS(1) ",%%xmm2 \n" 5265 "movq " MEMACCESS(1) ",%%xmm2 \n"
5188 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 5266 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5189 "lea " MEMLEA(0x8,1) ",%1 \n" 5267 "lea " MEMLEA(0x8,1) ",%1 \n"
5190 "punpcklbw %%xmm3,%%xmm2 \n" 5268 "punpcklbw %%xmm3,%%xmm2 \n"
5191 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5269 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5192 "movdqa %%xmm2,%%xmm1 \n" 5270 "movdqa %%xmm2,%%xmm1 \n"
5193 "lea " MEMLEA(0x10,0) ",%0 \n" 5271 "lea " MEMLEA(0x10,0) ",%0 \n"
(...skipping 11 matching lines...) Expand all
5205 "+rm"(width) // %4 5283 "+rm"(width) // %4
5206 : 5284 :
5207 : "memory", "cc", NACL_R14 5285 : "memory", "cc", NACL_R14
5208 "xmm0", "xmm1", "xmm2", "xmm3" 5286 "xmm0", "xmm1", "xmm2", "xmm3"
5209 ); 5287 );
5210 } 5288 }
5211 #endif // HAS_I422TOUYVYROW_SSE2 5289 #endif // HAS_I422TOUYVYROW_SSE2
5212 5290
5213 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 5291 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5214 void ARGBPolynomialRow_SSE2(const uint8* src_argb, 5292 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5215 uint8* dst_argb, const float* poly, 5293 uint8* dst_argb,
5294 const float* poly,
5216 int width) { 5295 int width) {
5217 asm volatile ( 5296 asm volatile (
5218 "pxor %%xmm3,%%xmm3 \n" 5297 "pxor %%xmm3,%%xmm3 \n"
5219 5298
5220 // 2 pixel loop. 5299 // 2 pixel loop.
5221 LABELALIGN 5300 LABELALIGN
5222 "1: \n" 5301 "1: \n"
5223 "movq " MEMACCESS(0) ",%%xmm0 \n" 5302 "movq " MEMACCESS(0) ",%%xmm0 \n"
5224 "lea " MEMLEA(0x8,0) ",%0 \n" 5303 "lea " MEMLEA(0x8,0) ",%0 \n"
5225 "punpcklbw %%xmm3,%%xmm0 \n" 5304 "punpcklbw %%xmm3,%%xmm0 \n"
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
5261 "+r"(width) // %2 5340 "+r"(width) // %2
5262 : "r"(poly) // %3 5341 : "r"(poly) // %3
5263 : "memory", "cc" 5342 : "memory", "cc"
5264 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 5343 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5265 ); 5344 );
5266 } 5345 }
5267 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 5346 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
5268 5347
5269 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 5348 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5270 void ARGBPolynomialRow_AVX2(const uint8* src_argb, 5349 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5271 uint8* dst_argb, const float* poly, 5350 uint8* dst_argb,
5351 const float* poly,
5272 int width) { 5352 int width) {
5273 asm volatile ( 5353 asm volatile (
5274 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" 5354 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
5275 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" 5355 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5276 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" 5356 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5277 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" 5357 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5278 5358
5279 // 2 pixel loop. 5359 // 2 pixel loop.
5280 LABELALIGN 5360 LABELALIGN
5281 "1: \n" 5361 "1: \n"
(...skipping 155 matching lines...) Expand 10 before | Expand all | Expand 10 after
5437 "+r"(width) // %2 5517 "+r"(width) // %2
5438 : 5518 :
5439 : "memory", "cc", 5519 : "memory", "cc",
5440 "xmm2", "xmm3" 5520 "xmm2", "xmm3"
5441 ); 5521 );
5442 } 5522 }
5443 #endif // HAS_HALFFLOATROW_F16C 5523 #endif // HAS_HALFFLOATROW_F16C
5444 5524
5445 #ifdef HAS_ARGBCOLORTABLEROW_X86 5525 #ifdef HAS_ARGBCOLORTABLEROW_X86
5446 // Tranform ARGB pixels with color table. 5526 // Tranform ARGB pixels with color table.
5447 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 5527 void ARGBColorTableRow_X86(uint8* dst_argb,
5528 const uint8* table_argb,
5448 int width) { 5529 int width) {
5449 uintptr_t pixel_temp; 5530 uintptr_t pixel_temp;
5450 asm volatile ( 5531 asm volatile (
5451 // 1 pixel loop. 5532 // 1 pixel loop.
5452 LABELALIGN 5533 LABELALIGN
5453 "1: \n" 5534 "1: \n"
5454 "movzb " MEMACCESS(0) ",%1 \n" 5535 "movzb " MEMACCESS(0) ",%1 \n"
5455 "lea " MEMLEA(0x4,0) ",%0 \n" 5536 "lea " MEMLEA(0x4,0) ",%0 \n"
5456 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 5537 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5457 "mov %b1," MEMACCESS2(-0x4,0) " \n" 5538 "mov %b1," MEMACCESS2(-0x4,0) " \n"
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
5497 : "+r"(dst_argb), // %0 5578 : "+r"(dst_argb), // %0
5498 "=&d"(pixel_temp), // %1 5579 "=&d"(pixel_temp), // %1
5499 "+r"(width) // %2 5580 "+r"(width) // %2
5500 : "r"(table_argb) // %3 5581 : "r"(table_argb) // %3
5501 : "memory", "cc"); 5582 : "memory", "cc");
5502 } 5583 }
5503 #endif // HAS_RGBCOLORTABLEROW_X86 5584 #endif // HAS_RGBCOLORTABLEROW_X86
5504 5585
5505 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 5586 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5506 // Tranform RGB pixels with luma table. 5587 // Tranform RGB pixels with luma table.
5507 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5588 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
5589 uint8* dst_argb,
5508 int width, 5590 int width,
5509 const uint8* luma, uint32 lumacoeff) { 5591 const uint8* luma,
5592 uint32 lumacoeff) {
5510 uintptr_t pixel_temp; 5593 uintptr_t pixel_temp;
5511 uintptr_t table_temp; 5594 uintptr_t table_temp;
5512 asm volatile ( 5595 asm volatile (
5513 "movd %6,%%xmm3 \n" 5596 "movd %6,%%xmm3 \n"
5514 "pshufd $0x0,%%xmm3,%%xmm3 \n" 5597 "pshufd $0x0,%%xmm3,%%xmm3 \n"
5515 "pcmpeqb %%xmm4,%%xmm4 \n" 5598 "pcmpeqb %%xmm4,%%xmm4 \n"
5516 "psllw $0x8,%%xmm4 \n" 5599 "psllw $0x8,%%xmm4 \n"
5517 "pxor %%xmm5,%%xmm5 \n" 5600 "pxor %%xmm5,%%xmm5 \n"
5518 5601
5519 // 4 pixel loop. 5602 // 4 pixel loop.
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
5597 "+rm"(width) // %4 5680 "+rm"(width) // %4
5598 : "r"(luma), // %5 5681 : "r"(luma), // %5
5599 "rm"(lumacoeff) // %6 5682 "rm"(lumacoeff) // %6
5600 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" 5683 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5601 ); 5684 );
5602 } 5685 }
5603 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5686 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5604 5687
5605 #endif // defined(__x86_64__) || defined(__i386__) 5688 #endif // defined(__x86_64__) || defined(__i386__)
5606 5689
5607 // clang-format on
5608
5609 #ifdef __cplusplus 5690 #ifdef __cplusplus
5610 } // extern "C" 5691 } // extern "C"
5611 } // namespace libyuv 5692 } // namespace libyuv
5612 #endif 5693 #endif
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698