OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
11 | 11 |
12 #include "libyuv/row.h" | 12 #include "libyuv/row.h" |
13 | 13 |
14 #ifdef __cplusplus | 14 #ifdef __cplusplus |
15 namespace libyuv { | 15 namespace libyuv { |
16 extern "C" { | 16 extern "C" { |
17 #endif | 17 #endif |
18 | 18 |
19 // clang-format off | |
20 | |
21 // This module is for GCC x86 and x64. | 19 // This module is for GCC x86 and x64. |
22 #if !defined(LIBYUV_DISABLE_X86) && \ | 20 #if !defined(LIBYUV_DISABLE_X86) && \ |
23 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) | 21 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) |
24 | 22 |
25 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) | 23 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) |
26 | 24 |
27 // Constants for ARGB | 25 // Constants for ARGB |
28 static vec8 kARGBToY = { | 26 static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, |
29 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 | 27 13, 65, 33, 0, 13, 65, 33, 0}; |
30 }; | |
31 | 28 |
32 // JPeg full range. | 29 // JPeg full range. |
33 static vec8 kARGBToYJ = { | 30 static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, |
34 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 | 31 15, 75, 38, 0, 15, 75, 38, 0}; |
35 }; | |
36 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) | 32 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) |
37 | 33 |
38 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) | 34 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) |
39 | 35 |
40 static vec8 kARGBToU = { | 36 static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, |
41 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 | 37 112, -74, -38, 0, 112, -74, -38, 0}; |
| 38 |
| 39 static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, |
| 40 127, -84, -43, 0, 127, -84, -43, 0}; |
| 41 |
| 42 static vec8 kARGBToV = { |
| 43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, |
42 }; | 44 }; |
43 | 45 |
44 static vec8 kARGBToUJ = { | 46 static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, |
45 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 | 47 -20, -107, 127, 0, -20, -107, 127, 0}; |
46 }; | |
47 | |
48 static vec8 kARGBToV = { | |
49 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, | |
50 }; | |
51 | |
52 static vec8 kARGBToVJ = { | |
53 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 | |
54 }; | |
55 | 48 |
56 // Constants for BGRA | 49 // Constants for BGRA |
57 static vec8 kBGRAToY = { | 50 static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, |
58 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 | 51 0, 33, 65, 13, 0, 33, 65, 13}; |
59 }; | |
60 | 52 |
61 static vec8 kBGRAToU = { | 53 static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, |
62 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 | 54 0, -38, -74, 112, 0, -38, -74, 112}; |
63 }; | |
64 | 55 |
65 static vec8 kBGRAToV = { | 56 static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, |
66 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 | 57 0, 112, -94, -18, 0, 112, -94, -18}; |
67 }; | |
68 | 58 |
69 // Constants for ABGR | 59 // Constants for ABGR |
70 static vec8 kABGRToY = { | 60 static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, |
71 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 | 61 33, 65, 13, 0, 33, 65, 13, 0}; |
72 }; | |
73 | 62 |
74 static vec8 kABGRToU = { | 63 static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, |
75 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 | 64 -38, -74, 112, 0, -38, -74, 112, 0}; |
76 }; | |
77 | 65 |
78 static vec8 kABGRToV = { | 66 static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, |
79 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 | 67 112, -94, -18, 0, 112, -94, -18, 0}; |
80 }; | |
81 | 68 |
82 // Constants for RGBA. | 69 // Constants for RGBA. |
83 static vec8 kRGBAToY = { | 70 static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, |
84 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 | 71 0, 13, 65, 33, 0, 13, 65, 33}; |
85 }; | |
86 | 72 |
87 static vec8 kRGBAToU = { | 73 static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, |
88 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 | 74 0, 112, -74, -38, 0, 112, -74, -38}; |
89 }; | |
90 | 75 |
91 static vec8 kRGBAToV = { | 76 static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, |
92 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 | 77 0, -18, -94, 112, 0, -18, -94, 112}; |
93 }; | |
94 | 78 |
95 static uvec8 kAddY16 = { | 79 static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, |
96 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u | 80 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; |
97 }; | |
98 | 81 |
99 // 7 bit fixed point 0.5. | 82 // 7 bit fixed point 0.5. |
100 static vec16 kAddYJ64 = { | 83 static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; |
101 64, 64, 64, 64, 64, 64, 64, 64 | |
102 }; | |
103 | 84 |
104 static uvec8 kAddUV128 = { | 85 static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, |
105 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, | 86 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; |
106 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u | |
107 }; | |
108 | 87 |
109 static uvec16 kAddUVJ128 = { | 88 static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, |
110 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u | 89 0x8080u, 0x8080u, 0x8080u, 0x8080u}; |
111 }; | |
112 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) | 90 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) |
113 | 91 |
114 #ifdef HAS_RGB24TOARGBROW_SSSE3 | 92 #ifdef HAS_RGB24TOARGBROW_SSSE3 |
115 | 93 |
116 // Shuffle table for converting RGB24 to ARGB. | 94 // Shuffle table for converting RGB24 to ARGB. |
117 static uvec8 kShuffleMaskRGB24ToARGB = { | 95 static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, |
118 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u | 96 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; |
119 }; | |
120 | 97 |
121 // Shuffle table for converting RAW to ARGB. | 98 // Shuffle table for converting RAW to ARGB. |
122 static uvec8 kShuffleMaskRAWToARGB = { | 99 static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, |
123 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u | 100 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; |
124 }; | |
125 | 101 |
126 // Shuffle table for converting RAW to RGB24. First 8. | 102 // Shuffle table for converting RAW to RGB24. First 8. |
127 static const uvec8 kShuffleMaskRAWToRGB24_0 = { | 103 static const uvec8 kShuffleMaskRAWToRGB24_0 = { |
128 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, | 104 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, |
129 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u | 105 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; |
130 }; | |
131 | 106 |
132 // Shuffle table for converting RAW to RGB24. Middle 8. | 107 // Shuffle table for converting RAW to RGB24. Middle 8. |
133 static const uvec8 kShuffleMaskRAWToRGB24_1 = { | 108 static const uvec8 kShuffleMaskRAWToRGB24_1 = { |
134 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, | 109 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, |
135 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u | 110 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; |
136 }; | |
137 | 111 |
138 // Shuffle table for converting RAW to RGB24. Last 8. | 112 // Shuffle table for converting RAW to RGB24. Last 8. |
139 static const uvec8 kShuffleMaskRAWToRGB24_2 = { | 113 static const uvec8 kShuffleMaskRAWToRGB24_2 = { |
140 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, | 114 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, |
141 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u | 115 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; |
142 }; | |
143 | 116 |
144 // Shuffle table for converting ARGB to RGB24. | 117 // Shuffle table for converting ARGB to RGB24. |
145 static uvec8 kShuffleMaskARGBToRGB24 = { | 118 static uvec8 kShuffleMaskARGBToRGB24 = { |
146 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u | 119 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; |
147 }; | |
148 | 120 |
149 // Shuffle table for converting ARGB to RAW. | 121 // Shuffle table for converting ARGB to RAW. |
150 static uvec8 kShuffleMaskARGBToRAW = { | 122 static uvec8 kShuffleMaskARGBToRAW = { |
151 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u | 123 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; |
152 }; | |
153 | 124 |
154 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 | 125 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 |
155 static uvec8 kShuffleMaskARGBToRGB24_0 = { | 126 static uvec8 kShuffleMaskARGBToRGB24_0 = { |
156 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u | 127 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; |
157 }; | |
158 | 128 |
159 // YUY2 shuf 16 Y to 32 Y. | 129 // YUY2 shuf 16 Y to 32 Y. |
160 static const lvec8 kShuffleYUY2Y = { | 130 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, |
161 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, | 131 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, |
162 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 | 132 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; |
163 }; | |
164 | 133 |
165 // YUY2 shuf 8 UV to 16 UV. | 134 // YUY2 shuf 8 UV to 16 UV. |
166 static const lvec8 kShuffleYUY2UV = { | 135 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, |
167 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, | 136 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, |
168 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 | 137 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; |
169 }; | |
170 | 138 |
171 // UYVY shuf 16 Y to 32 Y. | 139 // UYVY shuf 16 Y to 32 Y. |
172 static const lvec8 kShuffleUYVYY = { | 140 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, |
173 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, | 141 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, |
174 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 | 142 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; |
175 }; | |
176 | 143 |
177 // UYVY shuf 8 UV to 16 UV. | 144 // UYVY shuf 8 UV to 16 UV. |
178 static const lvec8 kShuffleUYVYUV = { | 145 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, |
179 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, | 146 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, |
180 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 | 147 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; |
181 }; | |
182 | 148 |
183 // NV21 shuf 8 VU to 16 UV. | 149 // NV21 shuf 8 VU to 16 UV. |
184 static const lvec8 kShuffleNV21 = { | 150 static const lvec8 kShuffleNV21 = { |
185 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, | 151 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
186 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, | 152 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
187 }; | 153 }; |
188 #endif // HAS_RGB24TOARGBROW_SSSE3 | 154 #endif // HAS_RGB24TOARGBROW_SSSE3 |
189 | 155 |
190 #ifdef HAS_J400TOARGBROW_SSE2 | 156 #ifdef HAS_J400TOARGBROW_SSE2 |
191 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { | 157 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { |
192 asm volatile ( | 158 asm volatile ( |
193 "pcmpeqb %%xmm5,%%xmm5 \n" | 159 "pcmpeqb %%xmm5,%%xmm5 \n" |
194 "pslld $0x18,%%xmm5 \n" | 160 "pslld $0x18,%%xmm5 \n" |
195 LABELALIGN | 161 LABELALIGN |
196 "1: \n" | 162 "1: \n" |
(...skipping 367 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
564 "lea " MEMLEA(0x8,1) ",%1 \n" | 530 "lea " MEMLEA(0x8,1) ",%1 \n" |
565 "sub $0x4,%2 \n" | 531 "sub $0x4,%2 \n" |
566 "jg 1b \n" | 532 "jg 1b \n" |
567 : "+r"(src), // %0 | 533 : "+r"(src), // %0 |
568 "+r"(dst), // %1 | 534 "+r"(dst), // %1 |
569 "+r"(width) // %2 | 535 "+r"(width) // %2 |
570 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 536 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
571 ); | 537 ); |
572 } | 538 } |
573 | 539 |
574 void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, | 540 void ARGBToRGB565DitherRow_SSE2(const uint8* src, |
575 const uint32 dither4, int width) { | 541 uint8* dst, |
576 asm volatile ( | 542 const uint32 dither4, |
577 "movd %3,%%xmm6 \n" | 543 int width) { |
578 "punpcklbw %%xmm6,%%xmm6 \n" | 544 asm volatile( |
579 "movdqa %%xmm6,%%xmm7 \n" | 545 "movd %3,%%xmm6 \n" |
580 "punpcklwd %%xmm6,%%xmm6 \n" | 546 "punpcklbw %%xmm6,%%xmm6 \n" |
581 "punpckhwd %%xmm7,%%xmm7 \n" | 547 "movdqa %%xmm6,%%xmm7 \n" |
582 "pcmpeqb %%xmm3,%%xmm3 \n" | 548 "punpcklwd %%xmm6,%%xmm6 \n" |
583 "psrld $0x1b,%%xmm3 \n" | 549 "punpckhwd %%xmm7,%%xmm7 \n" |
584 "pcmpeqb %%xmm4,%%xmm4 \n" | 550 "pcmpeqb %%xmm3,%%xmm3 \n" |
585 "psrld $0x1a,%%xmm4 \n" | 551 "psrld $0x1b,%%xmm3 \n" |
586 "pslld $0x5,%%xmm4 \n" | 552 "pcmpeqb %%xmm4,%%xmm4 \n" |
587 "pcmpeqb %%xmm5,%%xmm5 \n" | 553 "psrld $0x1a,%%xmm4 \n" |
588 "pslld $0xb,%%xmm5 \n" | 554 "pslld $0x5,%%xmm4 \n" |
| 555 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 556 "pslld $0xb,%%xmm5 \n" |
589 | 557 |
590 LABELALIGN | 558 LABELALIGN |
591 "1: \n" | 559 "1: \n" |
592 "movdqu (%0),%%xmm0 \n" | 560 "movdqu (%0),%%xmm0 \n" |
593 "paddusb %%xmm6,%%xmm0 \n" | 561 "paddusb %%xmm6,%%xmm0 \n" |
594 "movdqa %%xmm0,%%xmm1 \n" | 562 "movdqa %%xmm0,%%xmm1 \n" |
595 "movdqa %%xmm0,%%xmm2 \n" | 563 "movdqa %%xmm0,%%xmm2 \n" |
596 "pslld $0x8,%%xmm0 \n" | 564 "pslld $0x8,%%xmm0 \n" |
597 "psrld $0x3,%%xmm1 \n" | 565 "psrld $0x3,%%xmm1 \n" |
598 "psrld $0x5,%%xmm2 \n" | 566 "psrld $0x5,%%xmm2 \n" |
599 "psrad $0x10,%%xmm0 \n" | 567 "psrad $0x10,%%xmm0 \n" |
600 "pand %%xmm3,%%xmm1 \n" | 568 "pand %%xmm3,%%xmm1 \n" |
601 "pand %%xmm4,%%xmm2 \n" | 569 "pand %%xmm4,%%xmm2 \n" |
602 "pand %%xmm5,%%xmm0 \n" | 570 "pand %%xmm5,%%xmm0 \n" |
603 "por %%xmm2,%%xmm1 \n" | 571 "por %%xmm2,%%xmm1 \n" |
604 "por %%xmm1,%%xmm0 \n" | 572 "por %%xmm1,%%xmm0 \n" |
605 "packssdw %%xmm0,%%xmm0 \n" | 573 "packssdw %%xmm0,%%xmm0 \n" |
606 "lea 0x10(%0),%0 \n" | 574 "lea 0x10(%0),%0 \n" |
607 "movq %%xmm0,(%1) \n" | 575 "movq %%xmm0,(%1) \n" |
608 "lea 0x8(%1),%1 \n" | 576 "lea 0x8(%1),%1 \n" |
609 "sub $0x4,%2 \n" | 577 "sub $0x4,%2 \n" |
610 "jg 1b \n" | 578 "jg 1b \n" |
611 : "+r"(src), // %0 | 579 : "+r"(src), // %0 |
612 "+r"(dst), // %1 | 580 "+r"(dst), // %1 |
613 "+r"(width) // %2 | 581 "+r"(width) // %2 |
614 : "m"(dither4) // %3 | 582 : "m"(dither4) // %3 |
615 : "memory", "cc", | 583 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
616 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 584 "xmm7"); |
617 ); | |
618 } | 585 } |
619 | 586 |
620 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 | 587 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 |
621 void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, | 588 void ARGBToRGB565DitherRow_AVX2(const uint8* src, |
622 const uint32 dither4, int width) { | 589 uint8* dst, |
623 asm volatile ( | 590 const uint32 dither4, |
624 "vbroadcastss %3,%%xmm6 \n" | 591 int width) { |
625 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" | 592 asm volatile( |
626 "vpermq $0xd8,%%ymm6,%%ymm6 \n" | 593 "vbroadcastss %3,%%xmm6 \n" |
627 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" | 594 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" |
628 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" | 595 "vpermq $0xd8,%%ymm6,%%ymm6 \n" |
629 "vpsrld $0x1b,%%ymm3,%%ymm3 \n" | 596 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" |
630 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" | 597 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" |
631 "vpsrld $0x1a,%%ymm4,%%ymm4 \n" | 598 "vpsrld $0x1b,%%ymm3,%%ymm3 \n" |
632 "vpslld $0x5,%%ymm4,%%ymm4 \n" | 599 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" |
633 "vpslld $0xb,%%ymm3,%%ymm5 \n" | 600 "vpsrld $0x1a,%%ymm4,%%ymm4 \n" |
| 601 "vpslld $0x5,%%ymm4,%%ymm4 \n" |
| 602 "vpslld $0xb,%%ymm3,%%ymm5 \n" |
634 | 603 |
635 LABELALIGN | 604 LABELALIGN |
636 "1: \n" | 605 "1: \n" |
637 "vmovdqu (%0),%%ymm0 \n" | 606 "vmovdqu (%0),%%ymm0 \n" |
638 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" | 607 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" |
639 "vpsrld $0x5,%%ymm0,%%ymm2 \n" | 608 "vpsrld $0x5,%%ymm0,%%ymm2 \n" |
640 "vpsrld $0x3,%%ymm0,%%ymm1 \n" | 609 "vpsrld $0x3,%%ymm0,%%ymm1 \n" |
641 "vpsrld $0x8,%%ymm0,%%ymm0 \n" | 610 "vpsrld $0x8,%%ymm0,%%ymm0 \n" |
642 "vpand %%ymm4,%%ymm2,%%ymm2 \n" | 611 "vpand %%ymm4,%%ymm2,%%ymm2 \n" |
643 "vpand %%ymm3,%%ymm1,%%ymm1 \n" | 612 "vpand %%ymm3,%%ymm1,%%ymm1 \n" |
644 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | 613 "vpand %%ymm5,%%ymm0,%%ymm0 \n" |
645 "vpor %%ymm2,%%ymm1,%%ymm1 \n" | 614 "vpor %%ymm2,%%ymm1,%%ymm1 \n" |
646 "vpor %%ymm1,%%ymm0,%%ymm0 \n" | 615 "vpor %%ymm1,%%ymm0,%%ymm0 \n" |
647 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" | 616 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" |
648 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 617 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
649 "lea 0x20(%0),%0 \n" | 618 "lea 0x20(%0),%0 \n" |
650 "vmovdqu %%xmm0,(%1) \n" | 619 "vmovdqu %%xmm0,(%1) \n" |
651 "lea 0x10(%1),%1 \n" | 620 "lea 0x10(%1),%1 \n" |
652 "sub $0x8,%2 \n" | 621 "sub $0x8,%2 \n" |
653 "jg 1b \n" | 622 "jg 1b \n" |
654 "vzeroupper \n" | 623 "vzeroupper \n" |
655 : "+r"(src), // %0 | 624 : "+r"(src), // %0 |
656 "+r"(dst), // %1 | 625 "+r"(dst), // %1 |
657 "+r"(width) // %2 | 626 "+r"(width) // %2 |
658 : "m"(dither4) // %3 | 627 : "m"(dither4) // %3 |
659 : "memory", "cc", | 628 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
660 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 629 "xmm7"); |
661 ); | |
662 } | 630 } |
663 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 | 631 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 |
664 | 632 |
665 | |
666 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { | 633 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { |
667 asm volatile ( | 634 asm volatile ( |
668 "pcmpeqb %%xmm4,%%xmm4 \n" | 635 "pcmpeqb %%xmm4,%%xmm4 \n" |
669 "psrld $0x1b,%%xmm4 \n" | 636 "psrld $0x1b,%%xmm4 \n" |
670 "movdqa %%xmm4,%%xmm5 \n" | 637 "movdqa %%xmm4,%%xmm5 \n" |
671 "pslld $0x5,%%xmm5 \n" | 638 "pslld $0x5,%%xmm5 \n" |
672 "movdqa %%xmm4,%%xmm6 \n" | 639 "movdqa %%xmm4,%%xmm6 \n" |
673 "pslld $0xa,%%xmm6 \n" | 640 "pslld $0xa,%%xmm6 \n" |
674 "pcmpeqb %%xmm7,%%xmm7 \n" | 641 "pcmpeqb %%xmm7,%%xmm7 \n" |
675 "pslld $0xf,%%xmm7 \n" | 642 "pslld $0xf,%%xmm7 \n" |
(...skipping 128 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
804 "+r"(width) // %2 | 771 "+r"(width) // %2 |
805 : "m"(kARGBToYJ), // %3 | 772 : "m"(kARGBToYJ), // %3 |
806 "m"(kAddYJ64) // %4 | 773 "m"(kAddYJ64) // %4 |
807 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 774 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
808 ); | 775 ); |
809 } | 776 } |
810 #endif // HAS_ARGBTOYJROW_SSSE3 | 777 #endif // HAS_ARGBTOYJROW_SSSE3 |
811 | 778 |
812 #ifdef HAS_ARGBTOYROW_AVX2 | 779 #ifdef HAS_ARGBTOYROW_AVX2 |
813 // vpermd for vphaddw + vpackuswb vpermd. | 780 // vpermd for vphaddw + vpackuswb vpermd. |
814 static const lvec32 kPermdARGBToY_AVX = { | 781 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; |
815 0, 4, 1, 5, 2, 6, 3, 7 | |
816 }; | |
817 | 782 |
818 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 783 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
819 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { | 784 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { |
820 asm volatile ( | 785 asm volatile ( |
821 "vbroadcastf128 %3,%%ymm4 \n" | 786 "vbroadcastf128 %3,%%ymm4 \n" |
822 "vbroadcastf128 %4,%%ymm5 \n" | 787 "vbroadcastf128 %4,%%ymm5 \n" |
823 "vmovdqu %5,%%ymm6 \n" | 788 "vmovdqu %5,%%ymm6 \n" |
824 LABELALIGN | 789 LABELALIGN |
825 "1: \n" | 790 "1: \n" |
826 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 791 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
891 "+r"(width) // %2 | 856 "+r"(width) // %2 |
892 : "m"(kARGBToYJ), // %3 | 857 : "m"(kARGBToYJ), // %3 |
893 "m"(kAddYJ64), // %4 | 858 "m"(kAddYJ64), // %4 |
894 "m"(kPermdARGBToY_AVX) // %5 | 859 "m"(kPermdARGBToY_AVX) // %5 |
895 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 860 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
896 ); | 861 ); |
897 } | 862 } |
898 #endif // HAS_ARGBTOYJROW_AVX2 | 863 #endif // HAS_ARGBTOYJROW_AVX2 |
899 | 864 |
900 #ifdef HAS_ARGBTOUVROW_SSSE3 | 865 #ifdef HAS_ARGBTOUVROW_SSSE3 |
901 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 866 void ARGBToUVRow_SSSE3(const uint8* src_argb0, |
902 uint8* dst_u, uint8* dst_v, int width) { | 867 int src_stride_argb, |
| 868 uint8* dst_u, |
| 869 uint8* dst_v, |
| 870 int width) { |
903 asm volatile ( | 871 asm volatile ( |
904 "movdqa %5,%%xmm3 \n" | 872 "movdqa %5,%%xmm3 \n" |
905 "movdqa %6,%%xmm4 \n" | 873 "movdqa %6,%%xmm4 \n" |
906 "movdqa %7,%%xmm5 \n" | 874 "movdqa %7,%%xmm5 \n" |
907 "sub %1,%2 \n" | 875 "sub %1,%2 \n" |
908 LABELALIGN | 876 LABELALIGN |
909 "1: \n" | 877 "1: \n" |
910 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 878 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
911 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | 879 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
912 "pavgb %%xmm7,%%xmm0 \n" | 880 "pavgb %%xmm7,%%xmm0 \n" |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
956 "m"(kAddUV128) // %7 | 924 "m"(kAddUV128) // %7 |
957 : "memory", "cc", NACL_R14 | 925 : "memory", "cc", NACL_R14 |
958 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | 926 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
959 ); | 927 ); |
960 } | 928 } |
961 #endif // HAS_ARGBTOUVROW_SSSE3 | 929 #endif // HAS_ARGBTOUVROW_SSSE3 |
962 | 930 |
963 #ifdef HAS_ARGBTOUVROW_AVX2 | 931 #ifdef HAS_ARGBTOUVROW_AVX2 |
964 // vpshufb for vphaddw + vpackuswb packed to shorts. | 932 // vpshufb for vphaddw + vpackuswb packed to shorts. |
965 static const lvec8 kShufARGBToUV_AVX = { | 933 static const lvec8 kShufARGBToUV_AVX = { |
966 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, | 934 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, |
967 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 | 935 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; |
968 }; | 936 void ARGBToUVRow_AVX2(const uint8* src_argb0, |
969 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, | 937 int src_stride_argb, |
970 uint8* dst_u, uint8* dst_v, int width) { | 938 uint8* dst_u, |
| 939 uint8* dst_v, |
| 940 int width) { |
971 asm volatile ( | 941 asm volatile ( |
972 "vbroadcastf128 %5,%%ymm5 \n" | 942 "vbroadcastf128 %5,%%ymm5 \n" |
973 "vbroadcastf128 %6,%%ymm6 \n" | 943 "vbroadcastf128 %6,%%ymm6 \n" |
974 "vbroadcastf128 %7,%%ymm7 \n" | 944 "vbroadcastf128 %7,%%ymm7 \n" |
975 "sub %1,%2 \n" | 945 "sub %1,%2 \n" |
976 LABELALIGN | 946 LABELALIGN |
977 "1: \n" | 947 "1: \n" |
978 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 948 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
979 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 949 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
980 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" | 950 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1019 "m"(kARGBToV), // %6 | 989 "m"(kARGBToV), // %6 |
1020 "m"(kARGBToU), // %7 | 990 "m"(kARGBToU), // %7 |
1021 "m"(kShufARGBToUV_AVX) // %8 | 991 "m"(kShufARGBToUV_AVX) // %8 |
1022 : "memory", "cc", NACL_R14 | 992 : "memory", "cc", NACL_R14 |
1023 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 993 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
1024 ); | 994 ); |
1025 } | 995 } |
1026 #endif // HAS_ARGBTOUVROW_AVX2 | 996 #endif // HAS_ARGBTOUVROW_AVX2 |
1027 | 997 |
1028 #ifdef HAS_ARGBTOUVJROW_AVX2 | 998 #ifdef HAS_ARGBTOUVJROW_AVX2 |
1029 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, | 999 void ARGBToUVJRow_AVX2(const uint8* src_argb0, |
1030 uint8* dst_u, uint8* dst_v, int width) { | 1000 int src_stride_argb, |
| 1001 uint8* dst_u, |
| 1002 uint8* dst_v, |
| 1003 int width) { |
1031 asm volatile ( | 1004 asm volatile ( |
1032 "vbroadcastf128 %5,%%ymm5 \n" | 1005 "vbroadcastf128 %5,%%ymm5 \n" |
1033 "vbroadcastf128 %6,%%ymm6 \n" | 1006 "vbroadcastf128 %6,%%ymm6 \n" |
1034 "vbroadcastf128 %7,%%ymm7 \n" | 1007 "vbroadcastf128 %7,%%ymm7 \n" |
1035 "sub %1,%2 \n" | 1008 "sub %1,%2 \n" |
1036 LABELALIGN | 1009 LABELALIGN |
1037 "1: \n" | 1010 "1: \n" |
1038 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 1011 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
1039 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 1012 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
1040 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" | 1013 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1080 "m"(kARGBToVJ), // %6 | 1053 "m"(kARGBToVJ), // %6 |
1081 "m"(kARGBToUJ), // %7 | 1054 "m"(kARGBToUJ), // %7 |
1082 "m"(kShufARGBToUV_AVX) // %8 | 1055 "m"(kShufARGBToUV_AVX) // %8 |
1083 : "memory", "cc", NACL_R14 | 1056 : "memory", "cc", NACL_R14 |
1084 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 1057 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
1085 ); | 1058 ); |
1086 } | 1059 } |
1087 #endif // HAS_ARGBTOUVJROW_AVX2 | 1060 #endif // HAS_ARGBTOUVJROW_AVX2 |
1088 | 1061 |
1089 #ifdef HAS_ARGBTOUVJROW_SSSE3 | 1062 #ifdef HAS_ARGBTOUVJROW_SSSE3 |
1090 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1063 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, |
1091 uint8* dst_u, uint8* dst_v, int width) { | 1064 int src_stride_argb, |
| 1065 uint8* dst_u, |
| 1066 uint8* dst_v, |
| 1067 int width) { |
1092 asm volatile ( | 1068 asm volatile ( |
1093 "movdqa %5,%%xmm3 \n" | 1069 "movdqa %5,%%xmm3 \n" |
1094 "movdqa %6,%%xmm4 \n" | 1070 "movdqa %6,%%xmm4 \n" |
1095 "movdqa %7,%%xmm5 \n" | 1071 "movdqa %7,%%xmm5 \n" |
1096 "sub %1,%2 \n" | 1072 "sub %1,%2 \n" |
1097 LABELALIGN | 1073 LABELALIGN |
1098 "1: \n" | 1074 "1: \n" |
1099 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1075 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
1100 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | 1076 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
1101 "pavgb %%xmm7,%%xmm0 \n" | 1077 "pavgb %%xmm7,%%xmm0 \n" |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1144 "m"(kARGBToVJ), // %5 | 1120 "m"(kARGBToVJ), // %5 |
1145 "m"(kARGBToUJ), // %6 | 1121 "m"(kARGBToUJ), // %6 |
1146 "m"(kAddUVJ128) // %7 | 1122 "m"(kAddUVJ128) // %7 |
1147 : "memory", "cc", NACL_R14 | 1123 : "memory", "cc", NACL_R14 |
1148 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | 1124 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
1149 ); | 1125 ); |
1150 } | 1126 } |
1151 #endif // HAS_ARGBTOUVJROW_SSSE3 | 1127 #endif // HAS_ARGBTOUVJROW_SSSE3 |
1152 | 1128 |
1153 #ifdef HAS_ARGBTOUV444ROW_SSSE3 | 1129 #ifdef HAS_ARGBTOUV444ROW_SSSE3 |
1154 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1130 void ARGBToUV444Row_SSSE3(const uint8* src_argb, |
| 1131 uint8* dst_u, |
| 1132 uint8* dst_v, |
1155 int width) { | 1133 int width) { |
1156 asm volatile ( | 1134 asm volatile ( |
1157 "movdqa %4,%%xmm3 \n" | 1135 "movdqa %4,%%xmm3 \n" |
1158 "movdqa %5,%%xmm4 \n" | 1136 "movdqa %5,%%xmm4 \n" |
1159 "movdqa %6,%%xmm5 \n" | 1137 "movdqa %6,%%xmm5 \n" |
1160 "sub %1,%2 \n" | 1138 "sub %1,%2 \n" |
1161 LABELALIGN | 1139 LABELALIGN |
1162 "1: \n" | 1140 "1: \n" |
1163 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1141 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
1164 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 1142 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1234 "jg 1b \n" | 1212 "jg 1b \n" |
1235 : "+r"(src_bgra), // %0 | 1213 : "+r"(src_bgra), // %0 |
1236 "+r"(dst_y), // %1 | 1214 "+r"(dst_y), // %1 |
1237 "+r"(width) // %2 | 1215 "+r"(width) // %2 |
1238 : "m"(kBGRAToY), // %3 | 1216 : "m"(kBGRAToY), // %3 |
1239 "m"(kAddY16) // %4 | 1217 "m"(kAddY16) // %4 |
1240 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1218 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1241 ); | 1219 ); |
1242 } | 1220 } |
1243 | 1221 |
1244 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, | 1222 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, |
1245 uint8* dst_u, uint8* dst_v, int width) { | 1223 int src_stride_bgra, |
| 1224 uint8* dst_u, |
| 1225 uint8* dst_v, |
| 1226 int width) { |
1246 asm volatile ( | 1227 asm volatile ( |
1247 "movdqa %5,%%xmm3 \n" | 1228 "movdqa %5,%%xmm3 \n" |
1248 "movdqa %6,%%xmm4 \n" | 1229 "movdqa %6,%%xmm4 \n" |
1249 "movdqa %7,%%xmm5 \n" | 1230 "movdqa %7,%%xmm5 \n" |
1250 "sub %1,%2 \n" | 1231 "sub %1,%2 \n" |
1251 LABELALIGN | 1232 LABELALIGN |
1252 "1: \n" | 1233 "1: \n" |
1253 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1234 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
1254 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | 1235 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
1255 "pavgb %%xmm7,%%xmm0 \n" | 1236 "pavgb %%xmm7,%%xmm0 \n" |
(...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1363 "jg 1b \n" | 1344 "jg 1b \n" |
1364 : "+r"(src_rgba), // %0 | 1345 : "+r"(src_rgba), // %0 |
1365 "+r"(dst_y), // %1 | 1346 "+r"(dst_y), // %1 |
1366 "+r"(width) // %2 | 1347 "+r"(width) // %2 |
1367 : "m"(kRGBAToY), // %3 | 1348 : "m"(kRGBAToY), // %3 |
1368 "m"(kAddY16) // %4 | 1349 "m"(kAddY16) // %4 |
1369 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1350 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1370 ); | 1351 ); |
1371 } | 1352 } |
1372 | 1353 |
1373 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, | 1354 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, |
1374 uint8* dst_u, uint8* dst_v, int width) { | 1355 int src_stride_abgr, |
| 1356 uint8* dst_u, |
| 1357 uint8* dst_v, |
| 1358 int width) { |
1375 asm volatile ( | 1359 asm volatile ( |
1376 "movdqa %5,%%xmm3 \n" | 1360 "movdqa %5,%%xmm3 \n" |
1377 "movdqa %6,%%xmm4 \n" | 1361 "movdqa %6,%%xmm4 \n" |
1378 "movdqa %7,%%xmm5 \n" | 1362 "movdqa %7,%%xmm5 \n" |
1379 "sub %1,%2 \n" | 1363 "sub %1,%2 \n" |
1380 LABELALIGN | 1364 LABELALIGN |
1381 "1: \n" | 1365 "1: \n" |
1382 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1366 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
1383 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | 1367 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
1384 "pavgb %%xmm7,%%xmm0 \n" | 1368 "pavgb %%xmm7,%%xmm0 \n" |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1424 "+rm"(width) // %3 | 1408 "+rm"(width) // %3 |
1425 : "r"((intptr_t)(src_stride_abgr)), // %4 | 1409 : "r"((intptr_t)(src_stride_abgr)), // %4 |
1426 "m"(kABGRToV), // %5 | 1410 "m"(kABGRToV), // %5 |
1427 "m"(kABGRToU), // %6 | 1411 "m"(kABGRToU), // %6 |
1428 "m"(kAddUV128) // %7 | 1412 "m"(kAddUV128) // %7 |
1429 : "memory", "cc", NACL_R14 | 1413 : "memory", "cc", NACL_R14 |
1430 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | 1414 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
1431 ); | 1415 ); |
1432 } | 1416 } |
1433 | 1417 |
1434 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, | 1418 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, |
1435 uint8* dst_u, uint8* dst_v, int width) { | 1419 int src_stride_rgba, |
| 1420 uint8* dst_u, |
| 1421 uint8* dst_v, |
| 1422 int width) { |
1436 asm volatile ( | 1423 asm volatile ( |
1437 "movdqa %5,%%xmm3 \n" | 1424 "movdqa %5,%%xmm3 \n" |
1438 "movdqa %6,%%xmm4 \n" | 1425 "movdqa %6,%%xmm4 \n" |
1439 "movdqa %7,%%xmm5 \n" | 1426 "movdqa %7,%%xmm5 \n" |
1440 "sub %1,%2 \n" | 1427 "sub %1,%2 \n" |
1441 LABELALIGN | 1428 LABELALIGN |
1442 "1: \n" | 1429 "1: \n" |
1443 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1430 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
1444 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | 1431 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
1445 "pavgb %%xmm7,%%xmm0 \n" | 1432 "pavgb %%xmm7,%%xmm0 \n" |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1488 "m"(kRGBAToU), // %6 | 1475 "m"(kRGBAToU), // %6 |
1489 "m"(kAddUV128) // %7 | 1476 "m"(kAddUV128) // %7 |
1490 : "memory", "cc", NACL_R14 | 1477 : "memory", "cc", NACL_R14 |
1491 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | 1478 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
1492 ); | 1479 ); |
1493 } | 1480 } |
1494 | 1481 |
1495 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) | 1482 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) |
1496 | 1483 |
1497 // Read 8 UV from 444 | 1484 // Read 8 UV from 444 |
1498 #define READYUV444 \ | 1485 #define READYUV444 \ |
1499 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1486 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1500 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1487 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1501 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | 1488 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
1502 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1489 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1503 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1490 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1504 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1491 "punpcklbw %%xmm4,%%xmm4 \n" \ |
1505 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1492 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
1506 | 1493 |
1507 // Read 4 UV from 422, upsample to 8 UV | 1494 // Read 4 UV from 422, upsample to 8 UV |
1508 #define READYUV422 \ | 1495 #define READYUV422 \ |
1509 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1496 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1510 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1497 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1511 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ | 1498 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
1512 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1499 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1513 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1500 "punpcklwd %%xmm0,%%xmm0 \n" \ |
1514 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1501 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1515 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1502 "punpcklbw %%xmm4,%%xmm4 \n" \ |
1516 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1503 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
1517 | 1504 |
1518 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. | 1505 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. |
1519 #define READYUVA422 \ | 1506 #define READYUVA422 \ |
1520 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1507 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1521 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1508 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1522 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ | 1509 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
1523 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1510 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1524 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1511 "punpcklwd %%xmm0,%%xmm0 \n" \ |
1525 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1512 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1526 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1513 "punpcklbw %%xmm4,%%xmm4 \n" \ |
1527 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ | 1514 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ |
1528 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ | 1515 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
1529 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" | 1516 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" |
1530 | 1517 |
1531 // Read 4 UV from NV12, upsample to 8 UV | 1518 // Read 4 UV from NV12, upsample to 8 UV |
1532 #define READNV12 \ | 1519 #define READNV12 \ |
1533 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1520 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
1534 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ | 1521 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ |
1535 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1522 "punpcklwd %%xmm0,%%xmm0 \n" \ |
1536 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1523 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1537 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1524 "punpcklbw %%xmm4,%%xmm4 \n" \ |
1538 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1525 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
1539 | 1526 |
1540 // Read 4 VU from NV21, upsample to 8 UV | 1527 // Read 4 VU from NV21, upsample to 8 UV |
1541 #define READNV21 \ | 1528 #define READNV21 \ |
1542 "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ | 1529 "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ |
1543 "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ | 1530 "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ |
1544 "pshufb %[kShuffleNV21], %%xmm0 \n" \ | 1531 "pshufb %[kShuffleNV21], %%xmm0 \n" \ |
1545 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1532 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1546 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1533 "punpcklbw %%xmm4,%%xmm4 \n" \ |
1547 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1534 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
1548 | 1535 |
1549 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. | 1536 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. |
1550 #define READYUY2 \ | 1537 #define READYUY2 \ |
1551 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ | 1538 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ |
1552 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ | 1539 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ |
1553 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ | 1540 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ |
1554 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ | 1541 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ |
1555 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" | 1542 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" |
1556 | 1543 |
1557 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. | 1544 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. |
1558 #define READUYVY \ | 1545 #define READUYVY \ |
1559 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ | 1546 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ |
1560 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ | 1547 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ |
1561 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ | 1548 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ |
1562 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ | 1549 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ |
1563 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" | 1550 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" |
1564 | 1551 |
1565 #if defined(__x86_64__) | 1552 #if defined(__x86_64__) |
1566 #define YUVTORGB_SETUP(yuvconstants) \ | 1553 #define YUVTORGB_SETUP(yuvconstants) \ |
1567 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ | 1554 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ |
1568 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ | 1555 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ |
1569 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ | 1556 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ |
1570 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ | 1557 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ |
1571 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \ | 1558 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \ |
1572 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ | 1559 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ |
1573 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" | 1560 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" |
1574 // Convert 8 pixels: 8 UV and 8 Y | 1561 // Convert 8 pixels: 8 UV and 8 Y |
1575 #define YUVTORGB(yuvconstants) \ | 1562 #define YUVTORGB(yuvconstants) \ |
1576 "movdqa %%xmm0,%%xmm1 \n" \ | 1563 "movdqa %%xmm0,%%xmm1 \n" \ |
1577 "movdqa %%xmm0,%%xmm2 \n" \ | 1564 "movdqa %%xmm0,%%xmm2 \n" \ |
1578 "movdqa %%xmm0,%%xmm3 \n" \ | 1565 "movdqa %%xmm0,%%xmm3 \n" \ |
1579 "movdqa %%xmm11,%%xmm0 \n" \ | 1566 "movdqa %%xmm11,%%xmm0 \n" \ |
1580 "pmaddubsw %%xmm8,%%xmm1 \n" \ | 1567 "pmaddubsw %%xmm8,%%xmm1 \n" \ |
1581 "psubw %%xmm1,%%xmm0 \n" \ | 1568 "psubw %%xmm1,%%xmm0 \n" \ |
1582 "movdqa %%xmm12,%%xmm1 \n" \ | 1569 "movdqa %%xmm12,%%xmm1 \n" \ |
1583 "pmaddubsw %%xmm9,%%xmm2 \n" \ | 1570 "pmaddubsw %%xmm9,%%xmm2 \n" \ |
1584 "psubw %%xmm2,%%xmm1 \n" \ | 1571 "psubw %%xmm2,%%xmm1 \n" \ |
1585 "movdqa %%xmm13,%%xmm2 \n" \ | 1572 "movdqa %%xmm13,%%xmm2 \n" \ |
1586 "pmaddubsw %%xmm10,%%xmm3 \n" \ | 1573 "pmaddubsw %%xmm10,%%xmm3 \n" \ |
1587 "psubw %%xmm3,%%xmm2 \n" \ | 1574 "psubw %%xmm3,%%xmm2 \n" \ |
1588 "pmulhuw %%xmm14,%%xmm4 \n" \ | 1575 "pmulhuw %%xmm14,%%xmm4 \n" \ |
1589 "paddsw %%xmm4,%%xmm0 \n" \ | 1576 "paddsw %%xmm4,%%xmm0 \n" \ |
1590 "paddsw %%xmm4,%%xmm1 \n" \ | 1577 "paddsw %%xmm4,%%xmm1 \n" \ |
1591 "paddsw %%xmm4,%%xmm2 \n" \ | 1578 "paddsw %%xmm4,%%xmm2 \n" \ |
1592 "psraw $0x6,%%xmm0 \n" \ | 1579 "psraw $0x6,%%xmm0 \n" \ |
1593 "psraw $0x6,%%xmm1 \n" \ | 1580 "psraw $0x6,%%xmm1 \n" \ |
1594 "psraw $0x6,%%xmm2 \n" \ | 1581 "psraw $0x6,%%xmm2 \n" \ |
1595 "packuswb %%xmm0,%%xmm0 \n" \ | 1582 "packuswb %%xmm0,%%xmm0 \n" \ |
1596 "packuswb %%xmm1,%%xmm1 \n" \ | 1583 "packuswb %%xmm1,%%xmm1 \n" \ |
1597 "packuswb %%xmm2,%%xmm2 \n" | 1584 "packuswb %%xmm2,%%xmm2 \n" |
1598 #define YUVTORGB_REGS \ | 1585 #define YUVTORGB_REGS \ |
1599 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", | 1586 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", |
1600 | 1587 |
1601 #else | 1588 #else |
1602 #define YUVTORGB_SETUP(yuvconstants) | 1589 #define YUVTORGB_SETUP(yuvconstants) |
1603 // Convert 8 pixels: 8 UV and 8 Y | 1590 // Convert 8 pixels: 8 UV and 8 Y |
1604 #define YUVTORGB(yuvconstants) \ | 1591 #define YUVTORGB(yuvconstants) \ |
1605 "movdqa %%xmm0,%%xmm1 \n" \ | 1592 "movdqa %%xmm0,%%xmm1 \n" \ |
1606 "movdqa %%xmm0,%%xmm2 \n" \ | 1593 "movdqa %%xmm0,%%xmm2 \n" \ |
1607 "movdqa %%xmm0,%%xmm3 \n" \ | 1594 "movdqa %%xmm0,%%xmm3 \n" \ |
1608 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ | 1595 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ |
1609 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ | 1596 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ |
1610 "psubw %%xmm1,%%xmm0 \n" \ | 1597 "psubw %%xmm1,%%xmm0 \n" \ |
1611 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ | 1598 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ |
1612 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ | 1599 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ |
1613 "psubw %%xmm2,%%xmm1 \n" \ | 1600 "psubw %%xmm2,%%xmm1 \n" \ |
1614 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ | 1601 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ |
1615 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ | 1602 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ |
1616 "psubw %%xmm3,%%xmm2 \n" \ | 1603 "psubw %%xmm3,%%xmm2 \n" \ |
1617 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ | 1604 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ |
1618 "paddsw %%xmm4,%%xmm0 \n" \ | 1605 "paddsw %%xmm4,%%xmm0 \n" \ |
1619 "paddsw %%xmm4,%%xmm1 \n" \ | 1606 "paddsw %%xmm4,%%xmm1 \n" \ |
1620 "paddsw %%xmm4,%%xmm2 \n" \ | 1607 "paddsw %%xmm4,%%xmm2 \n" \ |
1621 "psraw $0x6,%%xmm0 \n" \ | 1608 "psraw $0x6,%%xmm0 \n" \ |
1622 "psraw $0x6,%%xmm1 \n" \ | 1609 "psraw $0x6,%%xmm1 \n" \ |
1623 "psraw $0x6,%%xmm2 \n" \ | 1610 "psraw $0x6,%%xmm2 \n" \ |
1624 "packuswb %%xmm0,%%xmm0 \n" \ | 1611 "packuswb %%xmm0,%%xmm0 \n" \ |
1625 "packuswb %%xmm1,%%xmm1 \n" \ | 1612 "packuswb %%xmm1,%%xmm1 \n" \ |
1626 "packuswb %%xmm2,%%xmm2 \n" | 1613 "packuswb %%xmm2,%%xmm2 \n" |
1627 #define YUVTORGB_REGS | 1614 #define YUVTORGB_REGS |
1628 #endif | 1615 #endif |
1629 | 1616 |
1630 // Store 8 ARGB values. | 1617 // Store 8 ARGB values. |
1631 #define STOREARGB \ | 1618 #define STOREARGB \ |
1632 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1619 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1633 "punpcklbw %%xmm5,%%xmm2 \n" \ | 1620 "punpcklbw %%xmm5,%%xmm2 \n" \ |
1634 "movdqa %%xmm0,%%xmm1 \n" \ | 1621 "movdqa %%xmm0,%%xmm1 \n" \ |
1635 "punpcklwd %%xmm2,%%xmm0 \n" \ | 1622 "punpcklwd %%xmm2,%%xmm0 \n" \ |
1636 "punpckhwd %%xmm2,%%xmm1 \n" \ | 1623 "punpckhwd %%xmm2,%%xmm1 \n" \ |
1637 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ | 1624 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ |
1638 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ | 1625 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ |
1639 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" | 1626 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" |
1640 | 1627 |
1641 // Store 8 RGBA values. | 1628 // Store 8 RGBA values. |
1642 #define STORERGBA \ | 1629 #define STORERGBA \ |
1643 "pcmpeqb %%xmm5,%%xmm5 \n" \ | 1630 "pcmpeqb %%xmm5,%%xmm5 \n" \ |
1644 "punpcklbw %%xmm2,%%xmm1 \n" \ | 1631 "punpcklbw %%xmm2,%%xmm1 \n" \ |
1645 "punpcklbw %%xmm0,%%xmm5 \n" \ | 1632 "punpcklbw %%xmm0,%%xmm5 \n" \ |
1646 "movdqa %%xmm5,%%xmm0 \n" \ | 1633 "movdqa %%xmm5,%%xmm0 \n" \ |
1647 "punpcklwd %%xmm1,%%xmm5 \n" \ | 1634 "punpcklwd %%xmm1,%%xmm5 \n" \ |
1648 "punpckhwd %%xmm1,%%xmm0 \n" \ | 1635 "punpckhwd %%xmm1,%%xmm0 \n" \ |
1649 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ | 1636 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ |
1650 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ | 1637 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ |
1651 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" | 1638 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" |
1652 | 1639 |
1653 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, | 1640 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, |
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1752 } | 1739 } |
1753 | 1740 |
1754 #ifdef HAS_I422ALPHATOARGBROW_SSSE3 | 1741 #ifdef HAS_I422ALPHATOARGBROW_SSSE3 |
1755 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, | 1742 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, |
1756 const uint8* u_buf, | 1743 const uint8* u_buf, |
1757 const uint8* v_buf, | 1744 const uint8* v_buf, |
1758 const uint8* a_buf, | 1745 const uint8* a_buf, |
1759 uint8* dst_argb, | 1746 uint8* dst_argb, |
1760 const struct YuvConstants* yuvconstants, | 1747 const struct YuvConstants* yuvconstants, |
1761 int width) { | 1748 int width) { |
| 1749 // clang-format off |
1762 asm volatile ( | 1750 asm volatile ( |
1763 YUVTORGB_SETUP(yuvconstants) | 1751 YUVTORGB_SETUP(yuvconstants) |
1764 "sub %[u_buf],%[v_buf] \n" | 1752 "sub %[u_buf],%[v_buf] \n" |
1765 LABELALIGN | 1753 LABELALIGN |
1766 "1: \n" | 1754 "1: \n" |
1767 READYUVA422 | 1755 READYUVA422 |
1768 YUVTORGB(yuvconstants) | 1756 YUVTORGB(yuvconstants) |
1769 STOREARGB | 1757 STOREARGB |
1770 "subl $0x8,%[width] \n" | 1758 "subl $0x8,%[width] \n" |
1771 "jg 1b \n" | 1759 "jg 1b \n" |
1772 : [y_buf]"+r"(y_buf), // %[y_buf] | 1760 : [y_buf]"+r"(y_buf), // %[y_buf] |
1773 [u_buf]"+r"(u_buf), // %[u_buf] | 1761 [u_buf]"+r"(u_buf), // %[u_buf] |
1774 [v_buf]"+r"(v_buf), // %[v_buf] | 1762 [v_buf]"+r"(v_buf), // %[v_buf] |
1775 [a_buf]"+r"(a_buf), // %[a_buf] | 1763 [a_buf]"+r"(a_buf), // %[a_buf] |
1776 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1764 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1777 #if defined(__i386__) | 1765 #if defined(__i386__) |
1778 [width]"+m"(width) // %[width] | 1766 [width]"+m"(width) // %[width] |
1779 #else | 1767 #else |
1780 [width]"+rm"(width) // %[width] | 1768 [width]"+rm"(width) // %[width] |
1781 #endif | 1769 #endif |
1782 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1770 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1783 : "memory", "cc", NACL_R14 YUVTORGB_REGS | 1771 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
1784 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1772 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1785 ); | 1773 ); |
| 1774 // clang-format on |
1786 } | 1775 } |
1787 #endif // HAS_I422ALPHATOARGBROW_SSSE3 | 1776 #endif // HAS_I422ALPHATOARGBROW_SSSE3 |
1788 | 1777 |
1789 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, | 1778 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, |
1790 const uint8* uv_buf, | 1779 const uint8* uv_buf, |
1791 uint8* dst_argb, | 1780 uint8* dst_argb, |
1792 const struct YuvConstants* yuvconstants, | 1781 const struct YuvConstants* yuvconstants, |
1793 int width) { | 1782 int width) { |
| 1783 // clang-format off |
1794 asm volatile ( | 1784 asm volatile ( |
1795 YUVTORGB_SETUP(yuvconstants) | 1785 YUVTORGB_SETUP(yuvconstants) |
1796 "pcmpeqb %%xmm5,%%xmm5 \n" | 1786 "pcmpeqb %%xmm5,%%xmm5 \n" |
1797 LABELALIGN | 1787 LABELALIGN |
1798 "1: \n" | 1788 "1: \n" |
1799 READNV12 | 1789 READNV12 |
1800 YUVTORGB(yuvconstants) | 1790 YUVTORGB(yuvconstants) |
1801 STOREARGB | 1791 STOREARGB |
1802 "sub $0x8,%[width] \n" | 1792 "sub $0x8,%[width] \n" |
1803 "jg 1b \n" | 1793 "jg 1b \n" |
1804 : [y_buf]"+r"(y_buf), // %[y_buf] | 1794 : [y_buf]"+r"(y_buf), // %[y_buf] |
1805 [uv_buf]"+r"(uv_buf), // %[uv_buf] | 1795 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
1806 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1796 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1807 [width]"+rm"(width) // %[width] | 1797 [width]"+rm"(width) // %[width] |
1808 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1798 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1809 : "memory", "cc", YUVTORGB_REGS // Does not use r14. | 1799 : "memory", "cc", YUVTORGB_REGS // Does not use r14. |
1810 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1800 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1811 ); | 1801 ); |
| 1802 // clang-format on |
1812 } | 1803 } |
1813 | 1804 |
1814 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, | 1805 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, |
1815 const uint8* vu_buf, | 1806 const uint8* vu_buf, |
1816 uint8* dst_argb, | 1807 uint8* dst_argb, |
1817 const struct YuvConstants* yuvconstants, | 1808 const struct YuvConstants* yuvconstants, |
1818 int width) { | 1809 int width) { |
| 1810 // clang-format off |
1819 asm volatile ( | 1811 asm volatile ( |
1820 YUVTORGB_SETUP(yuvconstants) | 1812 YUVTORGB_SETUP(yuvconstants) |
1821 "pcmpeqb %%xmm5,%%xmm5 \n" | 1813 "pcmpeqb %%xmm5,%%xmm5 \n" |
1822 LABELALIGN | 1814 LABELALIGN |
1823 "1: \n" | 1815 "1: \n" |
1824 READNV21 | 1816 READNV21 |
1825 YUVTORGB(yuvconstants) | 1817 YUVTORGB(yuvconstants) |
1826 STOREARGB | 1818 STOREARGB |
1827 "sub $0x8,%[width] \n" | 1819 "sub $0x8,%[width] \n" |
1828 "jg 1b \n" | 1820 "jg 1b \n" |
1829 : [y_buf]"+r"(y_buf), // %[y_buf] | 1821 : [y_buf]"+r"(y_buf), // %[y_buf] |
1830 [vu_buf]"+r"(vu_buf), // %[vu_buf] | 1822 [vu_buf]"+r"(vu_buf), // %[vu_buf] |
1831 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1823 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1832 [width]"+rm"(width) // %[width] | 1824 [width]"+rm"(width) // %[width] |
1833 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 1825 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
1834 [kShuffleNV21]"m"(kShuffleNV21) | 1826 [kShuffleNV21]"m"(kShuffleNV21) |
1835 : "memory", "cc", YUVTORGB_REGS // Does not use r14. | 1827 : "memory", "cc", YUVTORGB_REGS // Does not use r14. |
1836 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1828 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1837 ); | 1829 ); |
| 1830 // clang-format on |
1838 } | 1831 } |
1839 | 1832 |
1840 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, | 1833 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, |
1841 uint8* dst_argb, | 1834 uint8* dst_argb, |
1842 const struct YuvConstants* yuvconstants, | 1835 const struct YuvConstants* yuvconstants, |
1843 int width) { | 1836 int width) { |
| 1837 // clang-format off |
1844 asm volatile ( | 1838 asm volatile ( |
1845 YUVTORGB_SETUP(yuvconstants) | 1839 YUVTORGB_SETUP(yuvconstants) |
1846 "pcmpeqb %%xmm5,%%xmm5 \n" | 1840 "pcmpeqb %%xmm5,%%xmm5 \n" |
1847 LABELALIGN | 1841 LABELALIGN |
1848 "1: \n" | 1842 "1: \n" |
1849 READYUY2 | 1843 READYUY2 |
1850 YUVTORGB(yuvconstants) | 1844 YUVTORGB(yuvconstants) |
1851 STOREARGB | 1845 STOREARGB |
1852 "sub $0x8,%[width] \n" | 1846 "sub $0x8,%[width] \n" |
1853 "jg 1b \n" | 1847 "jg 1b \n" |
1854 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] | 1848 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] |
1855 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1849 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1856 [width]"+rm"(width) // %[width] | 1850 [width]"+rm"(width) // %[width] |
1857 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 1851 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
1858 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), | 1852 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), |
1859 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) | 1853 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) |
1860 : "memory", "cc", YUVTORGB_REGS // Does not use r14. | 1854 : "memory", "cc", YUVTORGB_REGS // Does not use r14. |
1861 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1855 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1862 ); | 1856 ); |
| 1857 // clang-format on |
1863 } | 1858 } |
1864 | 1859 |
1865 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, | 1860 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, |
1866 uint8* dst_argb, | 1861 uint8* dst_argb, |
1867 const struct YuvConstants* yuvconstants, | 1862 const struct YuvConstants* yuvconstants, |
1868 int width) { | 1863 int width) { |
| 1864 // clang-format off |
1869 asm volatile ( | 1865 asm volatile ( |
1870 YUVTORGB_SETUP(yuvconstants) | 1866 YUVTORGB_SETUP(yuvconstants) |
1871 "pcmpeqb %%xmm5,%%xmm5 \n" | 1867 "pcmpeqb %%xmm5,%%xmm5 \n" |
1872 LABELALIGN | 1868 LABELALIGN |
1873 "1: \n" | 1869 "1: \n" |
1874 READUYVY | 1870 READUYVY |
1875 YUVTORGB(yuvconstants) | 1871 YUVTORGB(yuvconstants) |
1876 STOREARGB | 1872 STOREARGB |
1877 "sub $0x8,%[width] \n" | 1873 "sub $0x8,%[width] \n" |
1878 "jg 1b \n" | 1874 "jg 1b \n" |
1879 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] | 1875 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] |
1880 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1876 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1881 [width]"+rm"(width) // %[width] | 1877 [width]"+rm"(width) // %[width] |
1882 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 1878 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
1883 [kShuffleUYVYY]"m"(kShuffleUYVYY), | 1879 [kShuffleUYVYY]"m"(kShuffleUYVYY), |
1884 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) | 1880 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) |
1885 : "memory", "cc", YUVTORGB_REGS // Does not use r14. | 1881 : "memory", "cc", YUVTORGB_REGS // Does not use r14. |
1886 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1882 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1887 ); | 1883 ); |
| 1884 // clang-format on |
1888 } | 1885 } |
1889 | 1886 |
1890 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, | 1887 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
1891 const uint8* u_buf, | 1888 const uint8* u_buf, |
1892 const uint8* v_buf, | 1889 const uint8* v_buf, |
1893 uint8* dst_rgba, | 1890 uint8* dst_rgba, |
1894 const struct YuvConstants* yuvconstants, | 1891 const struct YuvConstants* yuvconstants, |
1895 int width) { | 1892 int width) { |
1896 asm volatile ( | 1893 asm volatile ( |
1897 YUVTORGB_SETUP(yuvconstants) | 1894 YUVTORGB_SETUP(yuvconstants) |
(...skipping 13 matching lines...) Expand all Loading... |
1911 [width]"+rm"(width) // %[width] | 1908 [width]"+rm"(width) // %[width] |
1912 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1909 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1913 : "memory", "cc", NACL_R14 YUVTORGB_REGS | 1910 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
1914 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1911 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1915 ); | 1912 ); |
1916 } | 1913 } |
1917 | 1914 |
1918 #endif // HAS_I422TOARGBROW_SSSE3 | 1915 #endif // HAS_I422TOARGBROW_SSSE3 |
1919 | 1916 |
1920 // Read 16 UV from 444 | 1917 // Read 16 UV from 444 |
1921 #define READYUV444_AVX2 \ | 1918 #define READYUV444_AVX2 \ |
1922 "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1919 "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1923 MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1920 MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1924 "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ | 1921 "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ |
1925 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1922 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1926 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ | 1923 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ |
1927 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1924 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
1928 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1925 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1929 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1926 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
1930 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1927 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
1931 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1928 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
1932 | 1929 |
1933 // Read 8 UV from 422, upsample to 16 UV. | 1930 // Read 8 UV from 422, upsample to 16 UV. |
1934 #define READYUV422_AVX2 \ | 1931 #define READYUV422_AVX2 \ |
1935 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1932 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1936 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1933 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1937 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | 1934 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
1938 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1935 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
1939 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1936 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1940 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1937 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
1941 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1938 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1942 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1939 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
1943 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1940 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
1944 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1941 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
1945 | 1942 |
1946 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. | 1943 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. |
1947 #define READYUVA422_AVX2 \ | 1944 #define READYUVA422_AVX2 \ |
1948 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1945 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1949 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1946 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1950 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | 1947 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
1951 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1948 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
1952 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1949 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1953 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1950 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
1954 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1951 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1955 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1952 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
1956 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1953 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
1957 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ | 1954 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ |
1958 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ | 1955 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
1959 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ | 1956 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ |
1960 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" | 1957 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" |
1961 | 1958 |
1962 // Read 8 UV from NV12, upsample to 16 UV. | 1959 // Read 8 UV from NV12, upsample to 16 UV. |
1963 #define READNV12_AVX2 \ | 1960 #define READNV12_AVX2 \ |
1964 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1961 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
1965 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ | 1962 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ |
1966 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1963 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1967 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1964 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
1968 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1965 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1969 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1966 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
1970 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1967 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
1971 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1968 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
1972 | 1969 |
1973 // Read 8 VU from NV21, upsample to 16 UV. | 1970 // Read 8 VU from NV21, upsample to 16 UV. |
1974 #define READNV21_AVX2 \ | 1971 #define READNV21_AVX2 \ |
1975 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ | 1972 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ |
1976 "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ | 1973 "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ |
1977 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1974 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1978 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ | 1975 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ |
1979 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1976 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1980 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1977 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
1981 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1978 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
1982 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1979 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
1983 | 1980 |
1984 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. | 1981 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. |
1985 #define READYUY2_AVX2 \ | 1982 #define READYUY2_AVX2 \ |
1986 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ | 1983 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ |
1987 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ | 1984 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ |
1988 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ | 1985 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ |
1989 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ | 1986 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ |
1990 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" | 1987 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" |
1991 | 1988 |
1992 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. | 1989 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. |
1993 #define READUYVY_AVX2 \ | 1990 #define READUYVY_AVX2 \ |
1994 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ | 1991 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ |
1995 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ | 1992 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ |
1996 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ | 1993 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ |
1997 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ | 1994 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ |
1998 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" | 1995 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" |
1999 | 1996 |
2000 #if defined(__x86_64__) | 1997 #if defined(__x86_64__) |
2001 #define YUVTORGB_SETUP_AVX2(yuvconstants) \ | 1998 #define YUVTORGB_SETUP_AVX2(yuvconstants) \ |
2002 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ | 1999 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ |
2003 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ | 2000 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ |
2004 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ | 2001 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ |
2005 "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ | 2002 "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ |
2006 "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ | 2003 "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ |
2007 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ | 2004 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ |
2008 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" | 2005 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" |
2009 #define YUVTORGB_AVX2(yuvconstants) \ | 2006 #define YUVTORGB_AVX2(yuvconstants) \ |
2010 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ | 2007 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ |
2011 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ | 2008 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ |
2012 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ | 2009 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ |
2013 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ | 2010 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ |
2014 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ | 2011 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ |
2015 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ | 2012 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ |
2016 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ | 2013 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ |
2017 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ | 2014 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ |
2018 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ | 2015 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ |
2019 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ | 2016 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ |
2020 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | 2017 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ |
2021 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | 2018 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ |
2022 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ | 2019 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ |
2023 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ | 2020 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ |
2024 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | 2021 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
2025 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | 2022 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
2026 #define YUVTORGB_REGS_AVX2 \ | 2023 #define YUVTORGB_REGS_AVX2 \ |
2027 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", | 2024 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", |
2028 #else // Convert 16 pixels: 16 UV and 16 Y. | 2025 #else // Convert 16 pixels: 16 UV and 16 Y. |
2029 #define YUVTORGB_SETUP_AVX2(yuvconstants) | 2026 #define YUVTORGB_SETUP_AVX2(yuvconstants) |
2030 #define YUVTORGB_AVX2(yuvconstants) \ | 2027 #define YUVTORGB_AVX2(yuvconstants) \ |
2031 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ | 2028 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ |
2032 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ | 2029 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ |
2033 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ | 2030 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ |
2034 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ | 2031 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ |
2035 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ | 2032 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ |
2036 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ | 2033 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ |
2037 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ | 2034 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ |
2038 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ | 2035 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ |
2039 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ | 2036 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ |
2040 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ | 2037 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ |
2041 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ | 2038 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ |
2042 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ | 2039 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ |
2043 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ | 2040 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ |
2044 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | 2041 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ |
2045 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | 2042 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ |
2046 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ | 2043 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ |
2047 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ | 2044 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ |
2048 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | 2045 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
2049 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | 2046 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
2050 #define YUVTORGB_REGS_AVX2 | 2047 #define YUVTORGB_REGS_AVX2 |
2051 #endif | 2048 #endif |
2052 | 2049 |
2053 // Store 16 ARGB values. | 2050 // Store 16 ARGB values. |
2054 #define STOREARGB_AVX2 \ | 2051 #define STOREARGB_AVX2 \ |
2055 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 2052 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
2056 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 2053 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
2057 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ | 2054 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ |
2058 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ | 2055 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ |
2059 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ | 2056 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ |
2060 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ | 2057 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ |
2061 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ | 2058 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ |
2062 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ | 2059 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ |
2063 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" | 2060 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" |
2064 | 2061 |
2065 #ifdef HAS_I444TOARGBROW_AVX2 | 2062 #ifdef HAS_I444TOARGBROW_AVX2 |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2125 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 | 2122 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
2126 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2123 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2127 ); | 2124 ); |
2128 } | 2125 } |
2129 #endif // HAS_I422TOARGBROW_AVX2 | 2126 #endif // HAS_I422TOARGBROW_AVX2 |
2130 | 2127 |
2131 #if defined(HAS_I422ALPHATOARGBROW_AVX2) | 2128 #if defined(HAS_I422ALPHATOARGBROW_AVX2) |
2132 // 16 pixels | 2129 // 16 pixels |
2133 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. | 2130 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. |
2134 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, | 2131 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, |
2135 const uint8* u_buf, | 2132 const uint8* u_buf, |
2136 const uint8* v_buf, | 2133 const uint8* v_buf, |
2137 const uint8* a_buf, | 2134 const uint8* a_buf, |
2138 uint8* dst_argb, | 2135 uint8* dst_argb, |
2139 const struct YuvConstants* yuvconstants, | 2136 const struct YuvConstants* yuvconstants, |
2140 int width) { | 2137 int width) { |
| 2138 // clang-format off |
2141 asm volatile ( | 2139 asm volatile ( |
2142 YUVTORGB_SETUP_AVX2(yuvconstants) | 2140 YUVTORGB_SETUP_AVX2(yuvconstants) |
2143 "sub %[u_buf],%[v_buf] \n" | 2141 "sub %[u_buf],%[v_buf] \n" |
2144 LABELALIGN | 2142 LABELALIGN |
2145 "1: \n" | 2143 "1: \n" |
2146 READYUVA422_AVX2 | 2144 READYUVA422_AVX2 |
2147 YUVTORGB_AVX2(yuvconstants) | 2145 YUVTORGB_AVX2(yuvconstants) |
2148 STOREARGB_AVX2 | 2146 STOREARGB_AVX2 |
2149 "subl $0x10,%[width] \n" | 2147 "subl $0x10,%[width] \n" |
2150 "jg 1b \n" | 2148 "jg 1b \n" |
2151 "vzeroupper \n" | 2149 "vzeroupper \n" |
2152 : [y_buf]"+r"(y_buf), // %[y_buf] | 2150 : [y_buf]"+r"(y_buf), // %[y_buf] |
2153 [u_buf]"+r"(u_buf), // %[u_buf] | 2151 [u_buf]"+r"(u_buf), // %[u_buf] |
2154 [v_buf]"+r"(v_buf), // %[v_buf] | 2152 [v_buf]"+r"(v_buf), // %[v_buf] |
2155 [a_buf]"+r"(a_buf), // %[a_buf] | 2153 [a_buf]"+r"(a_buf), // %[a_buf] |
2156 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2154 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2157 #if defined(__i386__) | 2155 #if defined(__i386__) |
2158 [width]"+m"(width) // %[width] | 2156 [width]"+m"(width) // %[width] |
2159 #else | 2157 #else |
2160 [width]"+rm"(width) // %[width] | 2158 [width]"+rm"(width) // %[width] |
2161 #endif | 2159 #endif |
2162 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2160 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2163 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 | 2161 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
2164 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2162 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2165 ); | 2163 ); |
| 2164 // clang-format on |
2166 } | 2165 } |
2167 #endif // HAS_I422ALPHATOARGBROW_AVX2 | 2166 #endif // HAS_I422ALPHATOARGBROW_AVX2 |
2168 | 2167 |
2169 #if defined(HAS_I422TORGBAROW_AVX2) | 2168 #if defined(HAS_I422TORGBAROW_AVX2) |
2170 // 16 pixels | 2169 // 16 pixels |
2171 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | 2170 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). |
2172 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, | 2171 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, |
2173 const uint8* u_buf, | 2172 const uint8* u_buf, |
2174 const uint8* v_buf, | 2173 const uint8* v_buf, |
2175 uint8* dst_argb, | 2174 uint8* dst_argb, |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2210 #endif // HAS_I422TORGBAROW_AVX2 | 2209 #endif // HAS_I422TORGBAROW_AVX2 |
2211 | 2210 |
2212 #if defined(HAS_NV12TOARGBROW_AVX2) | 2211 #if defined(HAS_NV12TOARGBROW_AVX2) |
2213 // 16 pixels. | 2212 // 16 pixels. |
2214 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2213 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
2215 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, | 2214 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, |
2216 const uint8* uv_buf, | 2215 const uint8* uv_buf, |
2217 uint8* dst_argb, | 2216 uint8* dst_argb, |
2218 const struct YuvConstants* yuvconstants, | 2217 const struct YuvConstants* yuvconstants, |
2219 int width) { | 2218 int width) { |
| 2219 // clang-format off |
2220 asm volatile ( | 2220 asm volatile ( |
2221 YUVTORGB_SETUP_AVX2(yuvconstants) | 2221 YUVTORGB_SETUP_AVX2(yuvconstants) |
2222 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2222 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2223 LABELALIGN | 2223 LABELALIGN |
2224 "1: \n" | 2224 "1: \n" |
2225 READNV12_AVX2 | 2225 READNV12_AVX2 |
2226 YUVTORGB_AVX2(yuvconstants) | 2226 YUVTORGB_AVX2(yuvconstants) |
2227 STOREARGB_AVX2 | 2227 STOREARGB_AVX2 |
2228 "sub $0x10,%[width] \n" | 2228 "sub $0x10,%[width] \n" |
2229 "jg 1b \n" | 2229 "jg 1b \n" |
2230 "vzeroupper \n" | 2230 "vzeroupper \n" |
2231 : [y_buf]"+r"(y_buf), // %[y_buf] | 2231 : [y_buf]"+r"(y_buf), // %[y_buf] |
2232 [uv_buf]"+r"(uv_buf), // %[uv_buf] | 2232 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
2233 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2233 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2234 [width]"+rm"(width) // %[width] | 2234 [width]"+rm"(width) // %[width] |
2235 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2235 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2236 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. | 2236 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. |
2237 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2237 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2238 ); | 2238 ); |
| 2239 // clang-format on |
2239 } | 2240 } |
2240 #endif // HAS_NV12TOARGBROW_AVX2 | 2241 #endif // HAS_NV12TOARGBROW_AVX2 |
2241 | 2242 |
2242 #if defined(HAS_NV21TOARGBROW_AVX2) | 2243 #if defined(HAS_NV21TOARGBROW_AVX2) |
2243 // 16 pixels. | 2244 // 16 pixels. |
2244 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2245 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
2245 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, | 2246 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, |
2246 const uint8* vu_buf, | 2247 const uint8* vu_buf, |
2247 uint8* dst_argb, | 2248 uint8* dst_argb, |
2248 const struct YuvConstants* yuvconstants, | 2249 const struct YuvConstants* yuvconstants, |
2249 int width) { | 2250 int width) { |
| 2251 // clang-format off |
2250 asm volatile ( | 2252 asm volatile ( |
2251 YUVTORGB_SETUP_AVX2(yuvconstants) | 2253 YUVTORGB_SETUP_AVX2(yuvconstants) |
2252 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2254 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2253 LABELALIGN | 2255 LABELALIGN |
2254 "1: \n" | 2256 "1: \n" |
2255 READNV21_AVX2 | 2257 READNV21_AVX2 |
2256 YUVTORGB_AVX2(yuvconstants) | 2258 YUVTORGB_AVX2(yuvconstants) |
2257 STOREARGB_AVX2 | 2259 STOREARGB_AVX2 |
2258 "sub $0x10,%[width] \n" | 2260 "sub $0x10,%[width] \n" |
2259 "jg 1b \n" | 2261 "jg 1b \n" |
2260 "vzeroupper \n" | 2262 "vzeroupper \n" |
2261 : [y_buf]"+r"(y_buf), // %[y_buf] | 2263 : [y_buf]"+r"(y_buf), // %[y_buf] |
2262 [vu_buf]"+r"(vu_buf), // %[vu_buf] | 2264 [vu_buf]"+r"(vu_buf), // %[vu_buf] |
2263 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2265 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2264 [width]"+rm"(width) // %[width] | 2266 [width]"+rm"(width) // %[width] |
2265 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 2267 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
2266 [kShuffleNV21]"m"(kShuffleNV21) | 2268 [kShuffleNV21]"m"(kShuffleNV21) |
2267 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. | 2269 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. |
2268 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2270 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2269 ); | 2271 ); |
| 2272 // clang-format on |
2270 } | 2273 } |
2271 #endif // HAS_NV21TOARGBROW_AVX2 | 2274 #endif // HAS_NV21TOARGBROW_AVX2 |
2272 | 2275 |
2273 #if defined(HAS_YUY2TOARGBROW_AVX2) | 2276 #if defined(HAS_YUY2TOARGBROW_AVX2) |
2274 // 16 pixels. | 2277 // 16 pixels. |
2275 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2278 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
2276 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, | 2279 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, |
2277 uint8* dst_argb, | 2280 uint8* dst_argb, |
2278 const struct YuvConstants* yuvconstants, | 2281 const struct YuvConstants* yuvconstants, |
2279 int width) { | 2282 int width) { |
| 2283 // clang-format off |
2280 asm volatile ( | 2284 asm volatile ( |
2281 YUVTORGB_SETUP_AVX2(yuvconstants) | 2285 YUVTORGB_SETUP_AVX2(yuvconstants) |
2282 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2286 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2283 LABELALIGN | 2287 LABELALIGN |
2284 "1: \n" | 2288 "1: \n" |
2285 READYUY2_AVX2 | 2289 READYUY2_AVX2 |
2286 YUVTORGB_AVX2(yuvconstants) | 2290 YUVTORGB_AVX2(yuvconstants) |
2287 STOREARGB_AVX2 | 2291 STOREARGB_AVX2 |
2288 "sub $0x10,%[width] \n" | 2292 "sub $0x10,%[width] \n" |
2289 "jg 1b \n" | 2293 "jg 1b \n" |
2290 "vzeroupper \n" | 2294 "vzeroupper \n" |
2291 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] | 2295 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] |
2292 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2296 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2293 [width]"+rm"(width) // %[width] | 2297 [width]"+rm"(width) // %[width] |
2294 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 2298 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
2295 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), | 2299 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), |
2296 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) | 2300 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) |
2297 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. | 2301 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. |
2298 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2302 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2299 ); | 2303 ); |
| 2304 // clang-format on |
2300 } | 2305 } |
2301 #endif // HAS_YUY2TOARGBROW_AVX2 | 2306 #endif // HAS_YUY2TOARGBROW_AVX2 |
2302 | 2307 |
2303 #if defined(HAS_UYVYTOARGBROW_AVX2) | 2308 #if defined(HAS_UYVYTOARGBROW_AVX2) |
2304 // 16 pixels. | 2309 // 16 pixels. |
2305 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2310 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
2306 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, | 2311 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, |
2307 uint8* dst_argb, | 2312 uint8* dst_argb, |
2308 const struct YuvConstants* yuvconstants, | 2313 const struct YuvConstants* yuvconstants, |
2309 int width) { | 2314 int width) { |
| 2315 // clang-format off |
2310 asm volatile ( | 2316 asm volatile ( |
2311 YUVTORGB_SETUP_AVX2(yuvconstants) | 2317 YUVTORGB_SETUP_AVX2(yuvconstants) |
2312 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2318 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2313 LABELALIGN | 2319 LABELALIGN |
2314 "1: \n" | 2320 "1: \n" |
2315 READUYVY_AVX2 | 2321 READUYVY_AVX2 |
2316 YUVTORGB_AVX2(yuvconstants) | 2322 YUVTORGB_AVX2(yuvconstants) |
2317 STOREARGB_AVX2 | 2323 STOREARGB_AVX2 |
2318 "sub $0x10,%[width] \n" | 2324 "sub $0x10,%[width] \n" |
2319 "jg 1b \n" | 2325 "jg 1b \n" |
2320 "vzeroupper \n" | 2326 "vzeroupper \n" |
2321 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] | 2327 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] |
2322 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2328 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2323 [width]"+rm"(width) // %[width] | 2329 [width]"+rm"(width) // %[width] |
2324 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 2330 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
2325 [kShuffleUYVYY]"m"(kShuffleUYVYY), | 2331 [kShuffleUYVYY]"m"(kShuffleUYVYY), |
2326 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) | 2332 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) |
2327 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. | 2333 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. |
2328 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2334 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2329 ); | 2335 ); |
| 2336 // clang-format on |
2330 } | 2337 } |
2331 #endif // HAS_UYVYTOARGBROW_AVX2 | 2338 #endif // HAS_UYVYTOARGBROW_AVX2 |
2332 | 2339 |
2333 #ifdef HAS_I400TOARGBROW_SSE2 | 2340 #ifdef HAS_I400TOARGBROW_SSE2 |
2334 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { | 2341 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { |
2335 asm volatile ( | 2342 asm volatile ( |
2336 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 | 2343 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 |
2337 "movd %%eax,%%xmm2 \n" | 2344 "movd %%eax,%%xmm2 \n" |
2338 "pshufd $0x0,%%xmm2,%%xmm2 \n" | 2345 "pshufd $0x0,%%xmm2,%%xmm2 \n" |
2339 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 | 2346 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2417 "+rm"(width) // %2 | 2424 "+rm"(width) // %2 |
2418 : | 2425 : |
2419 : "memory", "cc", "eax" | 2426 : "memory", "cc", "eax" |
2420 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | 2427 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
2421 ); | 2428 ); |
2422 } | 2429 } |
2423 #endif // HAS_I400TOARGBROW_AVX2 | 2430 #endif // HAS_I400TOARGBROW_AVX2 |
2424 | 2431 |
2425 #ifdef HAS_MIRRORROW_SSSE3 | 2432 #ifdef HAS_MIRRORROW_SSSE3 |
2426 // Shuffle table for reversing the bytes. | 2433 // Shuffle table for reversing the bytes. |
2427 static uvec8 kShuffleMirror = { | 2434 static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, |
2428 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | 2435 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; |
2429 }; | |
2430 | 2436 |
2431 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { | 2437 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
2432 intptr_t temp_width = (intptr_t)(width); | 2438 intptr_t temp_width = (intptr_t)(width); |
2433 asm volatile ( | 2439 asm volatile ( |
2434 "movdqa %3,%%xmm5 \n" | 2440 "movdqa %3,%%xmm5 \n" |
2435 LABELALIGN | 2441 LABELALIGN |
2436 "1: \n" | 2442 "1: \n" |
2437 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 | 2443 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 |
2438 "pshufb %%xmm5,%%xmm0 \n" | 2444 "pshufb %%xmm5,%%xmm0 \n" |
2439 "movdqu %%xmm0," MEMACCESS(1) " \n" | 2445 "movdqu %%xmm0," MEMACCESS(1) " \n" |
(...skipping 30 matching lines...) Expand all Loading... |
2470 "+r"(temp_width) // %2 | 2476 "+r"(temp_width) // %2 |
2471 : "m"(kShuffleMirror) // %3 | 2477 : "m"(kShuffleMirror) // %3 |
2472 : "memory", "cc", NACL_R14 | 2478 : "memory", "cc", NACL_R14 |
2473 "xmm0", "xmm5" | 2479 "xmm0", "xmm5" |
2474 ); | 2480 ); |
2475 } | 2481 } |
2476 #endif // HAS_MIRRORROW_AVX2 | 2482 #endif // HAS_MIRRORROW_AVX2 |
2477 | 2483 |
2478 #ifdef HAS_MIRRORUVROW_SSSE3 | 2484 #ifdef HAS_MIRRORUVROW_SSSE3 |
2479 // Shuffle table for reversing the bytes of UV channels. | 2485 // Shuffle table for reversing the bytes of UV channels. |
2480 static uvec8 kShuffleMirrorUV = { | 2486 static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, |
2481 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u | 2487 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; |
2482 }; | 2488 void MirrorUVRow_SSSE3(const uint8* src, |
2483 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, | 2489 uint8* dst_u, |
| 2490 uint8* dst_v, |
2484 int width) { | 2491 int width) { |
2485 intptr_t temp_width = (intptr_t)(width); | 2492 intptr_t temp_width = (intptr_t)(width); |
2486 asm volatile ( | 2493 asm volatile ( |
2487 "movdqa %4,%%xmm1 \n" | 2494 "movdqa %4,%%xmm1 \n" |
2488 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" | 2495 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" |
2489 "sub %1,%2 \n" | 2496 "sub %1,%2 \n" |
2490 LABELALIGN | 2497 LABELALIGN |
2491 "1: \n" | 2498 "1: \n" |
2492 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 2499 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
2493 "lea " MEMLEA(-0x10,0) ",%0 \n" | 2500 "lea " MEMLEA(-0x10,0) ",%0 \n" |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2528 "+r"(temp_width) // %2 | 2535 "+r"(temp_width) // %2 |
2529 : | 2536 : |
2530 : "memory", "cc" | 2537 : "memory", "cc" |
2531 , "xmm0" | 2538 , "xmm0" |
2532 ); | 2539 ); |
2533 } | 2540 } |
2534 #endif // HAS_ARGBMIRRORROW_SSE2 | 2541 #endif // HAS_ARGBMIRRORROW_SSE2 |
2535 | 2542 |
2536 #ifdef HAS_ARGBMIRRORROW_AVX2 | 2543 #ifdef HAS_ARGBMIRRORROW_AVX2 |
2537 // Shuffle table for reversing the bytes. | 2544 // Shuffle table for reversing the bytes. |
2538 static const ulvec32 kARGBShuffleMirror_AVX2 = { | 2545 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; |
2539 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | |
2540 }; | |
2541 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { | 2546 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { |
2542 intptr_t temp_width = (intptr_t)(width); | 2547 intptr_t temp_width = (intptr_t)(width); |
2543 asm volatile ( | 2548 asm volatile ( |
2544 "vmovdqu %3,%%ymm5 \n" | 2549 "vmovdqu %3,%%ymm5 \n" |
2545 LABELALIGN | 2550 LABELALIGN |
2546 "1: \n" | 2551 "1: \n" |
2547 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 | 2552 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 |
2548 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | 2553 "vmovdqu %%ymm0," MEMACCESS(1) " \n" |
2549 "lea " MEMLEA(0x20,1) ",%1 \n" | 2554 "lea " MEMLEA(0x20,1) ",%1 \n" |
2550 "sub $0x8,%2 \n" | 2555 "sub $0x8,%2 \n" |
2551 "jg 1b \n" | 2556 "jg 1b \n" |
2552 "vzeroupper \n" | 2557 "vzeroupper \n" |
2553 : "+r"(src), // %0 | 2558 : "+r"(src), // %0 |
2554 "+r"(dst), // %1 | 2559 "+r"(dst), // %1 |
2555 "+r"(temp_width) // %2 | 2560 "+r"(temp_width) // %2 |
2556 : "m"(kARGBShuffleMirror_AVX2) // %3 | 2561 : "m"(kARGBShuffleMirror_AVX2) // %3 |
2557 : "memory", "cc", NACL_R14 | 2562 : "memory", "cc", NACL_R14 |
2558 "xmm0", "xmm5" | 2563 "xmm0", "xmm5" |
2559 ); | 2564 ); |
2560 } | 2565 } |
2561 #endif // HAS_ARGBMIRRORROW_AVX2 | 2566 #endif // HAS_ARGBMIRRORROW_AVX2 |
2562 | 2567 |
2563 #ifdef HAS_SPLITUVROW_AVX2 | 2568 #ifdef HAS_SPLITUVROW_AVX2 |
2564 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 2569 void SplitUVRow_AVX2(const uint8* src_uv, |
| 2570 uint8* dst_u, |
| 2571 uint8* dst_v, |
2565 int width) { | 2572 int width) { |
2566 asm volatile ( | 2573 asm volatile ( |
2567 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2574 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2568 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 2575 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
2569 "sub %1,%2 \n" | 2576 "sub %1,%2 \n" |
2570 LABELALIGN | 2577 LABELALIGN |
2571 "1: \n" | 2578 "1: \n" |
2572 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 2579 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
2573 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 2580 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
2574 "lea " MEMLEA(0x40,0) ",%0 \n" | 2581 "lea " MEMLEA(0x40,0) ",%0 \n" |
(...skipping 16 matching lines...) Expand all Loading... |
2591 "+r"(dst_v), // %2 | 2598 "+r"(dst_v), // %2 |
2592 "+r"(width) // %3 | 2599 "+r"(width) // %3 |
2593 : | 2600 : |
2594 : "memory", "cc", NACL_R14 | 2601 : "memory", "cc", NACL_R14 |
2595 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 2602 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
2596 ); | 2603 ); |
2597 } | 2604 } |
2598 #endif // HAS_SPLITUVROW_AVX2 | 2605 #endif // HAS_SPLITUVROW_AVX2 |
2599 | 2606 |
2600 #ifdef HAS_SPLITUVROW_SSE2 | 2607 #ifdef HAS_SPLITUVROW_SSE2 |
2601 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 2608 void SplitUVRow_SSE2(const uint8* src_uv, |
| 2609 uint8* dst_u, |
| 2610 uint8* dst_v, |
2602 int width) { | 2611 int width) { |
2603 asm volatile ( | 2612 asm volatile ( |
2604 "pcmpeqb %%xmm5,%%xmm5 \n" | 2613 "pcmpeqb %%xmm5,%%xmm5 \n" |
2605 "psrlw $0x8,%%xmm5 \n" | 2614 "psrlw $0x8,%%xmm5 \n" |
2606 "sub %1,%2 \n" | 2615 "sub %1,%2 \n" |
2607 LABELALIGN | 2616 LABELALIGN |
2608 "1: \n" | 2617 "1: \n" |
2609 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 2618 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
2610 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 2619 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
2611 "lea " MEMLEA(0x20,0) ",%0 \n" | 2620 "lea " MEMLEA(0x20,0) ",%0 \n" |
(...skipping 15 matching lines...) Expand all Loading... |
2627 "+r"(dst_v), // %2 | 2636 "+r"(dst_v), // %2 |
2628 "+r"(width) // %3 | 2637 "+r"(width) // %3 |
2629 : | 2638 : |
2630 : "memory", "cc", NACL_R14 | 2639 : "memory", "cc", NACL_R14 |
2631 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 2640 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
2632 ); | 2641 ); |
2633 } | 2642 } |
2634 #endif // HAS_SPLITUVROW_SSE2 | 2643 #endif // HAS_SPLITUVROW_SSE2 |
2635 | 2644 |
2636 #ifdef HAS_MERGEUVROW_AVX2 | 2645 #ifdef HAS_MERGEUVROW_AVX2 |
2637 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 2646 void MergeUVRow_AVX2(const uint8* src_u, |
| 2647 const uint8* src_v, |
| 2648 uint8* dst_uv, |
2638 int width) { | 2649 int width) { |
2639 asm volatile ( | 2650 asm volatile ( |
2640 "sub %0,%1 \n" | 2651 "sub %0,%1 \n" |
2641 LABELALIGN | 2652 LABELALIGN |
2642 "1: \n" | 2653 "1: \n" |
2643 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 2654 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
2644 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 | 2655 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 |
2645 "lea " MEMLEA(0x20,0) ",%0 \n" | 2656 "lea " MEMLEA(0x20,0) ",%0 \n" |
2646 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" | 2657 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" |
2647 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" | 2658 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" |
(...skipping 10 matching lines...) Expand all Loading... |
2658 "+r"(dst_uv), // %2 | 2669 "+r"(dst_uv), // %2 |
2659 "+r"(width) // %3 | 2670 "+r"(width) // %3 |
2660 : | 2671 : |
2661 : "memory", "cc", NACL_R14 | 2672 : "memory", "cc", NACL_R14 |
2662 "xmm0", "xmm1", "xmm2" | 2673 "xmm0", "xmm1", "xmm2" |
2663 ); | 2674 ); |
2664 } | 2675 } |
2665 #endif // HAS_MERGEUVROW_AVX2 | 2676 #endif // HAS_MERGEUVROW_AVX2 |
2666 | 2677 |
2667 #ifdef HAS_MERGEUVROW_SSE2 | 2678 #ifdef HAS_MERGEUVROW_SSE2 |
2668 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 2679 void MergeUVRow_SSE2(const uint8* src_u, |
| 2680 const uint8* src_v, |
| 2681 uint8* dst_uv, |
2669 int width) { | 2682 int width) { |
2670 asm volatile ( | 2683 asm volatile ( |
2671 "sub %0,%1 \n" | 2684 "sub %0,%1 \n" |
2672 LABELALIGN | 2685 LABELALIGN |
2673 "1: \n" | 2686 "1: \n" |
2674 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 2687 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
2675 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | 2688 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 |
2676 "lea " MEMLEA(0x10,0) ",%0 \n" | 2689 "lea " MEMLEA(0x10,0) ",%0 \n" |
2677 "movdqa %%xmm0,%%xmm2 \n" | 2690 "movdqa %%xmm0,%%xmm2 \n" |
2678 "punpcklbw %%xmm1,%%xmm0 \n" | 2691 "punpcklbw %%xmm1,%%xmm0 \n" |
(...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2752 : "memory", "cc" | 2765 : "memory", "cc" |
2753 , "xmm0", "xmm1" | 2766 , "xmm0", "xmm1" |
2754 ); | 2767 ); |
2755 } | 2768 } |
2756 #endif // HAS_COPYROW_AVX | 2769 #endif // HAS_COPYROW_AVX |
2757 | 2770 |
2758 #ifdef HAS_COPYROW_ERMS | 2771 #ifdef HAS_COPYROW_ERMS |
2759 // Multiple of 1. | 2772 // Multiple of 1. |
2760 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { | 2773 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { |
2761 size_t width_tmp = (size_t)(width); | 2774 size_t width_tmp = (size_t)(width); |
2762 asm volatile ( | 2775 asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n" |
2763 "rep movsb " MEMMOVESTRING(0,1) " \n" | 2776 : "+S"(src), // %0 |
2764 : "+S"(src), // %0 | 2777 "+D"(dst), // %1 |
2765 "+D"(dst), // %1 | 2778 "+c"(width_tmp) // %2 |
2766 "+c"(width_tmp) // %2 | 2779 : |
2767 : | 2780 : "memory", "cc"); |
2768 : "memory", "cc" | |
2769 ); | |
2770 } | 2781 } |
2771 #endif // HAS_COPYROW_ERMS | 2782 #endif // HAS_COPYROW_ERMS |
2772 | 2783 |
2773 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 | 2784 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 |
2774 // width in pixels | 2785 // width in pixels |
2775 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { | 2786 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
2776 asm volatile ( | 2787 asm volatile ( |
2777 "pcmpeqb %%xmm0,%%xmm0 \n" | 2788 "pcmpeqb %%xmm0,%%xmm0 \n" |
2778 "pslld $0x18,%%xmm0 \n" | 2789 "pslld $0x18,%%xmm0 \n" |
2779 "pcmpeqb %%xmm1,%%xmm1 \n" | 2790 "pcmpeqb %%xmm1,%%xmm1 \n" |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2831 : | 2842 : |
2832 : "memory", "cc" | 2843 : "memory", "cc" |
2833 , "xmm0", "xmm1", "xmm2" | 2844 , "xmm0", "xmm1", "xmm2" |
2834 ); | 2845 ); |
2835 } | 2846 } |
2836 #endif // HAS_ARGBCOPYALPHAROW_AVX2 | 2847 #endif // HAS_ARGBCOPYALPHAROW_AVX2 |
2837 | 2848 |
2838 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 | 2849 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 |
2839 // width in pixels | 2850 // width in pixels |
2840 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { | 2851 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { |
2841 asm volatile ( | 2852 asm volatile ( |
2842 LABELALIGN | 2853 LABELALIGN |
2843 "1: \n" | 2854 "1: \n" |
2844 "movdqu " MEMACCESS(0) ", %%xmm0 \n" | 2855 "movdqu " MEMACCESS(0) ", %%xmm0 \n" |
2845 "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" | 2856 "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" |
2846 "lea " MEMLEA(0x20, 0) ", %0 \n" | 2857 "lea " MEMLEA(0x20, 0) ", %0 \n" |
2847 "psrld $0x18, %%xmm0 \n" | 2858 "psrld $0x18, %%xmm0 \n" |
2848 "psrld $0x18, %%xmm1 \n" | 2859 "psrld $0x18, %%xmm1 \n" |
2849 "packssdw %%xmm1, %%xmm0 \n" | 2860 "packssdw %%xmm1, %%xmm0 \n" |
2850 "packuswb %%xmm0, %%xmm0 \n" | 2861 "packuswb %%xmm0, %%xmm0 \n" |
2851 "movq %%xmm0," MEMACCESS(1) " \n" | 2862 "movq %%xmm0," MEMACCESS(1) " \n" |
2852 "lea " MEMLEA(0x8, 1) ", %1 \n" | 2863 "lea " MEMLEA(0x8, 1) ", %1 \n" |
2853 "sub $0x8, %2 \n" | 2864 "sub $0x8, %2 \n" |
2854 "jg 1b \n" | 2865 "jg 1b \n" |
2855 : "+r"(src_argb), // %0 | 2866 : "+r"(src_argb), // %0 |
2856 "+r"(dst_a), // %1 | 2867 "+r"(dst_a), // %1 |
2857 "+rm"(width) // %2 | 2868 "+rm"(width) // %2 |
2858 : | 2869 : |
2859 : "memory", "cc" | 2870 : "memory", "cc" |
2860 , "xmm0", "xmm1" | 2871 , "xmm0", "xmm1" |
2861 ); | 2872 ); |
2862 } | 2873 } |
2863 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 | 2874 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 |
2864 | 2875 |
2865 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 | 2876 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 |
2866 static const uvec8 kShuffleAlphaShort_AVX2 = { | 2877 static const uvec8 kShuffleAlphaShort_AVX2 = { |
2867 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, | 2878 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, |
2868 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u | 2879 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u}; |
2869 }; | |
2870 | 2880 |
2871 void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) { | 2881 void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) { |
2872 asm volatile ( | 2882 asm volatile ( |
2873 "vmovdqa %3,%%ymm4 \n" | 2883 "vmovdqa %3,%%ymm4 \n" |
2874 "vbroadcastf128 %4,%%ymm5 \n" | 2884 "vbroadcastf128 %4,%%ymm5 \n" |
2875 LABELALIGN | 2885 LABELALIGN |
2876 "1: \n" | 2886 "1: \n" |
2877 "vmovdqu " MEMACCESS(0) ", %%ymm0 \n" | 2887 "vmovdqu " MEMACCESS(0) ", %%ymm0 \n" |
2878 "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n" | 2888 "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n" |
2879 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 | 2889 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 |
2880 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" | 2890 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" |
2881 "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n" | 2891 "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n" |
2882 "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n" | 2892 "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n" |
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2969 : "memory", "cc" | 2979 : "memory", "cc" |
2970 , "xmm0", "xmm1", "xmm2" | 2980 , "xmm0", "xmm1", "xmm2" |
2971 ); | 2981 ); |
2972 } | 2982 } |
2973 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 | 2983 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 |
2974 | 2984 |
2975 #ifdef HAS_SETROW_X86 | 2985 #ifdef HAS_SETROW_X86 |
2976 void SetRow_X86(uint8* dst, uint8 v8, int width) { | 2986 void SetRow_X86(uint8* dst, uint8 v8, int width) { |
2977 size_t width_tmp = (size_t)(width >> 2); | 2987 size_t width_tmp = (size_t)(width >> 2); |
2978 const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. | 2988 const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. |
2979 asm volatile ( | 2989 asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n" |
2980 "rep stosl " MEMSTORESTRING(eax,0) " \n" | 2990 : "+D"(dst), // %0 |
2981 : "+D"(dst), // %0 | 2991 "+c"(width_tmp) // %1 |
2982 "+c"(width_tmp) // %1 | 2992 : "a"(v32) // %2 |
2983 : "a"(v32) // %2 | 2993 : "memory", "cc"); |
2984 : "memory", "cc"); | |
2985 } | 2994 } |
2986 | 2995 |
2987 void SetRow_ERMS(uint8* dst, uint8 v8, int width) { | 2996 void SetRow_ERMS(uint8* dst, uint8 v8, int width) { |
2988 size_t width_tmp = (size_t)(width); | 2997 size_t width_tmp = (size_t)(width); |
2989 asm volatile ( | 2998 asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n" |
2990 "rep stosb " MEMSTORESTRING(al,0) " \n" | 2999 : "+D"(dst), // %0 |
2991 : "+D"(dst), // %0 | 3000 "+c"(width_tmp) // %1 |
2992 "+c"(width_tmp) // %1 | 3001 : "a"(v8) // %2 |
2993 : "a"(v8) // %2 | 3002 : "memory", "cc"); |
2994 : "memory", "cc"); | |
2995 } | 3003 } |
2996 | 3004 |
2997 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { | 3005 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { |
2998 size_t width_tmp = (size_t)(width); | 3006 size_t width_tmp = (size_t)(width); |
2999 asm volatile ( | 3007 asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n" |
3000 "rep stosl " MEMSTORESTRING(eax,0) " \n" | 3008 : "+D"(dst_argb), // %0 |
3001 : "+D"(dst_argb), // %0 | 3009 "+c"(width_tmp) // %1 |
3002 "+c"(width_tmp) // %1 | 3010 : "a"(v32) // %2 |
3003 : "a"(v32) // %2 | 3011 : "memory", "cc"); |
3004 : "memory", "cc"); | |
3005 } | 3012 } |
3006 #endif // HAS_SETROW_X86 | 3013 #endif // HAS_SETROW_X86 |
3007 | 3014 |
3008 #ifdef HAS_YUY2TOYROW_SSE2 | 3015 #ifdef HAS_YUY2TOYROW_SSE2 |
3009 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { | 3016 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { |
3010 asm volatile ( | 3017 asm volatile ( |
3011 "pcmpeqb %%xmm5,%%xmm5 \n" | 3018 "pcmpeqb %%xmm5,%%xmm5 \n" |
3012 "psrlw $0x8,%%xmm5 \n" | 3019 "psrlw $0x8,%%xmm5 \n" |
3013 LABELALIGN | 3020 LABELALIGN |
3014 "1: \n" | 3021 "1: \n" |
3015 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3022 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3016 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3023 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
3017 "lea " MEMLEA(0x20,0) ",%0 \n" | 3024 "lea " MEMLEA(0x20,0) ",%0 \n" |
3018 "pand %%xmm5,%%xmm0 \n" | 3025 "pand %%xmm5,%%xmm0 \n" |
3019 "pand %%xmm5,%%xmm1 \n" | 3026 "pand %%xmm5,%%xmm1 \n" |
3020 "packuswb %%xmm1,%%xmm0 \n" | 3027 "packuswb %%xmm1,%%xmm0 \n" |
3021 "movdqu %%xmm0," MEMACCESS(1) " \n" | 3028 "movdqu %%xmm0," MEMACCESS(1) " \n" |
3022 "lea " MEMLEA(0x10,1) ",%1 \n" | 3029 "lea " MEMLEA(0x10,1) ",%1 \n" |
3023 "sub $0x10,%2 \n" | 3030 "sub $0x10,%2 \n" |
3024 "jg 1b \n" | 3031 "jg 1b \n" |
3025 : "+r"(src_yuy2), // %0 | 3032 : "+r"(src_yuy2), // %0 |
3026 "+r"(dst_y), // %1 | 3033 "+r"(dst_y), // %1 |
3027 "+r"(width) // %2 | 3034 "+r"(width) // %2 |
3028 : | 3035 : |
3029 : "memory", "cc" | 3036 : "memory", "cc" |
3030 , "xmm0", "xmm1", "xmm5" | 3037 , "xmm0", "xmm1", "xmm5" |
3031 ); | 3038 ); |
3032 } | 3039 } |
3033 | 3040 |
3034 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, | 3041 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, |
3035 uint8* dst_u, uint8* dst_v, int width) { | 3042 int stride_yuy2, |
| 3043 uint8* dst_u, |
| 3044 uint8* dst_v, |
| 3045 int width) { |
3036 asm volatile ( | 3046 asm volatile ( |
3037 "pcmpeqb %%xmm5,%%xmm5 \n" | 3047 "pcmpeqb %%xmm5,%%xmm5 \n" |
3038 "psrlw $0x8,%%xmm5 \n" | 3048 "psrlw $0x8,%%xmm5 \n" |
3039 "sub %1,%2 \n" | 3049 "sub %1,%2 \n" |
3040 LABELALIGN | 3050 LABELALIGN |
3041 "1: \n" | 3051 "1: \n" |
3042 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3052 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3043 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3053 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
3044 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 | 3054 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 |
3045 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 | 3055 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 |
(...skipping 17 matching lines...) Expand all Loading... |
3063 "+r"(dst_u), // %1 | 3073 "+r"(dst_u), // %1 |
3064 "+r"(dst_v), // %2 | 3074 "+r"(dst_v), // %2 |
3065 "+r"(width) // %3 | 3075 "+r"(width) // %3 |
3066 : "r"((intptr_t)(stride_yuy2)) // %4 | 3076 : "r"((intptr_t)(stride_yuy2)) // %4 |
3067 : "memory", "cc", NACL_R14 | 3077 : "memory", "cc", NACL_R14 |
3068 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 3078 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
3069 ); | 3079 ); |
3070 } | 3080 } |
3071 | 3081 |
3072 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, | 3082 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, |
3073 uint8* dst_u, uint8* dst_v, int width) { | 3083 uint8* dst_u, |
| 3084 uint8* dst_v, |
| 3085 int width) { |
3074 asm volatile ( | 3086 asm volatile ( |
3075 "pcmpeqb %%xmm5,%%xmm5 \n" | 3087 "pcmpeqb %%xmm5,%%xmm5 \n" |
3076 "psrlw $0x8,%%xmm5 \n" | 3088 "psrlw $0x8,%%xmm5 \n" |
3077 "sub %1,%2 \n" | 3089 "sub %1,%2 \n" |
3078 LABELALIGN | 3090 LABELALIGN |
3079 "1: \n" | 3091 "1: \n" |
3080 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3092 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3081 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3093 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
3082 "lea " MEMLEA(0x20,0) ",%0 \n" | 3094 "lea " MEMLEA(0x20,0) ",%0 \n" |
3083 "psrlw $0x8,%%xmm0 \n" | 3095 "psrlw $0x8,%%xmm0 \n" |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3119 "jg 1b \n" | 3131 "jg 1b \n" |
3120 : "+r"(src_uyvy), // %0 | 3132 : "+r"(src_uyvy), // %0 |
3121 "+r"(dst_y), // %1 | 3133 "+r"(dst_y), // %1 |
3122 "+r"(width) // %2 | 3134 "+r"(width) // %2 |
3123 : | 3135 : |
3124 : "memory", "cc" | 3136 : "memory", "cc" |
3125 , "xmm0", "xmm1" | 3137 , "xmm0", "xmm1" |
3126 ); | 3138 ); |
3127 } | 3139 } |
3128 | 3140 |
3129 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, | 3141 void UYVYToUVRow_SSE2(const uint8* src_uyvy, |
3130 uint8* dst_u, uint8* dst_v, int width) { | 3142 int stride_uyvy, |
| 3143 uint8* dst_u, |
| 3144 uint8* dst_v, |
| 3145 int width) { |
3131 asm volatile ( | 3146 asm volatile ( |
3132 "pcmpeqb %%xmm5,%%xmm5 \n" | 3147 "pcmpeqb %%xmm5,%%xmm5 \n" |
3133 "psrlw $0x8,%%xmm5 \n" | 3148 "psrlw $0x8,%%xmm5 \n" |
3134 "sub %1,%2 \n" | 3149 "sub %1,%2 \n" |
3135 LABELALIGN | 3150 LABELALIGN |
3136 "1: \n" | 3151 "1: \n" |
3137 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3152 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3138 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3153 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
3139 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 | 3154 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 |
3140 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 | 3155 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 |
(...skipping 17 matching lines...) Expand all Loading... |
3158 "+r"(dst_u), // %1 | 3173 "+r"(dst_u), // %1 |
3159 "+r"(dst_v), // %2 | 3174 "+r"(dst_v), // %2 |
3160 "+r"(width) // %3 | 3175 "+r"(width) // %3 |
3161 : "r"((intptr_t)(stride_uyvy)) // %4 | 3176 : "r"((intptr_t)(stride_uyvy)) // %4 |
3162 : "memory", "cc", NACL_R14 | 3177 : "memory", "cc", NACL_R14 |
3163 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 3178 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
3164 ); | 3179 ); |
3165 } | 3180 } |
3166 | 3181 |
3167 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, | 3182 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
3168 uint8* dst_u, uint8* dst_v, int width) { | 3183 uint8* dst_u, |
| 3184 uint8* dst_v, |
| 3185 int width) { |
3169 asm volatile ( | 3186 asm volatile ( |
3170 "pcmpeqb %%xmm5,%%xmm5 \n" | 3187 "pcmpeqb %%xmm5,%%xmm5 \n" |
3171 "psrlw $0x8,%%xmm5 \n" | 3188 "psrlw $0x8,%%xmm5 \n" |
3172 "sub %1,%2 \n" | 3189 "sub %1,%2 \n" |
3173 LABELALIGN | 3190 LABELALIGN |
3174 "1: \n" | 3191 "1: \n" |
3175 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3192 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3176 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3193 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
3177 "lea " MEMLEA(0x20,0) ",%0 \n" | 3194 "lea " MEMLEA(0x20,0) ",%0 \n" |
3178 "pand %%xmm5,%%xmm0 \n" | 3195 "pand %%xmm5,%%xmm0 \n" |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3220 "vzeroupper \n" | 3237 "vzeroupper \n" |
3221 : "+r"(src_yuy2), // %0 | 3238 : "+r"(src_yuy2), // %0 |
3222 "+r"(dst_y), // %1 | 3239 "+r"(dst_y), // %1 |
3223 "+r"(width) // %2 | 3240 "+r"(width) // %2 |
3224 : | 3241 : |
3225 : "memory", "cc" | 3242 : "memory", "cc" |
3226 , "xmm0", "xmm1", "xmm5" | 3243 , "xmm0", "xmm1", "xmm5" |
3227 ); | 3244 ); |
3228 } | 3245 } |
3229 | 3246 |
3230 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, | 3247 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, |
3231 uint8* dst_u, uint8* dst_v, int width) { | 3248 int stride_yuy2, |
| 3249 uint8* dst_u, |
| 3250 uint8* dst_v, |
| 3251 int width) { |
3232 asm volatile ( | 3252 asm volatile ( |
3233 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3253 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
3234 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3254 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
3235 "sub %1,%2 \n" | 3255 "sub %1,%2 \n" |
3236 LABELALIGN | 3256 LABELALIGN |
3237 "1: \n" | 3257 "1: \n" |
3238 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3258 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
3239 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3259 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
3240 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 | 3260 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 |
3241 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) | 3261 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) |
(...skipping 18 matching lines...) Expand all Loading... |
3260 "+r"(dst_u), // %1 | 3280 "+r"(dst_u), // %1 |
3261 "+r"(dst_v), // %2 | 3281 "+r"(dst_v), // %2 |
3262 "+r"(width) // %3 | 3282 "+r"(width) // %3 |
3263 : "r"((intptr_t)(stride_yuy2)) // %4 | 3283 : "r"((intptr_t)(stride_yuy2)) // %4 |
3264 : "memory", "cc", NACL_R14 | 3284 : "memory", "cc", NACL_R14 |
3265 "xmm0", "xmm1", "xmm5" | 3285 "xmm0", "xmm1", "xmm5" |
3266 ); | 3286 ); |
3267 } | 3287 } |
3268 | 3288 |
3269 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, | 3289 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, |
3270 uint8* dst_u, uint8* dst_v, int width) { | 3290 uint8* dst_u, |
| 3291 uint8* dst_v, |
| 3292 int width) { |
3271 asm volatile ( | 3293 asm volatile ( |
3272 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3294 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
3273 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3295 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
3274 "sub %1,%2 \n" | 3296 "sub %1,%2 \n" |
3275 LABELALIGN | 3297 LABELALIGN |
3276 "1: \n" | 3298 "1: \n" |
3277 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3299 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
3278 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3300 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
3279 "lea " MEMLEA(0x40,0) ",%0 \n" | 3301 "lea " MEMLEA(0x40,0) ",%0 \n" |
3280 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | 3302 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3320 "jg 1b \n" | 3342 "jg 1b \n" |
3321 "vzeroupper \n" | 3343 "vzeroupper \n" |
3322 : "+r"(src_uyvy), // %0 | 3344 : "+r"(src_uyvy), // %0 |
3323 "+r"(dst_y), // %1 | 3345 "+r"(dst_y), // %1 |
3324 "+r"(width) // %2 | 3346 "+r"(width) // %2 |
3325 : | 3347 : |
3326 : "memory", "cc" | 3348 : "memory", "cc" |
3327 , "xmm0", "xmm1", "xmm5" | 3349 , "xmm0", "xmm1", "xmm5" |
3328 ); | 3350 ); |
3329 } | 3351 } |
3330 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, | 3352 void UYVYToUVRow_AVX2(const uint8* src_uyvy, |
3331 uint8* dst_u, uint8* dst_v, int width) { | 3353 int stride_uyvy, |
| 3354 uint8* dst_u, |
| 3355 uint8* dst_v, |
| 3356 int width) { |
3332 asm volatile ( | 3357 asm volatile ( |
3333 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3358 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
3334 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3359 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
3335 "sub %1,%2 \n" | 3360 "sub %1,%2 \n" |
3336 | 3361 |
3337 LABELALIGN | 3362 LABELALIGN |
3338 "1: \n" | 3363 "1: \n" |
3339 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3364 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
3340 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3365 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
3341 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 | 3366 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 |
(...skipping 19 matching lines...) Expand all Loading... |
3361 "+r"(dst_u), // %1 | 3386 "+r"(dst_u), // %1 |
3362 "+r"(dst_v), // %2 | 3387 "+r"(dst_v), // %2 |
3363 "+r"(width) // %3 | 3388 "+r"(width) // %3 |
3364 : "r"((intptr_t)(stride_uyvy)) // %4 | 3389 : "r"((intptr_t)(stride_uyvy)) // %4 |
3365 : "memory", "cc", NACL_R14 | 3390 : "memory", "cc", NACL_R14 |
3366 "xmm0", "xmm1", "xmm5" | 3391 "xmm0", "xmm1", "xmm5" |
3367 ); | 3392 ); |
3368 } | 3393 } |
3369 | 3394 |
3370 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, | 3395 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, |
3371 uint8* dst_u, uint8* dst_v, int width) { | 3396 uint8* dst_u, |
| 3397 uint8* dst_v, |
| 3398 int width) { |
3372 asm volatile ( | 3399 asm volatile ( |
3373 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3400 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
3374 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3401 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
3375 "sub %1,%2 \n" | 3402 "sub %1,%2 \n" |
3376 LABELALIGN | 3403 LABELALIGN |
3377 "1: \n" | 3404 "1: \n" |
3378 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3405 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
3379 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3406 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
3380 "lea " MEMLEA(0x40,0) ",%0 \n" | 3407 "lea " MEMLEA(0x40,0) ",%0 \n" |
3381 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | 3408 "vpand %%ymm5,%%ymm0,%%ymm0 \n" |
(...skipping 18 matching lines...) Expand all Loading... |
3400 "+r"(width) // %3 | 3427 "+r"(width) // %3 |
3401 : | 3428 : |
3402 : "memory", "cc", NACL_R14 | 3429 : "memory", "cc", NACL_R14 |
3403 "xmm0", "xmm1", "xmm5" | 3430 "xmm0", "xmm1", "xmm5" |
3404 ); | 3431 ); |
3405 } | 3432 } |
3406 #endif // HAS_YUY2TOYROW_AVX2 | 3433 #endif // HAS_YUY2TOYROW_AVX2 |
3407 | 3434 |
3408 #ifdef HAS_ARGBBLENDROW_SSSE3 | 3435 #ifdef HAS_ARGBBLENDROW_SSSE3 |
3409 // Shuffle table for isolating alpha. | 3436 // Shuffle table for isolating alpha. |
3410 static uvec8 kShuffleAlpha = { | 3437 static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
3411 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, | 3438 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; |
3412 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 | |
3413 }; | |
3414 | 3439 |
3415 // Blend 8 pixels at a time | 3440 // Blend 8 pixels at a time |
3416 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, | 3441 void ARGBBlendRow_SSSE3(const uint8* src_argb0, |
3417 uint8* dst_argb, int width) { | 3442 const uint8* src_argb1, |
| 3443 uint8* dst_argb, |
| 3444 int width) { |
3418 asm volatile ( | 3445 asm volatile ( |
3419 "pcmpeqb %%xmm7,%%xmm7 \n" | 3446 "pcmpeqb %%xmm7,%%xmm7 \n" |
3420 "psrlw $0xf,%%xmm7 \n" | 3447 "psrlw $0xf,%%xmm7 \n" |
3421 "pcmpeqb %%xmm6,%%xmm6 \n" | 3448 "pcmpeqb %%xmm6,%%xmm6 \n" |
3422 "psrlw $0x8,%%xmm6 \n" | 3449 "psrlw $0x8,%%xmm6 \n" |
3423 "pcmpeqb %%xmm5,%%xmm5 \n" | 3450 "pcmpeqb %%xmm5,%%xmm5 \n" |
3424 "psllw $0x8,%%xmm5 \n" | 3451 "psllw $0x8,%%xmm5 \n" |
3425 "pcmpeqb %%xmm4,%%xmm4 \n" | 3452 "pcmpeqb %%xmm4,%%xmm4 \n" |
3426 "pslld $0x18,%%xmm4 \n" | 3453 "pslld $0x18,%%xmm4 \n" |
3427 "sub $0x4,%3 \n" | 3454 "sub $0x4,%3 \n" |
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3492 ); | 3519 ); |
3493 } | 3520 } |
3494 #endif // HAS_ARGBBLENDROW_SSSE3 | 3521 #endif // HAS_ARGBBLENDROW_SSSE3 |
3495 | 3522 |
3496 #ifdef HAS_BLENDPLANEROW_SSSE3 | 3523 #ifdef HAS_BLENDPLANEROW_SSSE3 |
3497 // Blend 8 pixels at a time. | 3524 // Blend 8 pixels at a time. |
3498 // unsigned version of math | 3525 // unsigned version of math |
3499 // =((A2*C2)+(B2*(255-C2))+255)/256 | 3526 // =((A2*C2)+(B2*(255-C2))+255)/256 |
3500 // signed version of math | 3527 // signed version of math |
3501 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 | 3528 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 |
3502 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, | 3529 void BlendPlaneRow_SSSE3(const uint8* src0, |
3503 const uint8* alpha, uint8* dst, int width) { | 3530 const uint8* src1, |
3504 asm volatile ( | 3531 const uint8* alpha, |
3505 "pcmpeqb %%xmm5,%%xmm5 \n" | 3532 uint8* dst, |
3506 "psllw $0x8,%%xmm5 \n" | 3533 int width) { |
3507 "mov $0x80808080,%%eax \n" | 3534 asm volatile( |
3508 "movd %%eax,%%xmm6 \n" | 3535 "pcmpeqb %%xmm5,%%xmm5 \n" |
3509 "pshufd $0x0,%%xmm6,%%xmm6 \n" | 3536 "psllw $0x8,%%xmm5 \n" |
3510 "mov $0x807f807f,%%eax \n" | 3537 "mov $0x80808080,%%eax \n" |
3511 "movd %%eax,%%xmm7 \n" | 3538 "movd %%eax,%%xmm6 \n" |
3512 "pshufd $0x0,%%xmm7,%%xmm7 \n" | 3539 "pshufd $0x0,%%xmm6,%%xmm6 \n" |
3513 "sub %2,%0 \n" | 3540 "mov $0x807f807f,%%eax \n" |
3514 "sub %2,%1 \n" | 3541 "movd %%eax,%%xmm7 \n" |
3515 "sub %2,%3 \n" | 3542 "pshufd $0x0,%%xmm7,%%xmm7 \n" |
| 3543 "sub %2,%0 \n" |
| 3544 "sub %2,%1 \n" |
| 3545 "sub %2,%3 \n" |
3516 | 3546 |
3517 // 8 pixel loop. | 3547 // 8 pixel loop. |
3518 LABELALIGN | 3548 LABELALIGN |
3519 "1: \n" | 3549 "1: \n" |
3520 "movq (%2),%%xmm0 \n" | 3550 "movq (%2),%%xmm0 \n" |
3521 "punpcklbw %%xmm0,%%xmm0 \n" | 3551 "punpcklbw %%xmm0,%%xmm0 \n" |
3522 "pxor %%xmm5,%%xmm0 \n" | 3552 "pxor %%xmm5,%%xmm0 \n" |
3523 "movq (%0,%2,1),%%xmm1 \n" | 3553 "movq (%0,%2,1),%%xmm1 \n" |
3524 "movq (%1,%2,1),%%xmm2 \n" | 3554 "movq (%1,%2,1),%%xmm2 \n" |
3525 "punpcklbw %%xmm2,%%xmm1 \n" | 3555 "punpcklbw %%xmm2,%%xmm1 \n" |
3526 "psubb %%xmm6,%%xmm1 \n" | 3556 "psubb %%xmm6,%%xmm1 \n" |
3527 "pmaddubsw %%xmm1,%%xmm0 \n" | 3557 "pmaddubsw %%xmm1,%%xmm0 \n" |
3528 "paddw %%xmm7,%%xmm0 \n" | 3558 "paddw %%xmm7,%%xmm0 \n" |
3529 "psrlw $0x8,%%xmm0 \n" | 3559 "psrlw $0x8,%%xmm0 \n" |
3530 "packuswb %%xmm0,%%xmm0 \n" | 3560 "packuswb %%xmm0,%%xmm0 \n" |
3531 "movq %%xmm0,(%3,%2,1) \n" | 3561 "movq %%xmm0,(%3,%2,1) \n" |
3532 "lea 0x8(%2),%2 \n" | 3562 "lea 0x8(%2),%2 \n" |
3533 "sub $0x8,%4 \n" | 3563 "sub $0x8,%4 \n" |
3534 "jg 1b \n" | 3564 "jg 1b \n" |
3535 : "+r"(src0), // %0 | 3565 : "+r"(src0), // %0 |
3536 "+r"(src1), // %1 | 3566 "+r"(src1), // %1 |
3537 "+r"(alpha), // %2 | 3567 "+r"(alpha), // %2 |
3538 "+r"(dst), // %3 | 3568 "+r"(dst), // %3 |
3539 "+rm"(width) // %4 | 3569 "+rm"(width) // %4 |
3540 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" | 3570 ::"memory", |
3541 ); | 3571 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); |
3542 } | 3572 } |
3543 #endif // HAS_BLENDPLANEROW_SSSE3 | 3573 #endif // HAS_BLENDPLANEROW_SSSE3 |
3544 | 3574 |
3545 #ifdef HAS_BLENDPLANEROW_AVX2 | 3575 #ifdef HAS_BLENDPLANEROW_AVX2 |
3546 // Blend 32 pixels at a time. | 3576 // Blend 32 pixels at a time. |
3547 // unsigned version of math | 3577 // unsigned version of math |
3548 // =((A2*C2)+(B2*(255-C2))+255)/256 | 3578 // =((A2*C2)+(B2*(255-C2))+255)/256 |
3549 // signed version of math | 3579 // signed version of math |
3550 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 | 3580 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 |
3551 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, | 3581 void BlendPlaneRow_AVX2(const uint8* src0, |
3552 const uint8* alpha, uint8* dst, int width) { | 3582 const uint8* src1, |
3553 asm volatile ( | 3583 const uint8* alpha, |
3554 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3584 uint8* dst, |
3555 "vpsllw $0x8,%%ymm5,%%ymm5 \n" | 3585 int width) { |
3556 "mov $0x80808080,%%eax \n" | 3586 asm volatile( |
3557 "vmovd %%eax,%%xmm6 \n" | 3587 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
3558 "vbroadcastss %%xmm6,%%ymm6 \n" | 3588 "vpsllw $0x8,%%ymm5,%%ymm5 \n" |
3559 "mov $0x807f807f,%%eax \n" | 3589 "mov $0x80808080,%%eax \n" |
3560 "vmovd %%eax,%%xmm7 \n" | 3590 "vmovd %%eax,%%xmm6 \n" |
3561 "vbroadcastss %%xmm7,%%ymm7 \n" | 3591 "vbroadcastss %%xmm6,%%ymm6 \n" |
3562 "sub %2,%0 \n" | 3592 "mov $0x807f807f,%%eax \n" |
3563 "sub %2,%1 \n" | 3593 "vmovd %%eax,%%xmm7 \n" |
3564 "sub %2,%3 \n" | 3594 "vbroadcastss %%xmm7,%%ymm7 \n" |
| 3595 "sub %2,%0 \n" |
| 3596 "sub %2,%1 \n" |
| 3597 "sub %2,%3 \n" |
3565 | 3598 |
3566 // 32 pixel loop. | 3599 // 32 pixel loop. |
3567 LABELALIGN | 3600 LABELALIGN |
3568 "1: \n" | 3601 "1: \n" |
3569 "vmovdqu (%2),%%ymm0 \n" | 3602 "vmovdqu (%2),%%ymm0 \n" |
3570 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" | 3603 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" |
3571 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" | 3604 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" |
3572 "vpxor %%ymm5,%%ymm3,%%ymm3 \n" | 3605 "vpxor %%ymm5,%%ymm3,%%ymm3 \n" |
3573 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" | 3606 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" |
3574 "vmovdqu (%0,%2,1),%%ymm1 \n" | 3607 "vmovdqu (%0,%2,1),%%ymm1 \n" |
3575 "vmovdqu (%1,%2,1),%%ymm2 \n" | 3608 "vmovdqu (%1,%2,1),%%ymm2 \n" |
3576 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" | 3609 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" |
3577 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" | 3610 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" |
3578 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" | 3611 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" |
3579 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" | 3612 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" |
3580 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" | 3613 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" |
3581 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" | 3614 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" |
3582 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" | 3615 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" |
3583 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" | 3616 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" |
3584 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" | 3617 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" |
3585 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | 3618 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
3586 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" | 3619 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" |
3587 "vmovdqu %%ymm0,(%3,%2,1) \n" | 3620 "vmovdqu %%ymm0,(%3,%2,1) \n" |
3588 "lea 0x20(%2),%2 \n" | 3621 "lea 0x20(%2),%2 \n" |
3589 "sub $0x20,%4 \n" | 3622 "sub $0x20,%4 \n" |
3590 "jg 1b \n" | 3623 "jg 1b \n" |
3591 "vzeroupper \n" | 3624 "vzeroupper \n" |
3592 : "+r"(src0), // %0 | 3625 : "+r"(src0), // %0 |
3593 "+r"(src1), // %1 | 3626 "+r"(src1), // %1 |
3594 "+r"(alpha), // %2 | 3627 "+r"(alpha), // %2 |
3595 "+r"(dst), // %3 | 3628 "+r"(dst), // %3 |
3596 "+rm"(width) // %4 | 3629 "+rm"(width) // %4 |
3597 :: "memory", "cc", "eax", | 3630 ::"memory", |
3598 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 3631 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
3599 ); | 3632 "xmm7"); |
3600 } | 3633 } |
3601 #endif // HAS_BLENDPLANEROW_AVX2 | 3634 #endif // HAS_BLENDPLANEROW_AVX2 |
3602 | 3635 |
3603 #ifdef HAS_ARGBATTENUATEROW_SSSE3 | 3636 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
3604 // Shuffle table duplicating alpha | 3637 // Shuffle table duplicating alpha |
3605 static uvec8 kShuffleAlpha0 = { | 3638 static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, |
3606 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u | 3639 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; |
3607 }; | 3640 static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
3608 static uvec8 kShuffleAlpha1 = { | 3641 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; |
3609 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, | |
3610 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u | |
3611 }; | |
3612 // Attenuate 4 pixels at a time. | 3642 // Attenuate 4 pixels at a time. |
3613 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { | 3643 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
3614 asm volatile ( | 3644 asm volatile ( |
3615 "pcmpeqb %%xmm3,%%xmm3 \n" | 3645 "pcmpeqb %%xmm3,%%xmm3 \n" |
3616 "pslld $0x18,%%xmm3 \n" | 3646 "pslld $0x18,%%xmm3 \n" |
3617 "movdqa %3,%%xmm4 \n" | 3647 "movdqa %3,%%xmm4 \n" |
3618 "movdqa %4,%%xmm5 \n" | 3648 "movdqa %4,%%xmm5 \n" |
3619 | 3649 |
3620 // 4 pixel loop. | 3650 // 4 pixel loop. |
3621 LABELALIGN | 3651 LABELALIGN |
(...skipping 25 matching lines...) Expand all Loading... |
3647 : "m"(kShuffleAlpha0), // %3 | 3677 : "m"(kShuffleAlpha0), // %3 |
3648 "m"(kShuffleAlpha1) // %4 | 3678 "m"(kShuffleAlpha1) // %4 |
3649 : "memory", "cc" | 3679 : "memory", "cc" |
3650 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 3680 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
3651 ); | 3681 ); |
3652 } | 3682 } |
3653 #endif // HAS_ARGBATTENUATEROW_SSSE3 | 3683 #endif // HAS_ARGBATTENUATEROW_SSSE3 |
3654 | 3684 |
3655 #ifdef HAS_ARGBATTENUATEROW_AVX2 | 3685 #ifdef HAS_ARGBATTENUATEROW_AVX2 |
3656 // Shuffle table duplicating alpha. | 3686 // Shuffle table duplicating alpha. |
3657 static const uvec8 kShuffleAlpha_AVX2 = { | 3687 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, |
3658 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u | 3688 128u, 128u, 14u, 15u, 14u, 15u, |
3659 }; | 3689 14u, 15u, 128u, 128u}; |
3660 // Attenuate 8 pixels at a time. | 3690 // Attenuate 8 pixels at a time. |
3661 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { | 3691 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { |
3662 asm volatile ( | 3692 asm volatile ( |
3663 "vbroadcastf128 %3,%%ymm4 \n" | 3693 "vbroadcastf128 %3,%%ymm4 \n" |
3664 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3694 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
3665 "vpslld $0x18,%%ymm5,%%ymm5 \n" | 3695 "vpslld $0x18,%%ymm5,%%ymm5 \n" |
3666 "sub %0,%1 \n" | 3696 "sub %0,%1 \n" |
3667 | 3697 |
3668 // 8 pixel loop. | 3698 // 8 pixel loop. |
3669 LABELALIGN | 3699 LABELALIGN |
(...skipping 20 matching lines...) Expand all Loading... |
3690 "+r"(width) // %2 | 3720 "+r"(width) // %2 |
3691 : "m"(kShuffleAlpha_AVX2) // %3 | 3721 : "m"(kShuffleAlpha_AVX2) // %3 |
3692 : "memory", "cc" | 3722 : "memory", "cc" |
3693 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 3723 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
3694 ); | 3724 ); |
3695 } | 3725 } |
3696 #endif // HAS_ARGBATTENUATEROW_AVX2 | 3726 #endif // HAS_ARGBATTENUATEROW_AVX2 |
3697 | 3727 |
3698 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 | 3728 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 |
3699 // Unattenuate 4 pixels at a time. | 3729 // Unattenuate 4 pixels at a time. |
3700 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 3730 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, |
| 3731 uint8* dst_argb, |
3701 int width) { | 3732 int width) { |
3702 uintptr_t alpha; | 3733 uintptr_t alpha; |
3703 asm volatile ( | 3734 asm volatile ( |
3704 // 4 pixel loop. | 3735 // 4 pixel loop. |
3705 LABELALIGN | 3736 LABELALIGN |
3706 "1: \n" | 3737 "1: \n" |
3707 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3738 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3708 "movzb " MEMACCESS2(0x03,0) ",%3 \n" | 3739 "movzb " MEMACCESS2(0x03,0) ",%3 \n" |
3709 "punpcklbw %%xmm0,%%xmm0 \n" | 3740 "punpcklbw %%xmm0,%%xmm0 \n" |
3710 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 | 3741 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 |
(...skipping 26 matching lines...) Expand all Loading... |
3737 : "r"(fixed_invtbl8) // %4 | 3768 : "r"(fixed_invtbl8) // %4 |
3738 : "memory", "cc", NACL_R14 | 3769 : "memory", "cc", NACL_R14 |
3739 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 3770 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
3740 ); | 3771 ); |
3741 } | 3772 } |
3742 #endif // HAS_ARGBUNATTENUATEROW_SSE2 | 3773 #endif // HAS_ARGBUNATTENUATEROW_SSE2 |
3743 | 3774 |
3744 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 | 3775 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 |
3745 // Shuffle table duplicating alpha. | 3776 // Shuffle table duplicating alpha. |
3746 static const uvec8 kUnattenShuffleAlpha_AVX2 = { | 3777 static const uvec8 kUnattenShuffleAlpha_AVX2 = { |
3747 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u | 3778 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; |
3748 }; | |
3749 // Unattenuate 8 pixels at a time. | 3779 // Unattenuate 8 pixels at a time. |
3750 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 3780 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, |
| 3781 uint8* dst_argb, |
3751 int width) { | 3782 int width) { |
3752 uintptr_t alpha; | 3783 uintptr_t alpha; |
3753 asm volatile ( | 3784 asm volatile ( |
3754 "sub %0,%1 \n" | 3785 "sub %0,%1 \n" |
3755 "vbroadcastf128 %5,%%ymm5 \n" | 3786 "vbroadcastf128 %5,%%ymm5 \n" |
3756 | 3787 |
3757 // 8 pixel loop. | 3788 // 8 pixel loop. |
3758 LABELALIGN | 3789 LABELALIGN |
3759 "1: \n" | 3790 "1: \n" |
3760 // replace VPGATHER | 3791 // replace VPGATHER |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3855 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 3886 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
3856 ); | 3887 ); |
3857 } | 3888 } |
3858 #endif // HAS_ARGBGRAYROW_SSSE3 | 3889 #endif // HAS_ARGBGRAYROW_SSSE3 |
3859 | 3890 |
3860 #ifdef HAS_ARGBSEPIAROW_SSSE3 | 3891 #ifdef HAS_ARGBSEPIAROW_SSSE3 |
3861 // b = (r * 35 + g * 68 + b * 17) >> 7 | 3892 // b = (r * 35 + g * 68 + b * 17) >> 7 |
3862 // g = (r * 45 + g * 88 + b * 22) >> 7 | 3893 // g = (r * 45 + g * 88 + b * 22) >> 7 |
3863 // r = (r * 50 + g * 98 + b * 24) >> 7 | 3894 // r = (r * 50 + g * 98 + b * 24) >> 7 |
3864 // Constant for ARGB color to sepia tone | 3895 // Constant for ARGB color to sepia tone |
3865 static vec8 kARGBToSepiaB = { | 3896 static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, |
3866 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 | 3897 17, 68, 35, 0, 17, 68, 35, 0}; |
3867 }; | |
3868 | 3898 |
3869 static vec8 kARGBToSepiaG = { | 3899 static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, |
3870 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 | 3900 22, 88, 45, 0, 22, 88, 45, 0}; |
3871 }; | |
3872 | 3901 |
3873 static vec8 kARGBToSepiaR = { | 3902 static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, |
3874 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 | 3903 24, 98, 50, 0, 24, 98, 50, 0}; |
3875 }; | |
3876 | 3904 |
3877 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. | 3905 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
3878 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { | 3906 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { |
3879 asm volatile ( | 3907 asm volatile ( |
3880 "movdqa %2,%%xmm2 \n" | 3908 "movdqa %2,%%xmm2 \n" |
3881 "movdqa %3,%%xmm3 \n" | 3909 "movdqa %3,%%xmm3 \n" |
3882 "movdqa %4,%%xmm4 \n" | 3910 "movdqa %4,%%xmm4 \n" |
3883 | 3911 |
3884 // 8 pixel loop. | 3912 // 8 pixel loop. |
3885 LABELALIGN | 3913 LABELALIGN |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3928 "m"(kARGBToSepiaR) // %4 | 3956 "m"(kARGBToSepiaR) // %4 |
3929 : "memory", "cc" | 3957 : "memory", "cc" |
3930 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 3958 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
3931 ); | 3959 ); |
3932 } | 3960 } |
3933 #endif // HAS_ARGBSEPIAROW_SSSE3 | 3961 #endif // HAS_ARGBSEPIAROW_SSSE3 |
3934 | 3962 |
3935 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 | 3963 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 |
3936 // Tranform 8 ARGB pixels (32 bytes) with color matrix. | 3964 // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
3937 // Same as Sepia except matrix is provided. | 3965 // Same as Sepia except matrix is provided. |
3938 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 3966 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, |
3939 const int8* matrix_argb, int width) { | 3967 uint8* dst_argb, |
| 3968 const int8* matrix_argb, |
| 3969 int width) { |
3940 asm volatile ( | 3970 asm volatile ( |
3941 "movdqu " MEMACCESS(3) ",%%xmm5 \n" | 3971 "movdqu " MEMACCESS(3) ",%%xmm5 \n" |
3942 "pshufd $0x00,%%xmm5,%%xmm2 \n" | 3972 "pshufd $0x00,%%xmm5,%%xmm2 \n" |
3943 "pshufd $0x55,%%xmm5,%%xmm3 \n" | 3973 "pshufd $0x55,%%xmm5,%%xmm3 \n" |
3944 "pshufd $0xaa,%%xmm5,%%xmm4 \n" | 3974 "pshufd $0xaa,%%xmm5,%%xmm4 \n" |
3945 "pshufd $0xff,%%xmm5,%%xmm5 \n" | 3975 "pshufd $0xff,%%xmm5,%%xmm5 \n" |
3946 | 3976 |
3947 // 8 pixel loop. | 3977 // 8 pixel loop. |
3948 LABELALIGN | 3978 LABELALIGN |
3949 "1: \n" | 3979 "1: \n" |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3991 "+r"(width) // %2 | 4021 "+r"(width) // %2 |
3992 : "r"(matrix_argb) // %3 | 4022 : "r"(matrix_argb) // %3 |
3993 : "memory", "cc" | 4023 : "memory", "cc" |
3994 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 4024 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
3995 ); | 4025 ); |
3996 } | 4026 } |
3997 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 | 4027 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 |
3998 | 4028 |
3999 #ifdef HAS_ARGBQUANTIZEROW_SSE2 | 4029 #ifdef HAS_ARGBQUANTIZEROW_SSE2 |
4000 // Quantize 4 ARGB pixels (16 bytes). | 4030 // Quantize 4 ARGB pixels (16 bytes). |
4001 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, | 4031 void ARGBQuantizeRow_SSE2(uint8* dst_argb, |
4002 int interval_offset, int width) { | 4032 int scale, |
| 4033 int interval_size, |
| 4034 int interval_offset, |
| 4035 int width) { |
4003 asm volatile ( | 4036 asm volatile ( |
4004 "movd %2,%%xmm2 \n" | 4037 "movd %2,%%xmm2 \n" |
4005 "movd %3,%%xmm3 \n" | 4038 "movd %3,%%xmm3 \n" |
4006 "movd %4,%%xmm4 \n" | 4039 "movd %4,%%xmm4 \n" |
4007 "pshuflw $0x40,%%xmm2,%%xmm2 \n" | 4040 "pshuflw $0x40,%%xmm2,%%xmm2 \n" |
4008 "pshufd $0x44,%%xmm2,%%xmm2 \n" | 4041 "pshufd $0x44,%%xmm2,%%xmm2 \n" |
4009 "pshuflw $0x40,%%xmm3,%%xmm3 \n" | 4042 "pshuflw $0x40,%%xmm3,%%xmm3 \n" |
4010 "pshufd $0x44,%%xmm3,%%xmm3 \n" | 4043 "pshufd $0x44,%%xmm3,%%xmm3 \n" |
4011 "pshuflw $0x40,%%xmm4,%%xmm4 \n" | 4044 "pshuflw $0x40,%%xmm4,%%xmm4 \n" |
4012 "pshufd $0x44,%%xmm4,%%xmm4 \n" | 4045 "pshufd $0x44,%%xmm4,%%xmm4 \n" |
(...skipping 28 matching lines...) Expand all Loading... |
4041 "r"(interval_size), // %3 | 4074 "r"(interval_size), // %3 |
4042 "r"(interval_offset) // %4 | 4075 "r"(interval_offset) // %4 |
4043 : "memory", "cc" | 4076 : "memory", "cc" |
4044 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 4077 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
4045 ); | 4078 ); |
4046 } | 4079 } |
4047 #endif // HAS_ARGBQUANTIZEROW_SSE2 | 4080 #endif // HAS_ARGBQUANTIZEROW_SSE2 |
4048 | 4081 |
4049 #ifdef HAS_ARGBSHADEROW_SSE2 | 4082 #ifdef HAS_ARGBSHADEROW_SSE2 |
4050 // Shade 4 pixels at a time by specified value. | 4083 // Shade 4 pixels at a time by specified value. |
4051 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, | 4084 void ARGBShadeRow_SSE2(const uint8* src_argb, |
| 4085 uint8* dst_argb, |
| 4086 int width, |
4052 uint32 value) { | 4087 uint32 value) { |
4053 asm volatile ( | 4088 asm volatile ( |
4054 "movd %3,%%xmm2 \n" | 4089 "movd %3,%%xmm2 \n" |
4055 "punpcklbw %%xmm2,%%xmm2 \n" | 4090 "punpcklbw %%xmm2,%%xmm2 \n" |
4056 "punpcklqdq %%xmm2,%%xmm2 \n" | 4091 "punpcklqdq %%xmm2,%%xmm2 \n" |
4057 | 4092 |
4058 // 4 pixel loop. | 4093 // 4 pixel loop. |
4059 LABELALIGN | 4094 LABELALIGN |
4060 "1: \n" | 4095 "1: \n" |
4061 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4096 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
(...skipping 15 matching lines...) Expand all Loading... |
4077 "+r"(width) // %2 | 4112 "+r"(width) // %2 |
4078 : "r"(value) // %3 | 4113 : "r"(value) // %3 |
4079 : "memory", "cc" | 4114 : "memory", "cc" |
4080 , "xmm0", "xmm1", "xmm2" | 4115 , "xmm0", "xmm1", "xmm2" |
4081 ); | 4116 ); |
4082 } | 4117 } |
4083 #endif // HAS_ARGBSHADEROW_SSE2 | 4118 #endif // HAS_ARGBSHADEROW_SSE2 |
4084 | 4119 |
4085 #ifdef HAS_ARGBMULTIPLYROW_SSE2 | 4120 #ifdef HAS_ARGBMULTIPLYROW_SSE2 |
4086 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. | 4121 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. |
4087 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4122 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, |
4088 uint8* dst_argb, int width) { | 4123 const uint8* src_argb1, |
| 4124 uint8* dst_argb, |
| 4125 int width) { |
4089 asm volatile ( | 4126 asm volatile ( |
4090 "pxor %%xmm5,%%xmm5 \n" | 4127 "pxor %%xmm5,%%xmm5 \n" |
4091 | 4128 |
4092 // 4 pixel loop. | 4129 // 4 pixel loop. |
4093 LABELALIGN | 4130 LABELALIGN |
4094 "1: \n" | 4131 "1: \n" |
4095 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4132 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
4096 "lea " MEMLEA(0x10,0) ",%0 \n" | 4133 "lea " MEMLEA(0x10,0) ",%0 \n" |
4097 "movdqu " MEMACCESS(1) ",%%xmm2 \n" | 4134 "movdqu " MEMACCESS(1) ",%%xmm2 \n" |
4098 "lea " MEMLEA(0x10,1) ",%1 \n" | 4135 "lea " MEMLEA(0x10,1) ",%1 \n" |
(...skipping 16 matching lines...) Expand all Loading... |
4115 "+r"(width) // %3 | 4152 "+r"(width) // %3 |
4116 : | 4153 : |
4117 : "memory", "cc" | 4154 : "memory", "cc" |
4118 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 4155 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
4119 ); | 4156 ); |
4120 } | 4157 } |
4121 #endif // HAS_ARGBMULTIPLYROW_SSE2 | 4158 #endif // HAS_ARGBMULTIPLYROW_SSE2 |
4122 | 4159 |
4123 #ifdef HAS_ARGBMULTIPLYROW_AVX2 | 4160 #ifdef HAS_ARGBMULTIPLYROW_AVX2 |
4124 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | 4161 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
4125 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 4162 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, |
4126 uint8* dst_argb, int width) { | 4163 const uint8* src_argb1, |
| 4164 uint8* dst_argb, |
| 4165 int width) { |
4127 asm volatile ( | 4166 asm volatile ( |
4128 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" | 4167 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" |
4129 | 4168 |
4130 // 4 pixel loop. | 4169 // 4 pixel loop. |
4131 LABELALIGN | 4170 LABELALIGN |
4132 "1: \n" | 4171 "1: \n" |
4133 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" | 4172 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" |
4134 "lea " MEMLEA(0x20,0) ",%0 \n" | 4173 "lea " MEMLEA(0x20,0) ",%0 \n" |
4135 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" | 4174 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" |
4136 "lea " MEMLEA(0x20,1) ",%1 \n" | 4175 "lea " MEMLEA(0x20,1) ",%1 \n" |
(...skipping 17 matching lines...) Expand all Loading... |
4154 : "memory", "cc" | 4193 : "memory", "cc" |
4155 #if defined(__AVX2__) | 4194 #if defined(__AVX2__) |
4156 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 4195 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
4157 #endif | 4196 #endif |
4158 ); | 4197 ); |
4159 } | 4198 } |
4160 #endif // HAS_ARGBMULTIPLYROW_AVX2 | 4199 #endif // HAS_ARGBMULTIPLYROW_AVX2 |
4161 | 4200 |
4162 #ifdef HAS_ARGBADDROW_SSE2 | 4201 #ifdef HAS_ARGBADDROW_SSE2 |
4163 // Add 2 rows of ARGB pixels together, 4 pixels at a time. | 4202 // Add 2 rows of ARGB pixels together, 4 pixels at a time. |
4164 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4203 void ARGBAddRow_SSE2(const uint8* src_argb0, |
4165 uint8* dst_argb, int width) { | 4204 const uint8* src_argb1, |
| 4205 uint8* dst_argb, |
| 4206 int width) { |
4166 asm volatile ( | 4207 asm volatile ( |
4167 // 4 pixel loop. | 4208 // 4 pixel loop. |
4168 LABELALIGN | 4209 LABELALIGN |
4169 "1: \n" | 4210 "1: \n" |
4170 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4211 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
4171 "lea " MEMLEA(0x10,0) ",%0 \n" | 4212 "lea " MEMLEA(0x10,0) ",%0 \n" |
4172 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | 4213 "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
4173 "lea " MEMLEA(0x10,1) ",%1 \n" | 4214 "lea " MEMLEA(0x10,1) ",%1 \n" |
4174 "paddusb %%xmm1,%%xmm0 \n" | 4215 "paddusb %%xmm1,%%xmm0 \n" |
4175 "movdqu %%xmm0," MEMACCESS(2) " \n" | 4216 "movdqu %%xmm0," MEMACCESS(2) " \n" |
4176 "lea " MEMLEA(0x10,2) ",%2 \n" | 4217 "lea " MEMLEA(0x10,2) ",%2 \n" |
4177 "sub $0x4,%3 \n" | 4218 "sub $0x4,%3 \n" |
4178 "jg 1b \n" | 4219 "jg 1b \n" |
4179 : "+r"(src_argb0), // %0 | 4220 : "+r"(src_argb0), // %0 |
4180 "+r"(src_argb1), // %1 | 4221 "+r"(src_argb1), // %1 |
4181 "+r"(dst_argb), // %2 | 4222 "+r"(dst_argb), // %2 |
4182 "+r"(width) // %3 | 4223 "+r"(width) // %3 |
4183 : | 4224 : |
4184 : "memory", "cc" | 4225 : "memory", "cc" |
4185 , "xmm0", "xmm1" | 4226 , "xmm0", "xmm1" |
4186 ); | 4227 ); |
4187 } | 4228 } |
4188 #endif // HAS_ARGBADDROW_SSE2 | 4229 #endif // HAS_ARGBADDROW_SSE2 |
4189 | 4230 |
4190 #ifdef HAS_ARGBADDROW_AVX2 | 4231 #ifdef HAS_ARGBADDROW_AVX2 |
4191 // Add 2 rows of ARGB pixels together, 4 pixels at a time. | 4232 // Add 2 rows of ARGB pixels together, 4 pixels at a time. |
4192 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 4233 void ARGBAddRow_AVX2(const uint8* src_argb0, |
4193 uint8* dst_argb, int width) { | 4234 const uint8* src_argb1, |
| 4235 uint8* dst_argb, |
| 4236 int width) { |
4194 asm volatile ( | 4237 asm volatile ( |
4195 // 4 pixel loop. | 4238 // 4 pixel loop. |
4196 LABELALIGN | 4239 LABELALIGN |
4197 "1: \n" | 4240 "1: \n" |
4198 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 4241 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
4199 "lea " MEMLEA(0x20,0) ",%0 \n" | 4242 "lea " MEMLEA(0x20,0) ",%0 \n" |
4200 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" | 4243 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" |
4201 "lea " MEMLEA(0x20,1) ",%1 \n" | 4244 "lea " MEMLEA(0x20,1) ",%1 \n" |
4202 "vmovdqu %%ymm0," MEMACCESS(2) " \n" | 4245 "vmovdqu %%ymm0," MEMACCESS(2) " \n" |
4203 "lea " MEMLEA(0x20,2) ",%2 \n" | 4246 "lea " MEMLEA(0x20,2) ",%2 \n" |
4204 "sub $0x8,%3 \n" | 4247 "sub $0x8,%3 \n" |
4205 "jg 1b \n" | 4248 "jg 1b \n" |
4206 "vzeroupper \n" | 4249 "vzeroupper \n" |
4207 : "+r"(src_argb0), // %0 | 4250 : "+r"(src_argb0), // %0 |
4208 "+r"(src_argb1), // %1 | 4251 "+r"(src_argb1), // %1 |
4209 "+r"(dst_argb), // %2 | 4252 "+r"(dst_argb), // %2 |
4210 "+r"(width) // %3 | 4253 "+r"(width) // %3 |
4211 : | 4254 : |
4212 : "memory", "cc" | 4255 : "memory", "cc" |
4213 , "xmm0" | 4256 , "xmm0" |
4214 ); | 4257 ); |
4215 } | 4258 } |
4216 #endif // HAS_ARGBADDROW_AVX2 | 4259 #endif // HAS_ARGBADDROW_AVX2 |
4217 | 4260 |
4218 #ifdef HAS_ARGBSUBTRACTROW_SSE2 | 4261 #ifdef HAS_ARGBSUBTRACTROW_SSE2 |
4219 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. | 4262 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. |
4220 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4263 void ARGBSubtractRow_SSE2(const uint8* src_argb0, |
4221 uint8* dst_argb, int width) { | 4264 const uint8* src_argb1, |
| 4265 uint8* dst_argb, |
| 4266 int width) { |
4222 asm volatile ( | 4267 asm volatile ( |
4223 // 4 pixel loop. | 4268 // 4 pixel loop. |
4224 LABELALIGN | 4269 LABELALIGN |
4225 "1: \n" | 4270 "1: \n" |
4226 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4271 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
4227 "lea " MEMLEA(0x10,0) ",%0 \n" | 4272 "lea " MEMLEA(0x10,0) ",%0 \n" |
4228 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | 4273 "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
4229 "lea " MEMLEA(0x10,1) ",%1 \n" | 4274 "lea " MEMLEA(0x10,1) ",%1 \n" |
4230 "psubusb %%xmm1,%%xmm0 \n" | 4275 "psubusb %%xmm1,%%xmm0 \n" |
4231 "movdqu %%xmm0," MEMACCESS(2) " \n" | 4276 "movdqu %%xmm0," MEMACCESS(2) " \n" |
4232 "lea " MEMLEA(0x10,2) ",%2 \n" | 4277 "lea " MEMLEA(0x10,2) ",%2 \n" |
4233 "sub $0x4,%3 \n" | 4278 "sub $0x4,%3 \n" |
4234 "jg 1b \n" | 4279 "jg 1b \n" |
4235 : "+r"(src_argb0), // %0 | 4280 : "+r"(src_argb0), // %0 |
4236 "+r"(src_argb1), // %1 | 4281 "+r"(src_argb1), // %1 |
4237 "+r"(dst_argb), // %2 | 4282 "+r"(dst_argb), // %2 |
4238 "+r"(width) // %3 | 4283 "+r"(width) // %3 |
4239 : | 4284 : |
4240 : "memory", "cc" | 4285 : "memory", "cc" |
4241 , "xmm0", "xmm1" | 4286 , "xmm0", "xmm1" |
4242 ); | 4287 ); |
4243 } | 4288 } |
4244 #endif // HAS_ARGBSUBTRACTROW_SSE2 | 4289 #endif // HAS_ARGBSUBTRACTROW_SSE2 |
4245 | 4290 |
4246 #ifdef HAS_ARGBSUBTRACTROW_AVX2 | 4291 #ifdef HAS_ARGBSUBTRACTROW_AVX2 |
4247 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. | 4292 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. |
4248 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 4293 void ARGBSubtractRow_AVX2(const uint8* src_argb0, |
4249 uint8* dst_argb, int width) { | 4294 const uint8* src_argb1, |
| 4295 uint8* dst_argb, |
| 4296 int width) { |
4250 asm volatile ( | 4297 asm volatile ( |
4251 // 4 pixel loop. | 4298 // 4 pixel loop. |
4252 LABELALIGN | 4299 LABELALIGN |
4253 "1: \n" | 4300 "1: \n" |
4254 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 4301 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
4255 "lea " MEMLEA(0x20,0) ",%0 \n" | 4302 "lea " MEMLEA(0x20,0) ",%0 \n" |
4256 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" | 4303 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" |
4257 "lea " MEMLEA(0x20,1) ",%1 \n" | 4304 "lea " MEMLEA(0x20,1) ",%1 \n" |
4258 "vmovdqu %%ymm0," MEMACCESS(2) " \n" | 4305 "vmovdqu %%ymm0," MEMACCESS(2) " \n" |
4259 "lea " MEMLEA(0x20,2) ",%2 \n" | 4306 "lea " MEMLEA(0x20,2) ",%2 \n" |
4260 "sub $0x8,%3 \n" | 4307 "sub $0x8,%3 \n" |
4261 "jg 1b \n" | 4308 "jg 1b \n" |
4262 "vzeroupper \n" | 4309 "vzeroupper \n" |
4263 : "+r"(src_argb0), // %0 | 4310 : "+r"(src_argb0), // %0 |
4264 "+r"(src_argb1), // %1 | 4311 "+r"(src_argb1), // %1 |
4265 "+r"(dst_argb), // %2 | 4312 "+r"(dst_argb), // %2 |
4266 "+r"(width) // %3 | 4313 "+r"(width) // %3 |
4267 : | 4314 : |
4268 : "memory", "cc" | 4315 : "memory", "cc" |
4269 , "xmm0" | 4316 , "xmm0" |
4270 ); | 4317 ); |
4271 } | 4318 } |
4272 #endif // HAS_ARGBSUBTRACTROW_AVX2 | 4319 #endif // HAS_ARGBSUBTRACTROW_AVX2 |
4273 | 4320 |
4274 #ifdef HAS_SOBELXROW_SSE2 | 4321 #ifdef HAS_SOBELXROW_SSE2 |
4275 // SobelX as a matrix is | 4322 // SobelX as a matrix is |
4276 // -1 0 1 | 4323 // -1 0 1 |
4277 // -2 0 2 | 4324 // -2 0 2 |
4278 // -1 0 1 | 4325 // -1 0 1 |
4279 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, | 4326 void SobelXRow_SSE2(const uint8* src_y0, |
4280 const uint8* src_y2, uint8* dst_sobelx, int width) { | 4327 const uint8* src_y1, |
| 4328 const uint8* src_y2, |
| 4329 uint8* dst_sobelx, |
| 4330 int width) { |
4281 asm volatile ( | 4331 asm volatile ( |
4282 "sub %0,%1 \n" | 4332 "sub %0,%1 \n" |
4283 "sub %0,%2 \n" | 4333 "sub %0,%2 \n" |
4284 "sub %0,%3 \n" | 4334 "sub %0,%3 \n" |
4285 "pxor %%xmm5,%%xmm5 \n" | 4335 "pxor %%xmm5,%%xmm5 \n" |
4286 | 4336 |
4287 // 8 pixel loop. | 4337 // 8 pixel loop. |
4288 LABELALIGN | 4338 LABELALIGN |
4289 "1: \n" | 4339 "1: \n" |
4290 "movq " MEMACCESS(0) ",%%xmm0 \n" | 4340 "movq " MEMACCESS(0) ",%%xmm0 \n" |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4323 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 4373 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
4324 ); | 4374 ); |
4325 } | 4375 } |
4326 #endif // HAS_SOBELXROW_SSE2 | 4376 #endif // HAS_SOBELXROW_SSE2 |
4327 | 4377 |
4328 #ifdef HAS_SOBELYROW_SSE2 | 4378 #ifdef HAS_SOBELYROW_SSE2 |
4329 // SobelY as a matrix is | 4379 // SobelY as a matrix is |
4330 // -1 -2 -1 | 4380 // -1 -2 -1 |
4331 // 0 0 0 | 4381 // 0 0 0 |
4332 // 1 2 1 | 4382 // 1 2 1 |
4333 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, | 4383 void SobelYRow_SSE2(const uint8* src_y0, |
4334 uint8* dst_sobely, int width) { | 4384 const uint8* src_y1, |
| 4385 uint8* dst_sobely, |
| 4386 int width) { |
4335 asm volatile ( | 4387 asm volatile ( |
4336 "sub %0,%1 \n" | 4388 "sub %0,%1 \n" |
4337 "sub %0,%2 \n" | 4389 "sub %0,%2 \n" |
4338 "pxor %%xmm5,%%xmm5 \n" | 4390 "pxor %%xmm5,%%xmm5 \n" |
4339 | 4391 |
4340 // 8 pixel loop. | 4392 // 8 pixel loop. |
4341 LABELALIGN | 4393 LABELALIGN |
4342 "1: \n" | 4394 "1: \n" |
4343 "movq " MEMACCESS(0) ",%%xmm0 \n" | 4395 "movq " MEMACCESS(0) ",%%xmm0 \n" |
4344 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 | 4396 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4376 ); | 4428 ); |
4377 } | 4429 } |
4378 #endif // HAS_SOBELYROW_SSE2 | 4430 #endif // HAS_SOBELYROW_SSE2 |
4379 | 4431 |
4380 #ifdef HAS_SOBELROW_SSE2 | 4432 #ifdef HAS_SOBELROW_SSE2 |
4381 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. | 4433 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
4382 // A = 255 | 4434 // A = 255 |
4383 // R = Sobel | 4435 // R = Sobel |
4384 // G = Sobel | 4436 // G = Sobel |
4385 // B = Sobel | 4437 // B = Sobel |
4386 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 4438 void SobelRow_SSE2(const uint8* src_sobelx, |
4387 uint8* dst_argb, int width) { | 4439 const uint8* src_sobely, |
| 4440 uint8* dst_argb, |
| 4441 int width) { |
4388 asm volatile ( | 4442 asm volatile ( |
4389 "sub %0,%1 \n" | 4443 "sub %0,%1 \n" |
4390 "pcmpeqb %%xmm5,%%xmm5 \n" | 4444 "pcmpeqb %%xmm5,%%xmm5 \n" |
4391 "pslld $0x18,%%xmm5 \n" | 4445 "pslld $0x18,%%xmm5 \n" |
4392 | 4446 |
4393 // 8 pixel loop. | 4447 // 8 pixel loop. |
4394 LABELALIGN | 4448 LABELALIGN |
4395 "1: \n" | 4449 "1: \n" |
4396 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4450 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
4397 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | 4451 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 |
(...skipping 25 matching lines...) Expand all Loading... |
4423 "+r"(width) // %3 | 4477 "+r"(width) // %3 |
4424 : | 4478 : |
4425 : "memory", "cc", NACL_R14 | 4479 : "memory", "cc", NACL_R14 |
4426 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 4480 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
4427 ); | 4481 ); |
4428 } | 4482 } |
4429 #endif // HAS_SOBELROW_SSE2 | 4483 #endif // HAS_SOBELROW_SSE2 |
4430 | 4484 |
4431 #ifdef HAS_SOBELTOPLANEROW_SSE2 | 4485 #ifdef HAS_SOBELTOPLANEROW_SSE2 |
4432 // Adds Sobel X and Sobel Y and stores Sobel into a plane. | 4486 // Adds Sobel X and Sobel Y and stores Sobel into a plane. |
4433 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 4487 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, |
4434 uint8* dst_y, int width) { | 4488 const uint8* src_sobely, |
| 4489 uint8* dst_y, |
| 4490 int width) { |
4435 asm volatile ( | 4491 asm volatile ( |
4436 "sub %0,%1 \n" | 4492 "sub %0,%1 \n" |
4437 "pcmpeqb %%xmm5,%%xmm5 \n" | 4493 "pcmpeqb %%xmm5,%%xmm5 \n" |
4438 "pslld $0x18,%%xmm5 \n" | 4494 "pslld $0x18,%%xmm5 \n" |
4439 | 4495 |
4440 // 8 pixel loop. | 4496 // 8 pixel loop. |
4441 LABELALIGN | 4497 LABELALIGN |
4442 "1: \n" | 4498 "1: \n" |
4443 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4499 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
4444 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | 4500 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 |
(...skipping 13 matching lines...) Expand all Loading... |
4458 ); | 4514 ); |
4459 } | 4515 } |
4460 #endif // HAS_SOBELTOPLANEROW_SSE2 | 4516 #endif // HAS_SOBELTOPLANEROW_SSE2 |
4461 | 4517 |
4462 #ifdef HAS_SOBELXYROW_SSE2 | 4518 #ifdef HAS_SOBELXYROW_SSE2 |
4463 // Mixes Sobel X, Sobel Y and Sobel into ARGB. | 4519 // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
4464 // A = 255 | 4520 // A = 255 |
4465 // R = Sobel X | 4521 // R = Sobel X |
4466 // G = Sobel | 4522 // G = Sobel |
4467 // B = Sobel Y | 4523 // B = Sobel Y |
4468 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 4524 void SobelXYRow_SSE2(const uint8* src_sobelx, |
4469 uint8* dst_argb, int width) { | 4525 const uint8* src_sobely, |
| 4526 uint8* dst_argb, |
| 4527 int width) { |
4470 asm volatile ( | 4528 asm volatile ( |
4471 "sub %0,%1 \n" | 4529 "sub %0,%1 \n" |
4472 "pcmpeqb %%xmm5,%%xmm5 \n" | 4530 "pcmpeqb %%xmm5,%%xmm5 \n" |
4473 | 4531 |
4474 // 8 pixel loop. | 4532 // 8 pixel loop. |
4475 LABELALIGN | 4533 LABELALIGN |
4476 "1: \n" | 4534 "1: \n" |
4477 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4535 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
4478 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | 4536 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 |
4479 "lea " MEMLEA(0x10,0) ",%0 \n" | 4537 "lea " MEMLEA(0x10,0) ",%0 \n" |
(...skipping 25 matching lines...) Expand all Loading... |
4505 : | 4563 : |
4506 : "memory", "cc", NACL_R14 | 4564 : "memory", "cc", NACL_R14 |
4507 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 4565 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
4508 ); | 4566 ); |
4509 } | 4567 } |
4510 #endif // HAS_SOBELXYROW_SSE2 | 4568 #endif // HAS_SOBELXYROW_SSE2 |
4511 | 4569 |
4512 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 | 4570 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 |
4513 // Creates a table of cumulative sums where each value is a sum of all values | 4571 // Creates a table of cumulative sums where each value is a sum of all values |
4514 // above and to the left of the value, inclusive of the value. | 4572 // above and to the left of the value, inclusive of the value. |
4515 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, | 4573 void ComputeCumulativeSumRow_SSE2(const uint8* row, |
4516 const int32* previous_cumsum, int width) { | 4574 int32* cumsum, |
| 4575 const int32* previous_cumsum, |
| 4576 int width) { |
4517 asm volatile ( | 4577 asm volatile ( |
4518 "pxor %%xmm0,%%xmm0 \n" | 4578 "pxor %%xmm0,%%xmm0 \n" |
4519 "pxor %%xmm1,%%xmm1 \n" | 4579 "pxor %%xmm1,%%xmm1 \n" |
4520 "sub $0x4,%3 \n" | 4580 "sub $0x4,%3 \n" |
4521 "jl 49f \n" | 4581 "jl 49f \n" |
4522 "test $0xf,%1 \n" | 4582 "test $0xf,%1 \n" |
4523 "jne 49f \n" | 4583 "jne 49f \n" |
4524 | 4584 |
4525 // 4 pixel loop \n" | 4585 // 4 pixel loop \n" |
4526 LABELALIGN | 4586 LABELALIGN |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4583 "+r"(previous_cumsum), // %2 | 4643 "+r"(previous_cumsum), // %2 |
4584 "+r"(width) // %3 | 4644 "+r"(width) // %3 |
4585 : | 4645 : |
4586 : "memory", "cc" | 4646 : "memory", "cc" |
4587 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 4647 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
4588 ); | 4648 ); |
4589 } | 4649 } |
4590 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 | 4650 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 |
4591 | 4651 |
4592 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 | 4652 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
4593 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, | 4653 void CumulativeSumToAverageRow_SSE2(const int32* topleft, |
4594 int width, int area, uint8* dst, | 4654 const int32* botleft, |
| 4655 int width, |
| 4656 int area, |
| 4657 uint8* dst, |
4595 int count) { | 4658 int count) { |
4596 asm volatile ( | 4659 asm volatile ( |
4597 "movd %5,%%xmm5 \n" | 4660 "movd %5,%%xmm5 \n" |
4598 "cvtdq2ps %%xmm5,%%xmm5 \n" | 4661 "cvtdq2ps %%xmm5,%%xmm5 \n" |
4599 "rcpss %%xmm5,%%xmm4 \n" | 4662 "rcpss %%xmm5,%%xmm4 \n" |
4600 "pshufd $0x0,%%xmm4,%%xmm4 \n" | 4663 "pshufd $0x0,%%xmm4,%%xmm4 \n" |
4601 "sub $0x4,%3 \n" | 4664 "sub $0x4,%3 \n" |
4602 "jl 49f \n" | 4665 "jl 49f \n" |
4603 "cmpl $0x80,%5 \n" | 4666 "cmpl $0x80,%5 \n" |
4604 "ja 40f \n" | 4667 "ja 40f \n" |
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4716 "rm"(area) // %5 | 4779 "rm"(area) // %5 |
4717 : "memory", "cc", NACL_R14 | 4780 : "memory", "cc", NACL_R14 |
4718 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 4781 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
4719 ); | 4782 ); |
4720 } | 4783 } |
4721 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 | 4784 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
4722 | 4785 |
4723 #ifdef HAS_ARGBAFFINEROW_SSE2 | 4786 #ifdef HAS_ARGBAFFINEROW_SSE2 |
4724 // Copy ARGB pixels from source image with slope to a row of destination. | 4787 // Copy ARGB pixels from source image with slope to a row of destination. |
4725 LIBYUV_API | 4788 LIBYUV_API |
4726 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, | 4789 void ARGBAffineRow_SSE2(const uint8* src_argb, |
4727 uint8* dst_argb, const float* src_dudv, int width) { | 4790 int src_argb_stride, |
| 4791 uint8* dst_argb, |
| 4792 const float* src_dudv, |
| 4793 int width) { |
4728 intptr_t src_argb_stride_temp = src_argb_stride; | 4794 intptr_t src_argb_stride_temp = src_argb_stride; |
4729 intptr_t temp; | 4795 intptr_t temp; |
4730 asm volatile ( | 4796 asm volatile ( |
4731 "movq " MEMACCESS(3) ",%%xmm2 \n" | 4797 "movq " MEMACCESS(3) ",%%xmm2 \n" |
4732 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" | 4798 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" |
4733 "shl $0x10,%1 \n" | 4799 "shl $0x10,%1 \n" |
4734 "add $0x4,%1 \n" | 4800 "add $0x4,%1 \n" |
4735 "movd %1,%%xmm5 \n" | 4801 "movd %1,%%xmm5 \n" |
4736 "sub $0x4,%4 \n" | 4802 "sub $0x4,%4 \n" |
4737 "jl 49f \n" | 4803 "jl 49f \n" |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4801 "=&r"(temp) // %5 | 4867 "=&r"(temp) // %5 |
4802 : | 4868 : |
4803 : "memory", "cc", NACL_R14 | 4869 : "memory", "cc", NACL_R14 |
4804 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 4870 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
4805 ); | 4871 ); |
4806 } | 4872 } |
4807 #endif // HAS_ARGBAFFINEROW_SSE2 | 4873 #endif // HAS_ARGBAFFINEROW_SSE2 |
4808 | 4874 |
4809 #ifdef HAS_INTERPOLATEROW_SSSE3 | 4875 #ifdef HAS_INTERPOLATEROW_SSSE3 |
4810 // Bilinear filter 16x2 -> 16x1 | 4876 // Bilinear filter 16x2 -> 16x1 |
4811 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 4877 void InterpolateRow_SSSE3(uint8* dst_ptr, |
4812 ptrdiff_t src_stride, int dst_width, | 4878 const uint8* src_ptr, |
| 4879 ptrdiff_t src_stride, |
| 4880 int dst_width, |
4813 int source_y_fraction) { | 4881 int source_y_fraction) { |
4814 asm volatile ( | 4882 asm volatile ( |
4815 "sub %1,%0 \n" | 4883 "sub %1,%0 \n" |
4816 "cmp $0x0,%3 \n" | 4884 "cmp $0x0,%3 \n" |
4817 "je 100f \n" | 4885 "je 100f \n" |
4818 "cmp $0x80,%3 \n" | 4886 "cmp $0x80,%3 \n" |
4819 "je 50f \n" | 4887 "je 50f \n" |
4820 | 4888 |
4821 "movd %3,%%xmm0 \n" | 4889 "movd %3,%%xmm0 \n" |
4822 "neg %3 \n" | 4890 "neg %3 \n" |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4882 "+r"(source_y_fraction) // %3 | 4950 "+r"(source_y_fraction) // %3 |
4883 : "r"((intptr_t)(src_stride)) // %4 | 4951 : "r"((intptr_t)(src_stride)) // %4 |
4884 : "memory", "cc", "eax", NACL_R14 | 4952 : "memory", "cc", "eax", NACL_R14 |
4885 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 4953 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
4886 ); | 4954 ); |
4887 } | 4955 } |
4888 #endif // HAS_INTERPOLATEROW_SSSE3 | 4956 #endif // HAS_INTERPOLATEROW_SSSE3 |
4889 | 4957 |
4890 #ifdef HAS_INTERPOLATEROW_AVX2 | 4958 #ifdef HAS_INTERPOLATEROW_AVX2 |
4891 // Bilinear filter 32x2 -> 32x1 | 4959 // Bilinear filter 32x2 -> 32x1 |
4892 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, | 4960 void InterpolateRow_AVX2(uint8* dst_ptr, |
4893 ptrdiff_t src_stride, int dst_width, | 4961 const uint8* src_ptr, |
| 4962 ptrdiff_t src_stride, |
| 4963 int dst_width, |
4894 int source_y_fraction) { | 4964 int source_y_fraction) { |
4895 asm volatile ( | 4965 asm volatile ( |
4896 "cmp $0x0,%3 \n" | 4966 "cmp $0x0,%3 \n" |
4897 "je 100f \n" | 4967 "je 100f \n" |
4898 "sub %1,%0 \n" | 4968 "sub %1,%0 \n" |
4899 "cmp $0x80,%3 \n" | 4969 "cmp $0x80,%3 \n" |
4900 "je 50f \n" | 4970 "je 50f \n" |
4901 | 4971 |
4902 "vmovd %3,%%xmm0 \n" | 4972 "vmovd %3,%%xmm0 \n" |
4903 "neg %3 \n" | 4973 "neg %3 \n" |
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4958 "+r"(source_y_fraction) // %3 | 5028 "+r"(source_y_fraction) // %3 |
4959 : "r"((intptr_t)(src_stride)) // %4 | 5029 : "r"((intptr_t)(src_stride)) // %4 |
4960 : "memory", "cc", "eax", NACL_R14 | 5030 : "memory", "cc", "eax", NACL_R14 |
4961 "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" | 5031 "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" |
4962 ); | 5032 ); |
4963 } | 5033 } |
4964 #endif // HAS_INTERPOLATEROW_AVX2 | 5034 #endif // HAS_INTERPOLATEROW_AVX2 |
4965 | 5035 |
4966 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 | 5036 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 |
4967 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5037 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
4968 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 5038 void ARGBShuffleRow_SSSE3(const uint8* src_argb, |
4969 const uint8* shuffler, int width) { | 5039 uint8* dst_argb, |
| 5040 const uint8* shuffler, |
| 5041 int width) { |
4970 asm volatile ( | 5042 asm volatile ( |
4971 "movdqu " MEMACCESS(3) ",%%xmm5 \n" | 5043 "movdqu " MEMACCESS(3) ",%%xmm5 \n" |
4972 LABELALIGN | 5044 LABELALIGN |
4973 "1: \n" | 5045 "1: \n" |
4974 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 5046 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
4975 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 5047 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
4976 "lea " MEMLEA(0x20,0) ",%0 \n" | 5048 "lea " MEMLEA(0x20,0) ",%0 \n" |
4977 "pshufb %%xmm5,%%xmm0 \n" | 5049 "pshufb %%xmm5,%%xmm0 \n" |
4978 "pshufb %%xmm5,%%xmm1 \n" | 5050 "pshufb %%xmm5,%%xmm1 \n" |
4979 "movdqu %%xmm0," MEMACCESS(1) " \n" | 5051 "movdqu %%xmm0," MEMACCESS(1) " \n" |
4980 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | 5052 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
4981 "lea " MEMLEA(0x20,1) ",%1 \n" | 5053 "lea " MEMLEA(0x20,1) ",%1 \n" |
4982 "sub $0x8,%2 \n" | 5054 "sub $0x8,%2 \n" |
4983 "jg 1b \n" | 5055 "jg 1b \n" |
4984 : "+r"(src_argb), // %0 | 5056 : "+r"(src_argb), // %0 |
4985 "+r"(dst_argb), // %1 | 5057 "+r"(dst_argb), // %1 |
4986 "+r"(width) // %2 | 5058 "+r"(width) // %2 |
4987 : "r"(shuffler) // %3 | 5059 : "r"(shuffler) // %3 |
4988 : "memory", "cc" | 5060 : "memory", "cc" |
4989 , "xmm0", "xmm1", "xmm5" | 5061 , "xmm0", "xmm1", "xmm5" |
4990 ); | 5062 ); |
4991 } | 5063 } |
4992 #endif // HAS_ARGBSHUFFLEROW_SSSE3 | 5064 #endif // HAS_ARGBSHUFFLEROW_SSSE3 |
4993 | 5065 |
4994 #ifdef HAS_ARGBSHUFFLEROW_AVX2 | 5066 #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
4995 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5067 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
4996 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 5068 void ARGBShuffleRow_AVX2(const uint8* src_argb, |
4997 const uint8* shuffler, int width) { | 5069 uint8* dst_argb, |
| 5070 const uint8* shuffler, |
| 5071 int width) { |
4998 asm volatile ( | 5072 asm volatile ( |
4999 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" | 5073 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" |
5000 LABELALIGN | 5074 LABELALIGN |
5001 "1: \n" | 5075 "1: \n" |
5002 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 5076 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
5003 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 5077 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
5004 "lea " MEMLEA(0x40,0) ",%0 \n" | 5078 "lea " MEMLEA(0x40,0) ",%0 \n" |
5005 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" | 5079 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" |
5006 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" | 5080 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" |
5007 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | 5081 "vmovdqu %%ymm0," MEMACCESS(1) " \n" |
5008 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" | 5082 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" |
5009 "lea " MEMLEA(0x40,1) ",%1 \n" | 5083 "lea " MEMLEA(0x40,1) ",%1 \n" |
5010 "sub $0x10,%2 \n" | 5084 "sub $0x10,%2 \n" |
5011 "jg 1b \n" | 5085 "jg 1b \n" |
5012 "vzeroupper \n" | 5086 "vzeroupper \n" |
5013 : "+r"(src_argb), // %0 | 5087 : "+r"(src_argb), // %0 |
5014 "+r"(dst_argb), // %1 | 5088 "+r"(dst_argb), // %1 |
5015 "+r"(width) // %2 | 5089 "+r"(width) // %2 |
5016 : "r"(shuffler) // %3 | 5090 : "r"(shuffler) // %3 |
5017 : "memory", "cc" | 5091 : "memory", "cc" |
5018 , "xmm0", "xmm1", "xmm5" | 5092 , "xmm0", "xmm1", "xmm5" |
5019 ); | 5093 ); |
5020 } | 5094 } |
5021 #endif // HAS_ARGBSHUFFLEROW_AVX2 | 5095 #endif // HAS_ARGBSHUFFLEROW_AVX2 |
5022 | 5096 |
5023 #ifdef HAS_ARGBSHUFFLEROW_SSE2 | 5097 #ifdef HAS_ARGBSHUFFLEROW_SSE2 |
5024 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5098 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
5025 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 5099 void ARGBShuffleRow_SSE2(const uint8* src_argb, |
5026 const uint8* shuffler, int width) { | 5100 uint8* dst_argb, |
| 5101 const uint8* shuffler, |
| 5102 int width) { |
5027 uintptr_t pixel_temp; | 5103 uintptr_t pixel_temp; |
5028 asm volatile ( | 5104 asm volatile ( |
5029 "pxor %%xmm5,%%xmm5 \n" | 5105 "pxor %%xmm5,%%xmm5 \n" |
5030 "mov " MEMACCESS(4) ",%k2 \n" | 5106 "mov " MEMACCESS(4) ",%k2 \n" |
5031 "cmp $0x3000102,%k2 \n" | 5107 "cmp $0x3000102,%k2 \n" |
5032 "je 3012f \n" | 5108 "je 3012f \n" |
5033 "cmp $0x10203,%k2 \n" | 5109 "cmp $0x10203,%k2 \n" |
5034 "je 123f \n" | 5110 "je 123f \n" |
5035 "cmp $0x30201,%k2 \n" | 5111 "cmp $0x30201,%k2 \n" |
5036 "je 321f \n" | 5112 "je 321f \n" |
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5137 : "memory", "cc", NACL_R14 | 5213 : "memory", "cc", NACL_R14 |
5138 "xmm0", "xmm1", "xmm5" | 5214 "xmm0", "xmm1", "xmm5" |
5139 ); | 5215 ); |
5140 } | 5216 } |
5141 #endif // HAS_ARGBSHUFFLEROW_SSE2 | 5217 #endif // HAS_ARGBSHUFFLEROW_SSE2 |
5142 | 5218 |
5143 #ifdef HAS_I422TOYUY2ROW_SSE2 | 5219 #ifdef HAS_I422TOYUY2ROW_SSE2 |
5144 void I422ToYUY2Row_SSE2(const uint8* src_y, | 5220 void I422ToYUY2Row_SSE2(const uint8* src_y, |
5145 const uint8* src_u, | 5221 const uint8* src_u, |
5146 const uint8* src_v, | 5222 const uint8* src_v, |
5147 uint8* dst_frame, int width) { | 5223 uint8* dst_frame, |
5148 asm volatile ( | 5224 int width) { |
| 5225 asm volatile ( |
5149 "sub %1,%2 \n" | 5226 "sub %1,%2 \n" |
5150 LABELALIGN | 5227 LABELALIGN |
5151 "1: \n" | 5228 "1: \n" |
5152 "movq " MEMACCESS(1) ",%%xmm2 \n" | 5229 "movq " MEMACCESS(1) ",%%xmm2 \n" |
5153 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 | 5230 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 |
5154 "lea " MEMLEA(0x8,1) ",%1 \n" | 5231 "lea " MEMLEA(0x8,1) ",%1 \n" |
5155 "punpcklbw %%xmm3,%%xmm2 \n" | 5232 "punpcklbw %%xmm3,%%xmm2 \n" |
5156 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 5233 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
5157 "lea " MEMLEA(0x10,0) ",%0 \n" | 5234 "lea " MEMLEA(0x10,0) ",%0 \n" |
5158 "movdqa %%xmm0,%%xmm1 \n" | 5235 "movdqa %%xmm0,%%xmm1 \n" |
(...skipping 13 matching lines...) Expand all Loading... |
5172 : "memory", "cc", NACL_R14 | 5249 : "memory", "cc", NACL_R14 |
5173 "xmm0", "xmm1", "xmm2", "xmm3" | 5250 "xmm0", "xmm1", "xmm2", "xmm3" |
5174 ); | 5251 ); |
5175 } | 5252 } |
5176 #endif // HAS_I422TOYUY2ROW_SSE2 | 5253 #endif // HAS_I422TOYUY2ROW_SSE2 |
5177 | 5254 |
5178 #ifdef HAS_I422TOUYVYROW_SSE2 | 5255 #ifdef HAS_I422TOUYVYROW_SSE2 |
5179 void I422ToUYVYRow_SSE2(const uint8* src_y, | 5256 void I422ToUYVYRow_SSE2(const uint8* src_y, |
5180 const uint8* src_u, | 5257 const uint8* src_u, |
5181 const uint8* src_v, | 5258 const uint8* src_v, |
5182 uint8* dst_frame, int width) { | 5259 uint8* dst_frame, |
5183 asm volatile ( | 5260 int width) { |
| 5261 asm volatile ( |
5184 "sub %1,%2 \n" | 5262 "sub %1,%2 \n" |
5185 LABELALIGN | 5263 LABELALIGN |
5186 "1: \n" | 5264 "1: \n" |
5187 "movq " MEMACCESS(1) ",%%xmm2 \n" | 5265 "movq " MEMACCESS(1) ",%%xmm2 \n" |
5188 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 | 5266 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 |
5189 "lea " MEMLEA(0x8,1) ",%1 \n" | 5267 "lea " MEMLEA(0x8,1) ",%1 \n" |
5190 "punpcklbw %%xmm3,%%xmm2 \n" | 5268 "punpcklbw %%xmm3,%%xmm2 \n" |
5191 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 5269 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
5192 "movdqa %%xmm2,%%xmm1 \n" | 5270 "movdqa %%xmm2,%%xmm1 \n" |
5193 "lea " MEMLEA(0x10,0) ",%0 \n" | 5271 "lea " MEMLEA(0x10,0) ",%0 \n" |
(...skipping 11 matching lines...) Expand all Loading... |
5205 "+rm"(width) // %4 | 5283 "+rm"(width) // %4 |
5206 : | 5284 : |
5207 : "memory", "cc", NACL_R14 | 5285 : "memory", "cc", NACL_R14 |
5208 "xmm0", "xmm1", "xmm2", "xmm3" | 5286 "xmm0", "xmm1", "xmm2", "xmm3" |
5209 ); | 5287 ); |
5210 } | 5288 } |
5211 #endif // HAS_I422TOUYVYROW_SSE2 | 5289 #endif // HAS_I422TOUYVYROW_SSE2 |
5212 | 5290 |
5213 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 | 5291 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 |
5214 void ARGBPolynomialRow_SSE2(const uint8* src_argb, | 5292 void ARGBPolynomialRow_SSE2(const uint8* src_argb, |
5215 uint8* dst_argb, const float* poly, | 5293 uint8* dst_argb, |
| 5294 const float* poly, |
5216 int width) { | 5295 int width) { |
5217 asm volatile ( | 5296 asm volatile ( |
5218 "pxor %%xmm3,%%xmm3 \n" | 5297 "pxor %%xmm3,%%xmm3 \n" |
5219 | 5298 |
5220 // 2 pixel loop. | 5299 // 2 pixel loop. |
5221 LABELALIGN | 5300 LABELALIGN |
5222 "1: \n" | 5301 "1: \n" |
5223 "movq " MEMACCESS(0) ",%%xmm0 \n" | 5302 "movq " MEMACCESS(0) ",%%xmm0 \n" |
5224 "lea " MEMLEA(0x8,0) ",%0 \n" | 5303 "lea " MEMLEA(0x8,0) ",%0 \n" |
5225 "punpcklbw %%xmm3,%%xmm0 \n" | 5304 "punpcklbw %%xmm3,%%xmm0 \n" |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5261 "+r"(width) // %2 | 5340 "+r"(width) // %2 |
5262 : "r"(poly) // %3 | 5341 : "r"(poly) // %3 |
5263 : "memory", "cc" | 5342 : "memory", "cc" |
5264 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 5343 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
5265 ); | 5344 ); |
5266 } | 5345 } |
5267 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 | 5346 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 |
5268 | 5347 |
5269 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 | 5348 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 |
5270 void ARGBPolynomialRow_AVX2(const uint8* src_argb, | 5349 void ARGBPolynomialRow_AVX2(const uint8* src_argb, |
5271 uint8* dst_argb, const float* poly, | 5350 uint8* dst_argb, |
| 5351 const float* poly, |
5272 int width) { | 5352 int width) { |
5273 asm volatile ( | 5353 asm volatile ( |
5274 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" | 5354 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" |
5275 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" | 5355 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" |
5276 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" | 5356 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" |
5277 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" | 5357 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" |
5278 | 5358 |
5279 // 2 pixel loop. | 5359 // 2 pixel loop. |
5280 LABELALIGN | 5360 LABELALIGN |
5281 "1: \n" | 5361 "1: \n" |
(...skipping 155 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5437 "+r"(width) // %2 | 5517 "+r"(width) // %2 |
5438 : | 5518 : |
5439 : "memory", "cc", | 5519 : "memory", "cc", |
5440 "xmm2", "xmm3" | 5520 "xmm2", "xmm3" |
5441 ); | 5521 ); |
5442 } | 5522 } |
5443 #endif // HAS_HALFFLOATROW_F16C | 5523 #endif // HAS_HALFFLOATROW_F16C |
5444 | 5524 |
5445 #ifdef HAS_ARGBCOLORTABLEROW_X86 | 5525 #ifdef HAS_ARGBCOLORTABLEROW_X86 |
5446 // Tranform ARGB pixels with color table. | 5526 // Tranform ARGB pixels with color table. |
5447 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, | 5527 void ARGBColorTableRow_X86(uint8* dst_argb, |
| 5528 const uint8* table_argb, |
5448 int width) { | 5529 int width) { |
5449 uintptr_t pixel_temp; | 5530 uintptr_t pixel_temp; |
5450 asm volatile ( | 5531 asm volatile ( |
5451 // 1 pixel loop. | 5532 // 1 pixel loop. |
5452 LABELALIGN | 5533 LABELALIGN |
5453 "1: \n" | 5534 "1: \n" |
5454 "movzb " MEMACCESS(0) ",%1 \n" | 5535 "movzb " MEMACCESS(0) ",%1 \n" |
5455 "lea " MEMLEA(0x4,0) ",%0 \n" | 5536 "lea " MEMLEA(0x4,0) ",%0 \n" |
5456 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 | 5537 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 |
5457 "mov %b1," MEMACCESS2(-0x4,0) " \n" | 5538 "mov %b1," MEMACCESS2(-0x4,0) " \n" |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5497 : "+r"(dst_argb), // %0 | 5578 : "+r"(dst_argb), // %0 |
5498 "=&d"(pixel_temp), // %1 | 5579 "=&d"(pixel_temp), // %1 |
5499 "+r"(width) // %2 | 5580 "+r"(width) // %2 |
5500 : "r"(table_argb) // %3 | 5581 : "r"(table_argb) // %3 |
5501 : "memory", "cc"); | 5582 : "memory", "cc"); |
5502 } | 5583 } |
5503 #endif // HAS_RGBCOLORTABLEROW_X86 | 5584 #endif // HAS_RGBCOLORTABLEROW_X86 |
5504 | 5585 |
5505 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5586 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5506 // Tranform RGB pixels with luma table. | 5587 // Tranform RGB pixels with luma table. |
5507 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 5588 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, |
| 5589 uint8* dst_argb, |
5508 int width, | 5590 int width, |
5509 const uint8* luma, uint32 lumacoeff) { | 5591 const uint8* luma, |
| 5592 uint32 lumacoeff) { |
5510 uintptr_t pixel_temp; | 5593 uintptr_t pixel_temp; |
5511 uintptr_t table_temp; | 5594 uintptr_t table_temp; |
5512 asm volatile ( | 5595 asm volatile ( |
5513 "movd %6,%%xmm3 \n" | 5596 "movd %6,%%xmm3 \n" |
5514 "pshufd $0x0,%%xmm3,%%xmm3 \n" | 5597 "pshufd $0x0,%%xmm3,%%xmm3 \n" |
5515 "pcmpeqb %%xmm4,%%xmm4 \n" | 5598 "pcmpeqb %%xmm4,%%xmm4 \n" |
5516 "psllw $0x8,%%xmm4 \n" | 5599 "psllw $0x8,%%xmm4 \n" |
5517 "pxor %%xmm5,%%xmm5 \n" | 5600 "pxor %%xmm5,%%xmm5 \n" |
5518 | 5601 |
5519 // 4 pixel loop. | 5602 // 4 pixel loop. |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5597 "+rm"(width) // %4 | 5680 "+rm"(width) // %4 |
5598 : "r"(luma), // %5 | 5681 : "r"(luma), // %5 |
5599 "rm"(lumacoeff) // %6 | 5682 "rm"(lumacoeff) // %6 |
5600 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" | 5683 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" |
5601 ); | 5684 ); |
5602 } | 5685 } |
5603 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5686 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5604 | 5687 |
5605 #endif // defined(__x86_64__) || defined(__i386__) | 5688 #endif // defined(__x86_64__) || defined(__i386__) |
5606 | 5689 |
5607 // clang-format on | |
5608 | |
5609 #ifdef __cplusplus | 5690 #ifdef __cplusplus |
5610 } // extern "C" | 5691 } // extern "C" |
5611 } // namespace libyuv | 5692 } // namespace libyuv |
5612 #endif | 5693 #endif |
OLD | NEW |