OLD | NEW |
| (Empty) |
1 ; Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
2 ; Use of this source code is governed by a BSD-style license that can be | |
3 ; found in the LICENSE file. | |
4 | |
5 %include "media/base/simd/media_export.asm" | |
6 %include "third_party/x86inc/x86inc.asm" | |
7 | |
8 ; | |
9 ; This file uses SSE, SSE2, SSE3, and SSSE3, which are supported by all ATOM | |
10 ; processors. | |
11 ; | |
12 SECTION_TEXT | |
13 CPU SSE, SSE3, SSE3, SSSE3 | |
14 | |
15 ; | |
16 ; XMM registers representing constants. We must not use these registers as | |
17 ; destination operands. | |
18 ; for (int i = 0; i < 16; i += 4) { | |
19 ; xmm7.b[i] = 25; xmm7.b[i+1] = 2; xmm7.b[i+2] = 66; xmm7.b[i+3] = 0; | |
20 ; xmm6.b[i] = 0; xmm6.b[i+1] = 127; xmm6.b[i+2] = 0; xmm6.b[i+3] = 0; | |
21 ; xmm5.b[i] = 112; xmm5.b[i+1] = -74; xmm5.b[i+2] = -38; xmm5.b[i+3] = 0; | |
22 ; xmm4.b[i] = -18; xmm4.b[i+1] = -94; xmm4.b[i+2] = 112; xmm4.b[i+3] = 0; | |
23 ; } | |
24 ; | |
25 %define XMM_CONST_Y0 xmm7 | |
26 %define XMM_CONST_Y1 xmm6 | |
27 %define XMM_CONST_U xmm5 | |
28 %define XMM_CONST_V xmm4 | |
29 %define XMM_CONST_128 xmm3 | |
30 | |
31 ; | |
32 ; LOAD_XMM %1 (xmm), %2 (imm32) | |
33 ; Loads an immediate value to an XMM register. | |
34 ; %1.d[0] = %1.d[1] = %1.d[2] = %1.d[3] = %2; | |
35 ; | |
36 %macro LOAD_XMM 2 | |
37 mov TEMPd, %2 | |
38 movd %1, TEMPd | |
39 pshufd %1, %1, 00000000B | |
40 %endmacro | |
41 | |
42 ; | |
43 ; UNPACKRGB %1 (xmm), %2 (imm8) | |
44 ; Unpacks one RGB pixel in the specified XMM register. | |
45 ; for (int i = 15; i > %2; --i) %1.b[i] = %1.b[i - 1]; | |
46 ; %1.b[%2] = 0; | |
47 ; for (int i = %2 - 1; i >= 0; --i) %1.b[i] = %1.b[i]; | |
48 ; | |
49 %macro UNPACKRGB 2 | |
50 movdqa xmm1, %1 | |
51 psrldq xmm1, %2 | |
52 pslldq xmm1, %2 | |
53 pxor %1, xmm1 | |
54 pslldq xmm1, 1 | |
55 por %1, xmm1 | |
56 %endmacro | |
57 | |
58 ; | |
59 ; READ_ARGB %1 (xmm), %2 (imm) | |
60 ; Read the specified number of ARGB (or RGB) pixels from the source and store | |
61 ; them to the destination xmm register. If the input format is RGB, we read RGB | |
62 ; pixels and convert them to ARGB pixels. (For this case, the alpha values of | |
63 ; the output pixels become 0.) | |
64 ; | |
65 %macro READ_ARGB 2 | |
66 | |
67 %if PIXELSIZE == 4 | |
68 | |
69 ; Read ARGB pixels from the source. (This macro assumes the input buffer may | |
70 ; not be aligned to a 16-byte boundary.) | |
71 %if %2 == 1 | |
72 movd %1, DWORD [ARGBq + WIDTHq * 4 * 2] | |
73 %elif %2 == 2 | |
74 movq %1, QWORD [ARGBq + WIDTHq * 4 * 2] | |
75 %elif %2 == 4 | |
76 movdqu %1, DQWORD [ARGBq + WIDTHq * 4 * 2] | |
77 %else | |
78 %error unsupported number of pixels. | |
79 %endif | |
80 | |
81 %elif PIXELSIZE == 3 | |
82 | |
83 ; Read RGB pixels from the source and convert them to ARGB pixels. | |
84 %if %2 == 1 | |
85 ; Read one RGB pixel and convert it to one ARGB pixel. | |
86 ; Save the WIDTH register to xmm1. (This macro needs to break it.) | |
87 MOVq xmm1, WIDTHq | |
88 | |
89 ; Once read three bytes from the source to TEMPd, and copy it to the | |
90 ; destination xmm register. | |
91 lea WIDTHq, [WIDTHq + WIDTHq * 2] | |
92 movzx TEMPd, BYTE [ARGBq + WIDTHq * 2 + 2] | |
93 shl TEMPd, 16 | |
94 mov TEMPw, WORD [ARGBq + WIDTHq * 2] | |
95 movd %1, TEMPd | |
96 | |
97 ; Restore the WIDTH register. | |
98 MOVq WIDTHq, xmm1 | |
99 %elif %2 == 2 | |
100 ; Read two RGB pixels and convert them to two ARGB pixels. | |
101 ; Read six bytes from the source to the destination xmm register. | |
102 mov TEMPq, WIDTHq | |
103 lea TEMPq, [TEMPq + TEMPq * 2] | |
104 movd %1, DWORD [ARGBq + TEMPq * 2] | |
105 pinsrw %1, WORD [ARGBq + TEMPq * 2 + 4], 3 | |
106 | |
107 ; Fill the alpha values of these RGB pixels with 0 and convert them to two | |
108 ; ARGB pixels. | |
109 UNPACKRGB %1, 3 | |
110 %elif %2 == 4 | |
111 ; Read four RGB pixels and convert them to four ARGB pixels. | |
112 ; Read twelve bytes from the source to the destination xmm register. | |
113 mov TEMPq, WIDTHq | |
114 lea TEMPq, [TEMPq + TEMPq * 2] | |
115 movq %1, QWORD [ARGBq + TEMPq * 2] | |
116 movd xmm1, DWORD [ARGBq + TEMPq * 2 + 8] | |
117 shufps %1, xmm1, 01000100B | |
118 | |
119 ; Fill the alpha values of these RGB pixels with 0 and convert them to four | |
120 ; ARGB pixels. | |
121 UNPACKRGB %1, 3 | |
122 UNPACKRGB %1, 4 + 3 | |
123 UNPACKRGB %1, 4 + 4 + 3 | |
124 %else | |
125 %error unsupported number of pixels. | |
126 %endif | |
127 | |
128 %else | |
129 %error unsupported PIXELSIZE value. | |
130 %endif | |
131 | |
132 %endmacro | |
133 | |
134 ; | |
135 ; CALC_Y %1 (xmm), %2 (xmm) | |
136 ; Calculates four Y values from four ARGB pixels stored in %2. | |
137 ; %1.b[0] = ToByte((25 * B(0) + 129 * G(0) + 66 * R(0) + 128) / 256 + 16); | |
138 ; %1.b[1] = ToByte((25 * B(1) + 129 * G(1) + 66 * R(1) + 128) / 256 + 16); | |
139 ; %1.b[2] = ToByte((25 * B(2) + 129 * G(2) + 66 * R(2) + 128) / 256 + 16); | |
140 ; %1.b[3] = ToByte((25 * B(3) + 129 * G(3) + 66 * R(3) + 128) / 256 + 16); | |
141 ; | |
142 %macro CALC_Y 2 | |
143 ; To avoid signed saturation, we divide this conversion formula into two | |
144 ; formulae and store their results into two XMM registers %1 and xmm2. | |
145 ; %1.w[0] = 25 * %2.b[0] + 2 * %2.b[1] + 66 * %2.b[2] + 0 * %2.b[3]; | |
146 ; %1.w[1] = 25 * %2.b[4] + 2 * %2.b[5] + 66 * %2.b[6] + 0 * %2.b[7]; | |
147 ; %1.w[2] = 25 * %2.b[8] + 2 * %2.b[9] + 66 * %2.b[10] + 0 * %2.b[11]; | |
148 ; %1.w[3] = 25 * %2.b[12] + 2 * %2.b[13] + 66 * %2.b[14] + 0 * %2.b[15]; | |
149 ; xmm2.w[0] = 0 * %2.b[0] + 127 * %2.b[1] + 0 * %2.b[2] + 0 * %2.b[3]; | |
150 ; xmm2.w[1] = 0 * %2.b[4] + 127 * %2.b[5] + 0 * %2.b[6] + 0 * %2.b[7]; | |
151 ; xmm2.w[2] = 0 * %2.b[8] + 127 * %2.b[9] + 0 * %2.b[10] + 0 * %2.b[11]; | |
152 ; xmm2.w[3] = 0 * %2.b[12] + 127 * %2.b[13] + 0 * %2.b[14] + 0 * %2.b[15]; | |
153 movdqa %1, %2 | |
154 pmaddubsw %1, XMM_CONST_Y0 | |
155 phaddsw %1, %1 | |
156 movdqa xmm2, %2 | |
157 pmaddubsw xmm2, XMM_CONST_Y1 | |
158 phaddsw xmm2, xmm2 | |
159 | |
160 ; %1.b[0] = ToByte((%1.w[0] + xmm2.w[0] + 128) / 256 + 16); | |
161 ; %1.b[1] = ToByte((%1.w[1] + xmm2.w[1] + 128) / 256 + 16); | |
162 ; %1.b[2] = ToByte((%1.w[2] + xmm2.w[2] + 128) / 256 + 16); | |
163 ; %1.b[3] = ToByte((%1.w[3] + xmm2.w[3] + 128) / 256 + 16); | |
164 paddw %1, xmm2 | |
165 movdqa xmm2, XMM_CONST_128 | |
166 paddw %1, xmm2 | |
167 psrlw %1, 8 | |
168 psrlw xmm2, 3 | |
169 paddw %1, xmm2 | |
170 packuswb %1, %1 | |
171 %endmacro | |
172 | |
173 ; | |
174 ; INIT_UV %1 (r32), %2 (reg) %3 (imm) | |
175 ; | |
176 %macro INIT_UV 3 | |
177 | |
178 %if SUBSAMPLING == 1 && LINE == 1 | |
179 %if %3 == 1 || %3 == 2 | |
180 movzx %1, BYTE [%2 + WIDTHq] | |
181 %elif %3 == 4 | |
182 movzx %1, WORD [%2 + WIDTHq] | |
183 %else | |
184 %error unsupported number of pixels. | |
185 %endif | |
186 %endif | |
187 | |
188 %endmacro | |
189 | |
190 ; | |
191 ; CALC_UV %1 (xmm), %2 (xmm), %3 (xmm), %4 (r32) | |
192 ; Calculates two U (or V) values from four ARGB pixels stored in %2. | |
193 ; if %3 == XMM_CONST_U | |
194 ; if (SUBSAMPLING) { | |
195 ; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); | |
196 ; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); | |
197 ; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); | |
198 ; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); | |
199 ; } else { | |
200 ; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); | |
201 ; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); | |
202 ; } | |
203 ; if %3 == XMM_CONST_V | |
204 ; %1.b[0] = ToByte((-18 * B(0) - 94 * G(0) + 112 * R(0) + 128) / 256 + 128); | |
205 ; %1.b[1] = ToByte((-18 * B(2) - 94 * G(2) + 112 * R(2) + 128) / 256 + 128); | |
206 ; | |
207 %macro CALC_UV 4 | |
208 ; for (int i = 0; i < 4; ++i) { | |
209 ; %1.w[i] = 0; | |
210 ; for (int j = 0; j < 4; ++j) | |
211 ; %1.w[i] += %3.b[i * 4 + j] + %2.b[i * 4 + j]; | |
212 ; } | |
213 movdqa %1, %2 | |
214 pmaddubsw %1, %3 | |
215 phaddsw %1, %1 | |
216 | |
217 %if SUBSAMPLING == 1 | |
218 ; %1.w[0] = (%1.w[0] + %1.w[1] + 1) / 2; | |
219 ; %1.w[1] = (%1.w[1] + %1.w[0] + 1) / 2; | |
220 ; %1.w[2] = (%1.w[2] + %1.w[3] + 1) / 2; | |
221 ; %1.w[3] = (%1.w[3] + %1.w[2] + 1) / 2; | |
222 pshuflw xmm2, %1, 10110001B | |
223 pavgw %1, xmm2 | |
224 %endif | |
225 | |
226 ; %1.b[0] = ToByte((%1.w[0] + 128) / 256 + 128); | |
227 ; %1.b[1] = ToByte((%1.w[2] + 128) / 256 + 128); | |
228 pshuflw %1, %1, 10001000B | |
229 paddw %1, XMM_CONST_128 | |
230 psraw %1, 8 | |
231 paddw %1, XMM_CONST_128 | |
232 packuswb %1, %1 | |
233 | |
234 %if SUBSAMPLING == 1 && LINE == 1 | |
235 ; %1.b[0] = (%1.b[0] + %3.b[0] + 1) / 2; | |
236 ; %1.b[1] = (%1.b[1] + %3.b[1] + 1) / 2; | |
237 movd xmm2, %4 | |
238 pavgb %1, xmm2 | |
239 %endif | |
240 %endmacro | |
241 | |
242 ; | |
243 ; extern "C" void ConvertARGBToYUVRow_SSSE3(const uint8_t* argb, | |
244 ; uint8_t* y, | |
245 ; uint8_t* u, | |
246 ; uint8_t* v, | |
247 ; ptrdiff_t width); | |
248 ; | |
249 %define SYMBOL ConvertARGBToYUVRow_SSSE3 | |
250 %define PIXELSIZE 4 | |
251 %define SUBSAMPLING 0 | |
252 %define LINE 0 | |
253 %include "convert_rgb_to_yuv_ssse3.inc" | |
254 | |
255 ; | |
256 ; extern "C" void ConvertRGBToYUVRow_SSSE3(const uint8_t* rgb, | |
257 ; uint8_t* y, | |
258 ; uint8_t* u, | |
259 ; uint8_t* v, | |
260 ; ptrdiff_t width); | |
261 ; | |
262 %define SYMBOL ConvertRGBToYUVRow_SSSE3 | |
263 %define PIXELSIZE 3 | |
264 %define SUBSAMPLING 0 | |
265 %define LINE 0 | |
266 %include "convert_rgb_to_yuv_ssse3.inc" | |
267 | |
268 ; | |
269 ; extern "C" void ConvertARGBToYUVEven_SSSE3(const uint8_t* argb, | |
270 ; uint8_t* y, | |
271 ; uint8_t* u, | |
272 ; uint8_t* v, | |
273 ; ptrdiff_t width); | |
274 ; | |
275 %define SYMBOL ConvertARGBToYUVEven_SSSE3 | |
276 %define PIXELSIZE 4 | |
277 %define SUBSAMPLING 1 | |
278 %define LINE 0 | |
279 %include "convert_rgb_to_yuv_ssse3.inc" | |
280 | |
281 ; | |
282 ; extern "C" void ConvertARGBToYUVOdd_SSSE3(const uint8_t* argb, | |
283 ; uint8_t* y, | |
284 ; uint8_t* u, | |
285 ; uint8_t* v, | |
286 ; ptrdiff_t width); | |
287 ; | |
288 %define SYMBOL ConvertARGBToYUVOdd_SSSE3 | |
289 %define PIXELSIZE 4 | |
290 %define SUBSAMPLING 1 | |
291 %define LINE 1 | |
292 %include "convert_rgb_to_yuv_ssse3.inc" | |
293 | |
294 ; | |
295 ; extern "C" void ConvertRGBToYUVEven_SSSE3(const uint8_t* rgb, | |
296 ; uint8_t* y, | |
297 ; uint8_t* u, | |
298 ; uint8_t* v, | |
299 ; ptrdiff_t width); | |
300 ; | |
301 %define SYMBOL ConvertRGBToYUVEven_SSSE3 | |
302 %define PIXELSIZE 3 | |
303 %define SUBSAMPLING 1 | |
304 %define LINE 0 | |
305 %include "convert_rgb_to_yuv_ssse3.inc" | |
306 | |
307 ; | |
308 ; extern "C" void ConvertRGBToYUVOdd_SSSE3(const uint8_t* rgb, | |
309 ; uint8_t* y, | |
310 ; uint8_t* u, | |
311 ; uint8_t* v, | |
312 ; ptrdiff_t width); | |
313 ; | |
314 %define SYMBOL ConvertRGBToYUVOdd_SSSE3 | |
315 %define PIXELSIZE 3 | |
316 %define SUBSAMPLING 1 | |
317 %define LINE 1 | |
318 %include "convert_rgb_to_yuv_ssse3.inc" | |
OLD | NEW |