OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include "libyuv/row.h" | |
12 | |
13 #ifdef __cplusplus | |
14 namespace libyuv { | |
15 extern "C" { | |
16 #endif | |
17 | |
18 // This module is for GCC x86 and x64. | |
19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) | |
20 | |
21 // Offsets for source bytes 0 to 9 | |
22 static uvec8 kShuf0 = | |
23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; | |
24 | |
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. | |
26 static uvec8 kShuf1 = | |
27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; | |
28 | |
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. | |
30 static uvec8 kShuf2 = | |
31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; | |
32 | |
33 // Offsets for source bytes 0 to 10 | |
34 static uvec8 kShuf01 = | |
35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; | |
36 | |
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. | |
38 static uvec8 kShuf11 = | |
39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; | |
40 | |
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. | |
42 static uvec8 kShuf21 = | |
43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; | |
44 | |
45 // Coefficients for source bytes 0 to 10 | |
46 static uvec8 kMadd01 = | |
47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; | |
48 | |
49 // Coefficients for source bytes 10 to 21 | |
50 static uvec8 kMadd11 = | |
51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; | |
52 | |
53 // Coefficients for source bytes 21 to 31 | |
54 static uvec8 kMadd21 = | |
55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; | |
56 | |
57 // Coefficients for source bytes 21 to 31 | |
58 static vec16 kRound34 = | |
59 { 2, 2, 2, 2, 2, 2, 2, 2 }; | |
60 | |
61 static uvec8 kShuf38a = | |
62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; | |
63 | |
64 static uvec8 kShuf38b = | |
65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; | |
66 | |
67 // Arrange words 0,3,6 into 0,1,2 | |
68 static uvec8 kShufAc = | |
69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; | |
70 | |
71 // Arrange words 0,3,6 into 3,4,5 | |
72 static uvec8 kShufAc3 = | |
73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; | |
74 | |
75 // Scaling values for boxes of 3x3 and 2x3 | |
76 static uvec16 kScaleAc33 = | |
77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; | |
78 | |
79 // Arrange first value for pixels 0,1,2,3,4,5 | |
80 static uvec8 kShufAb0 = | |
81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; | |
82 | |
83 // Arrange second value for pixels 0,1,2,3,4,5 | |
84 static uvec8 kShufAb1 = | |
85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; | |
86 | |
87 // Arrange third value for pixels 0,1,2,3,4,5 | |
88 static uvec8 kShufAb2 = | |
89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; | |
90 | |
91 // Scaling values for boxes of 3x2 and 2x2 | |
92 static uvec16 kScaleAb2 = | |
93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; | |
94 | |
95 // GCC versions of row functions are verbatim conversions from Visual C. | |
96 // Generated using gcc disassembly on Visual C object file: | |
97 // objdump -D yuvscaler.obj >yuvscaler.txt | |
98 | |
99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | |
100 uint8* dst_ptr, int dst_width) { | |
101 asm volatile ( | |
102 LABELALIGN | |
103 "1: \n" | |
104 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
105 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
106 "lea " MEMLEA(0x20,0) ",%0 \n" | |
107 "psrlw $0x8,%%xmm0 \n" | |
108 "psrlw $0x8,%%xmm1 \n" | |
109 "packuswb %%xmm1,%%xmm0 \n" | |
110 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
111 "lea " MEMLEA(0x10,1) ",%1 \n" | |
112 "sub $0x10,%2 \n" | |
113 "jg 1b \n" | |
114 : "+r"(src_ptr), // %0 | |
115 "+r"(dst_ptr), // %1 | |
116 "+r"(dst_width) // %2 | |
117 :: "memory", "cc", "xmm0", "xmm1" | |
118 ); | |
119 } | |
120 | |
121 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | |
122 uint8* dst_ptr, int dst_width) { | |
123 asm volatile ( | |
124 "pcmpeqb %%xmm5,%%xmm5 \n" | |
125 "psrlw $0x8,%%xmm5 \n" | |
126 | |
127 LABELALIGN | |
128 "1: \n" | |
129 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
130 "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" | |
131 "lea " MEMLEA(0x20,0) ",%0 \n" | |
132 "movdqa %%xmm0,%%xmm2 \n" | |
133 "psrlw $0x8,%%xmm0 \n" | |
134 "movdqa %%xmm1,%%xmm3 \n" | |
135 "psrlw $0x8,%%xmm1 \n" | |
136 "pand %%xmm5,%%xmm2 \n" | |
137 "pand %%xmm5,%%xmm3 \n" | |
138 "pavgw %%xmm2,%%xmm0 \n" | |
139 "pavgw %%xmm3,%%xmm1 \n" | |
140 "packuswb %%xmm1,%%xmm0 \n" | |
141 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
142 "lea " MEMLEA(0x10,1) ",%1 \n" | |
143 "sub $0x10,%2 \n" | |
144 "jg 1b \n" | |
145 : "+r"(src_ptr), // %0 | |
146 "+r"(dst_ptr), // %1 | |
147 "+r"(dst_width) // %2 | |
148 :: "memory", "cc", "xmm0", "xmm1", "xmm5" | |
149 ); | |
150 } | |
151 | |
152 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | |
153 uint8* dst_ptr, int dst_width) { | |
154 asm volatile ( | |
155 "pcmpeqb %%xmm5,%%xmm5 \n" | |
156 "psrlw $0x8,%%xmm5 \n" | |
157 | |
158 LABELALIGN | |
159 "1: \n" | |
160 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
161 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
162 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 | |
163 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 | |
164 "lea " MEMLEA(0x20,0) ",%0 \n" | |
165 "pavgb %%xmm2,%%xmm0 \n" | |
166 "pavgb %%xmm3,%%xmm1 \n" | |
167 "movdqa %%xmm0,%%xmm2 \n" | |
168 "psrlw $0x8,%%xmm0 \n" | |
169 "movdqa %%xmm1,%%xmm3 \n" | |
170 "psrlw $0x8,%%xmm1 \n" | |
171 "pand %%xmm5,%%xmm2 \n" | |
172 "pand %%xmm5,%%xmm3 \n" | |
173 "pavgw %%xmm2,%%xmm0 \n" | |
174 "pavgw %%xmm3,%%xmm1 \n" | |
175 "packuswb %%xmm1,%%xmm0 \n" | |
176 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
177 "lea " MEMLEA(0x10,1) ",%1 \n" | |
178 "sub $0x10,%2 \n" | |
179 "jg 1b \n" | |
180 : "+r"(src_ptr), // %0 | |
181 "+r"(dst_ptr), // %1 | |
182 "+r"(dst_width) // %2 | |
183 : "r"((intptr_t)(src_stride)) // %3 | |
184 : "memory", "cc", NACL_R14 | |
185 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
186 ); | |
187 } | |
188 | |
189 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | |
190 uint8* dst_ptr, int dst_width) { | |
191 asm volatile ( | |
192 "pcmpeqb %%xmm5,%%xmm5 \n" | |
193 "psrld $0x18,%%xmm5 \n" | |
194 "pslld $0x10,%%xmm5 \n" | |
195 | |
196 LABELALIGN | |
197 "1: \n" | |
198 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
199 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
200 "lea " MEMLEA(0x20,0) ",%0 \n" | |
201 "pand %%xmm5,%%xmm0 \n" | |
202 "pand %%xmm5,%%xmm1 \n" | |
203 "packuswb %%xmm1,%%xmm0 \n" | |
204 "psrlw $0x8,%%xmm0 \n" | |
205 "packuswb %%xmm0,%%xmm0 \n" | |
206 "movq %%xmm0," MEMACCESS(1) " \n" | |
207 "lea " MEMLEA(0x8,1) ",%1 \n" | |
208 "sub $0x8,%2 \n" | |
209 "jg 1b \n" | |
210 : "+r"(src_ptr), // %0 | |
211 "+r"(dst_ptr), // %1 | |
212 "+r"(dst_width) // %2 | |
213 :: "memory", "cc", "xmm0", "xmm1", "xmm5" | |
214 ); | |
215 } | |
216 | |
217 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | |
218 uint8* dst_ptr, int dst_width) { | |
219 intptr_t stridex3 = 0; | |
220 asm volatile ( | |
221 "pcmpeqb %%xmm7,%%xmm7 \n" | |
222 "psrlw $0x8,%%xmm7 \n" | |
223 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" | |
224 | |
225 LABELALIGN | |
226 "1: \n" | |
227 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
228 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
229 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 | |
230 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 | |
231 "pavgb %%xmm2,%%xmm0 \n" | |
232 "pavgb %%xmm3,%%xmm1 \n" | |
233 MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 | |
234 MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 | |
235 MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4 | |
236 MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5 | |
237 "lea " MEMLEA(0x20,0) ",%0 \n" | |
238 "pavgb %%xmm4,%%xmm2 \n" | |
239 "pavgb %%xmm2,%%xmm0 \n" | |
240 "pavgb %%xmm5,%%xmm3 \n" | |
241 "pavgb %%xmm3,%%xmm1 \n" | |
242 "movdqa %%xmm0,%%xmm2 \n" | |
243 "psrlw $0x8,%%xmm0 \n" | |
244 "movdqa %%xmm1,%%xmm3 \n" | |
245 "psrlw $0x8,%%xmm1 \n" | |
246 "pand %%xmm7,%%xmm2 \n" | |
247 "pand %%xmm7,%%xmm3 \n" | |
248 "pavgw %%xmm2,%%xmm0 \n" | |
249 "pavgw %%xmm3,%%xmm1 \n" | |
250 "packuswb %%xmm1,%%xmm0 \n" | |
251 "movdqa %%xmm0,%%xmm2 \n" | |
252 "psrlw $0x8,%%xmm0 \n" | |
253 "pand %%xmm7,%%xmm2 \n" | |
254 "pavgw %%xmm2,%%xmm0 \n" | |
255 "packuswb %%xmm0,%%xmm0 \n" | |
256 "movq %%xmm0," MEMACCESS(1) " \n" | |
257 "lea " MEMLEA(0x8,1) ",%1 \n" | |
258 "sub $0x8,%2 \n" | |
259 "jg 1b \n" | |
260 : "+r"(src_ptr), // %0 | |
261 "+r"(dst_ptr), // %1 | |
262 "+r"(dst_width), // %2 | |
263 "+r"(stridex3) // %3 | |
264 : "r"((intptr_t)(src_stride)) // %4 | |
265 : "memory", "cc", NACL_R14 | |
266 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" | |
267 ); | |
268 } | |
269 | |
270 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, | |
271 uint8* dst_ptr, int dst_width) { | |
272 asm volatile ( | |
273 "movdqa %0,%%xmm3 \n" | |
274 "movdqa %1,%%xmm4 \n" | |
275 "movdqa %2,%%xmm5 \n" | |
276 : | |
277 : "m"(kShuf0), // %0 | |
278 "m"(kShuf1), // %1 | |
279 "m"(kShuf2) // %2 | |
280 ); | |
281 asm volatile ( | |
282 LABELALIGN | |
283 "1: \n" | |
284 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
285 "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n" | |
286 "lea " MEMLEA(0x20,0) ",%0 \n" | |
287 "movdqa %%xmm2,%%xmm1 \n" | |
288 "palignr $0x8,%%xmm0,%%xmm1 \n" | |
289 "pshufb %%xmm3,%%xmm0 \n" | |
290 "pshufb %%xmm4,%%xmm1 \n" | |
291 "pshufb %%xmm5,%%xmm2 \n" | |
292 "movq %%xmm0," MEMACCESS(1) " \n" | |
293 "movq %%xmm1," MEMACCESS2(0x8,1) " \n" | |
294 "movq %%xmm2," MEMACCESS2(0x10,1) " \n" | |
295 "lea " MEMLEA(0x18,1) ",%1 \n" | |
296 "sub $0x18,%2 \n" | |
297 "jg 1b \n" | |
298 : "+r"(src_ptr), // %0 | |
299 "+r"(dst_ptr), // %1 | |
300 "+r"(dst_width) // %2 | |
301 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
302 ); | |
303 } | |
304 | |
305 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, | |
306 ptrdiff_t src_stride, | |
307 uint8* dst_ptr, int dst_width) { | |
308 asm volatile ( | |
309 "movdqa %0,%%xmm2 \n" // kShuf01 | |
310 "movdqa %1,%%xmm3 \n" // kShuf11 | |
311 "movdqa %2,%%xmm4 \n" // kShuf21 | |
312 : | |
313 : "m"(kShuf01), // %0 | |
314 "m"(kShuf11), // %1 | |
315 "m"(kShuf21) // %2 | |
316 ); | |
317 asm volatile ( | |
318 "movdqa %0,%%xmm5 \n" // kMadd01 | |
319 "movdqa %1,%%xmm0 \n" // kMadd11 | |
320 "movdqa %2,%%xmm1 \n" // kRound34 | |
321 : | |
322 : "m"(kMadd01), // %0 | |
323 "m"(kMadd11), // %1 | |
324 "m"(kRound34) // %2 | |
325 ); | |
326 asm volatile ( | |
327 LABELALIGN | |
328 "1: \n" | |
329 "movdqu " MEMACCESS(0) ",%%xmm6 \n" | |
330 MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7 | |
331 "pavgb %%xmm7,%%xmm6 \n" | |
332 "pshufb %%xmm2,%%xmm6 \n" | |
333 "pmaddubsw %%xmm5,%%xmm6 \n" | |
334 "paddsw %%xmm1,%%xmm6 \n" | |
335 "psrlw $0x2,%%xmm6 \n" | |
336 "packuswb %%xmm6,%%xmm6 \n" | |
337 "movq %%xmm6," MEMACCESS(1) " \n" | |
338 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" | |
339 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 | |
340 "pavgb %%xmm7,%%xmm6 \n" | |
341 "pshufb %%xmm3,%%xmm6 \n" | |
342 "pmaddubsw %%xmm0,%%xmm6 \n" | |
343 "paddsw %%xmm1,%%xmm6 \n" | |
344 "psrlw $0x2,%%xmm6 \n" | |
345 "packuswb %%xmm6,%%xmm6 \n" | |
346 "movq %%xmm6," MEMACCESS2(0x8,1) " \n" | |
347 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" | |
348 MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7 | |
349 "lea " MEMLEA(0x20,0) ",%0 \n" | |
350 "pavgb %%xmm7,%%xmm6 \n" | |
351 "pshufb %%xmm4,%%xmm6 \n" | |
352 "pmaddubsw %4,%%xmm6 \n" | |
353 "paddsw %%xmm1,%%xmm6 \n" | |
354 "psrlw $0x2,%%xmm6 \n" | |
355 "packuswb %%xmm6,%%xmm6 \n" | |
356 "movq %%xmm6," MEMACCESS2(0x10,1) " \n" | |
357 "lea " MEMLEA(0x18,1) ",%1 \n" | |
358 "sub $0x18,%2 \n" | |
359 "jg 1b \n" | |
360 : "+r"(src_ptr), // %0 | |
361 "+r"(dst_ptr), // %1 | |
362 "+r"(dst_width) // %2 | |
363 : "r"((intptr_t)(src_stride)), // %3 | |
364 "m"(kMadd21) // %4 | |
365 : "memory", "cc", NACL_R14 | |
366 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
367 ); | |
368 } | |
369 | |
370 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, | |
371 ptrdiff_t src_stride, | |
372 uint8* dst_ptr, int dst_width) { | |
373 asm volatile ( | |
374 "movdqa %0,%%xmm2 \n" // kShuf01 | |
375 "movdqa %1,%%xmm3 \n" // kShuf11 | |
376 "movdqa %2,%%xmm4 \n" // kShuf21 | |
377 : | |
378 : "m"(kShuf01), // %0 | |
379 "m"(kShuf11), // %1 | |
380 "m"(kShuf21) // %2 | |
381 ); | |
382 asm volatile ( | |
383 "movdqa %0,%%xmm5 \n" // kMadd01 | |
384 "movdqa %1,%%xmm0 \n" // kMadd11 | |
385 "movdqa %2,%%xmm1 \n" // kRound34 | |
386 : | |
387 : "m"(kMadd01), // %0 | |
388 "m"(kMadd11), // %1 | |
389 "m"(kRound34) // %2 | |
390 ); | |
391 | |
392 asm volatile ( | |
393 LABELALIGN | |
394 "1: \n" | |
395 "movdqu " MEMACCESS(0) ",%%xmm6 \n" | |
396 MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7 | |
397 "pavgb %%xmm6,%%xmm7 \n" | |
398 "pavgb %%xmm7,%%xmm6 \n" | |
399 "pshufb %%xmm2,%%xmm6 \n" | |
400 "pmaddubsw %%xmm5,%%xmm6 \n" | |
401 "paddsw %%xmm1,%%xmm6 \n" | |
402 "psrlw $0x2,%%xmm6 \n" | |
403 "packuswb %%xmm6,%%xmm6 \n" | |
404 "movq %%xmm6," MEMACCESS(1) " \n" | |
405 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" | |
406 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 | |
407 "pavgb %%xmm6,%%xmm7 \n" | |
408 "pavgb %%xmm7,%%xmm6 \n" | |
409 "pshufb %%xmm3,%%xmm6 \n" | |
410 "pmaddubsw %%xmm0,%%xmm6 \n" | |
411 "paddsw %%xmm1,%%xmm6 \n" | |
412 "psrlw $0x2,%%xmm6 \n" | |
413 "packuswb %%xmm6,%%xmm6 \n" | |
414 "movq %%xmm6," MEMACCESS2(0x8,1) " \n" | |
415 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" | |
416 MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7 | |
417 "lea " MEMLEA(0x20,0) ",%0 \n" | |
418 "pavgb %%xmm6,%%xmm7 \n" | |
419 "pavgb %%xmm7,%%xmm6 \n" | |
420 "pshufb %%xmm4,%%xmm6 \n" | |
421 "pmaddubsw %4,%%xmm6 \n" | |
422 "paddsw %%xmm1,%%xmm6 \n" | |
423 "psrlw $0x2,%%xmm6 \n" | |
424 "packuswb %%xmm6,%%xmm6 \n" | |
425 "movq %%xmm6," MEMACCESS2(0x10,1) " \n" | |
426 "lea " MEMLEA(0x18,1) ",%1 \n" | |
427 "sub $0x18,%2 \n" | |
428 "jg 1b \n" | |
429 : "+r"(src_ptr), // %0 | |
430 "+r"(dst_ptr), // %1 | |
431 "+r"(dst_width) // %2 | |
432 : "r"((intptr_t)(src_stride)), // %3 | |
433 "m"(kMadd21) // %4 | |
434 : "memory", "cc", NACL_R14 | |
435 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
436 ); | |
437 } | |
438 | |
439 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, | |
440 uint8* dst_ptr, int dst_width) { | |
441 asm volatile ( | |
442 "movdqa %3,%%xmm4 \n" | |
443 "movdqa %4,%%xmm5 \n" | |
444 | |
445 LABELALIGN | |
446 "1: \n" | |
447 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
448 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
449 "lea " MEMLEA(0x20,0) ",%0 \n" | |
450 "pshufb %%xmm4,%%xmm0 \n" | |
451 "pshufb %%xmm5,%%xmm1 \n" | |
452 "paddusb %%xmm1,%%xmm0 \n" | |
453 "movq %%xmm0," MEMACCESS(1) " \n" | |
454 "movhlps %%xmm0,%%xmm1 \n" | |
455 "movd %%xmm1," MEMACCESS2(0x8,1) " \n" | |
456 "lea " MEMLEA(0xc,1) ",%1 \n" | |
457 "sub $0xc,%2 \n" | |
458 "jg 1b \n" | |
459 : "+r"(src_ptr), // %0 | |
460 "+r"(dst_ptr), // %1 | |
461 "+r"(dst_width) // %2 | |
462 : "m"(kShuf38a), // %3 | |
463 "m"(kShuf38b) // %4 | |
464 : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" | |
465 ); | |
466 } | |
467 | |
468 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, | |
469 ptrdiff_t src_stride, | |
470 uint8* dst_ptr, int dst_width) { | |
471 asm volatile ( | |
472 "movdqa %0,%%xmm2 \n" | |
473 "movdqa %1,%%xmm3 \n" | |
474 "movdqa %2,%%xmm4 \n" | |
475 "movdqa %3,%%xmm5 \n" | |
476 : | |
477 : "m"(kShufAb0), // %0 | |
478 "m"(kShufAb1), // %1 | |
479 "m"(kShufAb2), // %2 | |
480 "m"(kScaleAb2) // %3 | |
481 ); | |
482 asm volatile ( | |
483 LABELALIGN | |
484 "1: \n" | |
485 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
486 MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1 | |
487 "lea " MEMLEA(0x10,0) ",%0 \n" | |
488 "pavgb %%xmm1,%%xmm0 \n" | |
489 "movdqa %%xmm0,%%xmm1 \n" | |
490 "pshufb %%xmm2,%%xmm1 \n" | |
491 "movdqa %%xmm0,%%xmm6 \n" | |
492 "pshufb %%xmm3,%%xmm6 \n" | |
493 "paddusw %%xmm6,%%xmm1 \n" | |
494 "pshufb %%xmm4,%%xmm0 \n" | |
495 "paddusw %%xmm0,%%xmm1 \n" | |
496 "pmulhuw %%xmm5,%%xmm1 \n" | |
497 "packuswb %%xmm1,%%xmm1 \n" | |
498 "movd %%xmm1," MEMACCESS(1) " \n" | |
499 "psrlq $0x10,%%xmm1 \n" | |
500 "movd %%xmm1," MEMACCESS2(0x2,1) " \n" | |
501 "lea " MEMLEA(0x6,1) ",%1 \n" | |
502 "sub $0x6,%2 \n" | |
503 "jg 1b \n" | |
504 : "+r"(src_ptr), // %0 | |
505 "+r"(dst_ptr), // %1 | |
506 "+r"(dst_width) // %2 | |
507 : "r"((intptr_t)(src_stride)) // %3 | |
508 : "memory", "cc", NACL_R14 | |
509 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
510 ); | |
511 } | |
512 | |
513 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, | |
514 ptrdiff_t src_stride, | |
515 uint8* dst_ptr, int dst_width) { | |
516 asm volatile ( | |
517 "movdqa %0,%%xmm2 \n" | |
518 "movdqa %1,%%xmm3 \n" | |
519 "movdqa %2,%%xmm4 \n" | |
520 "pxor %%xmm5,%%xmm5 \n" | |
521 : | |
522 : "m"(kShufAc), // %0 | |
523 "m"(kShufAc3), // %1 | |
524 "m"(kScaleAc33) // %2 | |
525 ); | |
526 asm volatile ( | |
527 LABELALIGN | |
528 "1: \n" | |
529 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
530 MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6 | |
531 "movhlps %%xmm0,%%xmm1 \n" | |
532 "movhlps %%xmm6,%%xmm7 \n" | |
533 "punpcklbw %%xmm5,%%xmm0 \n" | |
534 "punpcklbw %%xmm5,%%xmm1 \n" | |
535 "punpcklbw %%xmm5,%%xmm6 \n" | |
536 "punpcklbw %%xmm5,%%xmm7 \n" | |
537 "paddusw %%xmm6,%%xmm0 \n" | |
538 "paddusw %%xmm7,%%xmm1 \n" | |
539 MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6 | |
540 "lea " MEMLEA(0x10,0) ",%0 \n" | |
541 "movhlps %%xmm6,%%xmm7 \n" | |
542 "punpcklbw %%xmm5,%%xmm6 \n" | |
543 "punpcklbw %%xmm5,%%xmm7 \n" | |
544 "paddusw %%xmm6,%%xmm0 \n" | |
545 "paddusw %%xmm7,%%xmm1 \n" | |
546 "movdqa %%xmm0,%%xmm6 \n" | |
547 "psrldq $0x2,%%xmm0 \n" | |
548 "paddusw %%xmm0,%%xmm6 \n" | |
549 "psrldq $0x2,%%xmm0 \n" | |
550 "paddusw %%xmm0,%%xmm6 \n" | |
551 "pshufb %%xmm2,%%xmm6 \n" | |
552 "movdqa %%xmm1,%%xmm7 \n" | |
553 "psrldq $0x2,%%xmm1 \n" | |
554 "paddusw %%xmm1,%%xmm7 \n" | |
555 "psrldq $0x2,%%xmm1 \n" | |
556 "paddusw %%xmm1,%%xmm7 \n" | |
557 "pshufb %%xmm3,%%xmm7 \n" | |
558 "paddusw %%xmm7,%%xmm6 \n" | |
559 "pmulhuw %%xmm4,%%xmm6 \n" | |
560 "packuswb %%xmm6,%%xmm6 \n" | |
561 "movd %%xmm6," MEMACCESS(1) " \n" | |
562 "psrlq $0x10,%%xmm6 \n" | |
563 "movd %%xmm6," MEMACCESS2(0x2,1) " \n" | |
564 "lea " MEMLEA(0x6,1) ",%1 \n" | |
565 "sub $0x6,%2 \n" | |
566 "jg 1b \n" | |
567 : "+r"(src_ptr), // %0 | |
568 "+r"(dst_ptr), // %1 | |
569 "+r"(dst_width) // %2 | |
570 : "r"((intptr_t)(src_stride)) // %3 | |
571 : "memory", "cc", NACL_R14 | |
572 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
573 ); | |
574 } | |
575 | |
576 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | |
577 uint16* dst_ptr, int src_width, int src_height) { | |
578 int tmp_height = 0; | |
579 intptr_t tmp_src = 0; | |
580 asm volatile ( | |
581 "pxor %%xmm4,%%xmm4 \n" | |
582 "sub $0x1,%5 \n" | |
583 | |
584 LABELALIGN | |
585 "1: \n" | |
586 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
587 "mov %0,%3 \n" | |
588 "add %6,%0 \n" | |
589 "movdqa %%xmm0,%%xmm1 \n" | |
590 "punpcklbw %%xmm4,%%xmm0 \n" | |
591 "punpckhbw %%xmm4,%%xmm1 \n" | |
592 "mov %5,%2 \n" | |
593 "test %2,%2 \n" | |
594 "je 3f \n" | |
595 | |
596 LABELALIGN | |
597 "2: \n" | |
598 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
599 "add %6,%0 \n" | |
600 "movdqa %%xmm2,%%xmm3 \n" | |
601 "punpcklbw %%xmm4,%%xmm2 \n" | |
602 "punpckhbw %%xmm4,%%xmm3 \n" | |
603 "paddusw %%xmm2,%%xmm0 \n" | |
604 "paddusw %%xmm3,%%xmm1 \n" | |
605 "sub $0x1,%2 \n" | |
606 "jg 2b \n" | |
607 | |
608 LABELALIGN | |
609 "3: \n" | |
610 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
611 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
612 "lea " MEMLEA(0x10,3) ",%0 \n" | |
613 "lea " MEMLEA(0x20,1) ",%1 \n" | |
614 "sub $0x10,%4 \n" | |
615 "jg 1b \n" | |
616 : "+r"(src_ptr), // %0 | |
617 "+r"(dst_ptr), // %1 | |
618 "+r"(tmp_height), // %2 | |
619 "+r"(tmp_src), // %3 | |
620 "+r"(src_width), // %4 | |
621 "+rm"(src_height) // %5 | |
622 : "rm"((intptr_t)(src_stride)) // %6 | |
623 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | |
624 ); | |
625 } | |
626 | |
627 // Bilinear column filtering. SSSE3 version. | |
628 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | |
629 int dst_width, int x, int dx) { | |
630 intptr_t x0 = 0, x1 = 0, temp_pixel = 0; | |
631 asm volatile ( | |
632 "movd %6,%%xmm2 \n" | |
633 "movd %7,%%xmm3 \n" | |
634 "movl $0x04040000,%k2 \n" | |
635 "movd %k2,%%xmm5 \n" | |
636 "pcmpeqb %%xmm6,%%xmm6 \n" | |
637 "psrlw $0x9,%%xmm6 \n" | |
638 "pextrw $0x1,%%xmm2,%k3 \n" | |
639 "subl $0x2,%5 \n" | |
640 "jl 29f \n" | |
641 "movdqa %%xmm2,%%xmm0 \n" | |
642 "paddd %%xmm3,%%xmm0 \n" | |
643 "punpckldq %%xmm0,%%xmm2 \n" | |
644 "punpckldq %%xmm3,%%xmm3 \n" | |
645 "paddd %%xmm3,%%xmm3 \n" | |
646 "pextrw $0x3,%%xmm2,%k4 \n" | |
647 | |
648 LABELALIGN | |
649 "2: \n" | |
650 "movdqa %%xmm2,%%xmm1 \n" | |
651 "paddd %%xmm3,%%xmm2 \n" | |
652 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 | |
653 "movd %k2,%%xmm0 \n" | |
654 "psrlw $0x9,%%xmm1 \n" | |
655 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 | |
656 "movd %k2,%%xmm4 \n" | |
657 "pshufb %%xmm5,%%xmm1 \n" | |
658 "punpcklwd %%xmm4,%%xmm0 \n" | |
659 "pxor %%xmm6,%%xmm1 \n" | |
660 "pmaddubsw %%xmm1,%%xmm0 \n" | |
661 "pextrw $0x1,%%xmm2,%k3 \n" | |
662 "pextrw $0x3,%%xmm2,%k4 \n" | |
663 "psrlw $0x7,%%xmm0 \n" | |
664 "packuswb %%xmm0,%%xmm0 \n" | |
665 "movd %%xmm0,%k2 \n" | |
666 "mov %w2," MEMACCESS(0) " \n" | |
667 "lea " MEMLEA(0x2,0) ",%0 \n" | |
668 "sub $0x2,%5 \n" | |
669 "jge 2b \n" | |
670 | |
671 LABELALIGN | |
672 "29: \n" | |
673 "addl $0x1,%5 \n" | |
674 "jl 99f \n" | |
675 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 | |
676 "movd %k2,%%xmm0 \n" | |
677 "psrlw $0x9,%%xmm2 \n" | |
678 "pshufb %%xmm5,%%xmm2 \n" | |
679 "pxor %%xmm6,%%xmm2 \n" | |
680 "pmaddubsw %%xmm2,%%xmm0 \n" | |
681 "psrlw $0x7,%%xmm0 \n" | |
682 "packuswb %%xmm0,%%xmm0 \n" | |
683 "movd %%xmm0,%k2 \n" | |
684 "mov %b2," MEMACCESS(0) " \n" | |
685 "99: \n" | |
686 : "+r"(dst_ptr), // %0 | |
687 "+r"(src_ptr), // %1 | |
688 "+a"(temp_pixel), // %2 | |
689 "+r"(x0), // %3 | |
690 "+r"(x1), // %4 | |
691 "+rm"(dst_width) // %5 | |
692 : "rm"(x), // %6 | |
693 "rm"(dx) // %7 | |
694 : "memory", "cc", NACL_R14 | |
695 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
696 ); | |
697 } | |
698 | |
699 // Reads 4 pixels, duplicates them and writes 8 pixels. | |
700 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. | |
701 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, | |
702 int dst_width, int x, int dx) { | |
703 asm volatile ( | |
704 LABELALIGN | |
705 "1: \n" | |
706 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
707 "lea " MEMLEA(0x10,1) ",%1 \n" | |
708 "movdqa %%xmm0,%%xmm1 \n" | |
709 "punpcklbw %%xmm0,%%xmm0 \n" | |
710 "punpckhbw %%xmm1,%%xmm1 \n" | |
711 "movdqu %%xmm0," MEMACCESS(0) " \n" | |
712 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" | |
713 "lea " MEMLEA(0x20,0) ",%0 \n" | |
714 "sub $0x20,%2 \n" | |
715 "jg 1b \n" | |
716 | |
717 : "+r"(dst_ptr), // %0 | |
718 "+r"(src_ptr), // %1 | |
719 "+r"(dst_width) // %2 | |
720 :: "memory", "cc", "xmm0", "xmm1" | |
721 ); | |
722 } | |
723 | |
724 void ScaleARGBRowDown2_SSE2(const uint8* src_argb, | |
725 ptrdiff_t src_stride, | |
726 uint8* dst_argb, int dst_width) { | |
727 asm volatile ( | |
728 LABELALIGN | |
729 "1: \n" | |
730 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
731 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
732 "lea " MEMLEA(0x20,0) ",%0 \n" | |
733 "shufps $0xdd,%%xmm1,%%xmm0 \n" | |
734 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
735 "lea " MEMLEA(0x10,1) ",%1 \n" | |
736 "sub $0x4,%2 \n" | |
737 "jg 1b \n" | |
738 : "+r"(src_argb), // %0 | |
739 "+r"(dst_argb), // %1 | |
740 "+r"(dst_width) // %2 | |
741 :: "memory", "cc", "xmm0", "xmm1" | |
742 ); | |
743 } | |
744 | |
745 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, | |
746 ptrdiff_t src_stride, | |
747 uint8* dst_argb, int dst_width) { | |
748 asm volatile ( | |
749 LABELALIGN | |
750 "1: \n" | |
751 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
752 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
753 "lea " MEMLEA(0x20,0) ",%0 \n" | |
754 "movdqa %%xmm0,%%xmm2 \n" | |
755 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
756 "shufps $0xdd,%%xmm1,%%xmm2 \n" | |
757 "pavgb %%xmm2,%%xmm0 \n" | |
758 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
759 "lea " MEMLEA(0x10,1) ",%1 \n" | |
760 "sub $0x4,%2 \n" | |
761 "jg 1b \n" | |
762 : "+r"(src_argb), // %0 | |
763 "+r"(dst_argb), // %1 | |
764 "+r"(dst_width) // %2 | |
765 :: "memory", "cc", "xmm0", "xmm1" | |
766 ); | |
767 } | |
768 | |
769 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, | |
770 ptrdiff_t src_stride, | |
771 uint8* dst_argb, int dst_width) { | |
772 asm volatile ( | |
773 LABELALIGN | |
774 "1: \n" | |
775 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
776 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
777 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 | |
778 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 | |
779 "lea " MEMLEA(0x20,0) ",%0 \n" | |
780 "pavgb %%xmm2,%%xmm0 \n" | |
781 "pavgb %%xmm3,%%xmm1 \n" | |
782 "movdqa %%xmm0,%%xmm2 \n" | |
783 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
784 "shufps $0xdd,%%xmm1,%%xmm2 \n" | |
785 "pavgb %%xmm2,%%xmm0 \n" | |
786 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
787 "lea " MEMLEA(0x10,1) ",%1 \n" | |
788 "sub $0x4,%2 \n" | |
789 "jg 1b \n" | |
790 : "+r"(src_argb), // %0 | |
791 "+r"(dst_argb), // %1 | |
792 "+r"(dst_width) // %2 | |
793 : "r"((intptr_t)(src_stride)) // %3 | |
794 : "memory", "cc", NACL_R14 | |
795 "xmm0", "xmm1", "xmm2", "xmm3" | |
796 ); | |
797 } | |
798 | |
799 // Reads 4 pixels at a time. | |
800 // Alignment requirement: dst_argb 16 byte aligned. | |
801 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, | |
802 int src_stepx, | |
803 uint8* dst_argb, int dst_width) { | |
804 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); | |
805 intptr_t src_stepx_x12 = 0; | |
806 asm volatile ( | |
807 "lea " MEMLEA3(0x00,1,4) ",%1 \n" | |
808 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" | |
809 LABELALIGN | |
810 "1: \n" | |
811 "movd " MEMACCESS(0) ",%%xmm0 \n" | |
812 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 | |
813 "punpckldq %%xmm1,%%xmm0 \n" | |
814 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 | |
815 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 | |
816 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" | |
817 "punpckldq %%xmm3,%%xmm2 \n" | |
818 "punpcklqdq %%xmm2,%%xmm0 \n" | |
819 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
820 "lea " MEMLEA(0x10,2) ",%2 \n" | |
821 "sub $0x4,%3 \n" | |
822 "jg 1b \n" | |
823 : "+r"(src_argb), // %0 | |
824 "+r"(src_stepx_x4), // %1 | |
825 "+r"(dst_argb), // %2 | |
826 "+r"(dst_width), // %3 | |
827 "+r"(src_stepx_x12) // %4 | |
828 :: "memory", "cc", NACL_R14 | |
829 "xmm0", "xmm1", "xmm2", "xmm3" | |
830 ); | |
831 } | |
832 | |
833 // Blends four 2x2 to 4x1. | |
834 // Alignment requirement: dst_argb 16 byte aligned. | |
835 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, | |
836 ptrdiff_t src_stride, int src_stepx, | |
837 uint8* dst_argb, int dst_width) { | |
838 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); | |
839 intptr_t src_stepx_x12 = 0; | |
840 intptr_t row1 = (intptr_t)(src_stride); | |
841 asm volatile ( | |
842 "lea " MEMLEA3(0x00,1,4) ",%1 \n" | |
843 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" | |
844 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" | |
845 | |
846 LABELALIGN | |
847 "1: \n" | |
848 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
849 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 | |
850 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 | |
851 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 | |
852 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" | |
853 "movq " MEMACCESS(5) ",%%xmm2 \n" | |
854 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 | |
855 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 | |
856 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 | |
857 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" | |
858 "pavgb %%xmm2,%%xmm0 \n" | |
859 "pavgb %%xmm3,%%xmm1 \n" | |
860 "movdqa %%xmm0,%%xmm2 \n" | |
861 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
862 "shufps $0xdd,%%xmm1,%%xmm2 \n" | |
863 "pavgb %%xmm2,%%xmm0 \n" | |
864 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
865 "lea " MEMLEA(0x10,2) ",%2 \n" | |
866 "sub $0x4,%3 \n" | |
867 "jg 1b \n" | |
868 : "+r"(src_argb), // %0 | |
869 "+r"(src_stepx_x4), // %1 | |
870 "+r"(dst_argb), // %2 | |
871 "+rm"(dst_width), // %3 | |
872 "+r"(src_stepx_x12), // %4 | |
873 "+r"(row1) // %5 | |
874 :: "memory", "cc", NACL_R14 | |
875 "xmm0", "xmm1", "xmm2", "xmm3" | |
876 ); | |
877 } | |
878 | |
879 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, | |
880 int dst_width, int x, int dx) { | |
881 intptr_t x0 = 0, x1 = 0; | |
882 asm volatile ( | |
883 "movd %5,%%xmm2 \n" | |
884 "movd %6,%%xmm3 \n" | |
885 "pshufd $0x0,%%xmm2,%%xmm2 \n" | |
886 "pshufd $0x11,%%xmm3,%%xmm0 \n" | |
887 "paddd %%xmm0,%%xmm2 \n" | |
888 "paddd %%xmm3,%%xmm3 \n" | |
889 "pshufd $0x5,%%xmm3,%%xmm0 \n" | |
890 "paddd %%xmm0,%%xmm2 \n" | |
891 "paddd %%xmm3,%%xmm3 \n" | |
892 "pshufd $0x0,%%xmm3,%%xmm3 \n" | |
893 "pextrw $0x1,%%xmm2,%k0 \n" | |
894 "pextrw $0x3,%%xmm2,%k1 \n" | |
895 "cmp $0x0,%4 \n" | |
896 "jl 99f \n" | |
897 "sub $0x4,%4 \n" | |
898 "jl 49f \n" | |
899 | |
900 LABELALIGN | |
901 "40: \n" | |
902 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 | |
903 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 | |
904 "pextrw $0x5,%%xmm2,%k0 \n" | |
905 "pextrw $0x7,%%xmm2,%k1 \n" | |
906 "paddd %%xmm3,%%xmm2 \n" | |
907 "punpckldq %%xmm1,%%xmm0 \n" | |
908 MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 | |
909 MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 | |
910 "pextrw $0x1,%%xmm2,%k0 \n" | |
911 "pextrw $0x3,%%xmm2,%k1 \n" | |
912 "punpckldq %%xmm4,%%xmm1 \n" | |
913 "punpcklqdq %%xmm1,%%xmm0 \n" | |
914 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
915 "lea " MEMLEA(0x10,2) ",%2 \n" | |
916 "sub $0x4,%4 \n" | |
917 "jge 40b \n" | |
918 | |
919 "49: \n" | |
920 "test $0x2,%4 \n" | |
921 "je 29f \n" | |
922 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 | |
923 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 | |
924 "pextrw $0x5,%%xmm2,%k0 \n" | |
925 "punpckldq %%xmm1,%%xmm0 \n" | |
926 "movq %%xmm0," MEMACCESS(2) " \n" | |
927 "lea " MEMLEA(0x8,2) ",%2 \n" | |
928 "29: \n" | |
929 "test $0x1,%4 \n" | |
930 "je 99f \n" | |
931 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 | |
932 "movd %%xmm0," MEMACCESS(2) " \n" | |
933 "99: \n" | |
934 : "+a"(x0), // %0 | |
935 "+d"(x1), // %1 | |
936 "+r"(dst_argb), // %2 | |
937 "+r"(src_argb), // %3 | |
938 "+r"(dst_width) // %4 | |
939 : "rm"(x), // %5 | |
940 "rm"(dx) // %6 | |
941 : "memory", "cc", NACL_R14 | |
942 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | |
943 ); | |
944 } | |
945 | |
946 // Reads 4 pixels, duplicates them and writes 8 pixels. | |
947 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. | |
948 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, | |
949 int dst_width, int x, int dx) { | |
950 asm volatile ( | |
951 LABELALIGN | |
952 "1: \n" | |
953 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
954 "lea " MEMLEA(0x10,1) ",%1 \n" | |
955 "movdqa %%xmm0,%%xmm1 \n" | |
956 "punpckldq %%xmm0,%%xmm0 \n" | |
957 "punpckhdq %%xmm1,%%xmm1 \n" | |
958 "movdqu %%xmm0," MEMACCESS(0) " \n" | |
959 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" | |
960 "lea " MEMLEA(0x20,0) ",%0 \n" | |
961 "sub $0x8,%2 \n" | |
962 "jg 1b \n" | |
963 | |
964 : "+r"(dst_argb), // %0 | |
965 "+r"(src_argb), // %1 | |
966 "+r"(dst_width) // %2 | |
967 :: "memory", "cc", NACL_R14 | |
968 "xmm0", "xmm1" | |
969 ); | |
970 } | |
971 | |
972 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw | |
973 static uvec8 kShuffleColARGB = { | |
974 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel | |
975 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel | |
976 }; | |
977 | |
978 // Shuffle table for duplicating 2 fractions into 8 bytes each | |
979 static uvec8 kShuffleFractions = { | |
980 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, | |
981 }; | |
982 | |
983 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version | |
984 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, | |
985 int dst_width, int x, int dx) { | |
986 intptr_t x0 = 0, x1 = 0; | |
987 asm volatile ( | |
988 "movdqa %0,%%xmm4 \n" | |
989 "movdqa %1,%%xmm5 \n" | |
990 : | |
991 : "m"(kShuffleColARGB), // %0 | |
992 "m"(kShuffleFractions) // %1 | |
993 ); | |
994 | |
995 asm volatile ( | |
996 "movd %5,%%xmm2 \n" | |
997 "movd %6,%%xmm3 \n" | |
998 "pcmpeqb %%xmm6,%%xmm6 \n" | |
999 "psrlw $0x9,%%xmm6 \n" | |
1000 "pextrw $0x1,%%xmm2,%k3 \n" | |
1001 "sub $0x2,%2 \n" | |
1002 "jl 29f \n" | |
1003 "movdqa %%xmm2,%%xmm0 \n" | |
1004 "paddd %%xmm3,%%xmm0 \n" | |
1005 "punpckldq %%xmm0,%%xmm2 \n" | |
1006 "punpckldq %%xmm3,%%xmm3 \n" | |
1007 "paddd %%xmm3,%%xmm3 \n" | |
1008 "pextrw $0x3,%%xmm2,%k4 \n" | |
1009 | |
1010 LABELALIGN | |
1011 "2: \n" | |
1012 "movdqa %%xmm2,%%xmm1 \n" | |
1013 "paddd %%xmm3,%%xmm2 \n" | |
1014 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 | |
1015 "psrlw $0x9,%%xmm1 \n" | |
1016 MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 | |
1017 "pshufb %%xmm5,%%xmm1 \n" | |
1018 "pshufb %%xmm4,%%xmm0 \n" | |
1019 "pxor %%xmm6,%%xmm1 \n" | |
1020 "pmaddubsw %%xmm1,%%xmm0 \n" | |
1021 "psrlw $0x7,%%xmm0 \n" | |
1022 "pextrw $0x1,%%xmm2,%k3 \n" | |
1023 "pextrw $0x3,%%xmm2,%k4 \n" | |
1024 "packuswb %%xmm0,%%xmm0 \n" | |
1025 "movq %%xmm0," MEMACCESS(0) " \n" | |
1026 "lea " MEMLEA(0x8,0) ",%0 \n" | |
1027 "sub $0x2,%2 \n" | |
1028 "jge 2b \n" | |
1029 | |
1030 LABELALIGN | |
1031 "29: \n" | |
1032 "add $0x1,%2 \n" | |
1033 "jl 99f \n" | |
1034 "psrlw $0x9,%%xmm2 \n" | |
1035 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 | |
1036 "pshufb %%xmm5,%%xmm2 \n" | |
1037 "pshufb %%xmm4,%%xmm0 \n" | |
1038 "pxor %%xmm6,%%xmm2 \n" | |
1039 "pmaddubsw %%xmm2,%%xmm0 \n" | |
1040 "psrlw $0x7,%%xmm0 \n" | |
1041 "packuswb %%xmm0,%%xmm0 \n" | |
1042 "movd %%xmm0," MEMACCESS(0) " \n" | |
1043 | |
1044 LABELALIGN | |
1045 "99: \n" | |
1046 : "+r"(dst_argb), // %0 | |
1047 "+r"(src_argb), // %1 | |
1048 "+rm"(dst_width), // %2 | |
1049 "+r"(x0), // %3 | |
1050 "+r"(x1) // %4 | |
1051 : "rm"(x), // %5 | |
1052 "rm"(dx) // %6 | |
1053 : "memory", "cc", NACL_R14 | |
1054 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
1055 ); | |
1056 } | |
1057 | |
1058 // Divide num by div and return as 16.16 fixed point result. | |
1059 int FixedDiv_X86(int num, int div) { | |
1060 asm volatile ( | |
1061 "cdq \n" | |
1062 "shld $0x10,%%eax,%%edx \n" | |
1063 "shl $0x10,%%eax \n" | |
1064 "idiv %1 \n" | |
1065 "mov %0, %%eax \n" | |
1066 : "+a"(num) // %0 | |
1067 : "c"(div) // %1 | |
1068 : "memory", "cc", "edx" | |
1069 ); | |
1070 return num; | |
1071 } | |
1072 | |
1073 // Divide num - 1 by div - 1 and return as 16.16 fixed point result. | |
1074 int FixedDiv1_X86(int num, int div) { | |
1075 asm volatile ( | |
1076 "cdq \n" | |
1077 "shld $0x10,%%eax,%%edx \n" | |
1078 "shl $0x10,%%eax \n" | |
1079 "sub $0x10001,%%eax \n" | |
1080 "sbb $0x0,%%edx \n" | |
1081 "sub $0x1,%1 \n" | |
1082 "idiv %1 \n" | |
1083 "mov %0, %%eax \n" | |
1084 : "+a"(num) // %0 | |
1085 : "c"(div) // %1 | |
1086 : "memory", "cc", "edx" | |
1087 ); | |
1088 return num; | |
1089 } | |
1090 | |
1091 #endif // defined(__x86_64__) || defined(__i386__) | |
1092 | |
1093 #ifdef __cplusplus | |
1094 } // extern "C" | |
1095 } // namespace libyuv | |
1096 #endif | |
OLD | NEW |