OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include "third_party/libyuv/include/libyuv/scale.h" | |
12 | |
13 #include <assert.h> | |
14 #include <string.h> | |
15 | |
16 #include "third_party/libyuv/include/libyuv/cpu_id.h" | |
17 #include "third_party/libyuv/source/row.h" | |
18 | |
19 #ifdef __cplusplus | |
20 namespace libyuv { | |
21 extern "C" { | |
22 #endif | |
23 | |
24 /* | |
25 * Note: Defining YUV_DISABLE_ASM allows to use c version. | |
26 */ | |
27 //#define YUV_DISABLE_ASM | |
28 | |
29 #if defined(_MSC_VER) | |
30 #define ALIGN16(var) __declspec(align(16)) var | |
31 #else | |
32 #define ALIGN16(var) var __attribute__((aligned(16))) | |
33 #endif | |
34 | |
35 // Note: A Neon reference manual | |
36 // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG
.html | |
37 // Note: Some SSE2 reference manuals | |
38 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf | |
39 | |
40 // Set the following flag to true to revert to only | |
41 // using the reference implementation ScalePlaneBox(), and | |
42 // NOT the optimized versions. Useful for debugging and | |
43 // when comparing the quality of the resulting YUV planes | |
44 // as produced by the optimized and non-optimized versions. | |
45 | |
46 static int use_reference_impl_ = 0; | |
47 | |
48 void SetUseReferenceImpl(int use) { | |
49 use_reference_impl_ = use; | |
50 } | |
51 | |
52 // ScaleRowDown2Int also used by planar functions | |
53 | |
54 /** | |
55 * NEON downscalers with interpolation. | |
56 * | |
57 * Provided by Fritz Koenig | |
58 * | |
59 */ | |
60 | |
61 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) | |
62 #define HAS_SCALEROWDOWN2_NEON | |
63 void ScaleRowDown2_NEON(const uint8* src_ptr, int src_stride, | |
64 uint8* dst, int dst_width) { | |
65 asm volatile ( | |
66 "1: \n" | |
67 "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0,
odd into q1 | |
68 "vst1.u8 {q0}, [%1]! \n" // store even pixels | |
69 "subs %2, %2, #16 \n" // 16 processed per loop | |
70 "bhi 1b \n" | |
71 : "+r"(src_ptr), // %0 | |
72 "+r"(dst), // %1 | |
73 "+r"(dst_width) // %2 | |
74 : | |
75 : "q0", "q1" // Clobber List | |
76 ); | |
77 } | |
78 | |
79 void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, | |
80 uint8* dst, int dst_width) { | |
81 asm volatile ( | |
82 "add %1, %0 \n" // change the stride to row
2 pointer | |
83 "1: \n" | |
84 "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post incre
ment | |
85 "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post incre
ment | |
86 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent | |
87 "vpaddl.u8 q1, q1 \n" | |
88 "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add r
ow 1 to row 2 | |
89 "vpadal.u8 q1, q3 \n" | |
90 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack | |
91 "vrshrn.u16 d1, q1, #2 \n" | |
92 "vst1.u8 {q0}, [%2]! \n" | |
93 "subs %3, %3, #16 \n" // 16 processed per loop | |
94 "bhi 1b \n" | |
95 : "+r"(src_ptr), // %0 | |
96 "+r"(src_stride), // %1 | |
97 "+r"(dst), // %2 | |
98 "+r"(dst_width) // %3 | |
99 : | |
100 : "q0", "q1", "q2", "q3" // Clobber List | |
101 ); | |
102 } | |
103 | |
104 #define HAS_SCALEROWDOWN4_NEON | |
105 static void ScaleRowDown4_NEON(const uint8* src_ptr, int src_stride, | |
106 uint8* dst_ptr, int dst_width) { | |
107 asm volatile ( | |
108 "1: \n" | |
109 "vld2.u8 {d0, d1}, [%0]! \n" | |
110 "vtrn.u8 d1, d0 \n" | |
111 "vshrn.u16 d0, q0, #8 \n" | |
112 "vst1.u32 {d0[1]}, [%1]! \n" | |
113 | |
114 "subs %2, #4 \n" | |
115 "bhi 1b \n" | |
116 : "+r"(src_ptr), // %0 | |
117 "+r"(dst_ptr), // %1 | |
118 "+r"(dst_width) // %2 | |
119 : | |
120 : "q0", "q1", "memory", "cc" | |
121 ); | |
122 } | |
123 | |
124 static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, | |
125 uint8* dst_ptr, int dst_width) { | |
126 asm volatile ( | |
127 "add r4, %0, %3 \n" | |
128 "add r5, r4, %3 \n" | |
129 "add %3, r5, %3 \n" | |
130 "1: \n" | |
131 "vld1.u8 {q0}, [%0]! \n" // load up 16x4 block of in
put data | |
132 "vld1.u8 {q1}, [r4]! \n" | |
133 "vld1.u8 {q2}, [r5]! \n" | |
134 "vld1.u8 {q3}, [%3]! \n" | |
135 | |
136 "vpaddl.u8 q0, q0 \n" | |
137 "vpadal.u8 q0, q1 \n" | |
138 "vpadal.u8 q0, q2 \n" | |
139 "vpadal.u8 q0, q3 \n" | |
140 | |
141 "vpaddl.u16 q0, q0 \n" | |
142 | |
143 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding | |
144 | |
145 "vmovn.u16 d0, q0 \n" | |
146 "vst1.u32 {d0[0]}, [%1]! \n" | |
147 | |
148 "subs %2, #4 \n" | |
149 "bhi 1b \n" | |
150 | |
151 : "+r"(src_ptr), // %0 | |
152 "+r"(dst_ptr), // %1 | |
153 "+r"(dst_width) // %2 | |
154 : "r"(src_stride) // %3 | |
155 : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" | |
156 ); | |
157 } | |
158 | |
159 #define HAS_SCALEROWDOWN34_NEON | |
160 // Down scale from 4 to 3 pixels. Use the neon multilane read/write | |
161 // to load up the every 4th pixel into a 4 different registers. | |
162 // Point samples 32 pixels to 24 pixels. | |
163 static void ScaleRowDown34_NEON(const uint8* src_ptr, int src_stride, | |
164 uint8* dst_ptr, int dst_width) { | |
165 asm volatile ( | |
166 "1: \n" | |
167 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 | |
168 "vmov d2, d3 \n" // order needs to be d0, d1,
d2 | |
169 "vst3.u8 {d0, d1, d2}, [%1]! \n" | |
170 "subs %2, #24 \n" | |
171 "bhi 1b \n" | |
172 : "+r"(src_ptr), // %0 | |
173 "+r"(dst_ptr), // %1 | |
174 "+r"(dst_width) // %2 | |
175 : | |
176 : "d0", "d1", "d2", "d3", "memory", "cc" | |
177 ); | |
178 } | |
179 | |
180 static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride, | |
181 uint8* dst_ptr, int dst_width) { | |
182 asm volatile ( | |
183 "vmov.u8 d24, #3 \n" | |
184 "add %3, %0 \n" | |
185 "1: \n" | |
186 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 | |
187 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 | |
188 | |
189 // filter src line 0 with src line 1 | |
190 // expand chars to shorts to allow for room | |
191 // when adding lines together | |
192 "vmovl.u8 q8, d4 \n" | |
193 "vmovl.u8 q9, d5 \n" | |
194 "vmovl.u8 q10, d6 \n" | |
195 "vmovl.u8 q11, d7 \n" | |
196 | |
197 // 3 * line_0 + line_1 | |
198 "vmlal.u8 q8, d0, d24 \n" | |
199 "vmlal.u8 q9, d1, d24 \n" | |
200 "vmlal.u8 q10, d2, d24 \n" | |
201 "vmlal.u8 q11, d3, d24 \n" | |
202 | |
203 // (3 * line_0 + line_1) >> 2 | |
204 "vqrshrn.u16 d0, q8, #2 \n" | |
205 "vqrshrn.u16 d1, q9, #2 \n" | |
206 "vqrshrn.u16 d2, q10, #2 \n" | |
207 "vqrshrn.u16 d3, q11, #2 \n" | |
208 | |
209 // a0 = (src[0] * 3 + s[1] * 1) >> 2 | |
210 "vmovl.u8 q8, d1 \n" | |
211 "vmlal.u8 q8, d0, d24 \n" | |
212 "vqrshrn.u16 d0, q8, #2 \n" | |
213 | |
214 // a1 = (src[1] * 1 + s[2] * 1) >> 1 | |
215 "vrhadd.u8 d1, d1, d2 \n" | |
216 | |
217 // a2 = (src[2] * 1 + s[3] * 3) >> 2 | |
218 "vmovl.u8 q8, d2 \n" | |
219 "vmlal.u8 q8, d3, d24 \n" | |
220 "vqrshrn.u16 d2, q8, #2 \n" | |
221 | |
222 "vst3.u8 {d0, d1, d2}, [%1]! \n" | |
223 | |
224 "subs %2, #24 \n" | |
225 "bhi 1b \n" | |
226 : "+r"(src_ptr), // %0 | |
227 "+r"(dst_ptr), // %1 | |
228 "+r"(dst_width), // %2 | |
229 "+r"(src_stride) // %3 | |
230 : | |
231 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" | |
232 ); | |
233 } | |
234 | |
235 static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride, | |
236 uint8* dst_ptr, int dst_width) { | |
237 asm volatile ( | |
238 "vmov.u8 d24, #3 \n" | |
239 "add %3, %0 \n" | |
240 "1: \n" | |
241 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 | |
242 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 | |
243 | |
244 // average src line 0 with src line 1 | |
245 "vrhadd.u8 q0, q0, q2 \n" | |
246 "vrhadd.u8 q1, q1, q3 \n" | |
247 | |
248 // a0 = (src[0] * 3 + s[1] * 1) >> 2 | |
249 "vmovl.u8 q3, d1 \n" | |
250 "vmlal.u8 q3, d0, d24 \n" | |
251 "vqrshrn.u16 d0, q3, #2 \n" | |
252 | |
253 // a1 = (src[1] * 1 + s[2] * 1) >> 1 | |
254 "vrhadd.u8 d1, d1, d2 \n" | |
255 | |
256 // a2 = (src[2] * 1 + s[3] * 3) >> 2 | |
257 "vmovl.u8 q3, d2 \n" | |
258 "vmlal.u8 q3, d3, d24 \n" | |
259 "vqrshrn.u16 d2, q3, #2 \n" | |
260 | |
261 "vst3.u8 {d0, d1, d2}, [%1]! \n" | |
262 | |
263 "subs %2, #24 \n" | |
264 "bhi 1b \n" | |
265 : "+r"(src_ptr), // %0 | |
266 "+r"(dst_ptr), // %1 | |
267 "+r"(dst_width), // %2 | |
268 "+r"(src_stride) // %3 | |
269 : | |
270 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" | |
271 ); | |
272 } | |
273 | |
274 #define HAS_SCALEROWDOWN38_NEON | |
275 const uint8 shuf38[16] __attribute__ ((aligned(16))) = | |
276 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; | |
277 const uint8 shuf38_2[16] __attribute__ ((aligned(16))) = | |
278 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; | |
279 const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) = | |
280 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, | |
281 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; | |
282 const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) = | |
283 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, | |
284 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; | |
285 | |
286 // 32 -> 12 | |
287 static void ScaleRowDown38_NEON(const uint8* src_ptr, int src_stride, | |
288 uint8* dst_ptr, int dst_width) { | |
289 asm volatile ( | |
290 "vld1.u8 {q3}, [%3] \n" | |
291 "1: \n" | |
292 "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" | |
293 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" | |
294 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" | |
295 "vst1.u8 {d4}, [%1]! \n" | |
296 "vst1.u32 {d5[0]}, [%1]! \n" | |
297 "subs %2, #12 \n" | |
298 "bhi 1b \n" | |
299 : "+r"(src_ptr), // %0 | |
300 "+r"(dst_ptr), // %1 | |
301 "+r"(dst_width) // %2 | |
302 : "r"(shuf38) // %3 | |
303 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" | |
304 ); | |
305 } | |
306 | |
307 // 32x3 -> 12x1 | |
308 static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, | |
309 uint8* dst_ptr, int dst_width) { | |
310 asm volatile ( | |
311 "vld1.u16 {q13}, [%4] \n" | |
312 "vld1.u8 {q14}, [%5] \n" | |
313 "vld1.u8 {q15}, [%6] \n" | |
314 "add r4, %0, %3, lsl #1 \n" | |
315 "add %3, %0 \n" | |
316 "1: \n" | |
317 | |
318 // d0 = 00 40 01 41 02 42 03 43 | |
319 // d1 = 10 50 11 51 12 52 13 53 | |
320 // d2 = 20 60 21 61 22 62 23 63 | |
321 // d3 = 30 70 31 71 32 72 33 73 | |
322 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" | |
323 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" | |
324 "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" | |
325 | |
326 // Shuffle the input data around to get align the data | |
327 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 | |
328 // d0 = 00 10 01 11 02 12 03 13 | |
329 // d1 = 40 50 41 51 42 52 43 53 | |
330 "vtrn.u8 d0, d1 \n" | |
331 "vtrn.u8 d4, d5 \n" | |
332 "vtrn.u8 d16, d17 \n" | |
333 | |
334 // d2 = 20 30 21 31 22 32 23 33 | |
335 // d3 = 60 70 61 71 62 72 63 73 | |
336 "vtrn.u8 d2, d3 \n" | |
337 "vtrn.u8 d6, d7 \n" | |
338 "vtrn.u8 d18, d19 \n" | |
339 | |
340 // d0 = 00+10 01+11 02+12 03+13 | |
341 // d2 = 40+50 41+51 42+52 43+53 | |
342 "vpaddl.u8 q0, q0 \n" | |
343 "vpaddl.u8 q2, q2 \n" | |
344 "vpaddl.u8 q8, q8 \n" | |
345 | |
346 // d3 = 60+70 61+71 62+72 63+73 | |
347 "vpaddl.u8 d3, d3 \n" | |
348 "vpaddl.u8 d7, d7 \n" | |
349 "vpaddl.u8 d19, d19 \n" | |
350 | |
351 // combine source lines | |
352 "vadd.u16 q0, q2 \n" | |
353 "vadd.u16 q0, q8 \n" | |
354 "vadd.u16 d4, d3, d7 \n" | |
355 "vadd.u16 d4, d19 \n" | |
356 | |
357 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] | |
358 // + s[6 + st * 1] + s[7 + st * 1] | |
359 // + s[6 + st * 2] + s[7 + st * 2]) / 6 | |
360 "vqrdmulh.s16 q2, q13 \n" | |
361 "vmovn.u16 d4, q2 \n" | |
362 | |
363 // Shuffle 2,3 reg around so that 2 can be added to the | |
364 // 0,1 reg and 3 can be added to the 4,5 reg. This | |
365 // requires expanding from u8 to u16 as the 0,1 and 4,5 | |
366 // registers are already expanded. Then do transposes | |
367 // to get aligned. | |
368 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 | |
369 "vmovl.u8 q1, d2 \n" | |
370 "vmovl.u8 q3, d6 \n" | |
371 "vmovl.u8 q9, d18 \n" | |
372 | |
373 // combine source lines | |
374 "vadd.u16 q1, q3 \n" | |
375 "vadd.u16 q1, q9 \n" | |
376 | |
377 // d4 = xx 20 xx 30 xx 22 xx 32 | |
378 // d5 = xx 21 xx 31 xx 23 xx 33 | |
379 "vtrn.u32 d2, d3 \n" | |
380 | |
381 // d4 = xx 20 xx 21 xx 22 xx 23 | |
382 // d5 = xx 30 xx 31 xx 32 xx 33 | |
383 "vtrn.u16 d2, d3 \n" | |
384 | |
385 // 0+1+2, 3+4+5 | |
386 "vadd.u16 q0, q1 \n" | |
387 | |
388 // Need to divide, but can't downshift as the the value | |
389 // isn't a power of 2. So multiply by 65536 / n | |
390 // and take the upper 16 bits. | |
391 "vqrdmulh.s16 q0, q15 \n" | |
392 | |
393 // Align for table lookup, vtbl requires registers to | |
394 // be adjacent | |
395 "vmov.u8 d2, d4 \n" | |
396 | |
397 "vtbl.u8 d3, {d0, d1, d2}, d28 \n" | |
398 "vtbl.u8 d4, {d0, d1, d2}, d29 \n" | |
399 | |
400 "vst1.u8 {d3}, [%1]! \n" | |
401 "vst1.u32 {d4[0]}, [%1]! \n" | |
402 "subs %2, #12 \n" | |
403 "bhi 1b \n" | |
404 : "+r"(src_ptr), // %0 | |
405 "+r"(dst_ptr), // %1 | |
406 "+r"(dst_width), // %2 | |
407 "+r"(src_stride) // %3 | |
408 : "r"(mult38_div6), // %4 | |
409 "r"(shuf38_2), // %5 | |
410 "r"(mult38_div9) // %6 | |
411 : "r4", "q0", "q1", "q2", "q3", "q8", "q9", | |
412 "q13", "q14", "q15", "memory", "cc" | |
413 ); | |
414 } | |
415 | |
416 // 32x2 -> 12x1 | |
417 static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, | |
418 uint8* dst_ptr, int dst_width) { | |
419 asm volatile ( | |
420 "vld1.u16 {q13}, [%4] \n" | |
421 "vld1.u8 {q14}, [%5] \n" | |
422 "add %3, %0 \n" | |
423 "1: \n" | |
424 | |
425 // d0 = 00 40 01 41 02 42 03 43 | |
426 // d1 = 10 50 11 51 12 52 13 53 | |
427 // d2 = 20 60 21 61 22 62 23 63 | |
428 // d3 = 30 70 31 71 32 72 33 73 | |
429 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" | |
430 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" | |
431 | |
432 // Shuffle the input data around to get align the data | |
433 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 | |
434 // d0 = 00 10 01 11 02 12 03 13 | |
435 // d1 = 40 50 41 51 42 52 43 53 | |
436 "vtrn.u8 d0, d1 \n" | |
437 "vtrn.u8 d4, d5 \n" | |
438 | |
439 // d2 = 20 30 21 31 22 32 23 33 | |
440 // d3 = 60 70 61 71 62 72 63 73 | |
441 "vtrn.u8 d2, d3 \n" | |
442 "vtrn.u8 d6, d7 \n" | |
443 | |
444 // d0 = 00+10 01+11 02+12 03+13 | |
445 // d2 = 40+50 41+51 42+52 43+53 | |
446 "vpaddl.u8 q0, q0 \n" | |
447 "vpaddl.u8 q2, q2 \n" | |
448 | |
449 // d3 = 60+70 61+71 62+72 63+73 | |
450 "vpaddl.u8 d3, d3 \n" | |
451 "vpaddl.u8 d7, d7 \n" | |
452 | |
453 // combine source lines | |
454 "vadd.u16 q0, q2 \n" | |
455 "vadd.u16 d4, d3, d7 \n" | |
456 | |
457 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 | |
458 "vqrshrn.u16 d4, q2, #2 \n" | |
459 | |
460 // Shuffle 2,3 reg around so that 2 can be added to the | |
461 // 0,1 reg and 3 can be added to the 4,5 reg. This | |
462 // requires expanding from u8 to u16 as the 0,1 and 4,5 | |
463 // registers are already expanded. Then do transposes | |
464 // to get aligned. | |
465 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 | |
466 "vmovl.u8 q1, d2 \n" | |
467 "vmovl.u8 q3, d6 \n" | |
468 | |
469 // combine source lines | |
470 "vadd.u16 q1, q3 \n" | |
471 | |
472 // d4 = xx 20 xx 30 xx 22 xx 32 | |
473 // d5 = xx 21 xx 31 xx 23 xx 33 | |
474 "vtrn.u32 d2, d3 \n" | |
475 | |
476 // d4 = xx 20 xx 21 xx 22 xx 23 | |
477 // d5 = xx 30 xx 31 xx 32 xx 33 | |
478 "vtrn.u16 d2, d3 \n" | |
479 | |
480 // 0+1+2, 3+4+5 | |
481 "vadd.u16 q0, q1 \n" | |
482 | |
483 // Need to divide, but can't downshift as the the value | |
484 // isn't a power of 2. So multiply by 65536 / n | |
485 // and take the upper 16 bits. | |
486 "vqrdmulh.s16 q0, q13 \n" | |
487 | |
488 // Align for table lookup, vtbl requires registers to | |
489 // be adjacent | |
490 "vmov.u8 d2, d4 \n" | |
491 | |
492 "vtbl.u8 d3, {d0, d1, d2}, d28 \n" | |
493 "vtbl.u8 d4, {d0, d1, d2}, d29 \n" | |
494 | |
495 "vst1.u8 {d3}, [%1]! \n" | |
496 "vst1.u32 {d4[0]}, [%1]! \n" | |
497 "subs %2, #12 \n" | |
498 "bhi 1b \n" | |
499 : "+r"(src_ptr), // %0 | |
500 "+r"(dst_ptr), // %1 | |
501 "+r"(dst_width), // %2 | |
502 "+r"(src_stride) // %3 | |
503 : "r"(mult38_div6), // %4 | |
504 "r"(shuf38_2) // %5 | |
505 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" | |
506 ); | |
507 } | |
508 | |
509 /** | |
510 * SSE2 downscalers with interpolation. | |
511 * | |
512 * Provided by Frank Barchard (fbarchard@google.com) | |
513 * | |
514 */ | |
515 | |
516 // Constants for SSE2 code | |
517 #elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \ | |
518 !defined(YUV_DISABLE_ASM) | |
519 #if defined(_MSC_VER) | |
520 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var | |
521 #elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && def
ined(__i386__) | |
522 #define TALIGN16(t, var) t var __attribute__((aligned(16))) | |
523 #else | |
524 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) | |
525 #endif | |
526 | |
527 #if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \ | |
528 defined(__i386__) | |
529 #define DECLARE_FUNCTION(name) \ | |
530 ".text \n" \ | |
531 ".globl _" #name " \n" \ | |
532 "_" #name ": \n" | |
533 #else | |
534 #define DECLARE_FUNCTION(name) \ | |
535 ".text \n" \ | |
536 ".global " #name " \n" \ | |
537 #name ": \n" | |
538 #endif | |
539 | |
540 | |
541 // Offsets for source bytes 0 to 9 | |
542 //extern "C" | |
543 TALIGN16(const uint8, shuf0[16]) = | |
544 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; | |
545 | |
546 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. | |
547 //extern "C" | |
548 TALIGN16(const uint8, shuf1[16]) = | |
549 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; | |
550 | |
551 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. | |
552 //extern "C" | |
553 TALIGN16(const uint8, shuf2[16]) = | |
554 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; | |
555 | |
556 // Offsets for source bytes 0 to 10 | |
557 //extern "C" | |
558 TALIGN16(const uint8, shuf01[16]) = | |
559 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; | |
560 | |
561 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. | |
562 //extern "C" | |
563 TALIGN16(const uint8, shuf11[16]) = | |
564 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; | |
565 | |
566 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. | |
567 //extern "C" | |
568 TALIGN16(const uint8, shuf21[16]) = | |
569 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; | |
570 | |
571 // Coefficients for source bytes 0 to 10 | |
572 //extern "C" | |
573 TALIGN16(const uint8, madd01[16]) = | |
574 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; | |
575 | |
576 // Coefficients for source bytes 10 to 21 | |
577 //extern "C" | |
578 TALIGN16(const uint8, madd11[16]) = | |
579 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; | |
580 | |
581 // Coefficients for source bytes 21 to 31 | |
582 //extern "C" | |
583 TALIGN16(const uint8, madd21[16]) = | |
584 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; | |
585 | |
586 // Coefficients for source bytes 21 to 31 | |
587 //extern "C" | |
588 TALIGN16(const int16, round34[8]) = | |
589 { 2, 2, 2, 2, 2, 2, 2, 2 }; | |
590 | |
591 //extern "C" | |
592 TALIGN16(const uint8, shuf38a[16]) = | |
593 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; | |
594 | |
595 //extern "C" | |
596 TALIGN16(const uint8, shuf38b[16]) = | |
597 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; | |
598 | |
599 // Arrange words 0,3,6 into 0,1,2 | |
600 //extern "C" | |
601 TALIGN16(const uint8, shufac0[16]) = | |
602 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; | |
603 | |
604 // Arrange words 0,3,6 into 3,4,5 | |
605 //extern "C" | |
606 TALIGN16(const uint8, shufac3[16]) = | |
607 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; | |
608 | |
609 // Scaling values for boxes of 3x3 and 2x3 | |
610 //extern "C" | |
611 TALIGN16(const uint16, scaleac3[8]) = | |
612 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; | |
613 | |
614 // Arrange first value for pixels 0,1,2,3,4,5 | |
615 //extern "C" | |
616 TALIGN16(const uint8, shufab0[16]) = | |
617 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; | |
618 | |
619 // Arrange second value for pixels 0,1,2,3,4,5 | |
620 //extern "C" | |
621 TALIGN16(const uint8, shufab1[16]) = | |
622 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; | |
623 | |
624 // Arrange third value for pixels 0,1,2,3,4,5 | |
625 //extern "C" | |
626 TALIGN16(const uint8, shufab2[16]) = | |
627 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; | |
628 | |
629 // Scaling values for boxes of 3x2 and 2x2 | |
630 //extern "C" | |
631 TALIGN16(const uint16, scaleab2[8]) = | |
632 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; | |
633 #endif | |
634 | |
635 #if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) && defined(_MSC_VER) | |
636 | |
637 #define HAS_SCALEROWDOWN2_SSE2 | |
638 // Reads 32 pixels, throws half away and writes 16 pixels. | |
639 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. | |
640 __declspec(naked) | |
641 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, | |
642 uint8* dst_ptr, int dst_width) { | |
643 __asm { | |
644 mov eax, [esp + 4] // src_ptr | |
645 // src_stride ignored | |
646 mov edx, [esp + 12] // dst_ptr | |
647 mov ecx, [esp + 16] // dst_width | |
648 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | |
649 psrlw xmm5, 8 | |
650 | |
651 wloop: | |
652 movdqa xmm0, [eax] | |
653 movdqa xmm1, [eax + 16] | |
654 lea eax, [eax + 32] | |
655 pand xmm0, xmm5 | |
656 pand xmm1, xmm5 | |
657 packuswb xmm0, xmm1 | |
658 movdqa [edx], xmm0 | |
659 lea edx, [edx + 16] | |
660 sub ecx, 16 | |
661 ja wloop | |
662 | |
663 ret | |
664 } | |
665 } | |
666 // Blends 32x2 rectangle to 16x1. | |
667 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. | |
668 __declspec(naked) | |
669 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, | |
670 uint8* dst_ptr, int dst_width) { | |
671 __asm { | |
672 push esi | |
673 mov eax, [esp + 4 + 4] // src_ptr | |
674 mov esi, [esp + 4 + 8] // src_stride | |
675 mov edx, [esp + 4 + 12] // dst_ptr | |
676 mov ecx, [esp + 4 + 16] // dst_width | |
677 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | |
678 psrlw xmm5, 8 | |
679 | |
680 wloop: | |
681 movdqa xmm0, [eax] | |
682 movdqa xmm1, [eax + 16] | |
683 movdqa xmm2, [eax + esi] | |
684 movdqa xmm3, [eax + esi + 16] | |
685 lea eax, [eax + 32] | |
686 pavgb xmm0, xmm2 // average rows | |
687 pavgb xmm1, xmm3 | |
688 | |
689 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) | |
690 psrlw xmm0, 8 | |
691 movdqa xmm3, xmm1 | |
692 psrlw xmm1, 8 | |
693 pand xmm2, xmm5 | |
694 pand xmm3, xmm5 | |
695 pavgw xmm0, xmm2 | |
696 pavgw xmm1, xmm3 | |
697 packuswb xmm0, xmm1 | |
698 | |
699 movdqa [edx], xmm0 | |
700 lea edx, [edx + 16] | |
701 sub ecx, 16 | |
702 ja wloop | |
703 | |
704 pop esi | |
705 ret | |
706 } | |
707 } | |
708 | |
709 #define HAS_SCALEROWDOWN4_SSE2 | |
710 // Point samples 32 pixels to 8 pixels. | |
711 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. | |
712 __declspec(naked) | |
713 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, | |
714 uint8* dst_ptr, int dst_width) { | |
715 __asm { | |
716 pushad | |
717 mov esi, [esp + 32 + 4] // src_ptr | |
718 // src_stride ignored | |
719 mov edi, [esp + 32 + 12] // dst_ptr | |
720 mov ecx, [esp + 32 + 16] // dst_width | |
721 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff | |
722 psrld xmm5, 24 | |
723 | |
724 wloop: | |
725 movdqa xmm0, [esi] | |
726 movdqa xmm1, [esi + 16] | |
727 lea esi, [esi + 32] | |
728 pand xmm0, xmm5 | |
729 pand xmm1, xmm5 | |
730 packuswb xmm0, xmm1 | |
731 packuswb xmm0, xmm0 | |
732 movq qword ptr [edi], xmm0 | |
733 lea edi, [edi + 8] | |
734 sub ecx, 8 | |
735 ja wloop | |
736 | |
737 popad | |
738 ret | |
739 } | |
740 } | |
741 | |
742 // Blends 32x4 rectangle to 8x1. | |
743 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. | |
744 __declspec(naked) | |
745 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, | |
746 uint8* dst_ptr, int dst_width) { | |
747 __asm { | |
748 pushad | |
749 mov esi, [esp + 32 + 4] // src_ptr | |
750 mov ebx, [esp + 32 + 8] // src_stride | |
751 mov edi, [esp + 32 + 12] // dst_ptr | |
752 mov ecx, [esp + 32 + 16] // dst_width | |
753 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff | |
754 psrlw xmm7, 8 | |
755 lea edx, [ebx + ebx * 2] // src_stride * 3 | |
756 | |
757 wloop: | |
758 movdqa xmm0, [esi] | |
759 movdqa xmm1, [esi + 16] | |
760 movdqa xmm2, [esi + ebx] | |
761 movdqa xmm3, [esi + ebx + 16] | |
762 pavgb xmm0, xmm2 // average rows | |
763 pavgb xmm1, xmm3 | |
764 movdqa xmm2, [esi + ebx * 2] | |
765 movdqa xmm3, [esi + ebx * 2 + 16] | |
766 movdqa xmm4, [esi + edx] | |
767 movdqa xmm5, [esi + edx + 16] | |
768 lea esi, [esi + 32] | |
769 pavgb xmm2, xmm4 | |
770 pavgb xmm3, xmm5 | |
771 pavgb xmm0, xmm2 | |
772 pavgb xmm1, xmm3 | |
773 | |
774 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) | |
775 psrlw xmm0, 8 | |
776 movdqa xmm3, xmm1 | |
777 psrlw xmm1, 8 | |
778 pand xmm2, xmm7 | |
779 pand xmm3, xmm7 | |
780 pavgw xmm0, xmm2 | |
781 pavgw xmm1, xmm3 | |
782 packuswb xmm0, xmm1 | |
783 | |
784 movdqa xmm2, xmm0 // average columns (16 to 8 pixels) | |
785 psrlw xmm0, 8 | |
786 pand xmm2, xmm7 | |
787 pavgw xmm0, xmm2 | |
788 packuswb xmm0, xmm0 | |
789 | |
790 movq qword ptr [edi], xmm0 | |
791 lea edi, [edi + 8] | |
792 sub ecx, 8 | |
793 ja wloop | |
794 | |
795 popad | |
796 ret | |
797 } | |
798 } | |
799 | |
800 #define HAS_SCALEROWDOWN8_SSE2 | |
801 // Point samples 32 pixels to 4 pixels. | |
802 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. | |
803 __declspec(naked) | |
804 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, | |
805 uint8* dst_ptr, int dst_width) { | |
806 __asm { | |
807 pushad | |
808 mov esi, [esp + 32 + 4] // src_ptr | |
809 // src_stride ignored | |
810 mov edi, [esp + 32 + 12] // dst_ptr | |
811 mov ecx, [esp + 32 + 16] // dst_width | |
812 pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes | |
813 psrlq xmm5, 56 | |
814 | |
815 wloop: | |
816 movdqa xmm0, [esi] | |
817 movdqa xmm1, [esi + 16] | |
818 lea esi, [esi + 32] | |
819 pand xmm0, xmm5 | |
820 pand xmm1, xmm5 | |
821 packuswb xmm0, xmm1 // 32->16 | |
822 packuswb xmm0, xmm0 // 16->8 | |
823 packuswb xmm0, xmm0 // 8->4 | |
824 movd dword ptr [edi], xmm0 | |
825 lea edi, [edi + 4] | |
826 sub ecx, 4 | |
827 ja wloop | |
828 | |
829 popad | |
830 ret | |
831 } | |
832 } | |
833 | |
834 // Blends 32x8 rectangle to 4x1. | |
835 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. | |
836 __declspec(naked) | |
837 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, | |
838 uint8* dst_ptr, int dst_width) { | |
839 __asm { | |
840 pushad | |
841 mov esi, [esp + 32 + 4] // src_ptr | |
842 mov ebx, [esp + 32 + 8] // src_stride | |
843 mov edi, [esp + 32 + 12] // dst_ptr | |
844 mov ecx, [esp + 32 + 16] // dst_width | |
845 lea edx, [ebx + ebx * 2] // src_stride * 3 | |
846 pxor xmm7, xmm7 | |
847 | |
848 wloop: | |
849 movdqa xmm0, [esi] // average 8 rows to 1 | |
850 movdqa xmm1, [esi + 16] | |
851 movdqa xmm2, [esi + ebx] | |
852 movdqa xmm3, [esi + ebx + 16] | |
853 pavgb xmm0, xmm2 | |
854 pavgb xmm1, xmm3 | |
855 movdqa xmm2, [esi + ebx * 2] | |
856 movdqa xmm3, [esi + ebx * 2 + 16] | |
857 movdqa xmm4, [esi + edx] | |
858 movdqa xmm5, [esi + edx + 16] | |
859 lea ebp, [esi + ebx * 4] | |
860 lea esi, [esi + 32] | |
861 pavgb xmm2, xmm4 | |
862 pavgb xmm3, xmm5 | |
863 pavgb xmm0, xmm2 | |
864 pavgb xmm1, xmm3 | |
865 | |
866 movdqa xmm2, [ebp] | |
867 movdqa xmm3, [ebp + 16] | |
868 movdqa xmm4, [ebp + ebx] | |
869 movdqa xmm5, [ebp + ebx + 16] | |
870 pavgb xmm2, xmm4 | |
871 pavgb xmm3, xmm5 | |
872 movdqa xmm4, [ebp + ebx * 2] | |
873 movdqa xmm5, [ebp + ebx * 2 + 16] | |
874 movdqa xmm6, [ebp + edx] | |
875 pavgb xmm4, xmm6 | |
876 movdqa xmm6, [ebp + edx + 16] | |
877 pavgb xmm5, xmm6 | |
878 pavgb xmm2, xmm4 | |
879 pavgb xmm3, xmm5 | |
880 pavgb xmm0, xmm2 | |
881 pavgb xmm1, xmm3 | |
882 | |
883 psadbw xmm0, xmm7 // average 32 pixels to 4 | |
884 psadbw xmm1, xmm7 | |
885 pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01 | |
886 pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx | |
887 por xmm0, xmm1 // -> 3201 | |
888 psrlw xmm0, 3 | |
889 packuswb xmm0, xmm0 | |
890 packuswb xmm0, xmm0 | |
891 movd dword ptr [edi], xmm0 | |
892 | |
893 lea edi, [edi + 4] | |
894 sub ecx, 4 | |
895 ja wloop | |
896 | |
897 popad | |
898 ret | |
899 } | |
900 } | |
901 | |
902 #define HAS_SCALEROWDOWN34_SSSE3 | |
903 // Point samples 32 pixels to 24 pixels. | |
904 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. | |
905 // Then shuffled to do the scaling. | |
906 | |
907 // Note that movdqa+palign may be better than movdqu. | |
908 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. | |
909 __declspec(naked) | |
910 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, | |
911 uint8* dst_ptr, int dst_width) { | |
912 __asm { | |
913 pushad | |
914 mov esi, [esp + 32 + 4] // src_ptr | |
915 // src_stride ignored | |
916 mov edi, [esp + 32 + 12] // dst_ptr | |
917 mov ecx, [esp + 32 + 16] // dst_width | |
918 movdqa xmm3, _shuf0 | |
919 movdqa xmm4, _shuf1 | |
920 movdqa xmm5, _shuf2 | |
921 | |
922 wloop: | |
923 movdqa xmm0, [esi] | |
924 movdqa xmm1, [esi + 16] | |
925 lea esi, [esi + 32] | |
926 movdqa xmm2, xmm1 | |
927 palignr xmm1, xmm0, 8 | |
928 pshufb xmm0, xmm3 | |
929 pshufb xmm1, xmm4 | |
930 pshufb xmm2, xmm5 | |
931 movq qword ptr [edi], xmm0 | |
932 movq qword ptr [edi + 8], xmm1 | |
933 movq qword ptr [edi + 16], xmm2 | |
934 lea edi, [edi + 24] | |
935 sub ecx, 24 | |
936 ja wloop | |
937 | |
938 popad | |
939 ret | |
940 } | |
941 } | |
942 | |
943 // Blends 32x2 rectangle to 24x1 | |
944 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. | |
945 // Then shuffled to do the scaling. | |
946 | |
947 // Register usage: | |
948 // xmm0 src_row 0 | |
949 // xmm1 src_row 1 | |
950 // xmm2 shuf 0 | |
951 // xmm3 shuf 1 | |
952 // xmm4 shuf 2 | |
953 // xmm5 madd 0 | |
954 // xmm6 madd 1 | |
955 // xmm7 round34 | |
956 | |
957 // Note that movdqa+palign may be better than movdqu. | |
958 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. | |
959 __declspec(naked) | |
960 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, | |
961 uint8* dst_ptr, int dst_width) { | |
962 __asm { | |
963 pushad | |
964 mov esi, [esp + 32 + 4] // src_ptr | |
965 mov ebx, [esp + 32 + 8] // src_stride | |
966 mov edi, [esp + 32 + 12] // dst_ptr | |
967 mov ecx, [esp + 32 + 16] // dst_width | |
968 movdqa xmm2, _shuf01 | |
969 movdqa xmm3, _shuf11 | |
970 movdqa xmm4, _shuf21 | |
971 movdqa xmm5, _madd01 | |
972 movdqa xmm6, _madd11 | |
973 movdqa xmm7, _round34 | |
974 | |
975 wloop: | |
976 movdqa xmm0, [esi] // pixels 0..7 | |
977 movdqa xmm1, [esi+ebx] | |
978 pavgb xmm0, xmm1 | |
979 pshufb xmm0, xmm2 | |
980 pmaddubsw xmm0, xmm5 | |
981 paddsw xmm0, xmm7 | |
982 psrlw xmm0, 2 | |
983 packuswb xmm0, xmm0 | |
984 movq qword ptr [edi], xmm0 | |
985 movdqu xmm0, [esi+8] // pixels 8..15 | |
986 movdqu xmm1, [esi+ebx+8] | |
987 pavgb xmm0, xmm1 | |
988 pshufb xmm0, xmm3 | |
989 pmaddubsw xmm0, xmm6 | |
990 paddsw xmm0, xmm7 | |
991 psrlw xmm0, 2 | |
992 packuswb xmm0, xmm0 | |
993 movq qword ptr [edi+8], xmm0 | |
994 movdqa xmm0, [esi+16] // pixels 16..23 | |
995 movdqa xmm1, [esi+ebx+16] | |
996 lea esi, [esi+32] | |
997 pavgb xmm0, xmm1 | |
998 pshufb xmm0, xmm4 | |
999 movdqa xmm1, _madd21 | |
1000 pmaddubsw xmm0, xmm1 | |
1001 paddsw xmm0, xmm7 | |
1002 psrlw xmm0, 2 | |
1003 packuswb xmm0, xmm0 | |
1004 movq qword ptr [edi+16], xmm0 | |
1005 lea edi, [edi+24] | |
1006 sub ecx, 24 | |
1007 ja wloop | |
1008 | |
1009 popad | |
1010 ret | |
1011 } | |
1012 } | |
1013 | |
1014 // Note that movdqa+palign may be better than movdqu. | |
1015 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. | |
1016 __declspec(naked) | |
1017 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, | |
1018 uint8* dst_ptr, int dst_width) { | |
1019 __asm { | |
1020 pushad | |
1021 mov esi, [esp + 32 + 4] // src_ptr | |
1022 mov ebx, [esp + 32 + 8] // src_stride | |
1023 mov edi, [esp + 32 + 12] // dst_ptr | |
1024 mov ecx, [esp + 32 + 16] // dst_width | |
1025 movdqa xmm2, _shuf01 | |
1026 movdqa xmm3, _shuf11 | |
1027 movdqa xmm4, _shuf21 | |
1028 movdqa xmm5, _madd01 | |
1029 movdqa xmm6, _madd11 | |
1030 movdqa xmm7, _round34 | |
1031 | |
1032 wloop: | |
1033 movdqa xmm0, [esi] // pixels 0..7 | |
1034 movdqa xmm1, [esi+ebx] | |
1035 pavgb xmm1, xmm0 | |
1036 pavgb xmm0, xmm1 | |
1037 pshufb xmm0, xmm2 | |
1038 pmaddubsw xmm0, xmm5 | |
1039 paddsw xmm0, xmm7 | |
1040 psrlw xmm0, 2 | |
1041 packuswb xmm0, xmm0 | |
1042 movq qword ptr [edi], xmm0 | |
1043 movdqu xmm0, [esi+8] // pixels 8..15 | |
1044 movdqu xmm1, [esi+ebx+8] | |
1045 pavgb xmm1, xmm0 | |
1046 pavgb xmm0, xmm1 | |
1047 pshufb xmm0, xmm3 | |
1048 pmaddubsw xmm0, xmm6 | |
1049 paddsw xmm0, xmm7 | |
1050 psrlw xmm0, 2 | |
1051 packuswb xmm0, xmm0 | |
1052 movq qword ptr [edi+8], xmm0 | |
1053 movdqa xmm0, [esi+16] // pixels 16..23 | |
1054 movdqa xmm1, [esi+ebx+16] | |
1055 lea esi, [esi+32] | |
1056 pavgb xmm1, xmm0 | |
1057 pavgb xmm0, xmm1 | |
1058 pshufb xmm0, xmm4 | |
1059 movdqa xmm1, _madd21 | |
1060 pmaddubsw xmm0, xmm1 | |
1061 paddsw xmm0, xmm7 | |
1062 psrlw xmm0, 2 | |
1063 packuswb xmm0, xmm0 | |
1064 movq qword ptr [edi+16], xmm0 | |
1065 lea edi, [edi+24] | |
1066 sub ecx, 24 | |
1067 ja wloop | |
1068 | |
1069 popad | |
1070 ret | |
1071 } | |
1072 } | |
1073 | |
1074 #define HAS_SCALEROWDOWN38_SSSE3 | |
1075 // 3/8 point sampler | |
1076 | |
1077 // Scale 32 pixels to 12 | |
1078 __declspec(naked) | |
1079 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, | |
1080 uint8* dst_ptr, int dst_width) { | |
1081 __asm { | |
1082 pushad | |
1083 mov esi, [esp + 32 + 4] // src_ptr | |
1084 mov edx, [esp + 32 + 8] // src_stride | |
1085 mov edi, [esp + 32 + 12] // dst_ptr | |
1086 mov ecx, [esp + 32 + 16] // dst_width | |
1087 movdqa xmm4, _shuf38a | |
1088 movdqa xmm5, _shuf38b | |
1089 | |
1090 xloop: | |
1091 movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 | |
1092 movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 | |
1093 lea esi, [esi + 32] | |
1094 pshufb xmm0, xmm4 | |
1095 pshufb xmm1, xmm5 | |
1096 paddusb xmm0, xmm1 | |
1097 | |
1098 movq qword ptr [edi], xmm0 // write 12 pixels | |
1099 movhlps xmm1, xmm0 | |
1100 movd [edi + 8], xmm1 | |
1101 lea edi, [edi + 12] | |
1102 sub ecx, 12 | |
1103 ja xloop | |
1104 | |
1105 popad | |
1106 ret | |
1107 } | |
1108 } | |
1109 | |
1110 // Scale 16x3 pixels to 6x1 with interpolation | |
1111 __declspec(naked) | |
1112 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, | |
1113 uint8* dst_ptr, int dst_width) { | |
1114 __asm { | |
1115 pushad | |
1116 mov esi, [esp + 32 + 4] // src_ptr | |
1117 mov edx, [esp + 32 + 8] // src_stride | |
1118 mov edi, [esp + 32 + 12] // dst_ptr | |
1119 mov ecx, [esp + 32 + 16] // dst_width | |
1120 movdqa xmm4, _shufac0 | |
1121 movdqa xmm5, _shufac3 | |
1122 movdqa xmm6, _scaleac3 | |
1123 pxor xmm7, xmm7 | |
1124 | |
1125 xloop: | |
1126 movdqa xmm0, [esi] // sum up 3 rows into xmm0/1 | |
1127 movdqa xmm2, [esi + edx] | |
1128 movhlps xmm1, xmm0 | |
1129 movhlps xmm3, xmm2 | |
1130 punpcklbw xmm0, xmm7 | |
1131 punpcklbw xmm1, xmm7 | |
1132 punpcklbw xmm2, xmm7 | |
1133 punpcklbw xmm3, xmm7 | |
1134 paddusw xmm0, xmm2 | |
1135 paddusw xmm1, xmm3 | |
1136 movdqa xmm2, [esi + edx * 2] | |
1137 lea esi, [esi + 16] | |
1138 movhlps xmm3, xmm2 | |
1139 punpcklbw xmm2, xmm7 | |
1140 punpcklbw xmm3, xmm7 | |
1141 paddusw xmm0, xmm2 | |
1142 paddusw xmm1, xmm3 | |
1143 | |
1144 movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2 | |
1145 psrldq xmm0, 2 | |
1146 paddusw xmm2, xmm0 | |
1147 psrldq xmm0, 2 | |
1148 paddusw xmm2, xmm0 | |
1149 pshufb xmm2, xmm4 | |
1150 | |
1151 movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2 | |
1152 psrldq xmm1, 2 | |
1153 paddusw xmm3, xmm1 | |
1154 psrldq xmm1, 2 | |
1155 paddusw xmm3, xmm1 | |
1156 pshufb xmm3, xmm5 | |
1157 paddusw xmm2, xmm3 | |
1158 | |
1159 pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6 | |
1160 packuswb xmm2, xmm2 | |
1161 | |
1162 movd [edi], xmm2 // write 6 pixels | |
1163 pextrw eax, xmm2, 2 | |
1164 mov [edi + 4], ax | |
1165 lea edi, [edi + 6] | |
1166 sub ecx, 6 | |
1167 ja xloop | |
1168 | |
1169 popad | |
1170 ret | |
1171 } | |
1172 } | |
1173 | |
1174 // Scale 16x2 pixels to 6x1 with interpolation | |
1175 __declspec(naked) | |
1176 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, | |
1177 uint8* dst_ptr, int dst_width) { | |
1178 __asm { | |
1179 pushad | |
1180 mov esi, [esp + 32 + 4] // src_ptr | |
1181 mov edx, [esp + 32 + 8] // src_stride | |
1182 mov edi, [esp + 32 + 12] // dst_ptr | |
1183 mov ecx, [esp + 32 + 16] // dst_width | |
1184 movdqa xmm4, _shufab0 | |
1185 movdqa xmm5, _shufab1 | |
1186 movdqa xmm6, _shufab2 | |
1187 movdqa xmm7, _scaleab2 | |
1188 | |
1189 xloop: | |
1190 movdqa xmm2, [esi] // average 2 rows into xmm2 | |
1191 pavgb xmm2, [esi + edx] | |
1192 lea esi, [esi + 16] | |
1193 | |
1194 movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0 | |
1195 pshufb xmm0, xmm4 | |
1196 movdqa xmm1, xmm2 | |
1197 pshufb xmm1, xmm5 | |
1198 paddusw xmm0, xmm1 | |
1199 pshufb xmm2, xmm6 | |
1200 paddusw xmm0, xmm2 | |
1201 | |
1202 pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2 | |
1203 packuswb xmm0, xmm0 | |
1204 | |
1205 movd [edi], xmm0 // write 6 pixels | |
1206 pextrw eax, xmm0, 2 | |
1207 mov [edi + 4], ax | |
1208 lea edi, [edi + 6] | |
1209 sub ecx, 6 | |
1210 ja xloop | |
1211 | |
1212 popad | |
1213 ret | |
1214 } | |
1215 } | |
1216 | |
1217 #define HAS_SCALEADDROWS_SSE2 | |
1218 | |
1219 // Reads 8xN bytes and produces 16 shorts at a time. | |
1220 __declspec(naked) | |
1221 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, | |
1222 uint16* dst_ptr, int src_width, | |
1223 int src_height) { | |
1224 __asm { | |
1225 pushad | |
1226 mov esi, [esp + 32 + 4] // src_ptr | |
1227 mov edx, [esp + 32 + 8] // src_stride | |
1228 mov edi, [esp + 32 + 12] // dst_ptr | |
1229 mov ecx, [esp + 32 + 16] // dst_width | |
1230 mov ebx, [esp + 32 + 20] // height | |
1231 pxor xmm5, xmm5 | |
1232 dec ebx | |
1233 | |
1234 xloop: | |
1235 // first row | |
1236 movdqa xmm2, [esi] | |
1237 lea eax, [esi + edx] | |
1238 movhlps xmm3, xmm2 | |
1239 mov ebp, ebx | |
1240 punpcklbw xmm2, xmm5 | |
1241 punpcklbw xmm3, xmm5 | |
1242 | |
1243 // sum remaining rows | |
1244 yloop: | |
1245 movdqa xmm0, [eax] // read 16 pixels | |
1246 lea eax, [eax + edx] // advance to next row | |
1247 movhlps xmm1, xmm0 | |
1248 punpcklbw xmm0, xmm5 | |
1249 punpcklbw xmm1, xmm5 | |
1250 paddusw xmm2, xmm0 // sum 16 words | |
1251 paddusw xmm3, xmm1 | |
1252 sub ebp, 1 | |
1253 ja yloop | |
1254 | |
1255 movdqa [edi], xmm2 | |
1256 movdqa [edi + 16], xmm3 | |
1257 lea edi, [edi + 32] | |
1258 lea esi, [esi + 16] | |
1259 | |
1260 sub ecx, 16 | |
1261 ja xloop | |
1262 | |
1263 popad | |
1264 ret | |
1265 } | |
1266 } | |
1267 | |
1268 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. | |
1269 #define HAS_SCALEFILTERROWS_SSE2 | |
1270 __declspec(naked) | |
1271 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, | |
1272 int src_stride, int dst_width, | |
1273 int source_y_fraction) { | |
1274 __asm { | |
1275 push esi | |
1276 push edi | |
1277 mov edi, [esp + 8 + 4] // dst_ptr | |
1278 mov esi, [esp + 8 + 8] // src_ptr | |
1279 mov edx, [esp + 8 + 12] // src_stride | |
1280 mov ecx, [esp + 8 + 16] // dst_width | |
1281 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) | |
1282 cmp eax, 0 | |
1283 je xloop1 | |
1284 cmp eax, 128 | |
1285 je xloop2 | |
1286 | |
1287 movd xmm6, eax // xmm6 = y fraction | |
1288 punpcklwd xmm6, xmm6 | |
1289 pshufd xmm6, xmm6, 0 | |
1290 neg eax // xmm5 = 256 - y fraction | |
1291 add eax, 256 | |
1292 movd xmm5, eax | |
1293 punpcklwd xmm5, xmm5 | |
1294 pshufd xmm5, xmm5, 0 | |
1295 pxor xmm7, xmm7 | |
1296 | |
1297 xloop: | |
1298 movdqa xmm0, [esi] | |
1299 movdqa xmm2, [esi + edx] | |
1300 lea esi, [esi + 16] | |
1301 movdqa xmm1, xmm0 | |
1302 movdqa xmm3, xmm2 | |
1303 punpcklbw xmm0, xmm7 | |
1304 punpcklbw xmm2, xmm7 | |
1305 punpckhbw xmm1, xmm7 | |
1306 punpckhbw xmm3, xmm7 | |
1307 pmullw xmm0, xmm5 // scale row 0 | |
1308 pmullw xmm1, xmm5 | |
1309 pmullw xmm2, xmm6 // scale row 1 | |
1310 pmullw xmm3, xmm6 | |
1311 paddusw xmm0, xmm2 // sum rows | |
1312 paddusw xmm1, xmm3 | |
1313 psrlw xmm0, 8 | |
1314 psrlw xmm1, 8 | |
1315 packuswb xmm0, xmm1 | |
1316 movdqa [edi], xmm0 | |
1317 lea edi, [edi + 16] | |
1318 sub ecx, 16 | |
1319 ja xloop | |
1320 | |
1321 mov al, [edi - 1] | |
1322 mov [edi], al | |
1323 pop edi | |
1324 pop esi | |
1325 ret | |
1326 | |
1327 xloop1: | |
1328 movdqa xmm0, [esi] | |
1329 lea esi, [esi + 16] | |
1330 movdqa [edi], xmm0 | |
1331 lea edi, [edi + 16] | |
1332 sub ecx, 16 | |
1333 ja xloop1 | |
1334 | |
1335 mov al, [edi - 1] | |
1336 mov [edi], al | |
1337 pop edi | |
1338 pop esi | |
1339 ret | |
1340 | |
1341 xloop2: | |
1342 movdqa xmm0, [esi] | |
1343 movdqa xmm2, [esi + edx] | |
1344 lea esi, [esi + 16] | |
1345 pavgb xmm0, xmm2 | |
1346 movdqa [edi], xmm0 | |
1347 lea edi, [edi + 16] | |
1348 sub ecx, 16 | |
1349 ja xloop2 | |
1350 | |
1351 mov al, [edi - 1] | |
1352 mov [edi], al | |
1353 pop edi | |
1354 pop esi | |
1355 ret | |
1356 } | |
1357 } | |
1358 | |
1359 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. | |
1360 #define HAS_SCALEFILTERROWS_SSSE3 | |
1361 __declspec(naked) | |
1362 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | |
1363 int src_stride, int dst_width, | |
1364 int source_y_fraction) { | |
1365 __asm { | |
1366 push esi | |
1367 push edi | |
1368 mov edi, [esp + 8 + 4] // dst_ptr | |
1369 mov esi, [esp + 8 + 8] // src_ptr | |
1370 mov edx, [esp + 8 + 12] // src_stride | |
1371 mov ecx, [esp + 8 + 16] // dst_width | |
1372 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) | |
1373 shr eax, 1 | |
1374 cmp eax, 0 | |
1375 je xloop1 | |
1376 cmp eax, 64 | |
1377 je xloop2 | |
1378 | |
1379 mov ah,al | |
1380 neg al | |
1381 add al, 128 | |
1382 movd xmm5, eax | |
1383 punpcklwd xmm5, xmm5 | |
1384 pshufd xmm5, xmm5, 0 | |
1385 | |
1386 xloop: | |
1387 movdqa xmm0, [esi] | |
1388 movdqa xmm2, [esi + edx] | |
1389 lea esi, [esi + 16] | |
1390 movdqa xmm1, xmm0 | |
1391 punpcklbw xmm0, xmm2 | |
1392 punpckhbw xmm1, xmm2 | |
1393 pmaddubsw xmm0, xmm5 | |
1394 pmaddubsw xmm1, xmm5 | |
1395 psrlw xmm0, 7 | |
1396 psrlw xmm1, 7 | |
1397 packuswb xmm0, xmm1 | |
1398 movdqa [edi], xmm0 | |
1399 lea edi, [edi + 16] | |
1400 sub ecx, 16 | |
1401 ja xloop | |
1402 | |
1403 mov al, [edi - 1] | |
1404 mov [edi], al | |
1405 pop edi | |
1406 pop esi | |
1407 ret | |
1408 | |
1409 xloop1: | |
1410 movdqa xmm0, [esi] | |
1411 lea esi, [esi + 16] | |
1412 movdqa [edi], xmm0 | |
1413 lea edi, [edi + 16] | |
1414 sub ecx, 16 | |
1415 ja xloop1 | |
1416 | |
1417 mov al, [edi - 1] | |
1418 mov [edi], al | |
1419 pop edi | |
1420 pop esi | |
1421 ret | |
1422 | |
1423 xloop2: | |
1424 movdqa xmm0, [esi] | |
1425 movdqa xmm2, [esi + edx] | |
1426 lea esi, [esi + 16] | |
1427 pavgb xmm0, xmm2 | |
1428 movdqa [edi], xmm0 | |
1429 lea edi, [edi + 16] | |
1430 sub ecx, 16 | |
1431 ja xloop2 | |
1432 | |
1433 mov al, [edi - 1] | |
1434 mov [edi], al | |
1435 pop edi | |
1436 pop esi | |
1437 ret | |
1438 | |
1439 } | |
1440 } | |
1441 | |
1442 // Note that movdqa+palign may be better than movdqu. | |
1443 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. | |
1444 __declspec(naked) | |
1445 static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | |
1446 int dst_width) { | |
1447 __asm { | |
1448 mov edx, [esp + 4] // dst_ptr | |
1449 mov eax, [esp + 8] // src_ptr | |
1450 mov ecx, [esp + 12] // dst_width | |
1451 movdqa xmm1, _round34 | |
1452 movdqa xmm2, _shuf01 | |
1453 movdqa xmm3, _shuf11 | |
1454 movdqa xmm4, _shuf21 | |
1455 movdqa xmm5, _madd01 | |
1456 movdqa xmm6, _madd11 | |
1457 movdqa xmm7, _madd21 | |
1458 | |
1459 wloop: | |
1460 movdqa xmm0, [eax] // pixels 0..7 | |
1461 pshufb xmm0, xmm2 | |
1462 pmaddubsw xmm0, xmm5 | |
1463 paddsw xmm0, xmm1 | |
1464 psrlw xmm0, 2 | |
1465 packuswb xmm0, xmm0 | |
1466 movq qword ptr [edx], xmm0 | |
1467 movdqu xmm0, [eax+8] // pixels 8..15 | |
1468 pshufb xmm0, xmm3 | |
1469 pmaddubsw xmm0, xmm6 | |
1470 paddsw xmm0, xmm1 | |
1471 psrlw xmm0, 2 | |
1472 packuswb xmm0, xmm0 | |
1473 movq qword ptr [edx+8], xmm0 | |
1474 movdqa xmm0, [eax+16] // pixels 16..23 | |
1475 lea eax, [eax+32] | |
1476 pshufb xmm0, xmm4 | |
1477 pmaddubsw xmm0, xmm7 | |
1478 paddsw xmm0, xmm1 | |
1479 psrlw xmm0, 2 | |
1480 packuswb xmm0, xmm0 | |
1481 movq qword ptr [edx+16], xmm0 | |
1482 lea edx, [edx+24] | |
1483 sub ecx, 24 | |
1484 ja wloop | |
1485 ret | |
1486 } | |
1487 } | |
1488 | |
1489 #elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM) | |
1490 | |
1491 // GCC versions of row functions are verbatim conversions from Visual C. | |
1492 // Generated using gcc disassembly on Visual C object file: | |
1493 // objdump -D yuvscaler.obj >yuvscaler.txt | |
1494 #define HAS_SCALEROWDOWN2_SSE2 | |
1495 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, | |
1496 uint8* dst_ptr, int dst_width) { | |
1497 asm volatile ( | |
1498 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1499 "psrlw $0x8,%%xmm5 \n" | |
1500 "1:" | |
1501 "movdqa (%0),%%xmm0 \n" | |
1502 "movdqa 0x10(%0),%%xmm1 \n" | |
1503 "lea 0x20(%0),%0 \n" | |
1504 "pand %%xmm5,%%xmm0 \n" | |
1505 "pand %%xmm5,%%xmm1 \n" | |
1506 "packuswb %%xmm1,%%xmm0 \n" | |
1507 "movdqa %%xmm0,(%1) \n" | |
1508 "lea 0x10(%1),%1 \n" | |
1509 "sub $0x10,%2 \n" | |
1510 "ja 1b \n" | |
1511 : "+r"(src_ptr), // %0 | |
1512 "+r"(dst_ptr), // %1 | |
1513 "+r"(dst_width) // %2 | |
1514 : | |
1515 : "memory", "cc" | |
1516 ); | |
1517 } | |
1518 | |
1519 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, | |
1520 uint8* dst_ptr, int dst_width) { | |
1521 asm volatile ( | |
1522 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1523 "psrlw $0x8,%%xmm5 \n" | |
1524 "1:" | |
1525 "movdqa (%0),%%xmm0 \n" | |
1526 "movdqa 0x10(%0),%%xmm1 \n" | |
1527 "movdqa (%0,%3,1),%%xmm2 \n" | |
1528 "movdqa 0x10(%0,%3,1),%%xmm3 \n" | |
1529 "lea 0x20(%0),%0 \n" | |
1530 "pavgb %%xmm2,%%xmm0 \n" | |
1531 "pavgb %%xmm3,%%xmm1 \n" | |
1532 "movdqa %%xmm0,%%xmm2 \n" | |
1533 "psrlw $0x8,%%xmm0 \n" | |
1534 "movdqa %%xmm1,%%xmm3 \n" | |
1535 "psrlw $0x8,%%xmm1 \n" | |
1536 "pand %%xmm5,%%xmm2 \n" | |
1537 "pand %%xmm5,%%xmm3 \n" | |
1538 "pavgw %%xmm2,%%xmm0 \n" | |
1539 "pavgw %%xmm3,%%xmm1 \n" | |
1540 "packuswb %%xmm1,%%xmm0 \n" | |
1541 "movdqa %%xmm0,(%1) \n" | |
1542 "lea 0x10(%1),%1 \n" | |
1543 "sub $0x10,%2 \n" | |
1544 "ja 1b \n" | |
1545 : "+r"(src_ptr), // %0 | |
1546 "+r"(dst_ptr), // %1 | |
1547 "+r"(dst_width) // %2 | |
1548 : "r"((intptr_t)(src_stride)) // %3 | |
1549 : "memory", "cc" | |
1550 ); | |
1551 } | |
1552 | |
1553 #define HAS_SCALEROWDOWN4_SSE2 | |
1554 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, | |
1555 uint8* dst_ptr, int dst_width) { | |
1556 asm volatile ( | |
1557 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1558 "psrld $0x18,%%xmm5 \n" | |
1559 "1:" | |
1560 "movdqa (%0),%%xmm0 \n" | |
1561 "movdqa 0x10(%0),%%xmm1 \n" | |
1562 "lea 0x20(%0),%0 \n" | |
1563 "pand %%xmm5,%%xmm0 \n" | |
1564 "pand %%xmm5,%%xmm1 \n" | |
1565 "packuswb %%xmm1,%%xmm0 \n" | |
1566 "packuswb %%xmm0,%%xmm0 \n" | |
1567 "movq %%xmm0,(%1) \n" | |
1568 "lea 0x8(%1),%1 \n" | |
1569 "sub $0x8,%2 \n" | |
1570 "ja 1b \n" | |
1571 : "+r"(src_ptr), // %0 | |
1572 "+r"(dst_ptr), // %1 | |
1573 "+r"(dst_width) // %2 | |
1574 : | |
1575 : "memory", "cc" | |
1576 ); | |
1577 } | |
1578 | |
1579 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, | |
1580 uint8* dst_ptr, int dst_width) { | |
1581 intptr_t temp = 0; | |
1582 asm volatile ( | |
1583 "pcmpeqb %%xmm7,%%xmm7 \n" | |
1584 "psrlw $0x8,%%xmm7 \n" | |
1585 "lea (%4,%4,2),%3 \n" | |
1586 "1:" | |
1587 "movdqa (%0),%%xmm0 \n" | |
1588 "movdqa 0x10(%0),%%xmm1 \n" | |
1589 "movdqa (%0,%4,1),%%xmm2 \n" | |
1590 "movdqa 0x10(%0,%4,1),%%xmm3 \n" | |
1591 "pavgb %%xmm2,%%xmm0 \n" | |
1592 "pavgb %%xmm3,%%xmm1 \n" | |
1593 "movdqa (%0,%4,2),%%xmm2 \n" | |
1594 "movdqa 0x10(%0,%4,2),%%xmm3 \n" | |
1595 "movdqa (%0,%3,1),%%xmm4 \n" | |
1596 "movdqa 0x10(%0,%3,1),%%xmm5 \n" | |
1597 "lea 0x20(%0),%0 \n" | |
1598 "pavgb %%xmm4,%%xmm2 \n" | |
1599 "pavgb %%xmm2,%%xmm0 \n" | |
1600 "pavgb %%xmm5,%%xmm3 \n" | |
1601 "pavgb %%xmm3,%%xmm1 \n" | |
1602 "movdqa %%xmm0,%%xmm2 \n" | |
1603 "psrlw $0x8,%%xmm0 \n" | |
1604 "movdqa %%xmm1,%%xmm3 \n" | |
1605 "psrlw $0x8,%%xmm1 \n" | |
1606 "pand %%xmm7,%%xmm2 \n" | |
1607 "pand %%xmm7,%%xmm3 \n" | |
1608 "pavgw %%xmm2,%%xmm0 \n" | |
1609 "pavgw %%xmm3,%%xmm1 \n" | |
1610 "packuswb %%xmm1,%%xmm0 \n" | |
1611 "movdqa %%xmm0,%%xmm2 \n" | |
1612 "psrlw $0x8,%%xmm0 \n" | |
1613 "pand %%xmm7,%%xmm2 \n" | |
1614 "pavgw %%xmm2,%%xmm0 \n" | |
1615 "packuswb %%xmm0,%%xmm0 \n" | |
1616 "movq %%xmm0,(%1) \n" | |
1617 "lea 0x8(%1),%1 \n" | |
1618 "sub $0x8,%2 \n" | |
1619 "ja 1b \n" | |
1620 : "+r"(src_ptr), // %0 | |
1621 "+r"(dst_ptr), // %1 | |
1622 "+r"(dst_width), // %2 | |
1623 "+r"(temp) // %3 | |
1624 : "r"((intptr_t)(src_stride)) // %4 | |
1625 : "memory", "cc" | |
1626 #if defined(__x86_64__) | |
1627 , "xmm6", "xmm7" | |
1628 #endif | |
1629 ); | |
1630 } | |
1631 | |
1632 #define HAS_SCALEROWDOWN8_SSE2 | |
1633 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, | |
1634 uint8* dst_ptr, int dst_width) { | |
1635 asm volatile ( | |
1636 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1637 "psrlq $0x38,%%xmm5 \n" | |
1638 "1:" | |
1639 "movdqa (%0),%%xmm0 \n" | |
1640 "movdqa 0x10(%0),%%xmm1 \n" | |
1641 "lea 0x20(%0),%0 \n" | |
1642 "pand %%xmm5,%%xmm0 \n" | |
1643 "pand %%xmm5,%%xmm1 \n" | |
1644 "packuswb %%xmm1,%%xmm0 \n" | |
1645 "packuswb %%xmm0,%%xmm0 \n" | |
1646 "packuswb %%xmm0,%%xmm0 \n" | |
1647 "movd %%xmm0,(%1) \n" | |
1648 "lea 0x4(%1),%1 \n" | |
1649 "sub $0x4,%2 \n" | |
1650 "ja 1b \n" | |
1651 : "+r"(src_ptr), // %0 | |
1652 "+r"(dst_ptr), // %1 | |
1653 "+r"(dst_width) // %2 | |
1654 : | |
1655 : "memory", "cc" | |
1656 ); | |
1657 } | |
1658 | |
1659 #if defined(__i386__) | |
1660 void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, | |
1661 uint8* dst_ptr, int dst_width); | |
1662 asm( | |
1663 DECLARE_FUNCTION(ScaleRowDown8Int_SSE2) | |
1664 "pusha \n" | |
1665 "mov 0x24(%esp),%esi \n" | |
1666 "mov 0x28(%esp),%ebx \n" | |
1667 "mov 0x2c(%esp),%edi \n" | |
1668 "mov 0x30(%esp),%ecx \n" | |
1669 "lea (%ebx,%ebx,2),%edx \n" | |
1670 "pxor %xmm7,%xmm7 \n" | |
1671 | |
1672 "1:" | |
1673 "movdqa (%esi),%xmm0 \n" | |
1674 "movdqa 0x10(%esi),%xmm1 \n" | |
1675 "movdqa (%esi,%ebx,1),%xmm2 \n" | |
1676 "movdqa 0x10(%esi,%ebx,1),%xmm3 \n" | |
1677 "pavgb %xmm2,%xmm0 \n" | |
1678 "pavgb %xmm3,%xmm1 \n" | |
1679 "movdqa (%esi,%ebx,2),%xmm2 \n" | |
1680 "movdqa 0x10(%esi,%ebx,2),%xmm3 \n" | |
1681 "movdqa (%esi,%edx,1),%xmm4 \n" | |
1682 "movdqa 0x10(%esi,%edx,1),%xmm5 \n" | |
1683 "lea (%esi,%ebx,4),%ebp \n" | |
1684 "lea 0x20(%esi),%esi \n" | |
1685 "pavgb %xmm4,%xmm2 \n" | |
1686 "pavgb %xmm5,%xmm3 \n" | |
1687 "pavgb %xmm2,%xmm0 \n" | |
1688 "pavgb %xmm3,%xmm1 \n" | |
1689 "movdqa 0x0(%ebp),%xmm2 \n" | |
1690 "movdqa 0x10(%ebp),%xmm3 \n" | |
1691 "movdqa 0x0(%ebp,%ebx,1),%xmm4 \n" | |
1692 "movdqa 0x10(%ebp,%ebx,1),%xmm5 \n" | |
1693 "pavgb %xmm4,%xmm2 \n" | |
1694 "pavgb %xmm5,%xmm3 \n" | |
1695 "movdqa 0x0(%ebp,%ebx,2),%xmm4 \n" | |
1696 "movdqa 0x10(%ebp,%ebx,2),%xmm5 \n" | |
1697 "movdqa 0x0(%ebp,%edx,1),%xmm6 \n" | |
1698 "pavgb %xmm6,%xmm4 \n" | |
1699 "movdqa 0x10(%ebp,%edx,1),%xmm6 \n" | |
1700 "pavgb %xmm6,%xmm5 \n" | |
1701 "pavgb %xmm4,%xmm2 \n" | |
1702 "pavgb %xmm5,%xmm3 \n" | |
1703 "pavgb %xmm2,%xmm0 \n" | |
1704 "pavgb %xmm3,%xmm1 \n" | |
1705 "psadbw %xmm7,%xmm0 \n" | |
1706 "psadbw %xmm7,%xmm1 \n" | |
1707 "pshufd $0xd8,%xmm0,%xmm0 \n" | |
1708 "pshufd $0x8d,%xmm1,%xmm1 \n" | |
1709 "por %xmm1,%xmm0 \n" | |
1710 "psrlw $0x3,%xmm0 \n" | |
1711 "packuswb %xmm0,%xmm0 \n" | |
1712 "packuswb %xmm0,%xmm0 \n" | |
1713 "movd %xmm0,(%edi) \n" | |
1714 "lea 0x4(%edi),%edi \n" | |
1715 "sub $0x4,%ecx \n" | |
1716 "ja 1b \n" | |
1717 "popa \n" | |
1718 "ret \n" | |
1719 ); | |
1720 | |
1721 // fpic is used for magiccam plugin | |
1722 #if !defined(__PIC__) | |
1723 #define HAS_SCALEROWDOWN34_SSSE3 | |
1724 void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, | |
1725 uint8* dst_ptr, int dst_width); | |
1726 asm( | |
1727 DECLARE_FUNCTION(ScaleRowDown34_SSSE3) | |
1728 "pusha \n" | |
1729 "mov 0x24(%esp),%esi \n" | |
1730 "mov 0x2c(%esp),%edi \n" | |
1731 "mov 0x30(%esp),%ecx \n" | |
1732 "movdqa _shuf0,%xmm3 \n" | |
1733 "movdqa _shuf1,%xmm4 \n" | |
1734 "movdqa _shuf2,%xmm5 \n" | |
1735 | |
1736 "1:" | |
1737 "movdqa (%esi),%xmm0 \n" | |
1738 "movdqa 0x10(%esi),%xmm2 \n" | |
1739 "lea 0x20(%esi),%esi \n" | |
1740 "movdqa %xmm2,%xmm1 \n" | |
1741 "palignr $0x8,%xmm0,%xmm1 \n" | |
1742 "pshufb %xmm3,%xmm0 \n" | |
1743 "pshufb %xmm4,%xmm1 \n" | |
1744 "pshufb %xmm5,%xmm2 \n" | |
1745 "movq %xmm0,(%edi) \n" | |
1746 "movq %xmm1,0x8(%edi) \n" | |
1747 "movq %xmm2,0x10(%edi) \n" | |
1748 "lea 0x18(%edi),%edi \n" | |
1749 "sub $0x18,%ecx \n" | |
1750 "ja 1b \n" | |
1751 "popa \n" | |
1752 "ret \n" | |
1753 ); | |
1754 | |
1755 void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, | |
1756 uint8* dst_ptr, int dst_width); | |
1757 asm( | |
1758 DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3) | |
1759 "pusha \n" | |
1760 "mov 0x24(%esp),%esi \n" | |
1761 "mov 0x28(%esp),%ebp \n" | |
1762 "mov 0x2c(%esp),%edi \n" | |
1763 "mov 0x30(%esp),%ecx \n" | |
1764 "movdqa _shuf01,%xmm2 \n" | |
1765 "movdqa _shuf11,%xmm3 \n" | |
1766 "movdqa _shuf21,%xmm4 \n" | |
1767 "movdqa _madd01,%xmm5 \n" | |
1768 "movdqa _madd11,%xmm6 \n" | |
1769 "movdqa _round34,%xmm7 \n" | |
1770 | |
1771 "1:" | |
1772 "movdqa (%esi),%xmm0 \n" | |
1773 "movdqa (%esi,%ebp),%xmm1 \n" | |
1774 "pavgb %xmm1,%xmm0 \n" | |
1775 "pshufb %xmm2,%xmm0 \n" | |
1776 "pmaddubsw %xmm5,%xmm0 \n" | |
1777 "paddsw %xmm7,%xmm0 \n" | |
1778 "psrlw $0x2,%xmm0 \n" | |
1779 "packuswb %xmm0,%xmm0 \n" | |
1780 "movq %xmm0,(%edi) \n" | |
1781 "movdqu 0x8(%esi),%xmm0 \n" | |
1782 "movdqu 0x8(%esi,%ebp),%xmm1 \n" | |
1783 "pavgb %xmm1,%xmm0 \n" | |
1784 "pshufb %xmm3,%xmm0 \n" | |
1785 "pmaddubsw %xmm6,%xmm0 \n" | |
1786 "paddsw %xmm7,%xmm0 \n" | |
1787 "psrlw $0x2,%xmm0 \n" | |
1788 "packuswb %xmm0,%xmm0 \n" | |
1789 "movq %xmm0,0x8(%edi) \n" | |
1790 "movdqa 0x10(%esi),%xmm0 \n" | |
1791 "movdqa 0x10(%esi,%ebp),%xmm1 \n" | |
1792 "lea 0x20(%esi),%esi \n" | |
1793 "pavgb %xmm1,%xmm0 \n" | |
1794 "pshufb %xmm4,%xmm0 \n" | |
1795 "movdqa _madd21,%xmm1 \n" | |
1796 "pmaddubsw %xmm1,%xmm0 \n" | |
1797 "paddsw %xmm7,%xmm0 \n" | |
1798 "psrlw $0x2,%xmm0 \n" | |
1799 "packuswb %xmm0,%xmm0 \n" | |
1800 "movq %xmm0,0x10(%edi) \n" | |
1801 "lea 0x18(%edi),%edi \n" | |
1802 "sub $0x18,%ecx \n" | |
1803 "ja 1b \n" | |
1804 | |
1805 "popa \n" | |
1806 "ret \n" | |
1807 ); | |
1808 | |
1809 void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, | |
1810 uint8* dst_ptr, int dst_width); | |
1811 asm( | |
1812 DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3) | |
1813 "pusha \n" | |
1814 "mov 0x24(%esp),%esi \n" | |
1815 "mov 0x28(%esp),%ebp \n" | |
1816 "mov 0x2c(%esp),%edi \n" | |
1817 "mov 0x30(%esp),%ecx \n" | |
1818 "movdqa _shuf01,%xmm2 \n" | |
1819 "movdqa _shuf11,%xmm3 \n" | |
1820 "movdqa _shuf21,%xmm4 \n" | |
1821 "movdqa _madd01,%xmm5 \n" | |
1822 "movdqa _madd11,%xmm6 \n" | |
1823 "movdqa _round34,%xmm7 \n" | |
1824 | |
1825 "1:" | |
1826 "movdqa (%esi),%xmm0 \n" | |
1827 "movdqa (%esi,%ebp,1),%xmm1 \n" | |
1828 "pavgb %xmm0,%xmm1 \n" | |
1829 "pavgb %xmm1,%xmm0 \n" | |
1830 "pshufb %xmm2,%xmm0 \n" | |
1831 "pmaddubsw %xmm5,%xmm0 \n" | |
1832 "paddsw %xmm7,%xmm0 \n" | |
1833 "psrlw $0x2,%xmm0 \n" | |
1834 "packuswb %xmm0,%xmm0 \n" | |
1835 "movq %xmm0,(%edi) \n" | |
1836 "movdqu 0x8(%esi),%xmm0 \n" | |
1837 "movdqu 0x8(%esi,%ebp,1),%xmm1 \n" | |
1838 "pavgb %xmm0,%xmm1 \n" | |
1839 "pavgb %xmm1,%xmm0 \n" | |
1840 "pshufb %xmm3,%xmm0 \n" | |
1841 "pmaddubsw %xmm6,%xmm0 \n" | |
1842 "paddsw %xmm7,%xmm0 \n" | |
1843 "psrlw $0x2,%xmm0 \n" | |
1844 "packuswb %xmm0,%xmm0 \n" | |
1845 "movq %xmm0,0x8(%edi) \n" | |
1846 "movdqa 0x10(%esi),%xmm0 \n" | |
1847 "movdqa 0x10(%esi,%ebp,1),%xmm1 \n" | |
1848 "lea 0x20(%esi),%esi \n" | |
1849 "pavgb %xmm0,%xmm1 \n" | |
1850 "pavgb %xmm1,%xmm0 \n" | |
1851 "pshufb %xmm4,%xmm0 \n" | |
1852 "movdqa _madd21,%xmm1 \n" | |
1853 "pmaddubsw %xmm1,%xmm0 \n" | |
1854 "paddsw %xmm7,%xmm0 \n" | |
1855 "psrlw $0x2,%xmm0 \n" | |
1856 "packuswb %xmm0,%xmm0 \n" | |
1857 "movq %xmm0,0x10(%edi) \n" | |
1858 "lea 0x18(%edi),%edi \n" | |
1859 "sub $0x18,%ecx \n" | |
1860 "ja 1b \n" | |
1861 "popa \n" | |
1862 "ret \n" | |
1863 ); | |
1864 | |
1865 #define HAS_SCALEROWDOWN38_SSSE3 | |
1866 void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, | |
1867 uint8* dst_ptr, int dst_width); | |
1868 asm( | |
1869 DECLARE_FUNCTION(ScaleRowDown38_SSSE3) | |
1870 "pusha \n" | |
1871 "mov 0x24(%esp),%esi \n" | |
1872 "mov 0x28(%esp),%edx \n" | |
1873 "mov 0x2c(%esp),%edi \n" | |
1874 "mov 0x30(%esp),%ecx \n" | |
1875 "movdqa _shuf38a ,%xmm4 \n" | |
1876 "movdqa _shuf38b ,%xmm5 \n" | |
1877 | |
1878 "1:" | |
1879 "movdqa (%esi),%xmm0 \n" | |
1880 "movdqa 0x10(%esi),%xmm1 \n" | |
1881 "lea 0x20(%esi),%esi \n" | |
1882 "pshufb %xmm4,%xmm0 \n" | |
1883 "pshufb %xmm5,%xmm1 \n" | |
1884 "paddusb %xmm1,%xmm0 \n" | |
1885 "movq %xmm0,(%edi) \n" | |
1886 "movhlps %xmm0,%xmm1 \n" | |
1887 "movd %xmm1,0x8(%edi) \n" | |
1888 "lea 0xc(%edi),%edi \n" | |
1889 "sub $0xc,%ecx \n" | |
1890 "ja 1b \n" | |
1891 "popa \n" | |
1892 "ret \n" | |
1893 ); | |
1894 | |
1895 void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, | |
1896 uint8* dst_ptr, int dst_width); | |
1897 asm( | |
1898 DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3) | |
1899 "pusha \n" | |
1900 "mov 0x24(%esp),%esi \n" | |
1901 "mov 0x28(%esp),%edx \n" | |
1902 "mov 0x2c(%esp),%edi \n" | |
1903 "mov 0x30(%esp),%ecx \n" | |
1904 "movdqa _shufac0,%xmm4 \n" | |
1905 "movdqa _shufac3,%xmm5 \n" | |
1906 "movdqa _scaleac3,%xmm6 \n" | |
1907 "pxor %xmm7,%xmm7 \n" | |
1908 | |
1909 "1:" | |
1910 "movdqa (%esi),%xmm0 \n" | |
1911 "movdqa (%esi,%edx,1),%xmm2 \n" | |
1912 "movhlps %xmm0,%xmm1 \n" | |
1913 "movhlps %xmm2,%xmm3 \n" | |
1914 "punpcklbw %xmm7,%xmm0 \n" | |
1915 "punpcklbw %xmm7,%xmm1 \n" | |
1916 "punpcklbw %xmm7,%xmm2 \n" | |
1917 "punpcklbw %xmm7,%xmm3 \n" | |
1918 "paddusw %xmm2,%xmm0 \n" | |
1919 "paddusw %xmm3,%xmm1 \n" | |
1920 "movdqa (%esi,%edx,2),%xmm2 \n" | |
1921 "lea 0x10(%esi),%esi \n" | |
1922 "movhlps %xmm2,%xmm3 \n" | |
1923 "punpcklbw %xmm7,%xmm2 \n" | |
1924 "punpcklbw %xmm7,%xmm3 \n" | |
1925 "paddusw %xmm2,%xmm0 \n" | |
1926 "paddusw %xmm3,%xmm1 \n" | |
1927 "movdqa %xmm0,%xmm2 \n" | |
1928 "psrldq $0x2,%xmm0 \n" | |
1929 "paddusw %xmm0,%xmm2 \n" | |
1930 "psrldq $0x2,%xmm0 \n" | |
1931 "paddusw %xmm0,%xmm2 \n" | |
1932 "pshufb %xmm4,%xmm2 \n" | |
1933 "movdqa %xmm1,%xmm3 \n" | |
1934 "psrldq $0x2,%xmm1 \n" | |
1935 "paddusw %xmm1,%xmm3 \n" | |
1936 "psrldq $0x2,%xmm1 \n" | |
1937 "paddusw %xmm1,%xmm3 \n" | |
1938 "pshufb %xmm5,%xmm3 \n" | |
1939 "paddusw %xmm3,%xmm2 \n" | |
1940 "pmulhuw %xmm6,%xmm2 \n" | |
1941 "packuswb %xmm2,%xmm2 \n" | |
1942 "movd %xmm2,(%edi) \n" | |
1943 "pextrw $0x2,%xmm2,%eax \n" | |
1944 "mov %ax,0x4(%edi) \n" | |
1945 "lea 0x6(%edi),%edi \n" | |
1946 "sub $0x6,%ecx \n" | |
1947 "ja 1b \n" | |
1948 "popa \n" | |
1949 "ret \n" | |
1950 ); | |
1951 | |
1952 void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, | |
1953 uint8* dst_ptr, int dst_width); | |
1954 asm( | |
1955 DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3) | |
1956 "pusha \n" | |
1957 "mov 0x24(%esp),%esi \n" | |
1958 "mov 0x28(%esp),%edx \n" | |
1959 "mov 0x2c(%esp),%edi \n" | |
1960 "mov 0x30(%esp),%ecx \n" | |
1961 "movdqa _shufab0,%xmm4 \n" | |
1962 "movdqa _shufab1,%xmm5 \n" | |
1963 "movdqa _shufab2,%xmm6 \n" | |
1964 "movdqa _scaleab2,%xmm7 \n" | |
1965 | |
1966 "1:" | |
1967 "movdqa (%esi),%xmm2 \n" | |
1968 "pavgb (%esi,%edx,1),%xmm2 \n" | |
1969 "lea 0x10(%esi),%esi \n" | |
1970 "movdqa %xmm2,%xmm0 \n" | |
1971 "pshufb %xmm4,%xmm0 \n" | |
1972 "movdqa %xmm2,%xmm1 \n" | |
1973 "pshufb %xmm5,%xmm1 \n" | |
1974 "paddusw %xmm1,%xmm0 \n" | |
1975 "pshufb %xmm6,%xmm2 \n" | |
1976 "paddusw %xmm2,%xmm0 \n" | |
1977 "pmulhuw %xmm7,%xmm0 \n" | |
1978 "packuswb %xmm0,%xmm0 \n" | |
1979 "movd %xmm0,(%edi) \n" | |
1980 "pextrw $0x2,%xmm0,%eax \n" | |
1981 "mov %ax,0x4(%edi) \n" | |
1982 "lea 0x6(%edi),%edi \n" | |
1983 "sub $0x6,%ecx \n" | |
1984 "ja 1b \n" | |
1985 "popa \n" | |
1986 "ret \n" | |
1987 ); | |
1988 #endif // __PIC__ | |
1989 | |
1990 #define HAS_SCALEADDROWS_SSE2 | |
1991 void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, | |
1992 uint16* dst_ptr, int src_width, | |
1993 int src_height); | |
1994 asm( | |
1995 DECLARE_FUNCTION(ScaleAddRows_SSE2) | |
1996 "pusha \n" | |
1997 "mov 0x24(%esp),%esi \n" | |
1998 "mov 0x28(%esp),%edx \n" | |
1999 "mov 0x2c(%esp),%edi \n" | |
2000 "mov 0x30(%esp),%ecx \n" | |
2001 "mov 0x34(%esp),%ebx \n" | |
2002 "pxor %xmm5,%xmm5 \n" | |
2003 | |
2004 "1:" | |
2005 "movdqa (%esi),%xmm2 \n" | |
2006 "lea (%esi,%edx,1),%eax \n" | |
2007 "movhlps %xmm2,%xmm3 \n" | |
2008 "lea -0x1(%ebx),%ebp \n" | |
2009 "punpcklbw %xmm5,%xmm2 \n" | |
2010 "punpcklbw %xmm5,%xmm3 \n" | |
2011 | |
2012 "2:" | |
2013 "movdqa (%eax),%xmm0 \n" | |
2014 "lea (%eax,%edx,1),%eax \n" | |
2015 "movhlps %xmm0,%xmm1 \n" | |
2016 "punpcklbw %xmm5,%xmm0 \n" | |
2017 "punpcklbw %xmm5,%xmm1 \n" | |
2018 "paddusw %xmm0,%xmm2 \n" | |
2019 "paddusw %xmm1,%xmm3 \n" | |
2020 "sub $0x1,%ebp \n" | |
2021 "ja 2b \n" | |
2022 | |
2023 "movdqa %xmm2,(%edi) \n" | |
2024 "movdqa %xmm3,0x10(%edi) \n" | |
2025 "lea 0x20(%edi),%edi \n" | |
2026 "lea 0x10(%esi),%esi \n" | |
2027 "sub $0x10,%ecx \n" | |
2028 "ja 1b \n" | |
2029 "popa \n" | |
2030 "ret \n" | |
2031 ); | |
2032 | |
2033 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version | |
2034 #define HAS_SCALEFILTERROWS_SSE2 | |
2035 void ScaleFilterRows_SSE2(uint8* dst_ptr, | |
2036 const uint8* src_ptr, int src_stride, | |
2037 int dst_width, int source_y_fraction); | |
2038 asm( | |
2039 DECLARE_FUNCTION(ScaleFilterRows_SSE2) | |
2040 "push %esi \n" | |
2041 "push %edi \n" | |
2042 "mov 0xc(%esp),%edi \n" | |
2043 "mov 0x10(%esp),%esi \n" | |
2044 "mov 0x14(%esp),%edx \n" | |
2045 "mov 0x18(%esp),%ecx \n" | |
2046 "mov 0x1c(%esp),%eax \n" | |
2047 "cmp $0x0,%eax \n" | |
2048 "je 2f \n" | |
2049 "cmp $0x80,%eax \n" | |
2050 "je 3f \n" | |
2051 "movd %eax,%xmm6 \n" | |
2052 "punpcklwd %xmm6,%xmm6 \n" | |
2053 "pshufd $0x0,%xmm6,%xmm6 \n" | |
2054 "neg %eax \n" | |
2055 "add $0x100,%eax \n" | |
2056 "movd %eax,%xmm5 \n" | |
2057 "punpcklwd %xmm5,%xmm5 \n" | |
2058 "pshufd $0x0,%xmm5,%xmm5 \n" | |
2059 "pxor %xmm7,%xmm7 \n" | |
2060 | |
2061 "1:" | |
2062 "movdqa (%esi),%xmm0 \n" | |
2063 "movdqa (%esi,%edx,1),%xmm2 \n" | |
2064 "lea 0x10(%esi),%esi \n" | |
2065 "movdqa %xmm0,%xmm1 \n" | |
2066 "movdqa %xmm2,%xmm3 \n" | |
2067 "punpcklbw %xmm7,%xmm0 \n" | |
2068 "punpcklbw %xmm7,%xmm2 \n" | |
2069 "punpckhbw %xmm7,%xmm1 \n" | |
2070 "punpckhbw %xmm7,%xmm3 \n" | |
2071 "pmullw %xmm5,%xmm0 \n" | |
2072 "pmullw %xmm5,%xmm1 \n" | |
2073 "pmullw %xmm6,%xmm2 \n" | |
2074 "pmullw %xmm6,%xmm3 \n" | |
2075 "paddusw %xmm2,%xmm0 \n" | |
2076 "paddusw %xmm3,%xmm1 \n" | |
2077 "psrlw $0x8,%xmm0 \n" | |
2078 "psrlw $0x8,%xmm1 \n" | |
2079 "packuswb %xmm1,%xmm0 \n" | |
2080 "movdqa %xmm0,(%edi) \n" | |
2081 "lea 0x10(%edi),%edi \n" | |
2082 "sub $0x10,%ecx \n" | |
2083 "ja 1b \n" | |
2084 "mov -0x1(%edi),%al \n" | |
2085 "mov %al,(%edi) \n" | |
2086 "pop %edi \n" | |
2087 "pop %esi \n" | |
2088 "ret \n" | |
2089 | |
2090 "2:" | |
2091 "movdqa (%esi),%xmm0 \n" | |
2092 "lea 0x10(%esi),%esi \n" | |
2093 "movdqa %xmm0,(%edi) \n" | |
2094 "lea 0x10(%edi),%edi \n" | |
2095 "sub $0x10,%ecx \n" | |
2096 "ja 2b \n" | |
2097 | |
2098 "mov -0x1(%edi),%al \n" | |
2099 "mov %al,(%edi) \n" | |
2100 "pop %edi \n" | |
2101 "pop %esi \n" | |
2102 "ret \n" | |
2103 | |
2104 "3:" | |
2105 "movdqa (%esi),%xmm0 \n" | |
2106 "movdqa (%esi,%edx,1),%xmm2 \n" | |
2107 "lea 0x10(%esi),%esi \n" | |
2108 "pavgb %xmm2,%xmm0 \n" | |
2109 "movdqa %xmm0,(%edi) \n" | |
2110 "lea 0x10(%edi),%edi \n" | |
2111 "sub $0x10,%ecx \n" | |
2112 "ja 3b \n" | |
2113 | |
2114 "mov -0x1(%edi),%al \n" | |
2115 "mov %al,(%edi) \n" | |
2116 "pop %edi \n" | |
2117 "pop %esi \n" | |
2118 "ret \n" | |
2119 ); | |
2120 | |
2121 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version | |
2122 #define HAS_SCALEFILTERROWS_SSSE3 | |
2123 void ScaleFilterRows_SSSE3(uint8* dst_ptr, | |
2124 const uint8* src_ptr, int src_stride, | |
2125 int dst_width, int source_y_fraction); | |
2126 asm( | |
2127 DECLARE_FUNCTION(ScaleFilterRows_SSSE3) | |
2128 "push %esi \n" | |
2129 "push %edi \n" | |
2130 "mov 0xc(%esp),%edi \n" | |
2131 "mov 0x10(%esp),%esi \n" | |
2132 "mov 0x14(%esp),%edx \n" | |
2133 "mov 0x18(%esp),%ecx \n" | |
2134 "mov 0x1c(%esp),%eax \n" | |
2135 "shr %eax \n" | |
2136 "cmp $0x0,%eax \n" | |
2137 "je 2f \n" | |
2138 "cmp $0x40,%eax \n" | |
2139 "je 3f \n" | |
2140 "mov %al,%ah \n" | |
2141 "neg %al \n" | |
2142 "add $0x80,%al \n" | |
2143 "movd %eax,%xmm5 \n" | |
2144 "punpcklwd %xmm5,%xmm5 \n" | |
2145 "pshufd $0x0,%xmm5,%xmm5 \n" | |
2146 | |
2147 "1:" | |
2148 "movdqa (%esi),%xmm0 \n" | |
2149 "movdqa (%esi,%edx,1),%xmm2 \n" | |
2150 "lea 0x10(%esi),%esi \n" | |
2151 "movdqa %xmm0,%xmm1 \n" | |
2152 "punpcklbw %xmm2,%xmm0 \n" | |
2153 "punpckhbw %xmm2,%xmm1 \n" | |
2154 "pmaddubsw %xmm5,%xmm0 \n" | |
2155 "pmaddubsw %xmm5,%xmm1 \n" | |
2156 "psrlw $0x7,%xmm0 \n" | |
2157 "psrlw $0x7,%xmm1 \n" | |
2158 "packuswb %xmm1,%xmm0 \n" | |
2159 "movdqa %xmm0,(%edi) \n" | |
2160 "lea 0x10(%edi),%edi \n" | |
2161 "sub $0x10,%ecx \n" | |
2162 "ja 1b \n" | |
2163 "mov -0x1(%edi),%al \n" | |
2164 "mov %al,(%edi) \n" | |
2165 "pop %edi \n" | |
2166 "pop %esi \n" | |
2167 "ret \n" | |
2168 | |
2169 "2:" | |
2170 "movdqa (%esi),%xmm0 \n" | |
2171 "lea 0x10(%esi),%esi \n" | |
2172 "movdqa %xmm0,(%edi) \n" | |
2173 "lea 0x10(%edi),%edi \n" | |
2174 "sub $0x10,%ecx \n" | |
2175 "ja 2b \n" | |
2176 "mov -0x1(%edi),%al \n" | |
2177 "mov %al,(%edi) \n" | |
2178 "pop %edi \n" | |
2179 "pop %esi \n" | |
2180 "ret \n" | |
2181 | |
2182 "3:" | |
2183 "movdqa (%esi),%xmm0 \n" | |
2184 "movdqa (%esi,%edx,1),%xmm2 \n" | |
2185 "lea 0x10(%esi),%esi \n" | |
2186 "pavgb %xmm2,%xmm0 \n" | |
2187 "movdqa %xmm0,(%edi) \n" | |
2188 "lea 0x10(%edi),%edi \n" | |
2189 "sub $0x10,%ecx \n" | |
2190 "ja 3b \n" | |
2191 "mov -0x1(%edi),%al \n" | |
2192 "mov %al,(%edi) \n" | |
2193 "pop %edi \n" | |
2194 "pop %esi \n" | |
2195 "ret \n" | |
2196 ); | |
2197 | |
2198 #elif defined(__x86_64__) | |
2199 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, | |
2200 uint8* dst_ptr, int dst_width) { | |
2201 asm volatile ( | |
2202 "lea (%3,%3,2),%%r10 \n" | |
2203 "pxor %%xmm7,%%xmm7 \n" | |
2204 "1:" | |
2205 "movdqa (%0),%%xmm0 \n" | |
2206 "movdqa 0x10(%0),%%xmm1 \n" | |
2207 "movdqa (%0,%3,1),%%xmm2 \n" | |
2208 "movdqa 0x10(%0,%3,1),%%xmm3 \n" | |
2209 "pavgb %%xmm2,%%xmm0 \n" | |
2210 "pavgb %%xmm3,%%xmm1 \n" | |
2211 "movdqa (%0,%3,2),%%xmm2 \n" | |
2212 "movdqa 0x10(%0,%3,2),%%xmm3 \n" | |
2213 "movdqa (%0,%%r10,1),%%xmm4 \n" | |
2214 "movdqa 0x10(%0,%%r10,1),%%xmm5 \n" | |
2215 "lea (%0,%3,4),%%r11 \n" | |
2216 "lea 0x20(%0),%0 \n" | |
2217 "pavgb %%xmm4,%%xmm2 \n" | |
2218 "pavgb %%xmm5,%%xmm3 \n" | |
2219 "pavgb %%xmm2,%%xmm0 \n" | |
2220 "pavgb %%xmm3,%%xmm1 \n" | |
2221 "movdqa 0x0(%%r11),%%xmm2 \n" | |
2222 "movdqa 0x10(%%r11),%%xmm3 \n" | |
2223 "movdqa 0x0(%%r11,%3,1),%%xmm4 \n" | |
2224 "movdqa 0x10(%%r11,%3,1),%%xmm5 \n" | |
2225 "pavgb %%xmm4,%%xmm2 \n" | |
2226 "pavgb %%xmm5,%%xmm3 \n" | |
2227 "movdqa 0x0(%%r11,%3,2),%%xmm4 \n" | |
2228 "movdqa 0x10(%%r11,%3,2),%%xmm5 \n" | |
2229 "movdqa 0x0(%%r11,%%r10,1),%%xmm6 \n" | |
2230 "pavgb %%xmm6,%%xmm4 \n" | |
2231 "movdqa 0x10(%%r11,%%r10,1),%%xmm6 \n" | |
2232 "pavgb %%xmm6,%%xmm5 \n" | |
2233 "pavgb %%xmm4,%%xmm2 \n" | |
2234 "pavgb %%xmm5,%%xmm3 \n" | |
2235 "pavgb %%xmm2,%%xmm0 \n" | |
2236 "pavgb %%xmm3,%%xmm1 \n" | |
2237 "psadbw %%xmm7,%%xmm0 \n" | |
2238 "psadbw %%xmm7,%%xmm1 \n" | |
2239 "pshufd $0xd8,%%xmm0,%%xmm0 \n" | |
2240 "pshufd $0x8d,%%xmm1,%%xmm1 \n" | |
2241 "por %%xmm1,%%xmm0 \n" | |
2242 "psrlw $0x3,%%xmm0 \n" | |
2243 "packuswb %%xmm0,%%xmm0 \n" | |
2244 "packuswb %%xmm0,%%xmm0 \n" | |
2245 "movd %%xmm0,(%1) \n" | |
2246 "lea 0x4(%1),%1 \n" | |
2247 "sub $0x4,%2 \n" | |
2248 "ja 1b \n" | |
2249 : "+r"(src_ptr), // %0 | |
2250 "+r"(dst_ptr), // %1 | |
2251 "+r"(dst_width) // %2 | |
2252 : "r"((intptr_t)(src_stride)) // %3 | |
2253 : "memory", "cc", "r10", "r11", "xmm6", "xmm7" | |
2254 ); | |
2255 } | |
2256 | |
2257 #define HAS_SCALEROWDOWN34_SSSE3 | |
2258 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, | |
2259 uint8* dst_ptr, int dst_width) { | |
2260 asm volatile ( | |
2261 "movdqa (%3),%%xmm3 \n" | |
2262 "movdqa (%4),%%xmm4 \n" | |
2263 "movdqa (%5),%%xmm5 \n" | |
2264 "1:" | |
2265 "movdqa (%0),%%xmm0 \n" | |
2266 "movdqa 0x10(%0),%%xmm2 \n" | |
2267 "lea 0x20(%0),%0 \n" | |
2268 "movdqa %%xmm2,%%xmm1 \n" | |
2269 "palignr $0x8,%%xmm0,%%xmm1 \n" | |
2270 "pshufb %%xmm3,%%xmm0 \n" | |
2271 "pshufb %%xmm4,%%xmm1 \n" | |
2272 "pshufb %%xmm5,%%xmm2 \n" | |
2273 "movq %%xmm0,(%1) \n" | |
2274 "movq %%xmm1,0x8(%1) \n" | |
2275 "movq %%xmm2,0x10(%1) \n" | |
2276 "lea 0x18(%1),%1 \n" | |
2277 "sub $0x18,%2 \n" | |
2278 "ja 1b \n" | |
2279 : "+r"(src_ptr), // %0 | |
2280 "+r"(dst_ptr), // %1 | |
2281 "+r"(dst_width) // %2 | |
2282 : "r"(_shuf0), // %3 | |
2283 "r"(_shuf1), // %4 | |
2284 "r"(_shuf2) // %5 | |
2285 : "memory", "cc" | |
2286 ); | |
2287 } | |
2288 | |
2289 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, | |
2290 uint8* dst_ptr, int dst_width) { | |
2291 asm volatile ( | |
2292 "movdqa (%4),%%xmm2 \n" // _shuf01 | |
2293 "movdqa (%5),%%xmm3 \n" // _shuf11 | |
2294 "movdqa (%6),%%xmm4 \n" // _shuf21 | |
2295 "movdqa (%7),%%xmm5 \n" // _madd01 | |
2296 "movdqa (%8),%%xmm6 \n" // _madd11 | |
2297 "movdqa (%9),%%xmm7 \n" // _round34 | |
2298 "movdqa (%10),%%xmm8 \n" // _madd21 | |
2299 "1:" | |
2300 "movdqa (%0),%%xmm0 \n" | |
2301 "movdqa (%0,%3),%%xmm1 \n" | |
2302 "pavgb %%xmm1,%%xmm0 \n" | |
2303 "pshufb %%xmm2,%%xmm0 \n" | |
2304 "pmaddubsw %%xmm5,%%xmm0 \n" | |
2305 "paddsw %%xmm7,%%xmm0 \n" | |
2306 "psrlw $0x2,%%xmm0 \n" | |
2307 "packuswb %%xmm0,%%xmm0 \n" | |
2308 "movq %%xmm0,(%1) \n" | |
2309 "movdqu 0x8(%0),%%xmm0 \n" | |
2310 "movdqu 0x8(%0,%3),%%xmm1 \n" | |
2311 "pavgb %%xmm1,%%xmm0 \n" | |
2312 "pshufb %%xmm3,%%xmm0 \n" | |
2313 "pmaddubsw %%xmm6,%%xmm0 \n" | |
2314 "paddsw %%xmm7,%%xmm0 \n" | |
2315 "psrlw $0x2,%%xmm0 \n" | |
2316 "packuswb %%xmm0,%%xmm0 \n" | |
2317 "movq %%xmm0,0x8(%1) \n" | |
2318 "movdqa 0x10(%0),%%xmm0 \n" | |
2319 "movdqa 0x10(%0,%3),%%xmm1 \n" | |
2320 "lea 0x20(%0),%0 \n" | |
2321 "pavgb %%xmm1,%%xmm0 \n" | |
2322 "pshufb %%xmm4,%%xmm0 \n" | |
2323 "pmaddubsw %%xmm8,%%xmm0 \n" | |
2324 "paddsw %%xmm7,%%xmm0 \n" | |
2325 "psrlw $0x2,%%xmm0 \n" | |
2326 "packuswb %%xmm0,%%xmm0 \n" | |
2327 "movq %%xmm0,0x10(%1) \n" | |
2328 "lea 0x18(%1),%1 \n" | |
2329 "sub $0x18,%2 \n" | |
2330 "ja 1b \n" | |
2331 : "+r"(src_ptr), // %0 | |
2332 "+r"(dst_ptr), // %1 | |
2333 "+r"(dst_width) // %2 | |
2334 : "r"((intptr_t)(src_stride)), // %3 | |
2335 "r"(_shuf01), // %4 | |
2336 "r"(_shuf11), // %5 | |
2337 "r"(_shuf21), // %6 | |
2338 "r"(_madd01), // %7 | |
2339 "r"(_madd11), // %8 | |
2340 "r"(_round34), // %9 | |
2341 "r"(_madd21) // %10 | |
2342 : "memory", "cc", "xmm6", "xmm7", "xmm8" | |
2343 ); | |
2344 } | |
2345 | |
2346 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, | |
2347 uint8* dst_ptr, int dst_width) { | |
2348 asm volatile ( | |
2349 "movdqa (%4),%%xmm2 \n" // _shuf01 | |
2350 "movdqa (%5),%%xmm3 \n" // _shuf11 | |
2351 "movdqa (%6),%%xmm4 \n" // _shuf21 | |
2352 "movdqa (%7),%%xmm5 \n" // _madd01 | |
2353 "movdqa (%8),%%xmm6 \n" // _madd11 | |
2354 "movdqa (%9),%%xmm7 \n" // _round34 | |
2355 "movdqa (%10),%%xmm8 \n" // _madd21 | |
2356 "1:" | |
2357 "movdqa (%0),%%xmm0 \n" | |
2358 "movdqa (%0,%3,1),%%xmm1 \n" | |
2359 "pavgb %%xmm0,%%xmm1 \n" | |
2360 "pavgb %%xmm1,%%xmm0 \n" | |
2361 "pshufb %%xmm2,%%xmm0 \n" | |
2362 "pmaddubsw %%xmm5,%%xmm0 \n" | |
2363 "paddsw %%xmm7,%%xmm0 \n" | |
2364 "psrlw $0x2,%%xmm0 \n" | |
2365 "packuswb %%xmm0,%%xmm0 \n" | |
2366 "movq %%xmm0,(%1) \n" | |
2367 "movdqu 0x8(%0),%%xmm0 \n" | |
2368 "movdqu 0x8(%0,%3,1),%%xmm1 \n" | |
2369 "pavgb %%xmm0,%%xmm1 \n" | |
2370 "pavgb %%xmm1,%%xmm0 \n" | |
2371 "pshufb %%xmm3,%%xmm0 \n" | |
2372 "pmaddubsw %%xmm6,%%xmm0 \n" | |
2373 "paddsw %%xmm7,%%xmm0 \n" | |
2374 "psrlw $0x2,%%xmm0 \n" | |
2375 "packuswb %%xmm0,%%xmm0 \n" | |
2376 "movq %%xmm0,0x8(%1) \n" | |
2377 "movdqa 0x10(%0),%%xmm0 \n" | |
2378 "movdqa 0x10(%0,%3,1),%%xmm1 \n" | |
2379 "lea 0x20(%0),%0 \n" | |
2380 "pavgb %%xmm0,%%xmm1 \n" | |
2381 "pavgb %%xmm1,%%xmm0 \n" | |
2382 "pshufb %%xmm4,%%xmm0 \n" | |
2383 "pmaddubsw %%xmm8,%%xmm0 \n" | |
2384 "paddsw %%xmm7,%%xmm0 \n" | |
2385 "psrlw $0x2,%%xmm0 \n" | |
2386 "packuswb %%xmm0,%%xmm0 \n" | |
2387 "movq %%xmm0,0x10(%1) \n" | |
2388 "lea 0x18(%1),%1 \n" | |
2389 "sub $0x18,%2 \n" | |
2390 "ja 1b \n" | |
2391 : "+r"(src_ptr), // %0 | |
2392 "+r"(dst_ptr), // %1 | |
2393 "+r"(dst_width) // %2 | |
2394 : "r"((intptr_t)(src_stride)), // %3 | |
2395 "r"(_shuf01), // %4 | |
2396 "r"(_shuf11), // %5 | |
2397 "r"(_shuf21), // %6 | |
2398 "r"(_madd01), // %7 | |
2399 "r"(_madd11), // %8 | |
2400 "r"(_round34), // %9 | |
2401 "r"(_madd21) // %10 | |
2402 : "memory", "cc", "xmm6", "xmm7", "xmm8" | |
2403 ); | |
2404 } | |
2405 | |
2406 #define HAS_SCALEROWDOWN38_SSSE3 | |
2407 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, | |
2408 uint8* dst_ptr, int dst_width) { | |
2409 asm volatile ( | |
2410 "movdqa (%3),%%xmm4 \n" | |
2411 "movdqa (%4),%%xmm5 \n" | |
2412 "1:" | |
2413 "movdqa (%0),%%xmm0 \n" | |
2414 "movdqa 0x10(%0),%%xmm1 \n" | |
2415 "lea 0x20(%0),%0 \n" | |
2416 "pshufb %%xmm4,%%xmm0 \n" | |
2417 "pshufb %%xmm5,%%xmm1 \n" | |
2418 "paddusb %%xmm1,%%xmm0 \n" | |
2419 "movq %%xmm0,(%1) \n" | |
2420 "movhlps %%xmm0,%%xmm1 \n" | |
2421 "movd %%xmm1,0x8(%1) \n" | |
2422 "lea 0xc(%1),%1 \n" | |
2423 "sub $0xc,%2 \n" | |
2424 "ja 1b \n" | |
2425 : "+r"(src_ptr), // %0 | |
2426 "+r"(dst_ptr), // %1 | |
2427 "+r"(dst_width) // %2 | |
2428 : "r"(_shuf38a), // %3 | |
2429 "r"(_shuf38b) // %4 | |
2430 : "memory", "cc" | |
2431 ); | |
2432 } | |
2433 | |
2434 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, | |
2435 uint8* dst_ptr, int dst_width) { | |
2436 asm volatile ( | |
2437 "movdqa (%4),%%xmm4 \n" | |
2438 "movdqa (%5),%%xmm5 \n" | |
2439 "movdqa (%6),%%xmm6 \n" | |
2440 "pxor %%xmm7,%%xmm7 \n" | |
2441 "1:" | |
2442 "movdqa (%0),%%xmm0 \n" | |
2443 "movdqa (%0,%3,1),%%xmm2 \n" | |
2444 "movhlps %%xmm0,%%xmm1 \n" | |
2445 "movhlps %%xmm2,%%xmm3 \n" | |
2446 "punpcklbw %%xmm7,%%xmm0 \n" | |
2447 "punpcklbw %%xmm7,%%xmm1 \n" | |
2448 "punpcklbw %%xmm7,%%xmm2 \n" | |
2449 "punpcklbw %%xmm7,%%xmm3 \n" | |
2450 "paddusw %%xmm2,%%xmm0 \n" | |
2451 "paddusw %%xmm3,%%xmm1 \n" | |
2452 "movdqa (%0,%3,2),%%xmm2 \n" | |
2453 "lea 0x10(%0),%0 \n" | |
2454 "movhlps %%xmm2,%%xmm3 \n" | |
2455 "punpcklbw %%xmm7,%%xmm2 \n" | |
2456 "punpcklbw %%xmm7,%%xmm3 \n" | |
2457 "paddusw %%xmm2,%%xmm0 \n" | |
2458 "paddusw %%xmm3,%%xmm1 \n" | |
2459 "movdqa %%xmm0,%%xmm2 \n" | |
2460 "psrldq $0x2,%%xmm0 \n" | |
2461 "paddusw %%xmm0,%%xmm2 \n" | |
2462 "psrldq $0x2,%%xmm0 \n" | |
2463 "paddusw %%xmm0,%%xmm2 \n" | |
2464 "pshufb %%xmm4,%%xmm2 \n" | |
2465 "movdqa %%xmm1,%%xmm3 \n" | |
2466 "psrldq $0x2,%%xmm1 \n" | |
2467 "paddusw %%xmm1,%%xmm3 \n" | |
2468 "psrldq $0x2,%%xmm1 \n" | |
2469 "paddusw %%xmm1,%%xmm3 \n" | |
2470 "pshufb %%xmm5,%%xmm3 \n" | |
2471 "paddusw %%xmm3,%%xmm2 \n" | |
2472 "pmulhuw %%xmm6,%%xmm2 \n" | |
2473 "packuswb %%xmm2,%%xmm2 \n" | |
2474 "movd %%xmm2,(%1) \n" | |
2475 "pextrw $0x2,%%xmm2,%%eax \n" | |
2476 "mov %%ax,0x4(%1) \n" | |
2477 "lea 0x6(%1),%1 \n" | |
2478 "sub $0x6,%2 \n" | |
2479 "ja 1b \n" | |
2480 : "+r"(src_ptr), // %0 | |
2481 "+r"(dst_ptr), // %1 | |
2482 "+r"(dst_width) // %2 | |
2483 : "r"((intptr_t)(src_stride)), // %3 | |
2484 "r"(_shufac0), // %4 | |
2485 "r"(_shufac3), // %5 | |
2486 "r"(_scaleac3) // %6 | |
2487 : "memory", "cc", "rax", "xmm6", "xmm7" | |
2488 ); | |
2489 } | |
2490 | |
2491 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, | |
2492 uint8* dst_ptr, int dst_width) { | |
2493 asm volatile ( | |
2494 "movdqa (%4),%%xmm4 \n" | |
2495 "movdqa (%5),%%xmm5 \n" | |
2496 "movdqa (%6),%%xmm6 \n" | |
2497 "movdqa (%7),%%xmm7 \n" | |
2498 "1:" | |
2499 "movdqa (%0),%%xmm2 \n" | |
2500 "pavgb (%0,%3,1),%%xmm2 \n" | |
2501 "lea 0x10(%0),%0 \n" | |
2502 "movdqa %%xmm2,%%xmm0 \n" | |
2503 "pshufb %%xmm4,%%xmm0 \n" | |
2504 "movdqa %%xmm2,%%xmm1 \n" | |
2505 "pshufb %%xmm5,%%xmm1 \n" | |
2506 "paddusw %%xmm1,%%xmm0 \n" | |
2507 "pshufb %%xmm6,%%xmm2 \n" | |
2508 "paddusw %%xmm2,%%xmm0 \n" | |
2509 "pmulhuw %%xmm7,%%xmm0 \n" | |
2510 "packuswb %%xmm0,%%xmm0 \n" | |
2511 "movd %%xmm0,(%1) \n" | |
2512 "pextrw $0x2,%%xmm0,%%eax \n" | |
2513 "mov %%ax,0x4(%1) \n" | |
2514 "lea 0x6(%1),%1 \n" | |
2515 "sub $0x6,%2 \n" | |
2516 "ja 1b \n" | |
2517 : "+r"(src_ptr), // %0 | |
2518 "+r"(dst_ptr), // %1 | |
2519 "+r"(dst_width) // %2 | |
2520 : "r"((intptr_t)(src_stride)), // %3 | |
2521 "r"(_shufab0), // %4 | |
2522 "r"(_shufab1), // %5 | |
2523 "r"(_shufab2), // %6 | |
2524 "r"(_scaleab2) // %7 | |
2525 : "memory", "cc", "rax", "xmm6", "xmm7" | |
2526 ); | |
2527 } | |
2528 | |
2529 #define HAS_SCALEADDROWS_SSE2 | |
2530 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, | |
2531 uint16* dst_ptr, int src_width, | |
2532 int src_height) { | |
2533 asm volatile ( | |
2534 "pxor %%xmm5,%%xmm5 \n" | |
2535 "1:" | |
2536 "movdqa (%0),%%xmm2 \n" | |
2537 "lea (%0,%4,1),%%r10 \n" | |
2538 "movhlps %%xmm2,%%xmm3 \n" | |
2539 "lea -0x1(%3),%%r11 \n" | |
2540 "punpcklbw %%xmm5,%%xmm2 \n" | |
2541 "punpcklbw %%xmm5,%%xmm3 \n" | |
2542 | |
2543 "2:" | |
2544 "movdqa (%%r10),%%xmm0 \n" | |
2545 "lea (%%r10,%4,1),%%r10 \n" | |
2546 "movhlps %%xmm0,%%xmm1 \n" | |
2547 "punpcklbw %%xmm5,%%xmm0 \n" | |
2548 "punpcklbw %%xmm5,%%xmm1 \n" | |
2549 "paddusw %%xmm0,%%xmm2 \n" | |
2550 "paddusw %%xmm1,%%xmm3 \n" | |
2551 "sub $0x1,%%r11 \n" | |
2552 "ja 2b \n" | |
2553 | |
2554 "movdqa %%xmm2,(%1) \n" | |
2555 "movdqa %%xmm3,0x10(%1) \n" | |
2556 "lea 0x20(%1),%1 \n" | |
2557 "lea 0x10(%0),%0 \n" | |
2558 "sub $0x10,%2 \n" | |
2559 "ja 1b \n" | |
2560 : "+r"(src_ptr), // %0 | |
2561 "+r"(dst_ptr), // %1 | |
2562 "+r"(src_width), // %2 | |
2563 "+r"(src_height) // %3 | |
2564 : "r"((intptr_t)(src_stride)) // %4 | |
2565 : "memory", "cc", "r10", "r11" | |
2566 ); | |
2567 } | |
2568 | |
2569 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version | |
2570 #define HAS_SCALEFILTERROWS_SSE2 | |
2571 static void ScaleFilterRows_SSE2(uint8* dst_ptr, | |
2572 const uint8* src_ptr, int src_stride, | |
2573 int dst_width, int source_y_fraction) { | |
2574 if (source_y_fraction == 0) { | |
2575 asm volatile ( | |
2576 "1:" | |
2577 "movdqa (%1),%%xmm0 \n" | |
2578 "lea 0x10(%1),%1 \n" | |
2579 "movdqa %%xmm0,(%0) \n" | |
2580 "lea 0x10(%0),%0 \n" | |
2581 "sub $0x10,%2 \n" | |
2582 "ja 1b \n" | |
2583 "mov -0x1(%0),%%al \n" | |
2584 "mov %%al,(%0) \n" | |
2585 : "+r"(dst_ptr), // %0 | |
2586 "+r"(src_ptr), // %1 | |
2587 "+r"(dst_width) // %2 | |
2588 : | |
2589 : "memory", "cc", "rax" | |
2590 ); | |
2591 return; | |
2592 } else if (source_y_fraction == 128) { | |
2593 asm volatile ( | |
2594 "1:" | |
2595 "movdqa (%1),%%xmm0 \n" | |
2596 "movdqa (%1,%3,1),%%xmm2 \n" | |
2597 "lea 0x10(%1),%1 \n" | |
2598 "pavgb %%xmm2,%%xmm0 \n" | |
2599 "movdqa %%xmm0,(%0) \n" | |
2600 "lea 0x10(%0),%0 \n" | |
2601 "sub $0x10,%2 \n" | |
2602 "ja 1b \n" | |
2603 "mov -0x1(%0),%%al \n" | |
2604 "mov %%al,(%0) \n" | |
2605 : "+r"(dst_ptr), // %0 | |
2606 "+r"(src_ptr), // %1 | |
2607 "+r"(dst_width) // %2 | |
2608 : "r"((intptr_t)(src_stride)) // %3 | |
2609 : "memory", "cc", "rax" | |
2610 ); | |
2611 return; | |
2612 } else { | |
2613 asm volatile ( | |
2614 "mov %3,%%eax \n" | |
2615 "movd %%eax,%%xmm6 \n" | |
2616 "punpcklwd %%xmm6,%%xmm6 \n" | |
2617 "pshufd $0x0,%%xmm6,%%xmm6 \n" | |
2618 "neg %%eax \n" | |
2619 "add $0x100,%%eax \n" | |
2620 "movd %%eax,%%xmm5 \n" | |
2621 "punpcklwd %%xmm5,%%xmm5 \n" | |
2622 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
2623 "pxor %%xmm7,%%xmm7 \n" | |
2624 "1:" | |
2625 "movdqa (%1),%%xmm0 \n" | |
2626 "movdqa (%1,%4,1),%%xmm2 \n" | |
2627 "lea 0x10(%1),%1 \n" | |
2628 "movdqa %%xmm0,%%xmm1 \n" | |
2629 "movdqa %%xmm2,%%xmm3 \n" | |
2630 "punpcklbw %%xmm7,%%xmm0 \n" | |
2631 "punpcklbw %%xmm7,%%xmm2 \n" | |
2632 "punpckhbw %%xmm7,%%xmm1 \n" | |
2633 "punpckhbw %%xmm7,%%xmm3 \n" | |
2634 "pmullw %%xmm5,%%xmm0 \n" | |
2635 "pmullw %%xmm5,%%xmm1 \n" | |
2636 "pmullw %%xmm6,%%xmm2 \n" | |
2637 "pmullw %%xmm6,%%xmm3 \n" | |
2638 "paddusw %%xmm2,%%xmm0 \n" | |
2639 "paddusw %%xmm3,%%xmm1 \n" | |
2640 "psrlw $0x8,%%xmm0 \n" | |
2641 "psrlw $0x8,%%xmm1 \n" | |
2642 "packuswb %%xmm1,%%xmm0 \n" | |
2643 "movdqa %%xmm0,(%0) \n" | |
2644 "lea 0x10(%0),%0 \n" | |
2645 "sub $0x10,%2 \n" | |
2646 "ja 1b \n" | |
2647 "mov -0x1(%0),%%al \n" | |
2648 "mov %%al,(%0) \n" | |
2649 : "+r"(dst_ptr), // %0 | |
2650 "+r"(src_ptr), // %1 | |
2651 "+r"(dst_width), // %2 | |
2652 "+r"(source_y_fraction) // %3 | |
2653 : "r"((intptr_t)(src_stride)) // %4 | |
2654 : "memory", "cc", "rax", "xmm6", "xmm7" | |
2655 ); | |
2656 } | |
2657 return; | |
2658 } | |
2659 | |
2660 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version | |
2661 #define HAS_SCALEFILTERROWS_SSSE3 | |
2662 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, | |
2663 const uint8* src_ptr, int src_stride, | |
2664 int dst_width, int source_y_fraction) { | |
2665 source_y_fraction >>= 1; | |
2666 if (source_y_fraction == 0) { | |
2667 asm volatile ( | |
2668 "1:" | |
2669 "movdqa (%1),%%xmm0 \n" | |
2670 "lea 0x10(%1),%1 \n" | |
2671 "movdqa %%xmm0,(%0) \n" | |
2672 "lea 0x10(%0),%0 \n" | |
2673 "sub $0x10,%2 \n" | |
2674 "ja 1b \n" | |
2675 "mov -0x1(%0),%%al \n" | |
2676 "mov %%al,(%0) \n" | |
2677 : "+r"(dst_ptr), // %0 | |
2678 "+r"(src_ptr), // %1 | |
2679 "+r"(dst_width) // %2 | |
2680 : | |
2681 : "memory", "cc", "rax" | |
2682 ); | |
2683 return; | |
2684 } else if (source_y_fraction == 64) { | |
2685 asm volatile ( | |
2686 "1:" | |
2687 "movdqa (%1),%%xmm0 \n" | |
2688 "movdqa (%1,%3,1),%%xmm2 \n" | |
2689 "lea 0x10(%1),%1 \n" | |
2690 "pavgb %%xmm2,%%xmm0 \n" | |
2691 "movdqa %%xmm0,(%0) \n" | |
2692 "lea 0x10(%0),%0 \n" | |
2693 "sub $0x10,%2 \n" | |
2694 "ja 1b \n" | |
2695 "mov -0x1(%0),%%al \n" | |
2696 "mov %%al,(%0) \n" | |
2697 : "+r"(dst_ptr), // %0 | |
2698 "+r"(src_ptr), // %1 | |
2699 "+r"(dst_width) // %2 | |
2700 : "r"((intptr_t)(src_stride)) // %3 | |
2701 : "memory", "cc", "rax" | |
2702 ); | |
2703 return; | |
2704 } else { | |
2705 asm volatile ( | |
2706 "mov %3,%%eax \n" | |
2707 "mov %%al,%%ah \n" | |
2708 "neg %%al \n" | |
2709 "add $0x80,%%al \n" | |
2710 "movd %%eax,%%xmm5 \n" | |
2711 "punpcklwd %%xmm5,%%xmm5 \n" | |
2712 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
2713 "1:" | |
2714 "movdqa (%1),%%xmm0 \n" | |
2715 "movdqa (%1,%4,1),%%xmm2 \n" | |
2716 "lea 0x10(%1),%1 \n" | |
2717 "movdqa %%xmm0,%%xmm1 \n" | |
2718 "punpcklbw %%xmm2,%%xmm0 \n" | |
2719 "punpckhbw %%xmm2,%%xmm1 \n" | |
2720 "pmaddubsw %%xmm5,%%xmm0 \n" | |
2721 "pmaddubsw %%xmm5,%%xmm1 \n" | |
2722 "psrlw $0x7,%%xmm0 \n" | |
2723 "psrlw $0x7,%%xmm1 \n" | |
2724 "packuswb %%xmm1,%%xmm0 \n" | |
2725 "movdqa %%xmm0,(%0) \n" | |
2726 "lea 0x10(%0),%0 \n" | |
2727 "sub $0x10,%2 \n" | |
2728 "ja 1b \n" | |
2729 "mov -0x1(%0),%%al \n" | |
2730 "mov %%al,(%0) \n" | |
2731 : "+r"(dst_ptr), // %0 | |
2732 "+r"(src_ptr), // %1 | |
2733 "+r"(dst_width), // %2 | |
2734 "+r"(source_y_fraction) // %3 | |
2735 : "r"((intptr_t)(src_stride)) // %4 | |
2736 : "memory", "cc", "rax" | |
2737 ); | |
2738 } | |
2739 return; | |
2740 } | |
2741 #endif | |
2742 #endif | |
2743 | |
2744 // CPU agnostic row functions | |
2745 static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride, | |
2746 uint8* dst, int dst_width) { | |
2747 int x; | |
2748 for (x = 0; x < dst_width; ++x) { | |
2749 *dst++ = *src_ptr; | |
2750 src_ptr += 2; | |
2751 } | |
2752 } | |
2753 | |
2754 static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, | |
2755 uint8* dst, int dst_width) { | |
2756 int x; | |
2757 for (x = 0; x < dst_width; ++x) { | |
2758 *dst++ = (src_ptr[0] + src_ptr[1] + | |
2759 src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; | |
2760 src_ptr += 2; | |
2761 } | |
2762 } | |
2763 | |
2764 static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride, | |
2765 uint8* dst, int dst_width) { | |
2766 int x; | |
2767 for (x = 0; x < dst_width; ++x) { | |
2768 *dst++ = *src_ptr; | |
2769 src_ptr += 4; | |
2770 } | |
2771 } | |
2772 | |
2773 static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride, | |
2774 uint8* dst, int dst_width) { | |
2775 int x; | |
2776 for (x = 0; x < dst_width; ++x) { | |
2777 *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + | |
2778 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + | |
2779 src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + | |
2780 src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + | |
2781 src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + | |
2782 src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + | |
2783 src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + | |
2784 8) >> 4; | |
2785 src_ptr += 4; | |
2786 } | |
2787 } | |
2788 | |
2789 // 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down. | |
2790 // Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu. | |
2791 // The following 2 lines cause error on Windows. | |
2792 //static const int kMaxOutputWidth = 640; | |
2793 //static const int kMaxRow12 = 1280; //kMaxOutputWidth * 2; | |
2794 #define kMaxOutputWidth 640 | |
2795 #define kMaxRow12 1280 | |
2796 | |
2797 static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride, | |
2798 uint8* dst, int dst_width) { | |
2799 int x; | |
2800 for (x = 0; x < dst_width; ++x) { | |
2801 *dst++ = *src_ptr; | |
2802 src_ptr += 8; | |
2803 } | |
2804 } | |
2805 | |
2806 // Note calling code checks width is less than max and if not | |
2807 // uses ScaleRowDown8_C instead. | |
2808 static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride, | |
2809 uint8* dst, int dst_width) { | |
2810 ALIGN16(uint8 src_row[kMaxRow12 * 2]); | |
2811 assert(dst_width <= kMaxOutputWidth); | |
2812 ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2); | |
2813 ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, | |
2814 src_row + kMaxOutputWidth, | |
2815 dst_width * 2); | |
2816 ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width); | |
2817 } | |
2818 | |
2819 static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride, | |
2820 uint8* dst, int dst_width) { | |
2821 uint8* dend; | |
2822 assert((dst_width % 3 == 0) && (dst_width > 0)); | |
2823 dend = dst + dst_width; | |
2824 do { | |
2825 dst[0] = src_ptr[0]; | |
2826 dst[1] = src_ptr[1]; | |
2827 dst[2] = src_ptr[3]; | |
2828 dst += 3; | |
2829 src_ptr += 4; | |
2830 } while (dst < dend); | |
2831 } | |
2832 | |
2833 // Filter rows 0 and 1 together, 3 : 1 | |
2834 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, | |
2835 uint8* d, int dst_width) { | |
2836 uint8* dend; | |
2837 const uint8* s; | |
2838 const uint8* t; | |
2839 assert((dst_width % 3 == 0) && (dst_width > 0)); | |
2840 dend = d + dst_width; | |
2841 s = src_ptr; | |
2842 t = src_ptr + src_stride; | |
2843 do { | |
2844 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; | |
2845 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; | |
2846 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; | |
2847 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; | |
2848 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; | |
2849 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; | |
2850 d[0] = (a0 * 3 + b0 + 2) >> 2; | |
2851 d[1] = (a1 * 3 + b1 + 2) >> 2; | |
2852 d[2] = (a2 * 3 + b2 + 2) >> 2; | |
2853 d += 3; | |
2854 s += 4; | |
2855 t += 4; | |
2856 } while (d < dend); | |
2857 } | |
2858 | |
2859 // Filter rows 1 and 2 together, 1 : 1 | |
2860 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, | |
2861 uint8* d, int dst_width) { | |
2862 uint8* dend; | |
2863 const uint8* s; | |
2864 const uint8* t; | |
2865 assert((dst_width % 3 == 0) && (dst_width > 0)); | |
2866 dend = d + dst_width; | |
2867 s = src_ptr; | |
2868 t = src_ptr + src_stride; | |
2869 do { | |
2870 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; | |
2871 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; | |
2872 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; | |
2873 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; | |
2874 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; | |
2875 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; | |
2876 d[0] = (a0 + b0 + 1) >> 1; | |
2877 d[1] = (a1 + b1 + 1) >> 1; | |
2878 d[2] = (a2 + b2 + 1) >> 1; | |
2879 d += 3; | |
2880 s += 4; | |
2881 t += 4; | |
2882 } while (d < dend); | |
2883 } | |
2884 | |
2885 #if defined(HAS_SCALEFILTERROWS_SSE2) | |
2886 // Filter row to 3/4 | |
2887 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, | |
2888 int dst_width) { | |
2889 uint8* dend; | |
2890 const uint8* s; | |
2891 assert((dst_width % 3 == 0) && (dst_width > 0)); | |
2892 dend = dst_ptr + dst_width; | |
2893 s = src_ptr; | |
2894 do { | |
2895 dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; | |
2896 dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; | |
2897 dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2; | |
2898 dst_ptr += 3; | |
2899 s += 4; | |
2900 } while (dst_ptr < dend); | |
2901 } | |
2902 #endif | |
2903 | |
2904 static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, | |
2905 int dst_width, int dx) { | |
2906 int x = 0; | |
2907 int j; | |
2908 for (j = 0; j < dst_width; ++j) { | |
2909 int xi = x >> 16; | |
2910 int xf1 = x & 0xffff; | |
2911 int xf0 = 65536 - xf1; | |
2912 | |
2913 *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16; | |
2914 x += dx; | |
2915 } | |
2916 } | |
2917 | |
2918 //Not work on Windows | |
2919 //static const int kMaxInputWidth = 2560; | |
2920 #define kMaxInputWidth 2560 | |
2921 #if defined(HAS_SCALEFILTERROWS_SSE2) | |
2922 #define HAS_SCALEROWDOWN34_SSE2 | |
2923 // Filter rows 0 and 1 together, 3 : 1 | |
2924 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride, | |
2925 uint8* dst_ptr, int dst_width) { | |
2926 ALIGN16(uint8 row[kMaxInputWidth]); | |
2927 assert((dst_width % 3 == 0) && (dst_width > 0)); | |
2928 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4); | |
2929 ScaleFilterCols34_C(dst_ptr, row, dst_width); | |
2930 } | |
2931 | |
2932 // Filter rows 1 and 2 together, 1 : 1 | |
2933 static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride, | |
2934 uint8* dst_ptr, int dst_width) { | |
2935 ALIGN16(uint8 row[kMaxInputWidth]); | |
2936 assert((dst_width % 3 == 0) && (dst_width > 0)); | |
2937 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2); | |
2938 ScaleFilterCols34_C(dst_ptr, row, dst_width); | |
2939 } | |
2940 #endif | |
2941 | |
2942 static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride, | |
2943 uint8* dst, int dst_width) { | |
2944 int x; | |
2945 assert(dst_width % 3 == 0); | |
2946 for (x = 0; x < dst_width; x += 3) { | |
2947 dst[0] = src_ptr[0]; | |
2948 dst[1] = src_ptr[3]; | |
2949 dst[2] = src_ptr[6]; | |
2950 dst += 3; | |
2951 src_ptr += 8; | |
2952 } | |
2953 } | |
2954 | |
2955 // 8x3 -> 3x1 | |
2956 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, | |
2957 uint8* dst_ptr, int dst_width) { | |
2958 int i; | |
2959 assert((dst_width % 3 == 0) && (dst_width > 0)); | |
2960 for (i = 0; i < dst_width; i+=3) { | |
2961 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + | |
2962 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + | |
2963 src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + | |
2964 src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * | |
2965 (65536 / 9) >> 16; | |
2966 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + | |
2967 src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + | |
2968 src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + | |
2969 src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * | |
2970 (65536 / 9) >> 16; | |
2971 dst_ptr[2] = (src_ptr[6] + src_ptr[7] + | |
2972 src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + | |
2973 src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) * | |
2974 (65536 / 6) >> 16; | |
2975 src_ptr += 8; | |
2976 dst_ptr += 3; | |
2977 } | |
2978 } | |
2979 | |
2980 // 8x2 -> 3x1 | |
2981 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride, | |
2982 uint8* dst_ptr, int dst_width) { | |
2983 int i; | |
2984 assert((dst_width % 3 == 0) && (dst_width > 0)); | |
2985 for (i = 0; i < dst_width; i+=3) { | |
2986 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + | |
2987 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + | |
2988 src_ptr[src_stride + 2]) * (65536 / 6) >> 16; | |
2989 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + | |
2990 src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + | |
2991 src_ptr[src_stride + 5]) * (65536 / 6) >> 16; | |
2992 dst_ptr[2] = (src_ptr[6] + src_ptr[7] + | |
2993 src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) * | |
2994 (65536 / 4) >> 16; | |
2995 src_ptr += 8; | |
2996 dst_ptr += 3; | |
2997 } | |
2998 } | |
2999 | |
3000 // C version 8x2 -> 8x1 | |
3001 static void ScaleFilterRows_C(uint8* dst_ptr, | |
3002 const uint8* src_ptr, int src_stride, | |
3003 int dst_width, int source_y_fraction) { | |
3004 int y1_fraction; | |
3005 int y0_fraction; | |
3006 const uint8* src_ptr1; | |
3007 uint8* end; | |
3008 assert(dst_width > 0); | |
3009 y1_fraction = source_y_fraction; | |
3010 y0_fraction = 256 - y1_fraction; | |
3011 src_ptr1 = src_ptr + src_stride; | |
3012 end = dst_ptr + dst_width; | |
3013 do { | |
3014 dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; | |
3015 dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; | |
3016 dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; | |
3017 dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; | |
3018 dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; | |
3019 dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; | |
3020 dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; | |
3021 dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; | |
3022 src_ptr += 8; | |
3023 src_ptr1 += 8; | |
3024 dst_ptr += 8; | |
3025 } while (dst_ptr < end); | |
3026 dst_ptr[0] = dst_ptr[-1]; | |
3027 } | |
3028 | |
3029 void ScaleAddRows_C(const uint8* src_ptr, int src_stride, | |
3030 uint16* dst_ptr, int src_width, int src_height) { | |
3031 int x,y; | |
3032 assert(src_width > 0); | |
3033 assert(src_height > 0); | |
3034 for (x = 0; x < src_width; ++x) { | |
3035 const uint8* s = src_ptr + x; | |
3036 int sum = 0; | |
3037 for (y = 0; y < src_height; ++y) { | |
3038 sum += s[0]; | |
3039 s += src_stride; | |
3040 } | |
3041 dst_ptr[x] = sum; | |
3042 } | |
3043 } | |
3044 | |
3045 /** | |
3046 * Scale plane, 1/2 | |
3047 * | |
3048 * This is an optimized version for scaling down a plane to 1/2 of | |
3049 * its original size. | |
3050 * | |
3051 */ | |
3052 static void ScalePlaneDown2(int src_width, int src_height, | |
3053 int dst_width, int dst_height, | |
3054 int src_stride, int dst_stride, | |
3055 const uint8* src_ptr, uint8* dst_ptr, | |
3056 FilterModeEnum filtering) { | |
3057 void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride, | |
3058 uint8* dst_ptr, int dst_width); | |
3059 assert(IS_ALIGNED(src_width, 2)); | |
3060 assert(IS_ALIGNED(src_height, 2)); | |
3061 | |
3062 #if defined(HAS_SCALEROWDOWN2_NEON) | |
3063 if (TestCpuFlag(kCpuHasNEON) && | |
3064 IS_ALIGNED(dst_width, 16)) { | |
3065 ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; | |
3066 } else | |
3067 #endif | |
3068 #if defined(HAS_SCALEROWDOWN2_SSE2) | |
3069 if (TestCpuFlag(kCpuHasSSE2) && | |
3070 IS_ALIGNED(dst_width, 16) && | |
3071 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && | |
3072 IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { | |
3073 ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; | |
3074 } else | |
3075 #endif | |
3076 { | |
3077 ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; | |
3078 } | |
3079 | |
3080 { | |
3081 int y; | |
3082 for (y = 0; y < dst_height; ++y) { | |
3083 ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); | |
3084 src_ptr += (src_stride << 1); | |
3085 dst_ptr += dst_stride; | |
3086 } | |
3087 } | |
3088 } | |
3089 | |
3090 /** | |
3091 * Scale plane, 1/4 | |
3092 * | |
3093 * This is an optimized version for scaling down a plane to 1/4 of | |
3094 * its original size. | |
3095 */ | |
3096 static void ScalePlaneDown4(int src_width, int src_height, | |
3097 int dst_width, int dst_height, | |
3098 int src_stride, int dst_stride, | |
3099 const uint8* src_ptr, uint8* dst_ptr, | |
3100 FilterModeEnum filtering) { | |
3101 void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride, | |
3102 uint8* dst_ptr, int dst_width); | |
3103 assert(IS_ALIGNED(src_width, 4)); | |
3104 assert(IS_ALIGNED(src_height, 4)); | |
3105 | |
3106 #if defined(HAS_SCALEROWDOWN4_NEON) | |
3107 if (TestCpuFlag(kCpuHasNEON) && | |
3108 IS_ALIGNED(dst_width, 4)) { | |
3109 ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON; | |
3110 } else | |
3111 #endif | |
3112 #if defined(HAS_SCALEROWDOWN4_SSE2) | |
3113 if (TestCpuFlag(kCpuHasSSE2) && | |
3114 IS_ALIGNED(dst_width, 8) && | |
3115 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && | |
3116 IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) { | |
3117 ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; | |
3118 } else | |
3119 #endif | |
3120 { | |
3121 ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; | |
3122 } | |
3123 | |
3124 { | |
3125 int y; | |
3126 for (y = 0; y < dst_height; ++y) { | |
3127 ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); | |
3128 src_ptr += (src_stride << 2); | |
3129 dst_ptr += dst_stride; | |
3130 } | |
3131 } | |
3132 } | |
3133 | |
3134 /** | |
3135 * Scale plane, 1/8 | |
3136 * | |
3137 * This is an optimized version for scaling down a plane to 1/8 | |
3138 * of its original size. | |
3139 * | |
3140 */ | |
3141 static void ScalePlaneDown8(int src_width, int src_height, | |
3142 int dst_width, int dst_height, | |
3143 int src_stride, int dst_stride, | |
3144 const uint8* src_ptr, uint8* dst_ptr, | |
3145 FilterModeEnum filtering) { | |
3146 void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride, | |
3147 uint8* dst_ptr, int dst_width); | |
3148 assert(IS_ALIGNED(src_width, 8)); | |
3149 assert(IS_ALIGNED(src_height, 8)); | |
3150 | |
3151 #if defined(HAS_SCALEROWDOWN8_SSE2) | |
3152 if (TestCpuFlag(kCpuHasSSE2) && | |
3153 IS_ALIGNED(dst_width, 4) && | |
3154 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && | |
3155 IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { | |
3156 ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; | |
3157 } else | |
3158 #endif | |
3159 { | |
3160 ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ? | |
3161 ScaleRowDown8Int_C : ScaleRowDown8_C; | |
3162 } | |
3163 | |
3164 { | |
3165 int y; | |
3166 for (y = 0; y < dst_height; ++y) { | |
3167 ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width); | |
3168 src_ptr += (src_stride << 3); | |
3169 dst_ptr += dst_stride; | |
3170 } | |
3171 } | |
3172 } | |
3173 | |
3174 /** | |
3175 * Scale plane down, 3/4 | |
3176 * | |
3177 * Provided by Frank Barchard (fbarchard@google.com) | |
3178 * | |
3179 */ | |
3180 static void ScalePlaneDown34(int src_width, int src_height, | |
3181 int dst_width, int dst_height, | |
3182 int src_stride, int dst_stride, | |
3183 const uint8* src_ptr, uint8* dst_ptr, | |
3184 FilterModeEnum filtering) { | |
3185 void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride, | |
3186 uint8* dst_ptr, int dst_width); | |
3187 void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride, | |
3188 uint8* dst_ptr, int dst_width); | |
3189 assert(dst_width % 3 == 0); | |
3190 #if defined(HAS_SCALEROWDOWN34_NEON) | |
3191 if (TestCpuFlag(kCpuHasNEON) && | |
3192 (dst_width % 24 == 0)) { | |
3193 if (!filtering) { | |
3194 ScaleRowDown34_0 = ScaleRowDown34_NEON; | |
3195 ScaleRowDown34_1 = ScaleRowDown34_NEON; | |
3196 } else { | |
3197 ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON; | |
3198 ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON; | |
3199 } | |
3200 } else | |
3201 #endif | |
3202 | |
3203 #if defined(HAS_SCALEROWDOWN34_SSSE3) | |
3204 if (TestCpuFlag(kCpuHasSSSE3) && | |
3205 (dst_width % 24 == 0) && | |
3206 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && | |
3207 IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) { | |
3208 if (!filtering) { | |
3209 ScaleRowDown34_0 = ScaleRowDown34_SSSE3; | |
3210 ScaleRowDown34_1 = ScaleRowDown34_SSSE3; | |
3211 } else { | |
3212 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; | |
3213 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; | |
3214 } | |
3215 } else | |
3216 #endif | |
3217 #if defined(HAS_SCALEROWDOWN34_SSE2) | |
3218 if (TestCpuFlag(kCpuHasSSE2) && | |
3219 (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) && | |
3220 IS_ALIGNED(dst_stride, 8) && | |
3221 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) && | |
3222 filtering) { | |
3223 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; | |
3224 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; | |
3225 } else | |
3226 #endif | |
3227 { | |
3228 if (!filtering) { | |
3229 ScaleRowDown34_0 = ScaleRowDown34_C; | |
3230 ScaleRowDown34_1 = ScaleRowDown34_C; | |
3231 } else { | |
3232 ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; | |
3233 ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; | |
3234 } | |
3235 } | |
3236 { | |
3237 int src_row = 0; | |
3238 int y; | |
3239 for (y = 0; y < dst_height; ++y) { | |
3240 switch (src_row) { | |
3241 case 0: | |
3242 ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); | |
3243 break; | |
3244 | |
3245 case 1: | |
3246 ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width); | |
3247 break; | |
3248 | |
3249 case 2: | |
3250 ScaleRowDown34_0(src_ptr + src_stride, -src_stride, | |
3251 dst_ptr, dst_width); | |
3252 break; | |
3253 } | |
3254 ++src_row; | |
3255 src_ptr += src_stride; | |
3256 dst_ptr += dst_stride; | |
3257 if (src_row >= 3) { | |
3258 src_ptr += src_stride; | |
3259 src_row = 0; | |
3260 } | |
3261 } | |
3262 } | |
3263 } | |
3264 | |
3265 /** | |
3266 * Scale plane, 3/8 | |
3267 * | |
3268 * This is an optimized version for scaling down a plane to 3/8 | |
3269 * of its original size. | |
3270 * | |
3271 * Reduces 16x3 to 6x1 | |
3272 */ | |
3273 static void ScalePlaneDown38(int src_width, int src_height, | |
3274 int dst_width, int dst_height, | |
3275 int src_stride, int dst_stride, | |
3276 const uint8* src_ptr, uint8* dst_ptr, | |
3277 FilterModeEnum filtering) { | |
3278 void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride, | |
3279 uint8* dst_ptr, int dst_width); | |
3280 void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride, | |
3281 uint8* dst_ptr, int dst_width); | |
3282 assert(dst_width % 3 == 0); | |
3283 #if defined(HAS_SCALEROWDOWN38_NEON) | |
3284 if (TestCpuFlag(kCpuHasNEON) && | |
3285 (dst_width % 12 == 0)) { | |
3286 if (!filtering) { | |
3287 ScaleRowDown38_3 = ScaleRowDown38_NEON; | |
3288 ScaleRowDown38_2 = ScaleRowDown38_NEON; | |
3289 } else { | |
3290 ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON; | |
3291 ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON; | |
3292 } | |
3293 } else | |
3294 #endif | |
3295 | |
3296 #if defined(HAS_SCALEROWDOWN38_SSSE3) | |
3297 if (TestCpuFlag(kCpuHasSSSE3) && | |
3298 (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) && | |
3299 IS_ALIGNED(dst_stride, 8) && | |
3300 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { | |
3301 if (!filtering) { | |
3302 ScaleRowDown38_3 = ScaleRowDown38_SSSE3; | |
3303 ScaleRowDown38_2 = ScaleRowDown38_SSSE3; | |
3304 } else { | |
3305 ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3; | |
3306 ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; | |
3307 } | |
3308 } else | |
3309 #endif | |
3310 { | |
3311 if (!filtering) { | |
3312 ScaleRowDown38_3 = ScaleRowDown38_C; | |
3313 ScaleRowDown38_2 = ScaleRowDown38_C; | |
3314 } else { | |
3315 ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; | |
3316 ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; | |
3317 } | |
3318 } | |
3319 { | |
3320 int src_row = 0; | |
3321 int y; | |
3322 for (y = 0; y < dst_height; ++y) { | |
3323 switch (src_row) { | |
3324 case 0: | |
3325 case 1: | |
3326 ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); | |
3327 src_ptr += src_stride * 3; | |
3328 ++src_row; | |
3329 break; | |
3330 | |
3331 case 2: | |
3332 ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width); | |
3333 src_ptr += src_stride * 2; | |
3334 src_row = 0; | |
3335 break; | |
3336 } | |
3337 dst_ptr += dst_stride; | |
3338 } | |
3339 } | |
3340 } | |
3341 | |
3342 __inline static uint32 SumBox(int iboxwidth, int iboxheight, | |
3343 int src_stride, const uint8* src_ptr) { | |
3344 int x, y; | |
3345 uint32 sum; | |
3346 assert(iboxwidth > 0); | |
3347 assert(iboxheight > 0); | |
3348 sum = 0u; | |
3349 for (y = 0; y < iboxheight; ++y) { | |
3350 for (x = 0; x < iboxwidth; ++x) { | |
3351 sum += src_ptr[x]; | |
3352 } | |
3353 src_ptr += src_stride; | |
3354 } | |
3355 return sum; | |
3356 } | |
3357 | |
3358 static void ScalePlaneBoxRow(int dst_width, int boxheight, | |
3359 int dx, int src_stride, | |
3360 const uint8* src_ptr, uint8* dst_ptr) { | |
3361 int x = 0; | |
3362 int i; | |
3363 for (i = 0; i < dst_width; ++i) { | |
3364 int ix = x >> 16; | |
3365 int boxwidth; | |
3366 x += dx; | |
3367 boxwidth = (x >> 16) - ix; | |
3368 *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / | |
3369 (boxwidth * boxheight); | |
3370 } | |
3371 } | |
3372 | |
3373 __inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { | |
3374 uint32 sum; | |
3375 int x; | |
3376 assert(iboxwidth > 0); | |
3377 sum = 0u; | |
3378 for (x = 0; x < iboxwidth; ++x) { | |
3379 sum += src_ptr[x]; | |
3380 } | |
3381 return sum; | |
3382 } | |
3383 | |
3384 static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, | |
3385 const uint16* src_ptr, uint8* dst_ptr) { | |
3386 int scaletbl[2]; | |
3387 int minboxwidth = (dx >> 16); | |
3388 scaletbl[0] = 65536 / (minboxwidth * boxheight); | |
3389 scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); | |
3390 { | |
3391 int *scaleptr = scaletbl - minboxwidth; | |
3392 int x = 0; | |
3393 int i; | |
3394 for (i = 0; i < dst_width; ++i) { | |
3395 int ix = x >> 16; | |
3396 int boxwidth; | |
3397 x += dx; | |
3398 boxwidth = (x >> 16) - ix; | |
3399 *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; | |
3400 } | |
3401 } | |
3402 } | |
3403 | |
3404 static void ScaleAddCols1_C(int dst_width, int boxheight, int dx, | |
3405 const uint16* src_ptr, uint8* dst_ptr) { | |
3406 int boxwidth = (dx >> 16); | |
3407 int scaleval = 65536 / (boxwidth * boxheight); | |
3408 int x = 0; | |
3409 int i; | |
3410 for (i = 0; i < dst_width; ++i) { | |
3411 *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; | |
3412 x += boxwidth; | |
3413 } | |
3414 } | |
3415 | |
3416 /** | |
3417 * Scale plane down to any dimensions, with interpolation. | |
3418 * (boxfilter). | |
3419 * | |
3420 * Same method as SimpleScale, which is fixed point, outputting | |
3421 * one pixel of destination using fixed point (16.16) to step | |
3422 * through source, sampling a box of pixel with simple | |
3423 * averaging. | |
3424 */ | |
3425 static void ScalePlaneBox(int src_width, int src_height, | |
3426 int dst_width, int dst_height, | |
3427 int src_stride, int dst_stride, | |
3428 const uint8* src_ptr, uint8* dst_ptr) { | |
3429 int dx, dy; | |
3430 assert(dst_width > 0); | |
3431 assert(dst_height > 0); | |
3432 dy = (src_height << 16) / dst_height; | |
3433 dx = (src_width << 16) / dst_width; | |
3434 if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) || | |
3435 dst_height * 2 > src_height) { | |
3436 uint8* dst = dst_ptr; | |
3437 int dy = (src_height << 16) / dst_height; | |
3438 int dx = (src_width << 16) / dst_width; | |
3439 int y = 0; | |
3440 int j; | |
3441 for (j = 0; j < dst_height; ++j) { | |
3442 int iy = y >> 16; | |
3443 const uint8* const src = src_ptr + iy * src_stride; | |
3444 int boxheight; | |
3445 y += dy; | |
3446 if (y > (src_height << 16)) { | |
3447 y = (src_height << 16); | |
3448 } | |
3449 boxheight = (y >> 16) - iy; | |
3450 ScalePlaneBoxRow(dst_width, boxheight, | |
3451 dx, src_stride, | |
3452 src, dst); | |
3453 | |
3454 dst += dst_stride; | |
3455 } | |
3456 } else { | |
3457 ALIGN16(uint16 row[kMaxInputWidth]); | |
3458 void (*ScaleAddRows)(const uint8* src_ptr, int src_stride, | |
3459 uint16* dst_ptr, int src_width, int src_height); | |
3460 void (*ScaleAddCols)(int dst_width, int boxheight, int dx, | |
3461 const uint16* src_ptr, uint8* dst_ptr); | |
3462 #if defined(HAS_SCALEADDROWS_SSE2) | |
3463 if (TestCpuFlag(kCpuHasSSE2) && | |
3464 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && | |
3465 IS_ALIGNED(src_width, 16)) { | |
3466 ScaleAddRows = ScaleAddRows_SSE2; | |
3467 } else | |
3468 #endif | |
3469 { | |
3470 ScaleAddRows = ScaleAddRows_C; | |
3471 } | |
3472 if (dx & 0xffff) { | |
3473 ScaleAddCols = ScaleAddCols2_C; | |
3474 } else { | |
3475 ScaleAddCols = ScaleAddCols1_C; | |
3476 } | |
3477 | |
3478 { | |
3479 int y = 0; | |
3480 int j; | |
3481 for (j = 0; j < dst_height; ++j) { | |
3482 int iy = y >> 16; | |
3483 const uint8* const src = src_ptr + iy * src_stride; | |
3484 int boxheight; | |
3485 y += dy; | |
3486 if (y > (src_height << 16)) { | |
3487 y = (src_height << 16); | |
3488 } | |
3489 boxheight = (y >> 16) - iy; | |
3490 ScaleAddRows(src, src_stride, row, src_width, boxheight); | |
3491 ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr); | |
3492 dst_ptr += dst_stride; | |
3493 } | |
3494 } | |
3495 } | |
3496 } | |
3497 | |
3498 /** | |
3499 * Scale plane to/from any dimensions, with interpolation. | |
3500 */ | |
3501 static void ScalePlaneBilinearSimple(int src_width, int src_height, | |
3502 int dst_width, int dst_height, | |
3503 int src_stride, int dst_stride, | |
3504 const uint8* src_ptr, uint8* dst_ptr) { | |
3505 int i, j; | |
3506 uint8* dst = dst_ptr; | |
3507 int dx = (src_width << 16) / dst_width; | |
3508 int dy = (src_height << 16) / dst_height; | |
3509 int maxx = ((src_width - 1) << 16) - 1; | |
3510 int maxy = ((src_height - 1) << 16) - 1; | |
3511 int y = (dst_height < src_height) ? 32768 : | |
3512 (src_height << 16) / dst_height - 32768; | |
3513 for (i = 0; i < dst_height; ++i) { | |
3514 int cy = (y < 0) ? 0 : y; | |
3515 int yi = cy >> 16; | |
3516 int yf = cy & 0xffff; | |
3517 const uint8* const src = src_ptr + yi * src_stride; | |
3518 int x = (dst_width < src_width) ? 32768 : | |
3519 (src_width << 16) / dst_width - 32768; | |
3520 for (j = 0; j < dst_width; ++j) { | |
3521 int cx = (x < 0) ? 0 : x; | |
3522 int xi = cx >> 16; | |
3523 int xf = cx & 0xffff; | |
3524 int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16; | |
3525 int r1 = (src[xi + src_stride] * (65536 - xf) + | |
3526 src[xi + src_stride + 1] * xf) >> 16; | |
3527 *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16; | |
3528 x += dx; | |
3529 if (x > maxx) | |
3530 x = maxx; | |
3531 } | |
3532 dst += dst_stride - dst_width; | |
3533 y += dy; | |
3534 if (y > maxy) | |
3535 y = maxy; | |
3536 } | |
3537 } | |
3538 | |
3539 /** | |
3540 * Scale plane to/from any dimensions, with bilinear | |
3541 * interpolation. | |
3542 */ | |
3543 static void ScalePlaneBilinear(int src_width, int src_height, | |
3544 int dst_width, int dst_height, | |
3545 int src_stride, int dst_stride, | |
3546 const uint8* src_ptr, uint8* dst_ptr) { | |
3547 int dy; | |
3548 int dx; | |
3549 assert(dst_width > 0); | |
3550 assert(dst_height > 0); | |
3551 dy = (src_height << 16) / dst_height; | |
3552 dx = (src_width << 16) / dst_width; | |
3553 if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) { | |
3554 ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, | |
3555 src_stride, dst_stride, src_ptr, dst_ptr); | |
3556 | |
3557 } else { | |
3558 ALIGN16(uint8 row[kMaxInputWidth + 1]); | |
3559 void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr, | |
3560 int src_stride, | |
3561 int dst_width, int source_y_fraction); | |
3562 void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, | |
3563 int dst_width, int dx); | |
3564 #if defined(HAS_SCALEFILTERROWS_SSSE3) | |
3565 if (TestCpuFlag(kCpuHasSSSE3) && | |
3566 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && | |
3567 IS_ALIGNED(src_width, 16)) { | |
3568 ScaleFilterRows = ScaleFilterRows_SSSE3; | |
3569 } else | |
3570 #endif | |
3571 #if defined(HAS_SCALEFILTERROWS_SSE2) | |
3572 if (TestCpuFlag(kCpuHasSSE2) && | |
3573 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && | |
3574 IS_ALIGNED(src_width, 16)) { | |
3575 ScaleFilterRows = ScaleFilterRows_SSE2; | |
3576 } else | |
3577 #endif | |
3578 { | |
3579 ScaleFilterRows = ScaleFilterRows_C; | |
3580 } | |
3581 ScaleFilterCols = ScaleFilterCols_C; | |
3582 | |
3583 { | |
3584 int y = 0; | |
3585 int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows. | |
3586 int j; | |
3587 for (j = 0; j < dst_height; ++j) { | |
3588 int iy = y >> 16; | |
3589 int fy = (y >> 8) & 255; | |
3590 const uint8* const src = src_ptr + iy * src_stride; | |
3591 ScaleFilterRows(row, src, src_stride, src_width, fy); | |
3592 ScaleFilterCols(dst_ptr, row, dst_width, dx); | |
3593 dst_ptr += dst_stride; | |
3594 y += dy; | |
3595 if (y > maxy) { | |
3596 y = maxy; | |
3597 } | |
3598 } | |
3599 } | |
3600 } | |
3601 } | |
3602 | |
3603 /** | |
3604 * Scale plane to/from any dimensions, without interpolation. | |
3605 * Fixed point math is used for performance: The upper 16 bits | |
3606 * of x and dx is the integer part of the source position and | |
3607 * the lower 16 bits are the fixed decimal part. | |
3608 */ | |
3609 static void ScalePlaneSimple(int src_width, int src_height, | |
3610 int dst_width, int dst_height, | |
3611 int src_stride, int dst_stride, | |
3612 const uint8* src_ptr, uint8* dst_ptr) { | |
3613 uint8* dst = dst_ptr; | |
3614 int dx = (src_width << 16) / dst_width; | |
3615 int y; | |
3616 for (y = 0; y < dst_height; ++y) { | |
3617 const uint8* const src = src_ptr + (y * src_height / dst_height) * | |
3618 src_stride; | |
3619 // TODO(fbarchard): Round X coordinate by setting x=0x8000. | |
3620 int x = 0; | |
3621 int i; | |
3622 for (i = 0; i < dst_width; ++i) { | |
3623 *dst++ = src[x >> 16]; | |
3624 x += dx; | |
3625 } | |
3626 dst += dst_stride - dst_width; | |
3627 } | |
3628 } | |
3629 | |
3630 /** | |
3631 * Scale plane to/from any dimensions. | |
3632 */ | |
3633 static void ScalePlaneAnySize(int src_width, int src_height, | |
3634 int dst_width, int dst_height, | |
3635 int src_stride, int dst_stride, | |
3636 const uint8* src_ptr, uint8* dst_ptr, | |
3637 FilterModeEnum filtering) { | |
3638 if (!filtering) { | |
3639 ScalePlaneSimple(src_width, src_height, dst_width, dst_height, | |
3640 src_stride, dst_stride, src_ptr, dst_ptr); | |
3641 } else { | |
3642 // fall back to non-optimized version | |
3643 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, | |
3644 src_stride, dst_stride, src_ptr, dst_ptr); | |
3645 } | |
3646 } | |
3647 | |
3648 /** | |
3649 * Scale plane down, any size | |
3650 * | |
3651 * This is an optimized version for scaling down a plane to any size. | |
3652 * The current implementation is ~10 times faster compared to the | |
3653 * reference implementation for e.g. XGA->LowResPAL | |
3654 * | |
3655 */ | |
3656 static void ScalePlaneDown(int src_width, int src_height, | |
3657 int dst_width, int dst_height, | |
3658 int src_stride, int dst_stride, | |
3659 const uint8* src_ptr, uint8* dst_ptr, | |
3660 FilterModeEnum filtering) { | |
3661 if (!filtering) { | |
3662 ScalePlaneSimple(src_width, src_height, dst_width, dst_height, | |
3663 src_stride, dst_stride, src_ptr, dst_ptr); | |
3664 } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) { | |
3665 // between 1/2x and 1x use bilinear | |
3666 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, | |
3667 src_stride, dst_stride, src_ptr, dst_ptr); | |
3668 } else { | |
3669 ScalePlaneBox(src_width, src_height, dst_width, dst_height, | |
3670 src_stride, dst_stride, src_ptr, dst_ptr); | |
3671 } | |
3672 } | |
3673 | |
3674 /** | |
3675 * Copy plane, no scaling | |
3676 * | |
3677 * This simply copies the given plane without scaling. | |
3678 * The current implementation is ~115 times faster | |
3679 * compared to the reference implementation. | |
3680 * | |
3681 */ | |
3682 static void CopyPlane(int src_width, int src_height, | |
3683 int dst_width, int dst_height, | |
3684 int src_stride, int dst_stride, | |
3685 const uint8* src_ptr, uint8* dst_ptr) { | |
3686 if (src_stride == src_width && dst_stride == dst_width) { | |
3687 // All contiguous, so can use REALLY fast path. | |
3688 memcpy(dst_ptr, src_ptr, src_width * src_height); | |
3689 } else { | |
3690 // Not all contiguous; must copy scanlines individually | |
3691 const uint8* src = src_ptr; | |
3692 uint8* dst = dst_ptr; | |
3693 int i; | |
3694 for (i = 0; i < src_height; ++i) { | |
3695 memcpy(dst, src, src_width); | |
3696 dst += dst_stride; | |
3697 src += src_stride; | |
3698 } | |
3699 } | |
3700 } | |
3701 | |
3702 static void ScalePlane(const uint8* src, int src_stride, | |
3703 int src_width, int src_height, | |
3704 uint8* dst, int dst_stride, | |
3705 int dst_width, int dst_height, | |
3706 FilterModeEnum filtering, int use_ref) { | |
3707 // Use specialized scales to improve performance for common resolutions. | |
3708 // For example, all the 1/2 scalings will use ScalePlaneDown2() | |
3709 if (dst_width == src_width && dst_height == src_height) { | |
3710 // Straight copy. | |
3711 CopyPlane(src_width, src_height, dst_width, dst_height, src_stride, | |
3712 dst_stride, src, dst); | |
3713 } else if (dst_width <= src_width && dst_height <= src_height) { | |
3714 // Scale down. | |
3715 if (use_ref) { | |
3716 // For testing, allow the optimized versions to be disabled. | |
3717 ScalePlaneDown(src_width, src_height, dst_width, dst_height, | |
3718 src_stride, dst_stride, src, dst, filtering); | |
3719 } else if (4 * dst_width == 3 * src_width && | |
3720 4 * dst_height == 3 * src_height) { | |
3721 // optimized, 3/4 | |
3722 ScalePlaneDown34(src_width, src_height, dst_width, dst_height, | |
3723 src_stride, dst_stride, src, dst, filtering); | |
3724 } else if (2 * dst_width == src_width && 2 * dst_height == src_height) { | |
3725 // optimized, 1/2 | |
3726 ScalePlaneDown2(src_width, src_height, dst_width, dst_height, | |
3727 src_stride, dst_stride, src, dst, filtering); | |
3728 // 3/8 rounded up for odd sized chroma height. | |
3729 } else if (8 * dst_width == 3 * src_width && | |
3730 dst_height == ((src_height * 3 + 7) / 8)) { | |
3731 // optimized, 3/8 | |
3732 ScalePlaneDown38(src_width, src_height, dst_width, dst_height, | |
3733 src_stride, dst_stride, src, dst, filtering); | |
3734 } else if (4 * dst_width == src_width && 4 * dst_height == src_height) { | |
3735 // optimized, 1/4 | |
3736 ScalePlaneDown4(src_width, src_height, dst_width, dst_height, | |
3737 src_stride, dst_stride, src, dst, filtering); | |
3738 } else if (8 * dst_width == src_width && 8 * dst_height == src_height) { | |
3739 // optimized, 1/8 | |
3740 ScalePlaneDown8(src_width, src_height, dst_width, dst_height, | |
3741 src_stride, dst_stride, src, dst, filtering); | |
3742 } else { | |
3743 // Arbitrary downsample | |
3744 ScalePlaneDown(src_width, src_height, dst_width, dst_height, | |
3745 src_stride, dst_stride, src, dst, filtering); | |
3746 } | |
3747 } else { | |
3748 // Arbitrary scale up and/or down. | |
3749 ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, | |
3750 src_stride, dst_stride, src, dst, filtering); | |
3751 } | |
3752 } | |
3753 | |
3754 /** | |
3755 * Scale a plane. | |
3756 * | |
3757 * This function in turn calls a scaling function | |
3758 * suitable for handling the desired resolutions. | |
3759 * | |
3760 */ | |
3761 | |
3762 int I420Scale(const uint8* src_y, int src_stride_y, | |
3763 const uint8* src_u, int src_stride_u, | |
3764 const uint8* src_v, int src_stride_v, | |
3765 int src_width, int src_height, | |
3766 uint8* dst_y, int dst_stride_y, | |
3767 uint8* dst_u, int dst_stride_u, | |
3768 uint8* dst_v, int dst_stride_v, | |
3769 int dst_width, int dst_height, | |
3770 FilterModeEnum filtering) { | |
3771 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || | |
3772 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { | |
3773 return -1; | |
3774 } | |
3775 // Negative height means invert the image. | |
3776 if (src_height < 0) { | |
3777 int halfheight; | |
3778 src_height = -src_height; | |
3779 halfheight = (src_height + 1) >> 1; | |
3780 src_y = src_y + (src_height - 1) * src_stride_y; | |
3781 src_u = src_u + (halfheight - 1) * src_stride_u; | |
3782 src_v = src_v + (halfheight - 1) * src_stride_v; | |
3783 src_stride_y = -src_stride_y; | |
3784 src_stride_u = -src_stride_u; | |
3785 src_stride_v = -src_stride_v; | |
3786 } | |
3787 { | |
3788 int src_halfwidth = (src_width + 1) >> 1; | |
3789 int src_halfheight = (src_height + 1) >> 1; | |
3790 int dst_halfwidth = (dst_width + 1) >> 1; | |
3791 int dst_halfheight = (dst_height + 1) >> 1; | |
3792 | |
3793 ScalePlane(src_y, src_stride_y, src_width, src_height, | |
3794 dst_y, dst_stride_y, dst_width, dst_height, | |
3795 filtering, use_reference_impl_); | |
3796 ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, | |
3797 dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, | |
3798 filtering, use_reference_impl_); | |
3799 ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, | |
3800 dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, | |
3801 filtering, use_reference_impl_); | |
3802 } | |
3803 return 0; | |
3804 } | |
3805 | |
3806 // Deprecated api | |
3807 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, | |
3808 int src_stride_y, int src_stride_u, int src_stride_v, | |
3809 int src_width, int src_height, | |
3810 uint8* dst_y, uint8* dst_u, uint8* dst_v, | |
3811 int dst_stride_y, int dst_stride_u, int dst_stride_v, | |
3812 int dst_width, int dst_height, | |
3813 int interpolate) { | |
3814 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || | |
3815 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { | |
3816 return -1; | |
3817 } | |
3818 // Negative height means invert the image. | |
3819 if (src_height < 0) { | |
3820 int halfheight; | |
3821 src_height = -src_height; | |
3822 halfheight = (src_height + 1) >> 1; | |
3823 src_y = src_y + (src_height - 1) * src_stride_y; | |
3824 src_u = src_u + (halfheight - 1) * src_stride_u; | |
3825 src_v = src_v + (halfheight - 1) * src_stride_v; | |
3826 src_stride_y = -src_stride_y; | |
3827 src_stride_u = -src_stride_u; | |
3828 src_stride_v = -src_stride_v; | |
3829 } | |
3830 { | |
3831 int src_halfwidth = (src_width + 1) >> 1; | |
3832 int src_halfheight = (src_height + 1) >> 1; | |
3833 int dst_halfwidth = (dst_width + 1) >> 1; | |
3834 int dst_halfheight = (dst_height + 1) >> 1; | |
3835 FilterModeEnum filtering = interpolate ? kFilterBox : kFilterNone; | |
3836 | |
3837 ScalePlane(src_y, src_stride_y, src_width, src_height, | |
3838 dst_y, dst_stride_y, dst_width, dst_height, | |
3839 filtering, use_reference_impl_); | |
3840 ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, | |
3841 dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, | |
3842 filtering, use_reference_impl_); | |
3843 ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, | |
3844 dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, | |
3845 filtering, use_reference_impl_); | |
3846 } | |
3847 return 0; | |
3848 } | |
3849 | |
3850 // Deprecated api | |
3851 int ScaleOffset(const uint8* src, int src_width, int src_height, | |
3852 uint8* dst, int dst_width, int dst_height, int dst_yoffset, | |
3853 int interpolate) { | |
3854 if (!src || src_width <= 0 || src_height <= 0 || | |
3855 !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 || | |
3856 dst_yoffset >= dst_height) { | |
3857 return -1; | |
3858 } | |
3859 dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2. | |
3860 { | |
3861 int src_halfwidth = (src_width + 1) >> 1; | |
3862 int src_halfheight = (src_height + 1) >> 1; | |
3863 int dst_halfwidth = (dst_width + 1) >> 1; | |
3864 int dst_halfheight = (dst_height + 1) >> 1; | |
3865 int aheight = dst_height - dst_yoffset * 2; // actual output height | |
3866 const uint8* const src_y = src; | |
3867 const uint8* const src_u = src + src_width * src_height; | |
3868 const uint8* const src_v = src + src_width * src_height + | |
3869 src_halfwidth * src_halfheight; | |
3870 uint8* dst_y = dst + dst_yoffset * dst_width; | |
3871 uint8* dst_u = dst + dst_width * dst_height + | |
3872 (dst_yoffset >> 1) * dst_halfwidth; | |
3873 uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + | |
3874 (dst_yoffset >> 1) * dst_halfwidth; | |
3875 return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth, | |
3876 src_width, src_height, dst_y, dst_u, dst_v, dst_width, | |
3877 dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate); | |
3878 } | |
3879 } | |
3880 | |
3881 #ifdef __cplusplus | |
3882 } // extern "C" | |
3883 } // namespace libyuv | |
3884 #endif | |
OLD | NEW |