Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(382)

Side by Side Diff: source/libvpx/third_party/libyuv/source/scale_win.cc

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "third_party/libyuv/include/libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for Visual C x86.
19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
20
21 // Offsets for source bytes 0 to 9
22 static uvec8 kShuf0 =
23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
24
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
26 static uvec8 kShuf1 =
27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
28
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
30 static uvec8 kShuf2 =
31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
32
33 // Offsets for source bytes 0 to 10
34 static uvec8 kShuf01 =
35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
36
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static uvec8 kShuf11 =
39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
40
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static uvec8 kShuf21 =
43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
44
45 // Coefficients for source bytes 0 to 10
46 static uvec8 kMadd01 =
47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
48
49 // Coefficients for source bytes 10 to 21
50 static uvec8 kMadd11 =
51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
52
53 // Coefficients for source bytes 21 to 31
54 static uvec8 kMadd21 =
55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
56
57 // Coefficients for source bytes 21 to 31
58 static vec16 kRound34 =
59 { 2, 2, 2, 2, 2, 2, 2, 2 };
60
61 static uvec8 kShuf38a =
62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
63
64 static uvec8 kShuf38b =
65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
66
67 // Arrange words 0,3,6 into 0,1,2
68 static uvec8 kShufAc =
69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
70
71 // Arrange words 0,3,6 into 3,4,5
72 static uvec8 kShufAc3 =
73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
74
75 // Scaling values for boxes of 3x3 and 2x3
76 static uvec16 kScaleAc33 =
77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
78
79 // Arrange first value for pixels 0,1,2,3,4,5
80 static uvec8 kShufAb0 =
81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
82
83 // Arrange second value for pixels 0,1,2,3,4,5
84 static uvec8 kShufAb1 =
85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
86
87 // Arrange third value for pixels 0,1,2,3,4,5
88 static uvec8 kShufAb2 =
89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
90
91 // Scaling values for boxes of 3x2 and 2x2
92 static uvec16 kScaleAb2 =
93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
94
95 // Reads 32 pixels, throws half away and writes 16 pixels.
96 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
97 __declspec(naked) __declspec(align(16))
98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
99 uint8* dst_ptr, int dst_width) {
100 __asm {
101 mov eax, [esp + 4] // src_ptr
102 // src_stride ignored
103 mov edx, [esp + 12] // dst_ptr
104 mov ecx, [esp + 16] // dst_width
105
106 align 4
107 wloop:
108 movdqa xmm0, [eax]
109 movdqa xmm1, [eax + 16]
110 lea eax, [eax + 32]
111 psrlw xmm0, 8 // isolate odd pixels.
112 psrlw xmm1, 8
113 packuswb xmm0, xmm1
114 sub ecx, 16
115 movdqa [edx], xmm0
116 lea edx, [edx + 16]
117 jg wloop
118
119 ret
120 }
121 }
122
123 // Blends 32x1 rectangle to 16x1.
124 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
125 __declspec(naked) __declspec(align(16))
126 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
127 uint8* dst_ptr, int dst_width) {
128 __asm {
129 mov eax, [esp + 4] // src_ptr
130 // src_stride
131 mov edx, [esp + 12] // dst_ptr
132 mov ecx, [esp + 16] // dst_width
133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
134 psrlw xmm5, 8
135
136 align 4
137 wloop:
138 movdqa xmm0, [eax]
139 movdqa xmm1, [eax + 16]
140 lea eax, [eax + 32]
141
142 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
143 psrlw xmm0, 8
144 movdqa xmm3, xmm1
145 psrlw xmm1, 8
146 pand xmm2, xmm5
147 pand xmm3, xmm5
148 pavgw xmm0, xmm2
149 pavgw xmm1, xmm3
150 packuswb xmm0, xmm1
151
152 sub ecx, 16
153 movdqa [edx], xmm0
154 lea edx, [edx + 16]
155 jg wloop
156
157 ret
158 }
159 }
160
161 // Blends 32x2 rectangle to 16x1.
162 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
163 __declspec(naked) __declspec(align(16))
164 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
165 uint8* dst_ptr, int dst_width) {
166 __asm {
167 push esi
168 mov eax, [esp + 4 + 4] // src_ptr
169 mov esi, [esp + 4 + 8] // src_stride
170 mov edx, [esp + 4 + 12] // dst_ptr
171 mov ecx, [esp + 4 + 16] // dst_width
172 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
173 psrlw xmm5, 8
174
175 align 4
176 wloop:
177 movdqa xmm0, [eax]
178 movdqa xmm1, [eax + 16]
179 movdqa xmm2, [eax + esi]
180 movdqa xmm3, [eax + esi + 16]
181 lea eax, [eax + 32]
182 pavgb xmm0, xmm2 // average rows
183 pavgb xmm1, xmm3
184
185 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
186 psrlw xmm0, 8
187 movdqa xmm3, xmm1
188 psrlw xmm1, 8
189 pand xmm2, xmm5
190 pand xmm3, xmm5
191 pavgw xmm0, xmm2
192 pavgw xmm1, xmm3
193 packuswb xmm0, xmm1
194
195 sub ecx, 16
196 movdqa [edx], xmm0
197 lea edx, [edx + 16]
198 jg wloop
199
200 pop esi
201 ret
202 }
203 }
204
205 // Reads 32 pixels, throws half away and writes 16 pixels.
206 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
207 __declspec(naked) __declspec(align(16))
208 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
209 ptrdiff_t src_stride,
210 uint8* dst_ptr, int dst_width) {
211 __asm {
212 mov eax, [esp + 4] // src_ptr
213 // src_stride ignored
214 mov edx, [esp + 12] // dst_ptr
215 mov ecx, [esp + 16] // dst_width
216
217 align 4
218 wloop:
219 movdqu xmm0, [eax]
220 movdqu xmm1, [eax + 16]
221 lea eax, [eax + 32]
222 psrlw xmm0, 8 // isolate odd pixels.
223 psrlw xmm1, 8
224 packuswb xmm0, xmm1
225 sub ecx, 16
226 movdqu [edx], xmm0
227 lea edx, [edx + 16]
228 jg wloop
229
230 ret
231 }
232 }
233
234 // Blends 32x1 rectangle to 16x1.
235 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
236 __declspec(naked) __declspec(align(16))
237 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
238 ptrdiff_t src_stride,
239 uint8* dst_ptr, int dst_width) {
240 __asm {
241 mov eax, [esp + 4] // src_ptr
242 // src_stride
243 mov edx, [esp + 12] // dst_ptr
244 mov ecx, [esp + 16] // dst_width
245 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
246 psrlw xmm5, 8
247
248 align 4
249 wloop:
250 movdqu xmm0, [eax]
251 movdqu xmm1, [eax + 16]
252 lea eax, [eax + 32]
253
254 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
255 psrlw xmm0, 8
256 movdqa xmm3, xmm1
257 psrlw xmm1, 8
258 pand xmm2, xmm5
259 pand xmm3, xmm5
260 pavgw xmm0, xmm2
261 pavgw xmm1, xmm3
262 packuswb xmm0, xmm1
263
264 sub ecx, 16
265 movdqu [edx], xmm0
266 lea edx, [edx + 16]
267 jg wloop
268
269 ret
270 }
271 }
272
273 // Blends 32x2 rectangle to 16x1.
274 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
275 __declspec(naked) __declspec(align(16))
276 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
277 ptrdiff_t src_stride,
278 uint8* dst_ptr, int dst_width) {
279 __asm {
280 push esi
281 mov eax, [esp + 4 + 4] // src_ptr
282 mov esi, [esp + 4 + 8] // src_stride
283 mov edx, [esp + 4 + 12] // dst_ptr
284 mov ecx, [esp + 4 + 16] // dst_width
285 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
286 psrlw xmm5, 8
287
288 align 4
289 wloop:
290 movdqu xmm0, [eax]
291 movdqu xmm1, [eax + 16]
292 movdqu xmm2, [eax + esi]
293 movdqu xmm3, [eax + esi + 16]
294 lea eax, [eax + 32]
295 pavgb xmm0, xmm2 // average rows
296 pavgb xmm1, xmm3
297
298 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
299 psrlw xmm0, 8
300 movdqa xmm3, xmm1
301 psrlw xmm1, 8
302 pand xmm2, xmm5
303 pand xmm3, xmm5
304 pavgw xmm0, xmm2
305 pavgw xmm1, xmm3
306 packuswb xmm0, xmm1
307
308 sub ecx, 16
309 movdqu [edx], xmm0
310 lea edx, [edx + 16]
311 jg wloop
312
313 pop esi
314 ret
315 }
316 }
317
318 // Point samples 32 pixels to 8 pixels.
319 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
320 __declspec(naked) __declspec(align(16))
321 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
322 uint8* dst_ptr, int dst_width) {
323 __asm {
324 mov eax, [esp + 4] // src_ptr
325 // src_stride ignored
326 mov edx, [esp + 12] // dst_ptr
327 mov ecx, [esp + 16] // dst_width
328 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
329 psrld xmm5, 24
330 pslld xmm5, 16
331
332 align 4
333 wloop:
334 movdqa xmm0, [eax]
335 movdqa xmm1, [eax + 16]
336 lea eax, [eax + 32]
337 pand xmm0, xmm5
338 pand xmm1, xmm5
339 packuswb xmm0, xmm1
340 psrlw xmm0, 8
341 packuswb xmm0, xmm0
342 sub ecx, 8
343 movq qword ptr [edx], xmm0
344 lea edx, [edx + 8]
345 jg wloop
346
347 ret
348 }
349 }
350
351 // Blends 32x4 rectangle to 8x1.
352 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
353 __declspec(naked) __declspec(align(16))
354 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
355 uint8* dst_ptr, int dst_width) {
356 __asm {
357 push esi
358 push edi
359 mov eax, [esp + 8 + 4] // src_ptr
360 mov esi, [esp + 8 + 8] // src_stride
361 mov edx, [esp + 8 + 12] // dst_ptr
362 mov ecx, [esp + 8 + 16] // dst_width
363 lea edi, [esi + esi * 2] // src_stride * 3
364 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
365 psrlw xmm7, 8
366
367 align 4
368 wloop:
369 movdqa xmm0, [eax]
370 movdqa xmm1, [eax + 16]
371 movdqa xmm2, [eax + esi]
372 movdqa xmm3, [eax + esi + 16]
373 pavgb xmm0, xmm2 // average rows
374 pavgb xmm1, xmm3
375 movdqa xmm2, [eax + esi * 2]
376 movdqa xmm3, [eax + esi * 2 + 16]
377 movdqa xmm4, [eax + edi]
378 movdqa xmm5, [eax + edi + 16]
379 lea eax, [eax + 32]
380 pavgb xmm2, xmm4
381 pavgb xmm3, xmm5
382 pavgb xmm0, xmm2
383 pavgb xmm1, xmm3
384
385 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
386 psrlw xmm0, 8
387 movdqa xmm3, xmm1
388 psrlw xmm1, 8
389 pand xmm2, xmm7
390 pand xmm3, xmm7
391 pavgw xmm0, xmm2
392 pavgw xmm1, xmm3
393 packuswb xmm0, xmm1
394
395 movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
396 psrlw xmm0, 8
397 pand xmm2, xmm7
398 pavgw xmm0, xmm2
399 packuswb xmm0, xmm0
400
401 sub ecx, 8
402 movq qword ptr [edx], xmm0
403 lea edx, [edx + 8]
404 jg wloop
405
406 pop edi
407 pop esi
408 ret
409 }
410 }
411
412 // Point samples 32 pixels to 24 pixels.
413 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
414 // Then shuffled to do the scaling.
415
416 // Note that movdqa+palign may be better than movdqu.
417 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
418 __declspec(naked) __declspec(align(16))
419 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
420 uint8* dst_ptr, int dst_width) {
421 __asm {
422 mov eax, [esp + 4] // src_ptr
423 // src_stride ignored
424 mov edx, [esp + 12] // dst_ptr
425 mov ecx, [esp + 16] // dst_width
426 movdqa xmm3, kShuf0
427 movdqa xmm4, kShuf1
428 movdqa xmm5, kShuf2
429
430 align 4
431 wloop:
432 movdqa xmm0, [eax]
433 movdqa xmm1, [eax + 16]
434 lea eax, [eax + 32]
435 movdqa xmm2, xmm1
436 palignr xmm1, xmm0, 8
437 pshufb xmm0, xmm3
438 pshufb xmm1, xmm4
439 pshufb xmm2, xmm5
440 movq qword ptr [edx], xmm0
441 movq qword ptr [edx + 8], xmm1
442 movq qword ptr [edx + 16], xmm2
443 lea edx, [edx + 24]
444 sub ecx, 24
445 jg wloop
446
447 ret
448 }
449 }
450
451 // Blends 32x2 rectangle to 24x1
452 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
453 // Then shuffled to do the scaling.
454
455 // Register usage:
456 // xmm0 src_row 0
457 // xmm1 src_row 1
458 // xmm2 shuf 0
459 // xmm3 shuf 1
460 // xmm4 shuf 2
461 // xmm5 madd 0
462 // xmm6 madd 1
463 // xmm7 kRound34
464
465 // Note that movdqa+palign may be better than movdqu.
466 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
467 __declspec(naked) __declspec(align(16))
468 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
469 ptrdiff_t src_stride,
470 uint8* dst_ptr, int dst_width) {
471 __asm {
472 push esi
473 mov eax, [esp + 4 + 4] // src_ptr
474 mov esi, [esp + 4 + 8] // src_stride
475 mov edx, [esp + 4 + 12] // dst_ptr
476 mov ecx, [esp + 4 + 16] // dst_width
477 movdqa xmm2, kShuf01
478 movdqa xmm3, kShuf11
479 movdqa xmm4, kShuf21
480 movdqa xmm5, kMadd01
481 movdqa xmm6, kMadd11
482 movdqa xmm7, kRound34
483
484 align 4
485 wloop:
486 movdqa xmm0, [eax] // pixels 0..7
487 movdqa xmm1, [eax + esi]
488 pavgb xmm0, xmm1
489 pshufb xmm0, xmm2
490 pmaddubsw xmm0, xmm5
491 paddsw xmm0, xmm7
492 psrlw xmm0, 2
493 packuswb xmm0, xmm0
494 movq qword ptr [edx], xmm0
495 movdqu xmm0, [eax + 8] // pixels 8..15
496 movdqu xmm1, [eax + esi + 8]
497 pavgb xmm0, xmm1
498 pshufb xmm0, xmm3
499 pmaddubsw xmm0, xmm6
500 paddsw xmm0, xmm7
501 psrlw xmm0, 2
502 packuswb xmm0, xmm0
503 movq qword ptr [edx + 8], xmm0
504 movdqa xmm0, [eax + 16] // pixels 16..23
505 movdqa xmm1, [eax + esi + 16]
506 lea eax, [eax + 32]
507 pavgb xmm0, xmm1
508 pshufb xmm0, xmm4
509 movdqa xmm1, kMadd21
510 pmaddubsw xmm0, xmm1
511 paddsw xmm0, xmm7
512 psrlw xmm0, 2
513 packuswb xmm0, xmm0
514 sub ecx, 24
515 movq qword ptr [edx + 16], xmm0
516 lea edx, [edx + 24]
517 jg wloop
518
519 pop esi
520 ret
521 }
522 }
523
524 // Note that movdqa+palign may be better than movdqu.
525 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
526 __declspec(naked) __declspec(align(16))
527 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
528 ptrdiff_t src_stride,
529 uint8* dst_ptr, int dst_width) {
530 __asm {
531 push esi
532 mov eax, [esp + 4 + 4] // src_ptr
533 mov esi, [esp + 4 + 8] // src_stride
534 mov edx, [esp + 4 + 12] // dst_ptr
535 mov ecx, [esp + 4 + 16] // dst_width
536 movdqa xmm2, kShuf01
537 movdqa xmm3, kShuf11
538 movdqa xmm4, kShuf21
539 movdqa xmm5, kMadd01
540 movdqa xmm6, kMadd11
541 movdqa xmm7, kRound34
542
543 align 4
544 wloop:
545 movdqa xmm0, [eax] // pixels 0..7
546 movdqa xmm1, [eax + esi]
547 pavgb xmm1, xmm0
548 pavgb xmm0, xmm1
549 pshufb xmm0, xmm2
550 pmaddubsw xmm0, xmm5
551 paddsw xmm0, xmm7
552 psrlw xmm0, 2
553 packuswb xmm0, xmm0
554 movq qword ptr [edx], xmm0
555 movdqu xmm0, [eax + 8] // pixels 8..15
556 movdqu xmm1, [eax + esi + 8]
557 pavgb xmm1, xmm0
558 pavgb xmm0, xmm1
559 pshufb xmm0, xmm3
560 pmaddubsw xmm0, xmm6
561 paddsw xmm0, xmm7
562 psrlw xmm0, 2
563 packuswb xmm0, xmm0
564 movq qword ptr [edx + 8], xmm0
565 movdqa xmm0, [eax + 16] // pixels 16..23
566 movdqa xmm1, [eax + esi + 16]
567 lea eax, [eax + 32]
568 pavgb xmm1, xmm0
569 pavgb xmm0, xmm1
570 pshufb xmm0, xmm4
571 movdqa xmm1, kMadd21
572 pmaddubsw xmm0, xmm1
573 paddsw xmm0, xmm7
574 psrlw xmm0, 2
575 packuswb xmm0, xmm0
576 sub ecx, 24
577 movq qword ptr [edx + 16], xmm0
578 lea edx, [edx+24]
579 jg wloop
580
581 pop esi
582 ret
583 }
584 }
585
586 // 3/8 point sampler
587
588 // Scale 32 pixels to 12
589 __declspec(naked) __declspec(align(16))
590 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
591 uint8* dst_ptr, int dst_width) {
592 __asm {
593 mov eax, [esp + 4] // src_ptr
594 // src_stride ignored
595 mov edx, [esp + 12] // dst_ptr
596 mov ecx, [esp + 16] // dst_width
597 movdqa xmm4, kShuf38a
598 movdqa xmm5, kShuf38b
599
600 align 4
601 xloop:
602 movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
603 movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
604 lea eax, [eax + 32]
605 pshufb xmm0, xmm4
606 pshufb xmm1, xmm5
607 paddusb xmm0, xmm1
608
609 sub ecx, 12
610 movq qword ptr [edx], xmm0 // write 12 pixels
611 movhlps xmm1, xmm0
612 movd [edx + 8], xmm1
613 lea edx, [edx + 12]
614 jg xloop
615
616 ret
617 }
618 }
619
620 // Scale 16x3 pixels to 6x1 with interpolation
621 __declspec(naked) __declspec(align(16))
622 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
623 ptrdiff_t src_stride,
624 uint8* dst_ptr, int dst_width) {
625 __asm {
626 push esi
627 mov eax, [esp + 4 + 4] // src_ptr
628 mov esi, [esp + 4 + 8] // src_stride
629 mov edx, [esp + 4 + 12] // dst_ptr
630 mov ecx, [esp + 4 + 16] // dst_width
631 movdqa xmm2, kShufAc
632 movdqa xmm3, kShufAc3
633 movdqa xmm4, kScaleAc33
634 pxor xmm5, xmm5
635
636 align 4
637 xloop:
638 movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
639 movdqa xmm6, [eax + esi]
640 movhlps xmm1, xmm0
641 movhlps xmm7, xmm6
642 punpcklbw xmm0, xmm5
643 punpcklbw xmm1, xmm5
644 punpcklbw xmm6, xmm5
645 punpcklbw xmm7, xmm5
646 paddusw xmm0, xmm6
647 paddusw xmm1, xmm7
648 movdqa xmm6, [eax + esi * 2]
649 lea eax, [eax + 16]
650 movhlps xmm7, xmm6
651 punpcklbw xmm6, xmm5
652 punpcklbw xmm7, xmm5
653 paddusw xmm0, xmm6
654 paddusw xmm1, xmm7
655
656 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
657 psrldq xmm0, 2
658 paddusw xmm6, xmm0
659 psrldq xmm0, 2
660 paddusw xmm6, xmm0
661 pshufb xmm6, xmm2
662
663 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
664 psrldq xmm1, 2
665 paddusw xmm7, xmm1
666 psrldq xmm1, 2
667 paddusw xmm7, xmm1
668 pshufb xmm7, xmm3
669 paddusw xmm6, xmm7
670
671 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
672 packuswb xmm6, xmm6
673
674 sub ecx, 6
675 movd [edx], xmm6 // write 6 pixels
676 psrlq xmm6, 16
677 movd [edx + 2], xmm6
678 lea edx, [edx + 6]
679 jg xloop
680
681 pop esi
682 ret
683 }
684 }
685
686 // Scale 16x2 pixels to 6x1 with interpolation
687 __declspec(naked) __declspec(align(16))
688 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
689 ptrdiff_t src_stride,
690 uint8* dst_ptr, int dst_width) {
691 __asm {
692 push esi
693 mov eax, [esp + 4 + 4] // src_ptr
694 mov esi, [esp + 4 + 8] // src_stride
695 mov edx, [esp + 4 + 12] // dst_ptr
696 mov ecx, [esp + 4 + 16] // dst_width
697 movdqa xmm2, kShufAb0
698 movdqa xmm3, kShufAb1
699 movdqa xmm4, kShufAb2
700 movdqa xmm5, kScaleAb2
701
702 align 4
703 xloop:
704 movdqa xmm0, [eax] // average 2 rows into xmm0
705 pavgb xmm0, [eax + esi]
706 lea eax, [eax + 16]
707
708 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
709 pshufb xmm1, xmm2
710 movdqa xmm6, xmm0
711 pshufb xmm6, xmm3
712 paddusw xmm1, xmm6
713 pshufb xmm0, xmm4
714 paddusw xmm1, xmm0
715
716 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
717 packuswb xmm1, xmm1
718
719 sub ecx, 6
720 movd [edx], xmm1 // write 6 pixels
721 psrlq xmm1, 16
722 movd [edx + 2], xmm1
723 lea edx, [edx + 6]
724 jg xloop
725
726 pop esi
727 ret
728 }
729 }
730
731 // Reads 16xN bytes and produces 16 shorts at a time.
732 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
733 __declspec(naked) __declspec(align(16))
734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
735 uint16* dst_ptr, int src_width,
736 int src_height) {
737 __asm {
738 push esi
739 push edi
740 push ebx
741 push ebp
742 mov esi, [esp + 16 + 4] // src_ptr
743 mov edx, [esp + 16 + 8] // src_stride
744 mov edi, [esp + 16 + 12] // dst_ptr
745 mov ecx, [esp + 16 + 16] // dst_width
746 mov ebx, [esp + 16 + 20] // height
747 pxor xmm4, xmm4
748 dec ebx
749
750 align 4
751 xloop:
752 // first row
753 movdqa xmm0, [esi]
754 lea eax, [esi + edx]
755 movdqa xmm1, xmm0
756 punpcklbw xmm0, xmm4
757 punpckhbw xmm1, xmm4
758 lea esi, [esi + 16]
759 mov ebp, ebx
760 test ebp, ebp
761 je ydone
762
763 // sum remaining rows
764 align 4
765 yloop:
766 movdqa xmm2, [eax] // read 16 pixels
767 lea eax, [eax + edx] // advance to next row
768 movdqa xmm3, xmm2
769 punpcklbw xmm2, xmm4
770 punpckhbw xmm3, xmm4
771 paddusw xmm0, xmm2 // sum 16 words
772 paddusw xmm1, xmm3
773 sub ebp, 1
774 jg yloop
775
776 align 4
777 ydone:
778 movdqa [edi], xmm0
779 movdqa [edi + 16], xmm1
780 lea edi, [edi + 32]
781
782 sub ecx, 16
783 jg xloop
784
785 pop ebp
786 pop ebx
787 pop edi
788 pop esi
789 ret
790 }
791 }
792
793 // Bilinear column filtering. SSSE3 version.
794 // TODO(fbarchard): Port to Neon
795 // TODO(fbarchard): Switch the following:
796 // xor ebx, ebx
797 // mov bx, word ptr [esi + eax] // 2 source x0 pixels
798 // To
799 // movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
800 // when drmemory bug fixed.
801 // https://code.google.com/p/drmemory/issues/detail?id=1396
802
803 __declspec(naked) __declspec(align(16))
804 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
805 int dst_width, int x, int dx) {
806 __asm {
807 push ebx
808 push esi
809 push edi
810 mov edi, [esp + 12 + 4] // dst_ptr
811 mov esi, [esp + 12 + 8] // src_ptr
812 mov ecx, [esp + 12 + 12] // dst_width
813 movd xmm2, [esp + 12 + 16] // x
814 movd xmm3, [esp + 12 + 20] // dx
815 mov eax, 0x04040000 // shuffle to line up fractions with pixel.
816 movd xmm5, eax
817 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
818 psrlw xmm6, 9
819 pextrw eax, xmm2, 1 // get x0 integer. preroll
820 sub ecx, 2
821 jl xloop29
822
823 movdqa xmm0, xmm2 // x1 = x0 + dx
824 paddd xmm0, xmm3
825 punpckldq xmm2, xmm0 // x0 x1
826 punpckldq xmm3, xmm3 // dx dx
827 paddd xmm3, xmm3 // dx * 2, dx * 2
828 pextrw edx, xmm2, 3 // get x1 integer. preroll
829
830 // 2 Pixel loop.
831 align 4
832 xloop2:
833 movdqa xmm1, xmm2 // x0, x1 fractions.
834 paddd xmm2, xmm3 // x += dx
835 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
836 movd xmm0, ebx
837 psrlw xmm1, 9 // 7 bit fractions.
838 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
839 movd xmm4, ebx
840 pshufb xmm1, xmm5 // 0011
841 punpcklwd xmm0, xmm4
842 pxor xmm1, xmm6 // 0..7f and 7f..0
843 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
844 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
845 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
846 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
847 packuswb xmm0, xmm0 // 8 bits, 2 pixels.
848 movd ebx, xmm0
849 mov [edi], bx
850 lea edi, [edi + 2]
851 sub ecx, 2 // 2 pixels
852 jge xloop2
853
854 align 4
855 xloop29:
856
857 add ecx, 2 - 1
858 jl xloop99
859
860 // 1 pixel remainder
861 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
862 movd xmm0, ebx
863 psrlw xmm2, 9 // 7 bit fractions.
864 pshufb xmm2, xmm5 // 0011
865 pxor xmm2, xmm6 // 0..7f and 7f..0
866 pmaddubsw xmm0, xmm2 // 16 bit
867 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
868 packuswb xmm0, xmm0 // 8 bits
869 movd ebx, xmm0
870 mov [edi], bl
871
872 align 4
873 xloop99:
874
875 pop edi
876 pop esi
877 pop ebx
878 ret
879 }
880 }
881
882 // Reads 16 pixels, duplicates them and writes 32 pixels.
883 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
884 __declspec(naked) __declspec(align(16))
885 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
886 int dst_width, int x, int dx) {
887 __asm {
888 mov edx, [esp + 4] // dst_ptr
889 mov eax, [esp + 8] // src_ptr
890 mov ecx, [esp + 12] // dst_width
891
892 align 4
893 wloop:
894 movdqa xmm0, [eax]
895 lea eax, [eax + 16]
896 movdqa xmm1, xmm0
897 punpcklbw xmm0, xmm0
898 punpckhbw xmm1, xmm1
899 sub ecx, 32
900 movdqa [edx], xmm0
901 movdqa [edx + 16], xmm1
902 lea edx, [edx + 32]
903 jg wloop
904
905 ret
906 }
907 }
908
909 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
910 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
911 __declspec(naked) __declspec(align(16))
912 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
913 ptrdiff_t src_stride,
914 uint8* dst_argb, int dst_width) {
915 __asm {
916 mov eax, [esp + 4] // src_argb
917 // src_stride ignored
918 mov edx, [esp + 12] // dst_argb
919 mov ecx, [esp + 16] // dst_width
920
921 align 4
922 wloop:
923 movdqa xmm0, [eax]
924 movdqa xmm1, [eax + 16]
925 lea eax, [eax + 32]
926 shufps xmm0, xmm1, 0xdd
927 sub ecx, 4
928 movdqa [edx], xmm0
929 lea edx, [edx + 16]
930 jg wloop
931
932 ret
933 }
934 }
935
936 // Blends 8x1 rectangle to 4x1.
937 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
938 __declspec(naked) __declspec(align(16))
939 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
940 ptrdiff_t src_stride,
941 uint8* dst_argb, int dst_width) {
942 __asm {
943 mov eax, [esp + 4] // src_argb
944 // src_stride ignored
945 mov edx, [esp + 12] // dst_argb
946 mov ecx, [esp + 16] // dst_width
947
948 align 4
949 wloop:
950 movdqa xmm0, [eax]
951 movdqa xmm1, [eax + 16]
952 lea eax, [eax + 32]
953 movdqa xmm2, xmm0
954 shufps xmm0, xmm1, 0x88 // even pixels
955 shufps xmm2, xmm1, 0xdd // odd pixels
956 pavgb xmm0, xmm2
957 sub ecx, 4
958 movdqa [edx], xmm0
959 lea edx, [edx + 16]
960 jg wloop
961
962 ret
963 }
964 }
965
966 // Blends 8x2 rectangle to 4x1.
967 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
968 __declspec(naked) __declspec(align(16))
969 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
970 ptrdiff_t src_stride,
971 uint8* dst_argb, int dst_width) {
972 __asm {
973 push esi
974 mov eax, [esp + 4 + 4] // src_argb
975 mov esi, [esp + 4 + 8] // src_stride
976 mov edx, [esp + 4 + 12] // dst_argb
977 mov ecx, [esp + 4 + 16] // dst_width
978
979 align 4
980 wloop:
981 movdqa xmm0, [eax]
982 movdqa xmm1, [eax + 16]
983 movdqa xmm2, [eax + esi]
984 movdqa xmm3, [eax + esi + 16]
985 lea eax, [eax + 32]
986 pavgb xmm0, xmm2 // average rows
987 pavgb xmm1, xmm3
988 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
989 shufps xmm0, xmm1, 0x88 // even pixels
990 shufps xmm2, xmm1, 0xdd // odd pixels
991 pavgb xmm0, xmm2
992 sub ecx, 4
993 movdqa [edx], xmm0
994 lea edx, [edx + 16]
995 jg wloop
996
997 pop esi
998 ret
999 }
1000 }
1001
1002 // Reads 4 pixels at a time.
1003 // Alignment requirement: dst_argb 16 byte aligned.
1004 __declspec(naked) __declspec(align(16))
1005 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
1006 int src_stepx,
1007 uint8* dst_argb, int dst_width) {
1008 __asm {
1009 push ebx
1010 push edi
1011 mov eax, [esp + 8 + 4] // src_argb
1012 // src_stride ignored
1013 mov ebx, [esp + 8 + 12] // src_stepx
1014 mov edx, [esp + 8 + 16] // dst_argb
1015 mov ecx, [esp + 8 + 20] // dst_width
1016 lea ebx, [ebx * 4]
1017 lea edi, [ebx + ebx * 2]
1018
1019 align 4
1020 wloop:
1021 movd xmm0, [eax]
1022 movd xmm1, [eax + ebx]
1023 punpckldq xmm0, xmm1
1024 movd xmm2, [eax + ebx * 2]
1025 movd xmm3, [eax + edi]
1026 lea eax, [eax + ebx * 4]
1027 punpckldq xmm2, xmm3
1028 punpcklqdq xmm0, xmm2
1029 sub ecx, 4
1030 movdqa [edx], xmm0
1031 lea edx, [edx + 16]
1032 jg wloop
1033
1034 pop edi
1035 pop ebx
1036 ret
1037 }
1038 }
1039
1040 // Blends four 2x2 to 4x1.
1041 // Alignment requirement: dst_argb 16 byte aligned.
1042 __declspec(naked) __declspec(align(16))
1043 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1044 ptrdiff_t src_stride,
1045 int src_stepx,
1046 uint8* dst_argb, int dst_width) {
1047 __asm {
1048 push ebx
1049 push esi
1050 push edi
1051 mov eax, [esp + 12 + 4] // src_argb
1052 mov esi, [esp + 12 + 8] // src_stride
1053 mov ebx, [esp + 12 + 12] // src_stepx
1054 mov edx, [esp + 12 + 16] // dst_argb
1055 mov ecx, [esp + 12 + 20] // dst_width
1056 lea esi, [eax + esi] // row1 pointer
1057 lea ebx, [ebx * 4]
1058 lea edi, [ebx + ebx * 2]
1059
1060 align 4
1061 wloop:
1062 movq xmm0, qword ptr [eax] // row0 4 pairs
1063 movhps xmm0, qword ptr [eax + ebx]
1064 movq xmm1, qword ptr [eax + ebx * 2]
1065 movhps xmm1, qword ptr [eax + edi]
1066 lea eax, [eax + ebx * 4]
1067 movq xmm2, qword ptr [esi] // row1 4 pairs
1068 movhps xmm2, qword ptr [esi + ebx]
1069 movq xmm3, qword ptr [esi + ebx * 2]
1070 movhps xmm3, qword ptr [esi + edi]
1071 lea esi, [esi + ebx * 4]
1072 pavgb xmm0, xmm2 // average rows
1073 pavgb xmm1, xmm3
1074 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
1075 shufps xmm0, xmm1, 0x88 // even pixels
1076 shufps xmm2, xmm1, 0xdd // odd pixels
1077 pavgb xmm0, xmm2
1078 sub ecx, 4
1079 movdqa [edx], xmm0
1080 lea edx, [edx + 16]
1081 jg wloop
1082
1083 pop edi
1084 pop esi
1085 pop ebx
1086 ret
1087 }
1088 }
1089
1090 // Column scaling unfiltered. SSE2 version.
1091 __declspec(naked) __declspec(align(16))
1092 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1093 int dst_width, int x, int dx) {
1094 __asm {
1095 push edi
1096 push esi
1097 mov edi, [esp + 8 + 4] // dst_argb
1098 mov esi, [esp + 8 + 8] // src_argb
1099 mov ecx, [esp + 8 + 12] // dst_width
1100 movd xmm2, [esp + 8 + 16] // x
1101 movd xmm3, [esp + 8 + 20] // dx
1102
1103 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
1104 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
1105 paddd xmm2, xmm0
1106 paddd xmm3, xmm3 // 0, 0, 0, dx * 2
1107 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
1108 paddd xmm2, xmm0 // x3 x2 x1 x0
1109 paddd xmm3, xmm3 // 0, 0, 0, dx * 4
1110 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
1111
1112 pextrw eax, xmm2, 1 // get x0 integer.
1113 pextrw edx, xmm2, 3 // get x1 integer.
1114
1115 cmp ecx, 0
1116 jle xloop99
1117 sub ecx, 4
1118 jl xloop49
1119
1120 // 4 Pixel loop.
1121 align 4
1122 xloop4:
1123 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
1124 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
1125 pextrw eax, xmm2, 5 // get x2 integer.
1126 pextrw edx, xmm2, 7 // get x3 integer.
1127 paddd xmm2, xmm3 // x += dx
1128 punpckldq xmm0, xmm1 // x0 x1
1129
1130 movd xmm1, [esi + eax * 4] // 1 source x2 pixels
1131 movd xmm4, [esi + edx * 4] // 1 source x3 pixels
1132 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
1133 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
1134 punpckldq xmm1, xmm4 // x2 x3
1135 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
1136 sub ecx, 4 // 4 pixels
1137 movdqu [edi], xmm0
1138 lea edi, [edi + 16]
1139 jge xloop4
1140
1141 align 4
1142 xloop49:
1143 test ecx, 2
1144 je xloop29
1145
1146 // 2 Pixels.
1147 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
1148 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
1149 pextrw eax, xmm2, 5 // get x2 integer.
1150 punpckldq xmm0, xmm1 // x0 x1
1151
1152 movq qword ptr [edi], xmm0
1153 lea edi, [edi + 8]
1154
1155 xloop29:
1156 test ecx, 1
1157 je xloop99
1158
1159 // 1 Pixels.
1160 movd xmm0, [esi + eax * 4] // 1 source x2 pixels
1161 movd dword ptr [edi], xmm0
1162 align 4
1163 xloop99:
1164
1165 pop esi
1166 pop edi
1167 ret
1168 }
1169 }
1170
1171 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1172 // TODO(fbarchard): Port to Neon
1173
1174 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1175 static uvec8 kShuffleColARGB = {
1176 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1177 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1178 };
1179
1180 // Shuffle table for duplicating 2 fractions into 8 bytes each
1181 static uvec8 kShuffleFractions = {
1182 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1183 };
1184
1185 __declspec(naked) __declspec(align(16))
1186 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1187 int dst_width, int x, int dx) {
1188 __asm {
1189 push esi
1190 push edi
1191 mov edi, [esp + 8 + 4] // dst_argb
1192 mov esi, [esp + 8 + 8] // src_argb
1193 mov ecx, [esp + 8 + 12] // dst_width
1194 movd xmm2, [esp + 8 + 16] // x
1195 movd xmm3, [esp + 8 + 20] // dx
1196 movdqa xmm4, kShuffleColARGB
1197 movdqa xmm5, kShuffleFractions
1198 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
1199 psrlw xmm6, 9
1200 pextrw eax, xmm2, 1 // get x0 integer. preroll
1201 sub ecx, 2
1202 jl xloop29
1203
1204 movdqa xmm0, xmm2 // x1 = x0 + dx
1205 paddd xmm0, xmm3
1206 punpckldq xmm2, xmm0 // x0 x1
1207 punpckldq xmm3, xmm3 // dx dx
1208 paddd xmm3, xmm3 // dx * 2, dx * 2
1209 pextrw edx, xmm2, 3 // get x1 integer. preroll
1210
1211 // 2 Pixel loop.
1212 align 4
1213 xloop2:
1214 movdqa xmm1, xmm2 // x0, x1 fractions.
1215 paddd xmm2, xmm3 // x += dx
1216 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
1217 psrlw xmm1, 9 // 7 bit fractions.
1218 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
1219 pshufb xmm1, xmm5 // 0000000011111111
1220 pshufb xmm0, xmm4 // arrange pixels into pairs
1221 pxor xmm1, xmm6 // 0..7f and 7f..0
1222 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
1223 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
1224 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
1225 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
1226 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
1227 movq qword ptr [edi], xmm0
1228 lea edi, [edi + 8]
1229 sub ecx, 2 // 2 pixels
1230 jge xloop2
1231
1232 align 4
1233 xloop29:
1234
1235 add ecx, 2 - 1
1236 jl xloop99
1237
1238 // 1 pixel remainder
1239 psrlw xmm2, 9 // 7 bit fractions.
1240 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
1241 pshufb xmm2, xmm5 // 00000000
1242 pshufb xmm0, xmm4 // arrange pixels into pairs
1243 pxor xmm2, xmm6 // 0..7f and 7f..0
1244 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
1245 psrlw xmm0, 7
1246 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
1247 movd [edi], xmm0
1248
1249 align 4
1250 xloop99:
1251
1252 pop edi
1253 pop esi
1254 ret
1255 }
1256 }
1257
1258 // Reads 4 pixels, duplicates them and writes 8 pixels.
1259 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
1260 __declspec(naked) __declspec(align(16))
1261 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1262 int dst_width, int x, int dx) {
1263 __asm {
1264 mov edx, [esp + 4] // dst_argb
1265 mov eax, [esp + 8] // src_argb
1266 mov ecx, [esp + 12] // dst_width
1267
1268 align 4
1269 wloop:
1270 movdqa xmm0, [eax]
1271 lea eax, [eax + 16]
1272 movdqa xmm1, xmm0
1273 punpckldq xmm0, xmm0
1274 punpckhdq xmm1, xmm1
1275 sub ecx, 8
1276 movdqa [edx], xmm0
1277 movdqa [edx + 16], xmm1
1278 lea edx, [edx + 32]
1279 jg wloop
1280
1281 ret
1282 }
1283 }
1284
1285 // Divide num by div and return as 16.16 fixed point result.
1286 __declspec(naked) __declspec(align(16))
1287 int FixedDiv_X86(int num, int div) {
1288 __asm {
1289 mov eax, [esp + 4] // num
1290 cdq // extend num to 64 bits
1291 shld edx, eax, 16 // 32.16
1292 shl eax, 16
1293 idiv dword ptr [esp + 8]
1294 ret
1295 }
1296 }
1297
1298 // Divide num by div and return as 16.16 fixed point result.
1299 __declspec(naked) __declspec(align(16))
1300 int FixedDiv1_X86(int num, int div) {
1301 __asm {
1302 mov eax, [esp + 4] // num
1303 mov ecx, [esp + 8] // denom
1304 cdq // extend num to 64 bits
1305 shld edx, eax, 16 // 32.16
1306 shl eax, 16
1307 sub eax, 0x00010001
1308 sbb edx, 0
1309 sub ecx, 1
1310 idiv ecx
1311 ret
1312 }
1313 }
1314
1315 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
1316
1317 #ifdef __cplusplus
1318 } // extern "C"
1319 } // namespace libyuv
1320 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/scale_posix.cc ('k') | source/libvpx/third_party/nestegg/0001-include-paths.diff » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698