Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(227)

Side by Side Diff: source/libvpx/vpx_scale/win32/scaleopt.c

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 11
12 /**************************************************************************** 12 /****************************************************************************
13 * 13 *
14 * Module Title : scaleopt.cpp 14 * Module Title : scaleopt.cpp
15 * 15 *
16 * Description : Optimized scaling functions 16 * Description : Optimized scaling functions
17 * 17 *
18 ****************************************************************************/ 18 ****************************************************************************/
19 #include "pragmas.h" 19 #include "pragmas.h"
20 20
21
22
23 /**************************************************************************** 21 /****************************************************************************
24 * Module Statics 22 * Module Statics
25 ****************************************************************************/ 23 ****************************************************************************/
26 __declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 5 1 };
27 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102 , 102 };
28 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 1 54, 154 };
29 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 20 5, 205 };
30 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 1 28, 128 }; 24 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 1 28, 128 };
31 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
32 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };
33 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
34 __declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
35 __declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };
36 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };
37
38
39 25
40 #include "vpx_scale/vpxscale.h" 26 #include "vpx_scale/vpxscale.h"
41 #include "vpx_mem/vpx_mem.h" 27 #include "vpx_mem/vpx_mem.h"
42 28
43 /****************************************************************************
44 *
45 * ROUTINE : horizontal_line_3_5_scale_mmx
46 *
47 * INPUTS : const unsigned char *source :
48 * unsigned int source_width :
49 * unsigned char *dest :
50 * unsigned int dest_width :
51 *
52 * OUTPUTS : None.
53 *
54 * RETURNS : void
55 *
56 * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
57 *
58 * SPECIAL NOTES : None.
59 *
60 ****************************************************************************/
61 static
62 void horizontal_line_3_5_scale_mmx
63 (
64 const unsigned char *source,
65 unsigned int source_width,
66 unsigned char *dest,
67 unsigned int dest_width
68 ) {
69 (void) dest_width;
70
71 __asm {
72
73 push ebx
74
75 mov esi, source
76 mov edi, dest
77
78 mov ecx, source_width
79 lea edx, [esi+ecx-3];
80
81 movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx
82 movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx
83
84 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
85 pxor mm7, mm7 // clear mm7
86
87 horiz_line_3_5_loop:
88
89 mov eax, DWORD PTR [esi] // eax = 00 01 02 03
90 mov ebx, eax
91
92 and ebx, 0xffff00 // ebx = xx 01 02 xx
93 mov ecx, eax // ecx = 00 01 02 03
94
95 and eax, 0xffff0000 // eax = xx xx 02 03
96 xor ecx, eax // ecx = 00 01 xx xx
97
98 shr ebx, 8 // ebx = 01 02 xx xx
99 or eax, ebx // eax = 01 02 02 03
100
101 shl ebx, 16 // ebx = xx xx 01 02
102 movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx
103
104 or ebx, ecx // ebx = 00 01 01 02
105 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx
106
107 movd mm0, ebx // mm0 = 00 01 01 02
108 pmullw mm1, mm6 //
109
110 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
111 pmullw mm0, mm5 //
112
113 mov [edi], ebx // writeoutput 00 xx xx xx
114 add esi, 3
115
116 add edi, 5
117 paddw mm0, mm1
118
119 paddw mm0, mm4
120 psrlw mm0, 8
121
122 cmp esi, edx
123 packuswb mm0, mm7
124
125 movd DWORD Ptr [edi-4], mm0
126 jl horiz_line_3_5_loop
127
128 // Exit:
129 mov eax, DWORD PTR [esi] // eax = 00 01 02 03
130 mov ebx, eax
131
132 and ebx, 0xffff00 // ebx = xx 01 02 xx
133 mov ecx, eax // ecx = 00 01 02 03
134
135 and eax, 0xffff0000 // eax = xx xx 02 03
136 xor ecx, eax // ecx = 00 01 xx xx
137
138 shr ebx, 8 // ebx = 01 02 xx xx
139 or eax, ebx // eax = 01 02 02 03
140
141 shl eax, 8 // eax = xx 01 02 02
142 and eax, 0xffff0000 // eax = xx xx 02 02
143
144 or eax, ebx // eax = 01 02 02 02
145
146 shl ebx, 16 // ebx = xx xx 01 02
147 movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx
148
149 or ebx, ecx // ebx = 00 01 01 02
150 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx
151
152 movd mm0, ebx // mm0 = 00 01 01 02
153 pmullw mm1, mm6 //
154
155 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
156 pmullw mm0, mm5 //
157
158 mov [edi], ebx // writeoutput 00 xx xx xx
159 paddw mm0, mm1
160
161 paddw mm0, mm4
162 psrlw mm0, 8
163
164 packuswb mm0, mm7
165 movd DWORD Ptr [edi+1], mm0
166
167 pop ebx
168
169 }
170
171 }
172
173
174 /****************************************************************************
175 *
176 * ROUTINE : horizontal_line_4_5_scale_mmx
177 *
178 * INPUTS : const unsigned char *source :
179 * unsigned int source_width :
180 * unsigned char *dest :
181 * unsigned int dest_width :
182 *
183 * OUTPUTS : None.
184 *
185 * RETURNS : void
186 *
187 * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
188 *
189 * SPECIAL NOTES : None.
190 *
191 ****************************************************************************/
192 static
193 void horizontal_line_4_5_scale_mmx
194 (
195 const unsigned char *source,
196 unsigned int source_width,
197 unsigned char *dest,
198 unsigned int dest_width
199 ) {
200 (void)dest_width;
201
202 __asm {
203
204 mov esi, source
205 mov edi, dest
206
207 mov ecx, source_width
208 lea edx, [esi+ecx-8];
209
210 movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx
211 movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx
212
213 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
214 pxor mm7, mm7 // clear mm7
215
216 horiz_line_4_5_loop:
217
218 movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07
219 movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08
220
221 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
222 movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08
223
224 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
225 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
226
227 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
228 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
229
230 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
231 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
232
233 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
234 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
235
236 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
237 pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51
238
239 paddw mm0, mm1 // added round values
240 paddw mm0, mm4
241
242 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
243 packuswb mm0, mm7
244
245 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
246 add edi, 10
247
248 add esi, 8
249 paddw mm2, mm3 //
250
251 paddw mm2, mm4 // added round values
252 cmp esi, edx
253
254 psrlw mm2, 8
255 packuswb mm2, mm7
256
257 movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
258 jl horiz_line_4_5_loop
259
260 // Exit:
261 movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07
262 movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07
263
264 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
265 psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00
266
267 movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00
268 pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00
269
270 psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07
271 por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07
272
273 movq mm3, mm1
274
275 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
276 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
277
278 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
279 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
280
281 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
282 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
283
284 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
285 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
286
287 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
288 pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51
289
290 paddw mm0, mm1 // added round values
291 paddw mm0, mm4
292
293 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
294 packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx
295
296 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
297 paddw mm2, mm3 //
298
299 paddw mm2, mm4 // added round values
300 psrlw mm2, 8
301
302 packuswb mm2, mm7
303 movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09
304
305
306 }
307 }
308
309 /****************************************************************************
310 *
311 * ROUTINE : vertical_band_4_5_scale_mmx
312 *
313 * INPUTS : unsigned char *dest :
314 * unsigned int dest_pitch :
315 * unsigned int dest_width :
316 *
317 * OUTPUTS : None.
318 *
319 * RETURNS : void
320 *
321 * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
322 *
323 * SPECIAL NOTES : The routine uses the first line of the band below
324 * the current band. The function also has a "C" only
325 * version.
326 *
327 ****************************************************************************/
328 static
329 void vertical_band_4_5_scale_mmx
330 (
331 unsigned char *dest,
332 unsigned int dest_pitch,
333 unsigned int dest_width
334 ) {
335 __asm {
336
337 mov esi, dest // Get the source and destinatio n pointer
338 mov ecx, dest_pitch // Get the pitch size
339
340 lea edi, [esi+ecx*2] // tow lines below
341 add edi, ecx // three lines below
342
343 pxor mm7, mm7 // clear out mm7
344 mov edx, dest_width // Loop counter
345
346 vs_4_5_loop:
347
348 movq mm0, QWORD ptr [esi] // src[0];
349 movq mm1, QWORD ptr [esi+ecx] // src[1];
350
351 movq mm2, mm0 // Make a copy
352 punpcklbw mm0, mm7 // unpack low to word
353
354 movq mm5, one_fifth
355 punpckhbw mm2, mm7 // unpack high to word
356
357 pmullw mm0, mm5 // a * 1/5
358
359 movq mm3, mm1 // make a copy
360 punpcklbw mm1, mm7 // unpack low to word
361
362 pmullw mm2, mm5 // a * 1/5
363 movq mm6, four_fifths // constan
364
365 movq mm4, mm1 // copy of low b
366 pmullw mm4, mm6 // b * 4/5
367
368 punpckhbw mm3, mm7 // unpack high to word
369 movq mm5, mm3 // copy of high b
370
371 pmullw mm5, mm6 // b * 4/5
372 paddw mm0, mm4 // a * 1/5 + b * 4/5
373
374 paddw mm2, mm5 // a * 1/5 + b * 4/5
375 paddw mm0, round_values // + 128
376
377 paddw mm2, round_values // + 128
378 psrlw mm0, 8
379
380 psrlw mm2, 8
381 packuswb mm0, mm2 // des [1]
382
383 movq QWORD ptr [esi+ecx], mm0 // write des[1]
384 movq mm0, [esi+ecx*2] // mm0 = src[2]
385
386 // mm1, mm3 --- Src[1]
387 // mm0 --- Src[2]
388 // mm7 for unpacking
389
390 movq mm5, two_fifths
391 movq mm2, mm0 // make a copy
392
393 pmullw mm1, mm5 // b * 2/5
394 movq mm6, three_fifths
395
396
397 punpcklbw mm0, mm7 // unpack low to word
398 pmullw mm3, mm5 // b * 2/5
399
400 movq mm4, mm0 // make copy of c
401 punpckhbw mm2, mm7 // unpack high to word
402
403 pmullw mm4, mm6 // c * 3/5
404 movq mm5, mm2
405
406 pmullw mm5, mm6 // c * 3/5
407 paddw mm1, mm4 // b * 2/5 + c * 3/5
408
409 paddw mm3, mm5 // b * 2/5 + c * 3/5
410 paddw mm1, round_values // + 128
411
412 paddw mm3, round_values // + 128
413 psrlw mm1, 8
414
415 psrlw mm3, 8
416 packuswb mm1, mm3 // des[2]
417
418 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
419 movq mm1, [edi] // mm1=Src[3];
420
421 // mm0, mm2 --- Src[2]
422 // mm1 --- Src[3]
423 // mm6 --- 3/5
424 // mm7 for unpacking
425
426 pmullw mm0, mm6 // c * 3/5
427 movq mm5, two_fifths // mm5 = 2/5
428
429 movq mm3, mm1 // make a copy
430 pmullw mm2, mm6 // c * 3/5
431
432 punpcklbw mm1, mm7 // unpack low
433 movq mm4, mm1 // make a copy
434
435 punpckhbw mm3, mm7 // unpack high
436 pmullw mm4, mm5 // d * 2/5
437
438 movq mm6, mm3 // make a copy
439 pmullw mm6, mm5 // d * 2/5
440
441 paddw mm0, mm4 // c * 3/5 + d * 2/5
442 paddw mm2, mm6 // c * 3/5 + d * 2/5
443
444 paddw mm0, round_values // + 128
445 paddw mm2, round_values // + 128
446
447 psrlw mm0, 8
448 psrlw mm2, 8
449
450 packuswb mm0, mm2 // des[3]
451 movq QWORD ptr [edi], mm0 // write des[3]
452
453 // mm1, mm3 --- Src[3]
454 // mm7 -- cleared for unpacking
455
456 movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group
457
458 movq mm5, four_fifths // mm5 = 4/5
459 pmullw mm1, mm5 // d * 4/5
460
461 movq mm6, one_fifth // mm6 = 1/5
462 movq mm2, mm0 // make a copy
463
464 pmullw mm3, mm5 // d * 4/5
465 punpcklbw mm0, mm7 // unpack low
466
467 pmullw mm0, mm6 // an * 1/5
468 punpckhbw mm2, mm7 // unpack high
469
470 paddw mm1, mm0 // d * 4/5 + an * 1/5
471 pmullw mm2, mm6 // an * 1/5
472
473 paddw mm3, mm2 // d * 4/5 + an * 1/5
474 paddw mm1, round_values // + 128
475
476 paddw mm3, round_values // + 128
477 psrlw mm1, 8
478
479 psrlw mm3, 8
480 packuswb mm1, mm3 // des[4]
481
482 movq QWORD ptr [edi+ecx], mm1 // write des[4]
483
484 add edi, 8
485 add esi, 8
486
487 sub edx, 8
488 jg vs_4_5_loop
489 }
490 }
491
492 /****************************************************************************
493 *
494 * ROUTINE : last_vertical_band_4_5_scale_mmx
495 *
496 * INPUTS : unsigned char *dest :
497 * unsigned int dest_pitch :
498 * unsigned int dest_width :
499 *
500 * OUTPUTS : None.
501 *
502 * RETURNS : None
503 *
504 * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
505 *
506 * SPECIAL NOTES : The routine uses the first line of the band below
507 * the current band. The function also has an "C" only
508 * version.
509 *
510 ****************************************************************************/
511 static
512 void last_vertical_band_4_5_scale_mmx
513 (
514 unsigned char *dest,
515 unsigned int dest_pitch,
516 unsigned int dest_width
517 ) {
518 __asm {
519 mov esi, dest // Get the source and destinatio n pointer
520 mov ecx, dest_pitch // Get the pitch size
521
522 lea edi, [esi+ecx*2] // tow lines below
523 add edi, ecx // three lines below
524
525 pxor mm7, mm7 // clear out mm7
526 mov edx, dest_width // Loop counter
527
528 last_vs_4_5_loop:
529
530 movq mm0, QWORD ptr [esi] // src[0];
531 movq mm1, QWORD ptr [esi+ecx] // src[1];
532
533 movq mm2, mm0 // Make a copy
534 punpcklbw mm0, mm7 // unpack low to word
535
536 movq mm5, one_fifth
537 punpckhbw mm2, mm7 // unpack high to word
538
539 pmullw mm0, mm5 // a * 1/5
540
541 movq mm3, mm1 // make a copy
542 punpcklbw mm1, mm7 // unpack low to word
543
544 pmullw mm2, mm5 // a * 1/5
545 movq mm6, four_fifths // constan
546
547 movq mm4, mm1 // copy of low b
548 pmullw mm4, mm6 // b * 4/5
549
550 punpckhbw mm3, mm7 // unpack high to word
551 movq mm5, mm3 // copy of high b
552
553 pmullw mm5, mm6 // b * 4/5
554 paddw mm0, mm4 // a * 1/5 + b * 4/5
555
556 paddw mm2, mm5 // a * 1/5 + b * 4/5
557 paddw mm0, round_values // + 128
558
559 paddw mm2, round_values // + 128
560 psrlw mm0, 8
561
562 psrlw mm2, 8
563 packuswb mm0, mm2 // des [1]
564
565 movq QWORD ptr [esi+ecx], mm0 // write des[1]
566 movq mm0, [esi+ecx*2] // mm0 = src[2]
567
568 // mm1, mm3 --- Src[1]
569 // mm0 --- Src[2]
570 // mm7 for unpacking
571
572 movq mm5, two_fifths
573 movq mm2, mm0 // make a copy
574
575 pmullw mm1, mm5 // b * 2/5
576 movq mm6, three_fifths
577
578
579 punpcklbw mm0, mm7 // unpack low to word
580 pmullw mm3, mm5 // b * 2/5
581
582 movq mm4, mm0 // make copy of c
583 punpckhbw mm2, mm7 // unpack high to word
584
585 pmullw mm4, mm6 // c * 3/5
586 movq mm5, mm2
587
588 pmullw mm5, mm6 // c * 3/5
589 paddw mm1, mm4 // b * 2/5 + c * 3/5
590
591 paddw mm3, mm5 // b * 2/5 + c * 3/5
592 paddw mm1, round_values // + 128
593
594 paddw mm3, round_values // + 128
595 psrlw mm1, 8
596
597 psrlw mm3, 8
598 packuswb mm1, mm3 // des[2]
599
600 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
601 movq mm1, [edi] // mm1=Src[3];
602
603 movq QWORD ptr [edi+ecx], mm1 // write des[4];
604
605 // mm0, mm2 --- Src[2]
606 // mm1 --- Src[3]
607 // mm6 --- 3/5
608 // mm7 for unpacking
609
610 pmullw mm0, mm6 // c * 3/5
611 movq mm5, two_fifths // mm5 = 2/5
612
613 movq mm3, mm1 // make a copy
614 pmullw mm2, mm6 // c * 3/5
615
616 punpcklbw mm1, mm7 // unpack low
617 movq mm4, mm1 // make a copy
618
619 punpckhbw mm3, mm7 // unpack high
620 pmullw mm4, mm5 // d * 2/5
621
622 movq mm6, mm3 // make a copy
623 pmullw mm6, mm5 // d * 2/5
624
625 paddw mm0, mm4 // c * 3/5 + d * 2/5
626 paddw mm2, mm6 // c * 3/5 + d * 2/5
627
628 paddw mm0, round_values // + 128
629 paddw mm2, round_values // + 128
630
631 psrlw mm0, 8
632 psrlw mm2, 8
633
634 packuswb mm0, mm2 // des[3]
635 movq QWORD ptr [edi], mm0 // write des[3]
636
637 // mm1, mm3 --- Src[3]
638 // mm7 -- cleared for unpacking
639 add edi, 8
640 add esi, 8
641
642 sub edx, 8
643 jg last_vs_4_5_loop
644 }
645 }
646
647 /****************************************************************************
648 *
649 * ROUTINE : vertical_band_3_5_scale_mmx
650 *
651 * INPUTS : unsigned char *dest :
652 * unsigned int dest_pitch :
653 * unsigned int dest_width :
654 *
655 * OUTPUTS : None.
656 *
657 * RETURNS : void
658 *
659 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
660 *
661 * SPECIAL NOTES : The routine uses the first line of the band below
662 * the current band. The function also has an "C" only
663 * version.
664 *
665 ****************************************************************************/
666 static
667 void vertical_band_3_5_scale_mmx
668 (
669 unsigned char *dest,
670 unsigned int dest_pitch,
671 unsigned int dest_width
672 ) {
673 __asm {
674 mov esi, dest // Get the source and destinatio n pointer
675 mov ecx, dest_pitch // Get the pitch size
676
677 lea edi, [esi+ecx*2] // tow lines below
678 add edi, ecx // three lines below
679
680 pxor mm7, mm7 // clear out mm7
681 mov edx, dest_width // Loop counter
682
683 vs_3_5_loop:
684
685 movq mm0, QWORD ptr [esi] // src[0];
686 movq mm1, QWORD ptr [esi+ecx] // src[1];
687
688 movq mm2, mm0 // Make a copy
689 punpcklbw mm0, mm7 // unpack low to word
690
691 movq mm5, two_fifths // mm5 = 2/5
692 punpckhbw mm2, mm7 // unpack high to word
693
694 pmullw mm0, mm5 // a * 2/5
695
696 movq mm3, mm1 // make a copy
697 punpcklbw mm1, mm7 // unpack low to word
698
699 pmullw mm2, mm5 // a * 2/5
700 movq mm6, three_fifths // mm6 = 3/5
701
702 movq mm4, mm1 // copy of low b
703 pmullw mm4, mm6 // b * 3/5
704
705 punpckhbw mm3, mm7 // unpack high to word
706 movq mm5, mm3 // copy of high b
707
708 pmullw mm5, mm6 // b * 3/5
709 paddw mm0, mm4 // a * 2/5 + b * 3/5
710
711 paddw mm2, mm5 // a * 2/5 + b * 3/5
712 paddw mm0, round_values // + 128
713
714 paddw mm2, round_values // + 128
715 psrlw mm0, 8
716
717 psrlw mm2, 8
718 packuswb mm0, mm2 // des [1]
719
720 movq QWORD ptr [esi+ecx], mm0 // write des[1]
721 movq mm0, [esi+ecx*2] // mm0 = src[2]
722
723 // mm1, mm3 --- Src[1]
724 // mm0 --- Src[2]
725 // mm7 for unpacking
726
727 movq mm4, mm1 // b low
728 pmullw mm1, four_fifths // b * 4/5 low
729
730 movq mm5, mm3 // b high
731 pmullw mm3, four_fifths // b * 4/5 high
732
733 movq mm2, mm0 // c
734 pmullw mm4, one_fifth // b * 1/5
735
736 punpcklbw mm0, mm7 // c low
737 pmullw mm5, one_fifth // b * 1/5
738
739 movq mm6, mm0 // make copy of c low
740 punpckhbw mm2, mm7 // c high
741
742 pmullw mm6, one_fifth // c * 1/5 low
743 movq mm7, mm2 // make copy of c high
744
745 pmullw mm7, one_fifth // c * 1/5 high
746 paddw mm1, mm6 // b * 4/5 + c * 1/5 low
747
748 paddw mm3, mm7 // b * 4/5 + c * 1/5 high
749 movq mm6, mm0 // make copy of c low
750
751 pmullw mm6, four_fifths // c * 4/5 low
752 movq mm7, mm2 // make copy of c high
753
754 pmullw mm7, four_fifths // c * 4/5 high
755
756 paddw mm4, mm6 // b * 1/5 + c * 4/5 low
757 paddw mm5, mm7 // b * 1/5 + c * 4/5 high
758
759 paddw mm1, round_values // + 128
760 paddw mm3, round_values // + 128
761
762 psrlw mm1, 8
763 psrlw mm3, 8
764
765 packuswb mm1, mm3 // des[2]
766 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
767
768 paddw mm4, round_values // + 128
769 paddw mm5, round_values // + 128
770
771 psrlw mm4, 8
772 psrlw mm5, 8
773
774 packuswb mm4, mm5 // des[3]
775 movq QWORD ptr [edi], mm4 // write des[3]
776
777 // mm0, mm2 --- Src[3]
778
779 pxor mm7, mm7 // clear mm7 for unpacking
780 movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next grou p
781
782 movq mm5, three_fifths // mm5 = 3/5
783 pmullw mm0, mm5 // d * 3/5
784
785 movq mm6, two_fifths // mm6 = 2/5
786 movq mm3, mm1 // make a copy
787
788 pmullw mm2, mm5 // d * 3/5
789 punpcklbw mm1, mm7 // unpack low
790
791 pmullw mm1, mm6 // an * 2/5
792 punpckhbw mm3, mm7 // unpack high
793
794 paddw mm0, mm1 // d * 3/5 + an * 2/5
795 pmullw mm3, mm6 // an * 2/5
796
797 paddw mm2, mm3 // d * 3/5 + an * 2/5
798 paddw mm0, round_values // + 128
799
800 paddw mm2, round_values // + 128
801 psrlw mm0, 8
802
803 psrlw mm2, 8
804 packuswb mm0, mm2 // des[4]
805
806 movq QWORD ptr [edi+ecx], mm0 // write des[4]
807
808 add edi, 8
809 add esi, 8
810
811 sub edx, 8
812 jg vs_3_5_loop
813 }
814 }
815
816 /****************************************************************************
817 *
818 * ROUTINE : last_vertical_band_3_5_scale_mmx
819 *
820 * INPUTS : unsigned char *dest :
821 * unsigned int dest_pitch :
822 * unsigned int dest_width :
823 *
824 * OUTPUTS : None.
825 *
826 * RETURNS : void
827 *
828 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
829 *
830 * SPECIAL NOTES : The routine uses the first line of the band below
831 * the current band. The function also has an "C" only
832 * version.
833 *
834 ****************************************************************************/
835 static
836 void last_vertical_band_3_5_scale_mmx
837 (
838 unsigned char *dest,
839 unsigned int dest_pitch,
840 unsigned int dest_width
841 ) {
842 __asm {
843 mov esi, dest // Get the source and destinatio n pointer
844 mov ecx, dest_pitch // Get the pitch size
845
846 lea edi, [esi+ecx*2] // tow lines below
847 add edi, ecx // three lines below
848
849 pxor mm7, mm7 // clear out mm7
850 mov edx, dest_width // Loop counter
851
852
853 last_vs_3_5_loop:
854
855 movq mm0, QWORD ptr [esi] // src[0];
856 movq mm1, QWORD ptr [esi+ecx] // src[1];
857
858 movq mm2, mm0 // Make a copy
859 punpcklbw mm0, mm7 // unpack low to word
860
861 movq mm5, two_fifths // mm5 = 2/5
862 punpckhbw mm2, mm7 // unpack high to word
863
864 pmullw mm0, mm5 // a * 2/5
865
866 movq mm3, mm1 // make a copy
867 punpcklbw mm1, mm7 // unpack low to word
868
869 pmullw mm2, mm5 // a * 2/5
870 movq mm6, three_fifths // mm6 = 3/5
871
872 movq mm4, mm1 // copy of low b
873 pmullw mm4, mm6 // b * 3/5
874
875 punpckhbw mm3, mm7 // unpack high to word
876 movq mm5, mm3 // copy of high b
877
878 pmullw mm5, mm6 // b * 3/5
879 paddw mm0, mm4 // a * 2/5 + b * 3/5
880
881 paddw mm2, mm5 // a * 2/5 + b * 3/5
882 paddw mm0, round_values // + 128
883
884 paddw mm2, round_values // + 128
885 psrlw mm0, 8
886
887 psrlw mm2, 8
888 packuswb mm0, mm2 // des [1]
889
890 movq QWORD ptr [esi+ecx], mm0 // write des[1]
891 movq mm0, [esi+ecx*2] // mm0 = src[2]
892
893
894
895 // mm1, mm3 --- Src[1]
896 // mm0 --- Src[2]
897 // mm7 for unpacking
898
899 movq mm4, mm1 // b low
900 pmullw mm1, four_fifths // b * 4/5 low
901
902 movq QWORD ptr [edi+ecx], mm0 // write des[4]
903
904 movq mm5, mm3 // b high
905 pmullw mm3, four_fifths // b * 4/5 high
906
907 movq mm2, mm0 // c
908 pmullw mm4, one_fifth // b * 1/5
909
910 punpcklbw mm0, mm7 // c low
911 pmullw mm5, one_fifth // b * 1/5
912
913 movq mm6, mm0 // make copy of c low
914 punpckhbw mm2, mm7 // c high
915
916 pmullw mm6, one_fifth // c * 1/5 low
917 movq mm7, mm2 // make copy of c high
918
919 pmullw mm7, one_fifth // c * 1/5 high
920 paddw mm1, mm6 // b * 4/5 + c * 1/5 low
921
922 paddw mm3, mm7 // b * 4/5 + c * 1/5 high
923 movq mm6, mm0 // make copy of c low
924
925 pmullw mm6, four_fifths // c * 4/5 low
926 movq mm7, mm2 // make copy of c high
927
928 pmullw mm7, four_fifths // c * 4/5 high
929
930 paddw mm4, mm6 // b * 1/5 + c * 4/5 low
931 paddw mm5, mm7 // b * 1/5 + c * 4/5 high
932
933 paddw mm1, round_values // + 128
934 paddw mm3, round_values // + 128
935
936 psrlw mm1, 8
937 psrlw mm3, 8
938
939 packuswb mm1, mm3 // des[2]
940 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
941
942 paddw mm4, round_values // + 128
943 paddw mm5, round_values // + 128
944
945 psrlw mm4, 8
946 psrlw mm5, 8
947
948 packuswb mm4, mm5 // des[3]
949 movq QWORD ptr [edi], mm4 // write des[3]
950
951 // mm0, mm2 --- Src[3]
952
953 add edi, 8
954 add esi, 8
955
956 sub edx, 8
957 jg last_vs_3_5_loop
958 }
959 }
960
961 /****************************************************************************
962 *
963 * ROUTINE : vertical_band_1_2_scale_mmx
964 *
965 * INPUTS : unsigned char *dest :
966 * unsigned int dest_pitch :
967 * unsigned int dest_width :
968 *
969 * OUTPUTS : None.
970 *
971 * RETURNS : void
972 *
973 * FUNCTION : 1 to 2 up-scaling of a band of pixels.
974 *
975 * SPECIAL NOTES : The routine uses the first line of the band below
976 * the current band. The function also has an "C" only
977 * version.
978 *
979 ****************************************************************************/
980 static
981 void vertical_band_1_2_scale_mmx
982 (
983 unsigned char *dest,
984 unsigned int dest_pitch,
985 unsigned int dest_width
986 ) {
987 __asm {
988
989 mov esi, dest // Get the source and destinatio n pointer
990 mov ecx, dest_pitch // Get the pitch size
991
992 pxor mm7, mm7 // clear out mm7
993 mov edx, dest_width // Loop counter
994
995 vs_1_2_loop:
996
997 movq mm0, [esi] // get Src[0]
998 movq mm1, [esi + ecx * 2] // get Src[1]
999
1000 movq mm2, mm0 // make copy before unpack
1001 movq mm3, mm1 // make copy before unpack
1002
1003 punpcklbw mm0, mm7 // low Src[0]
1004 movq mm6, four_ones // mm6= 1, 1, 1, 1
1005
1006 punpcklbw mm1, mm7 // low Src[1]
1007 paddw mm0, mm1 // low (a + b)
1008
1009 punpckhbw mm2, mm7 // high Src[0]
1010 paddw mm0, mm6 // low (a + b + 1)
1011
1012 punpckhbw mm3, mm7
1013 paddw mm2, mm3 // high (a + b )
1014
1015 psraw mm0, 1 // low (a + b +1 )/2
1016 paddw mm2, mm6 // high (a + b + 1)
1017
1018 psraw mm2, 1 // high (a + b + 1)/2
1019 packuswb mm0, mm2 // pack results
1020
1021 movq [esi+ecx], mm0 // write out eight bytes
1022 add esi, 8
1023
1024 sub edx, 8
1025 jg vs_1_2_loop
1026 }
1027
1028 }
1029
1030 /****************************************************************************
1031 *
1032 * ROUTINE : last_vertical_band_1_2_scale_mmx
1033 *
1034 * INPUTS : unsigned char *dest :
1035 * unsigned int dest_pitch :
1036 * unsigned int dest_width :
1037 *
1038 * OUTPUTS : None.
1039 *
1040 * RETURNS : void
1041 *
1042 * FUNCTION : 1 to 2 up-scaling of band of pixels.
1043 *
1044 * SPECIAL NOTES : The routine uses the first line of the band below
1045 * the current band. The function also has an "C" only
1046 * version.
1047 *
1048 ****************************************************************************/
1049 static
1050 void last_vertical_band_1_2_scale_mmx
1051 (
1052 unsigned char *dest,
1053 unsigned int dest_pitch,
1054 unsigned int dest_width
1055 ) {
1056 __asm {
1057 mov esi, dest // Get the source and destinatio n pointer
1058 mov ecx, dest_pitch // Get the pitch size
1059
1060 mov edx, dest_width // Loop counter
1061
1062 last_vs_1_2_loop:
1063
1064 movq mm0, [esi] // get Src[0]
1065 movq [esi+ecx], mm0 // write out eight bytes
1066
1067 add esi, 8
1068 sub edx, 8
1069
1070 jg last_vs_1_2_loop
1071 }
1072 }
1073
1074 /****************************************************************************
1075 *
1076 * ROUTINE : horizontal_line_1_2_scale
1077 *
1078 * INPUTS : const unsigned char *source :
1079 * unsigned int source_width :
1080 * unsigned char *dest :
1081 * unsigned int dest_width :
1082 *
1083 * OUTPUTS : None.
1084 *
1085 * RETURNS : void
1086 *
1087 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
1088 *
1089 * SPECIAL NOTES : None.
1090 *
1091 ****************************************************************************/
1092 static
1093 void horizontal_line_1_2_scale_mmx
1094 (
1095 const unsigned char *source,
1096 unsigned int source_width,
1097 unsigned char *dest,
1098 unsigned int dest_width
1099 ) {
1100 (void) dest_width;
1101
1102 __asm {
1103 mov esi, source
1104 mov edi, dest
1105
1106 pxor mm7, mm7
1107 movq mm6, four_ones
1108
1109 mov ecx, source_width
1110
1111 hs_1_2_loop:
1112
1113 movq mm0, [esi]
1114 movq mm1, [esi+1]
1115
1116 movq mm2, mm0
1117 movq mm3, mm1
1118
1119 movq mm4, mm0
1120 punpcklbw mm0, mm7
1121
1122 punpcklbw mm1, mm7
1123 paddw mm0, mm1
1124
1125 paddw mm0, mm6
1126 punpckhbw mm2, mm7
1127
1128 punpckhbw mm3, mm7
1129 paddw mm2, mm3
1130
1131 paddw mm2, mm6
1132 psraw mm0, 1
1133
1134 psraw mm2, 1
1135 packuswb mm0, mm2
1136
1137 movq mm2, mm4
1138 punpcklbw mm2, mm0
1139
1140 movq [edi], mm2
1141 punpckhbw mm4, mm0
1142
1143 movq [edi+8], mm4
1144 add esi, 8
1145
1146 add edi, 16
1147 sub ecx, 8
1148
1149 cmp ecx, 8
1150 jg hs_1_2_loop
1151
1152 // last eight pixel
1153
1154 movq mm0, [esi]
1155 movq mm1, mm0
1156
1157 movq mm2, mm0
1158 movq mm3, mm1
1159
1160 psrlq mm1, 8
1161 psrlq mm3, 56
1162
1163 psllq mm3, 56
1164 por mm1, mm3
1165
1166 movq mm3, mm1
1167 movq mm4, mm0
1168
1169 punpcklbw mm0, mm7
1170 punpcklbw mm1, mm7
1171
1172 paddw mm0, mm1
1173 paddw mm0, mm6
1174
1175 punpckhbw mm2, mm7
1176 punpckhbw mm3, mm7
1177
1178 paddw mm2, mm3
1179 paddw mm2, mm6
1180
1181 psraw mm0, 1
1182 psraw mm2, 1
1183
1184 packuswb mm0, mm2
1185 movq mm2, mm4
1186
1187 punpcklbw mm2, mm0
1188 movq [edi], mm2
1189
1190 punpckhbw mm4, mm0
1191 movq [edi+8], mm4
1192 }
1193 }
1194
1195
1196
1197
1198
1199 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 }; 29 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };
1200 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 }; 30 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };
1201 31
1202 32
1203 /**************************************************************************** 33 /****************************************************************************
1204 * 34 *
1205 * ROUTINE : horizontal_line_5_4_scale_mmx 35 * ROUTINE : horizontal_line_5_4_scale_mmx
1206 * 36 *
1207 * INPUTS : const unsigned char *source : Pointer to source data. 37 * INPUTS : const unsigned char *source : Pointer to source data.
1208 * unsigned int source_width : Stride of source. 38 * unsigned int source_width : Stride of source.
(...skipping 469 matching lines...) Expand 10 before | Expand all | Expand 10 after
1678 cmp esi, ecx 508 cmp esi, ecx
1679 jl vs_2_1_i_loop 509 jl vs_2_1_i_loop
1680 510
1681 } 511 }
1682 } 512 }
1683 513
1684 514
1685 515
1686 void 516 void
1687 register_mmxscalers(void) { 517 register_mmxscalers(void) {
1688 vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx;
1689 vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx;
1690 vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx;
1691 vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx;
1692 vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx;
1693 vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx;
1694 vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx;
1695 vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx;
1696 vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx;
1697
1698 vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c;
1699 vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c;
1700 vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;
1701 vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c;
1702 vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c;
1703 vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;
1704
1705
1706
1707 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx; 518 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
1708 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx; 519 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
1709 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx; 520 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
1710 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx; 521 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
1711 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx; 522 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
1712 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx; 523 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
1713 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx; 524 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;
1714
1715
1716
1717
1718 } 525 }
OLDNEW
« libvpx.gyp ('K') | « source/libvpx/vpx_scale/vpxscale.h ('k') | source/libvpx/vpxdec.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698