| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 | 11 |
| 12 /**************************************************************************** | 12 /**************************************************************************** |
| 13 * | 13 * |
| 14 * Module Title : scaleopt.cpp | 14 * Module Title : scaleopt.cpp |
| 15 * | 15 * |
| 16 * Description : Optimized scaling functions | 16 * Description : Optimized scaling functions |
| 17 * | 17 * |
| 18 ****************************************************************************/ | 18 ****************************************************************************/ |
| 19 #include "pragmas.h" | 19 #include "pragmas.h" |
| 20 | 20 |
| 21 | |
| 22 | |
| 23 /**************************************************************************** | 21 /**************************************************************************** |
| 24 * Module Statics | 22 * Module Statics |
| 25 ****************************************************************************/ | 23 ****************************************************************************/ |
| 26 __declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 5
1 }; | |
| 27 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102
, 102 }; | |
| 28 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 1
54, 154 }; | |
| 29 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 20
5, 205 }; | |
| 30 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 1
28, 128 }; | 24 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 1
28, 128 }; |
| 31 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1}; | |
| 32 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,
51 }; | |
| 33 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154,
205 }; | |
| 34 __declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0,
255, 0}; | |
| 35 __declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205,
102 }; | |
| 36 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51,
154 }; | |
| 37 | |
| 38 | |
| 39 | 25 |
| 40 #include "vpx_scale/vpxscale.h" | 26 #include "vpx_scale/vpxscale.h" |
| 41 #include "vpx_mem/vpx_mem.h" | 27 #include "vpx_mem/vpx_mem.h" |
| 42 | 28 |
| 43 /**************************************************************************** | |
| 44 * | |
| 45 * ROUTINE : horizontal_line_3_5_scale_mmx | |
| 46 * | |
| 47 * INPUTS : const unsigned char *source : | |
| 48 * unsigned int source_width : | |
| 49 * unsigned char *dest : | |
| 50 * unsigned int dest_width : | |
| 51 * | |
| 52 * OUTPUTS : None. | |
| 53 * | |
| 54 * RETURNS : void | |
| 55 * | |
| 56 * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels. | |
| 57 * | |
| 58 * SPECIAL NOTES : None. | |
| 59 * | |
| 60 ****************************************************************************/ | |
| 61 static | |
| 62 void horizontal_line_3_5_scale_mmx | |
| 63 ( | |
| 64 const unsigned char *source, | |
| 65 unsigned int source_width, | |
| 66 unsigned char *dest, | |
| 67 unsigned int dest_width | |
| 68 ) { | |
| 69 (void) dest_width; | |
| 70 | |
| 71 __asm { | |
| 72 | |
| 73 push ebx | |
| 74 | |
| 75 mov esi, source | |
| 76 mov edi, dest | |
| 77 | |
| 78 mov ecx, source_width | |
| 79 lea edx, [esi+ecx-3]; | |
| 80 | |
| 81 movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx | |
| 82 movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx | |
| 83 | |
| 84 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx | |
| 85 pxor mm7, mm7 // clear mm7 | |
| 86 | |
| 87 horiz_line_3_5_loop: | |
| 88 | |
| 89 mov eax, DWORD PTR [esi] // eax = 00 01 02 03 | |
| 90 mov ebx, eax | |
| 91 | |
| 92 and ebx, 0xffff00 // ebx = xx 01 02 xx | |
| 93 mov ecx, eax // ecx = 00 01 02 03 | |
| 94 | |
| 95 and eax, 0xffff0000 // eax = xx xx 02 03 | |
| 96 xor ecx, eax // ecx = 00 01 xx xx | |
| 97 | |
| 98 shr ebx, 8 // ebx = 01 02 xx xx | |
| 99 or eax, ebx // eax = 01 02 02 03 | |
| 100 | |
| 101 shl ebx, 16 // ebx = xx xx 01 02 | |
| 102 movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx | |
| 103 | |
| 104 or ebx, ecx // ebx = 00 01 01 02 | |
| 105 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx | |
| 106 | |
| 107 movd mm0, ebx // mm0 = 00 01 01 02 | |
| 108 pmullw mm1, mm6 // | |
| 109 | |
| 110 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx | |
| 111 pmullw mm0, mm5 // | |
| 112 | |
| 113 mov [edi], ebx // writeoutput 00 xx xx xx | |
| 114 add esi, 3 | |
| 115 | |
| 116 add edi, 5 | |
| 117 paddw mm0, mm1 | |
| 118 | |
| 119 paddw mm0, mm4 | |
| 120 psrlw mm0, 8 | |
| 121 | |
| 122 cmp esi, edx | |
| 123 packuswb mm0, mm7 | |
| 124 | |
| 125 movd DWORD Ptr [edi-4], mm0 | |
| 126 jl horiz_line_3_5_loop | |
| 127 | |
| 128 // Exit: | |
| 129 mov eax, DWORD PTR [esi] // eax = 00 01 02 03 | |
| 130 mov ebx, eax | |
| 131 | |
| 132 and ebx, 0xffff00 // ebx = xx 01 02 xx | |
| 133 mov ecx, eax // ecx = 00 01 02 03 | |
| 134 | |
| 135 and eax, 0xffff0000 // eax = xx xx 02 03 | |
| 136 xor ecx, eax // ecx = 00 01 xx xx | |
| 137 | |
| 138 shr ebx, 8 // ebx = 01 02 xx xx | |
| 139 or eax, ebx // eax = 01 02 02 03 | |
| 140 | |
| 141 shl eax, 8 // eax = xx 01 02 02 | |
| 142 and eax, 0xffff0000 // eax = xx xx 02 02 | |
| 143 | |
| 144 or eax, ebx // eax = 01 02 02 02 | |
| 145 | |
| 146 shl ebx, 16 // ebx = xx xx 01 02 | |
| 147 movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx | |
| 148 | |
| 149 or ebx, ecx // ebx = 00 01 01 02 | |
| 150 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx | |
| 151 | |
| 152 movd mm0, ebx // mm0 = 00 01 01 02 | |
| 153 pmullw mm1, mm6 // | |
| 154 | |
| 155 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx | |
| 156 pmullw mm0, mm5 // | |
| 157 | |
| 158 mov [edi], ebx // writeoutput 00 xx xx xx | |
| 159 paddw mm0, mm1 | |
| 160 | |
| 161 paddw mm0, mm4 | |
| 162 psrlw mm0, 8 | |
| 163 | |
| 164 packuswb mm0, mm7 | |
| 165 movd DWORD Ptr [edi+1], mm0 | |
| 166 | |
| 167 pop ebx | |
| 168 | |
| 169 } | |
| 170 | |
| 171 } | |
| 172 | |
| 173 | |
| 174 /**************************************************************************** | |
| 175 * | |
| 176 * ROUTINE : horizontal_line_4_5_scale_mmx | |
| 177 * | |
| 178 * INPUTS : const unsigned char *source : | |
| 179 * unsigned int source_width : | |
| 180 * unsigned char *dest : | |
| 181 * unsigned int dest_width : | |
| 182 * | |
| 183 * OUTPUTS : None. | |
| 184 * | |
| 185 * RETURNS : void | |
| 186 * | |
| 187 * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels. | |
| 188 * | |
| 189 * SPECIAL NOTES : None. | |
| 190 * | |
| 191 ****************************************************************************/ | |
| 192 static | |
| 193 void horizontal_line_4_5_scale_mmx | |
| 194 ( | |
| 195 const unsigned char *source, | |
| 196 unsigned int source_width, | |
| 197 unsigned char *dest, | |
| 198 unsigned int dest_width | |
| 199 ) { | |
| 200 (void)dest_width; | |
| 201 | |
| 202 __asm { | |
| 203 | |
| 204 mov esi, source | |
| 205 mov edi, dest | |
| 206 | |
| 207 mov ecx, source_width | |
| 208 lea edx, [esi+ecx-8]; | |
| 209 | |
| 210 movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx | |
| 211 movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx | |
| 212 | |
| 213 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx | |
| 214 pxor mm7, mm7 // clear mm7 | |
| 215 | |
| 216 horiz_line_4_5_loop: | |
| 217 | |
| 218 movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06
07 | |
| 219 movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07
08 | |
| 220 | |
| 221 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 | |
| 222 movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08 | |
| 223 | |
| 224 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx | |
| 225 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx | |
| 226 | |
| 227 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx | |
| 228 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 | |
| 229 | |
| 230 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 | |
| 231 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx | |
| 232 | |
| 233 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx | |
| 234 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 | |
| 235 | |
| 236 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx | |
| 237 pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51 | |
| 238 | |
| 239 paddw mm0, mm1 // added round values | |
| 240 paddw mm0, mm4 | |
| 241 | |
| 242 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx | |
| 243 packuswb mm0, mm7 | |
| 244 | |
| 245 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 | |
| 246 add edi, 10 | |
| 247 | |
| 248 add esi, 8 | |
| 249 paddw mm2, mm3 // | |
| 250 | |
| 251 paddw mm2, mm4 // added round values | |
| 252 cmp esi, edx | |
| 253 | |
| 254 psrlw mm2, 8 | |
| 255 packuswb mm2, mm7 | |
| 256 | |
| 257 movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09 | |
| 258 jl horiz_line_4_5_loop | |
| 259 | |
| 260 // Exit: | |
| 261 movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07 | |
| 262 movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07 | |
| 263 | |
| 264 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 | |
| 265 psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00 | |
| 266 | |
| 267 movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00 | |
| 268 pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00 | |
| 269 | |
| 270 psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07 | |
| 271 por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07 | |
| 272 | |
| 273 movq mm3, mm1 | |
| 274 | |
| 275 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx | |
| 276 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx | |
| 277 | |
| 278 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx | |
| 279 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 | |
| 280 | |
| 281 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 | |
| 282 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx | |
| 283 | |
| 284 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx | |
| 285 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 | |
| 286 | |
| 287 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx | |
| 288 pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51 | |
| 289 | |
| 290 paddw mm0, mm1 // added round values | |
| 291 paddw mm0, mm4 | |
| 292 | |
| 293 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx | |
| 294 packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx | |
| 295 | |
| 296 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 | |
| 297 paddw mm2, mm3 // | |
| 298 | |
| 299 paddw mm2, mm4 // added round values | |
| 300 psrlw mm2, 8 | |
| 301 | |
| 302 packuswb mm2, mm7 | |
| 303 movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09 | |
| 304 | |
| 305 | |
| 306 } | |
| 307 } | |
| 308 | |
| 309 /**************************************************************************** | |
| 310 * | |
| 311 * ROUTINE : vertical_band_4_5_scale_mmx | |
| 312 * | |
| 313 * INPUTS : unsigned char *dest : | |
| 314 * unsigned int dest_pitch : | |
| 315 * unsigned int dest_width : | |
| 316 * | |
| 317 * OUTPUTS : None. | |
| 318 * | |
| 319 * RETURNS : void | |
| 320 * | |
| 321 * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels. | |
| 322 * | |
| 323 * SPECIAL NOTES : The routine uses the first line of the band below | |
| 324 * the current band. The function also has a "C" only | |
| 325 * version. | |
| 326 * | |
| 327 ****************************************************************************/ | |
| 328 static | |
| 329 void vertical_band_4_5_scale_mmx | |
| 330 ( | |
| 331 unsigned char *dest, | |
| 332 unsigned int dest_pitch, | |
| 333 unsigned int dest_width | |
| 334 ) { | |
| 335 __asm { | |
| 336 | |
| 337 mov esi, dest // Get the source and destinatio
n pointer | |
| 338 mov ecx, dest_pitch // Get the pitch size | |
| 339 | |
| 340 lea edi, [esi+ecx*2] // tow lines below | |
| 341 add edi, ecx // three lines below | |
| 342 | |
| 343 pxor mm7, mm7 // clear out mm7 | |
| 344 mov edx, dest_width // Loop counter | |
| 345 | |
| 346 vs_4_5_loop: | |
| 347 | |
| 348 movq mm0, QWORD ptr [esi] // src[0]; | |
| 349 movq mm1, QWORD ptr [esi+ecx] // src[1]; | |
| 350 | |
| 351 movq mm2, mm0 // Make a copy | |
| 352 punpcklbw mm0, mm7 // unpack low to word | |
| 353 | |
| 354 movq mm5, one_fifth | |
| 355 punpckhbw mm2, mm7 // unpack high to word | |
| 356 | |
| 357 pmullw mm0, mm5 // a * 1/5 | |
| 358 | |
| 359 movq mm3, mm1 // make a copy | |
| 360 punpcklbw mm1, mm7 // unpack low to word | |
| 361 | |
| 362 pmullw mm2, mm5 // a * 1/5 | |
| 363 movq mm6, four_fifths // constan | |
| 364 | |
| 365 movq mm4, mm1 // copy of low b | |
| 366 pmullw mm4, mm6 // b * 4/5 | |
| 367 | |
| 368 punpckhbw mm3, mm7 // unpack high to word | |
| 369 movq mm5, mm3 // copy of high b | |
| 370 | |
| 371 pmullw mm5, mm6 // b * 4/5 | |
| 372 paddw mm0, mm4 // a * 1/5 + b * 4/5 | |
| 373 | |
| 374 paddw mm2, mm5 // a * 1/5 + b * 4/5 | |
| 375 paddw mm0, round_values // + 128 | |
| 376 | |
| 377 paddw mm2, round_values // + 128 | |
| 378 psrlw mm0, 8 | |
| 379 | |
| 380 psrlw mm2, 8 | |
| 381 packuswb mm0, mm2 // des [1] | |
| 382 | |
| 383 movq QWORD ptr [esi+ecx], mm0 // write des[1] | |
| 384 movq mm0, [esi+ecx*2] // mm0 = src[2] | |
| 385 | |
| 386 // mm1, mm3 --- Src[1] | |
| 387 // mm0 --- Src[2] | |
| 388 // mm7 for unpacking | |
| 389 | |
| 390 movq mm5, two_fifths | |
| 391 movq mm2, mm0 // make a copy | |
| 392 | |
| 393 pmullw mm1, mm5 // b * 2/5 | |
| 394 movq mm6, three_fifths | |
| 395 | |
| 396 | |
| 397 punpcklbw mm0, mm7 // unpack low to word | |
| 398 pmullw mm3, mm5 // b * 2/5 | |
| 399 | |
| 400 movq mm4, mm0 // make copy of c | |
| 401 punpckhbw mm2, mm7 // unpack high to word | |
| 402 | |
| 403 pmullw mm4, mm6 // c * 3/5 | |
| 404 movq mm5, mm2 | |
| 405 | |
| 406 pmullw mm5, mm6 // c * 3/5 | |
| 407 paddw mm1, mm4 // b * 2/5 + c * 3/5 | |
| 408 | |
| 409 paddw mm3, mm5 // b * 2/5 + c * 3/5 | |
| 410 paddw mm1, round_values // + 128 | |
| 411 | |
| 412 paddw mm3, round_values // + 128 | |
| 413 psrlw mm1, 8 | |
| 414 | |
| 415 psrlw mm3, 8 | |
| 416 packuswb mm1, mm3 // des[2] | |
| 417 | |
| 418 movq QWORD ptr [esi+ecx*2], mm1 // write des[2] | |
| 419 movq mm1, [edi] // mm1=Src[3]; | |
| 420 | |
| 421 // mm0, mm2 --- Src[2] | |
| 422 // mm1 --- Src[3] | |
| 423 // mm6 --- 3/5 | |
| 424 // mm7 for unpacking | |
| 425 | |
| 426 pmullw mm0, mm6 // c * 3/5 | |
| 427 movq mm5, two_fifths // mm5 = 2/5 | |
| 428 | |
| 429 movq mm3, mm1 // make a copy | |
| 430 pmullw mm2, mm6 // c * 3/5 | |
| 431 | |
| 432 punpcklbw mm1, mm7 // unpack low | |
| 433 movq mm4, mm1 // make a copy | |
| 434 | |
| 435 punpckhbw mm3, mm7 // unpack high | |
| 436 pmullw mm4, mm5 // d * 2/5 | |
| 437 | |
| 438 movq mm6, mm3 // make a copy | |
| 439 pmullw mm6, mm5 // d * 2/5 | |
| 440 | |
| 441 paddw mm0, mm4 // c * 3/5 + d * 2/5 | |
| 442 paddw mm2, mm6 // c * 3/5 + d * 2/5 | |
| 443 | |
| 444 paddw mm0, round_values // + 128 | |
| 445 paddw mm2, round_values // + 128 | |
| 446 | |
| 447 psrlw mm0, 8 | |
| 448 psrlw mm2, 8 | |
| 449 | |
| 450 packuswb mm0, mm2 // des[3] | |
| 451 movq QWORD ptr [edi], mm0 // write des[3] | |
| 452 | |
| 453 // mm1, mm3 --- Src[3] | |
| 454 // mm7 -- cleared for unpacking | |
| 455 | |
| 456 movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group | |
| 457 | |
| 458 movq mm5, four_fifths // mm5 = 4/5 | |
| 459 pmullw mm1, mm5 // d * 4/5 | |
| 460 | |
| 461 movq mm6, one_fifth // mm6 = 1/5 | |
| 462 movq mm2, mm0 // make a copy | |
| 463 | |
| 464 pmullw mm3, mm5 // d * 4/5 | |
| 465 punpcklbw mm0, mm7 // unpack low | |
| 466 | |
| 467 pmullw mm0, mm6 // an * 1/5 | |
| 468 punpckhbw mm2, mm7 // unpack high | |
| 469 | |
| 470 paddw mm1, mm0 // d * 4/5 + an * 1/5 | |
| 471 pmullw mm2, mm6 // an * 1/5 | |
| 472 | |
| 473 paddw mm3, mm2 // d * 4/5 + an * 1/5 | |
| 474 paddw mm1, round_values // + 128 | |
| 475 | |
| 476 paddw mm3, round_values // + 128 | |
| 477 psrlw mm1, 8 | |
| 478 | |
| 479 psrlw mm3, 8 | |
| 480 packuswb mm1, mm3 // des[4] | |
| 481 | |
| 482 movq QWORD ptr [edi+ecx], mm1 // write des[4] | |
| 483 | |
| 484 add edi, 8 | |
| 485 add esi, 8 | |
| 486 | |
| 487 sub edx, 8 | |
| 488 jg vs_4_5_loop | |
| 489 } | |
| 490 } | |
| 491 | |
| 492 /**************************************************************************** | |
| 493 * | |
| 494 * ROUTINE : last_vertical_band_4_5_scale_mmx | |
| 495 * | |
| 496 * INPUTS : unsigned char *dest : | |
| 497 * unsigned int dest_pitch : | |
| 498 * unsigned int dest_width : | |
| 499 * | |
| 500 * OUTPUTS : None. | |
| 501 * | |
| 502 * RETURNS : None | |
| 503 * | |
| 504 * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image. | |
| 505 * | |
| 506 * SPECIAL NOTES : The routine uses the first line of the band below | |
| 507 * the current band. The function also has an "C" only | |
| 508 * version. | |
| 509 * | |
| 510 ****************************************************************************/ | |
| 511 static | |
| 512 void last_vertical_band_4_5_scale_mmx | |
| 513 ( | |
| 514 unsigned char *dest, | |
| 515 unsigned int dest_pitch, | |
| 516 unsigned int dest_width | |
| 517 ) { | |
| 518 __asm { | |
| 519 mov esi, dest // Get the source and destinatio
n pointer | |
| 520 mov ecx, dest_pitch // Get the pitch size | |
| 521 | |
| 522 lea edi, [esi+ecx*2] // tow lines below | |
| 523 add edi, ecx // three lines below | |
| 524 | |
| 525 pxor mm7, mm7 // clear out mm7 | |
| 526 mov edx, dest_width // Loop counter | |
| 527 | |
| 528 last_vs_4_5_loop: | |
| 529 | |
| 530 movq mm0, QWORD ptr [esi] // src[0]; | |
| 531 movq mm1, QWORD ptr [esi+ecx] // src[1]; | |
| 532 | |
| 533 movq mm2, mm0 // Make a copy | |
| 534 punpcklbw mm0, mm7 // unpack low to word | |
| 535 | |
| 536 movq mm5, one_fifth | |
| 537 punpckhbw mm2, mm7 // unpack high to word | |
| 538 | |
| 539 pmullw mm0, mm5 // a * 1/5 | |
| 540 | |
| 541 movq mm3, mm1 // make a copy | |
| 542 punpcklbw mm1, mm7 // unpack low to word | |
| 543 | |
| 544 pmullw mm2, mm5 // a * 1/5 | |
| 545 movq mm6, four_fifths // constan | |
| 546 | |
| 547 movq mm4, mm1 // copy of low b | |
| 548 pmullw mm4, mm6 // b * 4/5 | |
| 549 | |
| 550 punpckhbw mm3, mm7 // unpack high to word | |
| 551 movq mm5, mm3 // copy of high b | |
| 552 | |
| 553 pmullw mm5, mm6 // b * 4/5 | |
| 554 paddw mm0, mm4 // a * 1/5 + b * 4/5 | |
| 555 | |
| 556 paddw mm2, mm5 // a * 1/5 + b * 4/5 | |
| 557 paddw mm0, round_values // + 128 | |
| 558 | |
| 559 paddw mm2, round_values // + 128 | |
| 560 psrlw mm0, 8 | |
| 561 | |
| 562 psrlw mm2, 8 | |
| 563 packuswb mm0, mm2 // des [1] | |
| 564 | |
| 565 movq QWORD ptr [esi+ecx], mm0 // write des[1] | |
| 566 movq mm0, [esi+ecx*2] // mm0 = src[2] | |
| 567 | |
| 568 // mm1, mm3 --- Src[1] | |
| 569 // mm0 --- Src[2] | |
| 570 // mm7 for unpacking | |
| 571 | |
| 572 movq mm5, two_fifths | |
| 573 movq mm2, mm0 // make a copy | |
| 574 | |
| 575 pmullw mm1, mm5 // b * 2/5 | |
| 576 movq mm6, three_fifths | |
| 577 | |
| 578 | |
| 579 punpcklbw mm0, mm7 // unpack low to word | |
| 580 pmullw mm3, mm5 // b * 2/5 | |
| 581 | |
| 582 movq mm4, mm0 // make copy of c | |
| 583 punpckhbw mm2, mm7 // unpack high to word | |
| 584 | |
| 585 pmullw mm4, mm6 // c * 3/5 | |
| 586 movq mm5, mm2 | |
| 587 | |
| 588 pmullw mm5, mm6 // c * 3/5 | |
| 589 paddw mm1, mm4 // b * 2/5 + c * 3/5 | |
| 590 | |
| 591 paddw mm3, mm5 // b * 2/5 + c * 3/5 | |
| 592 paddw mm1, round_values // + 128 | |
| 593 | |
| 594 paddw mm3, round_values // + 128 | |
| 595 psrlw mm1, 8 | |
| 596 | |
| 597 psrlw mm3, 8 | |
| 598 packuswb mm1, mm3 // des[2] | |
| 599 | |
| 600 movq QWORD ptr [esi+ecx*2], mm1 // write des[2] | |
| 601 movq mm1, [edi] // mm1=Src[3]; | |
| 602 | |
| 603 movq QWORD ptr [edi+ecx], mm1 // write des[4]; | |
| 604 | |
| 605 // mm0, mm2 --- Src[2] | |
| 606 // mm1 --- Src[3] | |
| 607 // mm6 --- 3/5 | |
| 608 // mm7 for unpacking | |
| 609 | |
| 610 pmullw mm0, mm6 // c * 3/5 | |
| 611 movq mm5, two_fifths // mm5 = 2/5 | |
| 612 | |
| 613 movq mm3, mm1 // make a copy | |
| 614 pmullw mm2, mm6 // c * 3/5 | |
| 615 | |
| 616 punpcklbw mm1, mm7 // unpack low | |
| 617 movq mm4, mm1 // make a copy | |
| 618 | |
| 619 punpckhbw mm3, mm7 // unpack high | |
| 620 pmullw mm4, mm5 // d * 2/5 | |
| 621 | |
| 622 movq mm6, mm3 // make a copy | |
| 623 pmullw mm6, mm5 // d * 2/5 | |
| 624 | |
| 625 paddw mm0, mm4 // c * 3/5 + d * 2/5 | |
| 626 paddw mm2, mm6 // c * 3/5 + d * 2/5 | |
| 627 | |
| 628 paddw mm0, round_values // + 128 | |
| 629 paddw mm2, round_values // + 128 | |
| 630 | |
| 631 psrlw mm0, 8 | |
| 632 psrlw mm2, 8 | |
| 633 | |
| 634 packuswb mm0, mm2 // des[3] | |
| 635 movq QWORD ptr [edi], mm0 // write des[3] | |
| 636 | |
| 637 // mm1, mm3 --- Src[3] | |
| 638 // mm7 -- cleared for unpacking | |
| 639 add edi, 8 | |
| 640 add esi, 8 | |
| 641 | |
| 642 sub edx, 8 | |
| 643 jg last_vs_4_5_loop | |
| 644 } | |
| 645 } | |
| 646 | |
| 647 /**************************************************************************** | |
| 648 * | |
| 649 * ROUTINE : vertical_band_3_5_scale_mmx | |
| 650 * | |
| 651 * INPUTS : unsigned char *dest : | |
| 652 * unsigned int dest_pitch : | |
| 653 * unsigned int dest_width : | |
| 654 * | |
| 655 * OUTPUTS : None. | |
| 656 * | |
| 657 * RETURNS : void | |
| 658 * | |
| 659 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. | |
| 660 * | |
| 661 * SPECIAL NOTES : The routine uses the first line of the band below | |
| 662 * the current band. The function also has an "C" only | |
| 663 * version. | |
| 664 * | |
| 665 ****************************************************************************/ | |
| 666 static | |
| 667 void vertical_band_3_5_scale_mmx | |
| 668 ( | |
| 669 unsigned char *dest, | |
| 670 unsigned int dest_pitch, | |
| 671 unsigned int dest_width | |
| 672 ) { | |
| 673 __asm { | |
| 674 mov esi, dest // Get the source and destinatio
n pointer | |
| 675 mov ecx, dest_pitch // Get the pitch size | |
| 676 | |
| 677 lea edi, [esi+ecx*2] // tow lines below | |
| 678 add edi, ecx // three lines below | |
| 679 | |
| 680 pxor mm7, mm7 // clear out mm7 | |
| 681 mov edx, dest_width // Loop counter | |
| 682 | |
| 683 vs_3_5_loop: | |
| 684 | |
| 685 movq mm0, QWORD ptr [esi] // src[0]; | |
| 686 movq mm1, QWORD ptr [esi+ecx] // src[1]; | |
| 687 | |
| 688 movq mm2, mm0 // Make a copy | |
| 689 punpcklbw mm0, mm7 // unpack low to word | |
| 690 | |
| 691 movq mm5, two_fifths // mm5 = 2/5 | |
| 692 punpckhbw mm2, mm7 // unpack high to word | |
| 693 | |
| 694 pmullw mm0, mm5 // a * 2/5 | |
| 695 | |
| 696 movq mm3, mm1 // make a copy | |
| 697 punpcklbw mm1, mm7 // unpack low to word | |
| 698 | |
| 699 pmullw mm2, mm5 // a * 2/5 | |
| 700 movq mm6, three_fifths // mm6 = 3/5 | |
| 701 | |
| 702 movq mm4, mm1 // copy of low b | |
| 703 pmullw mm4, mm6 // b * 3/5 | |
| 704 | |
| 705 punpckhbw mm3, mm7 // unpack high to word | |
| 706 movq mm5, mm3 // copy of high b | |
| 707 | |
| 708 pmullw mm5, mm6 // b * 3/5 | |
| 709 paddw mm0, mm4 // a * 2/5 + b * 3/5 | |
| 710 | |
| 711 paddw mm2, mm5 // a * 2/5 + b * 3/5 | |
| 712 paddw mm0, round_values // + 128 | |
| 713 | |
| 714 paddw mm2, round_values // + 128 | |
| 715 psrlw mm0, 8 | |
| 716 | |
| 717 psrlw mm2, 8 | |
| 718 packuswb mm0, mm2 // des [1] | |
| 719 | |
| 720 movq QWORD ptr [esi+ecx], mm0 // write des[1] | |
| 721 movq mm0, [esi+ecx*2] // mm0 = src[2] | |
| 722 | |
| 723 // mm1, mm3 --- Src[1] | |
| 724 // mm0 --- Src[2] | |
| 725 // mm7 for unpacking | |
| 726 | |
| 727 movq mm4, mm1 // b low | |
| 728 pmullw mm1, four_fifths // b * 4/5 low | |
| 729 | |
| 730 movq mm5, mm3 // b high | |
| 731 pmullw mm3, four_fifths // b * 4/5 high | |
| 732 | |
| 733 movq mm2, mm0 // c | |
| 734 pmullw mm4, one_fifth // b * 1/5 | |
| 735 | |
| 736 punpcklbw mm0, mm7 // c low | |
| 737 pmullw mm5, one_fifth // b * 1/5 | |
| 738 | |
| 739 movq mm6, mm0 // make copy of c low | |
| 740 punpckhbw mm2, mm7 // c high | |
| 741 | |
| 742 pmullw mm6, one_fifth // c * 1/5 low | |
| 743 movq mm7, mm2 // make copy of c high | |
| 744 | |
| 745 pmullw mm7, one_fifth // c * 1/5 high | |
| 746 paddw mm1, mm6 // b * 4/5 + c * 1/5 low | |
| 747 | |
| 748 paddw mm3, mm7 // b * 4/5 + c * 1/5 high | |
| 749 movq mm6, mm0 // make copy of c low | |
| 750 | |
| 751 pmullw mm6, four_fifths // c * 4/5 low | |
| 752 movq mm7, mm2 // make copy of c high | |
| 753 | |
| 754 pmullw mm7, four_fifths // c * 4/5 high | |
| 755 | |
| 756 paddw mm4, mm6 // b * 1/5 + c * 4/5 low | |
| 757 paddw mm5, mm7 // b * 1/5 + c * 4/5 high | |
| 758 | |
| 759 paddw mm1, round_values // + 128 | |
| 760 paddw mm3, round_values // + 128 | |
| 761 | |
| 762 psrlw mm1, 8 | |
| 763 psrlw mm3, 8 | |
| 764 | |
| 765 packuswb mm1, mm3 // des[2] | |
| 766 movq QWORD ptr [esi+ecx*2], mm1 // write des[2] | |
| 767 | |
| 768 paddw mm4, round_values // + 128 | |
| 769 paddw mm5, round_values // + 128 | |
| 770 | |
| 771 psrlw mm4, 8 | |
| 772 psrlw mm5, 8 | |
| 773 | |
| 774 packuswb mm4, mm5 // des[3] | |
| 775 movq QWORD ptr [edi], mm4 // write des[3] | |
| 776 | |
| 777 // mm0, mm2 --- Src[3] | |
| 778 | |
| 779 pxor mm7, mm7 // clear mm7 for unpacking | |
| 780 movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next grou
p | |
| 781 | |
| 782 movq mm5, three_fifths // mm5 = 3/5 | |
| 783 pmullw mm0, mm5 // d * 3/5 | |
| 784 | |
| 785 movq mm6, two_fifths // mm6 = 2/5 | |
| 786 movq mm3, mm1 // make a copy | |
| 787 | |
| 788 pmullw mm2, mm5 // d * 3/5 | |
| 789 punpcklbw mm1, mm7 // unpack low | |
| 790 | |
| 791 pmullw mm1, mm6 // an * 2/5 | |
| 792 punpckhbw mm3, mm7 // unpack high | |
| 793 | |
| 794 paddw mm0, mm1 // d * 3/5 + an * 2/5 | |
| 795 pmullw mm3, mm6 // an * 2/5 | |
| 796 | |
| 797 paddw mm2, mm3 // d * 3/5 + an * 2/5 | |
| 798 paddw mm0, round_values // + 128 | |
| 799 | |
| 800 paddw mm2, round_values // + 128 | |
| 801 psrlw mm0, 8 | |
| 802 | |
| 803 psrlw mm2, 8 | |
| 804 packuswb mm0, mm2 // des[4] | |
| 805 | |
| 806 movq QWORD ptr [edi+ecx], mm0 // write des[4] | |
| 807 | |
| 808 add edi, 8 | |
| 809 add esi, 8 | |
| 810 | |
| 811 sub edx, 8 | |
| 812 jg vs_3_5_loop | |
| 813 } | |
| 814 } | |
| 815 | |
| 816 /**************************************************************************** | |
| 817 * | |
| 818 * ROUTINE : last_vertical_band_3_5_scale_mmx | |
| 819 * | |
| 820 * INPUTS : unsigned char *dest : | |
| 821 * unsigned int dest_pitch : | |
| 822 * unsigned int dest_width : | |
| 823 * | |
| 824 * OUTPUTS : None. | |
| 825 * | |
| 826 * RETURNS : void | |
| 827 * | |
| 828 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. | |
| 829 * | |
| 830 * SPECIAL NOTES : The routine uses the first line of the band below | |
| 831 * the current band. The function also has an "C" only | |
| 832 * version. | |
| 833 * | |
| 834 ****************************************************************************/ | |
| 835 static | |
| 836 void last_vertical_band_3_5_scale_mmx | |
| 837 ( | |
| 838 unsigned char *dest, | |
| 839 unsigned int dest_pitch, | |
| 840 unsigned int dest_width | |
| 841 ) { | |
| 842 __asm { | |
| 843 mov esi, dest // Get the source and destinatio
n pointer | |
| 844 mov ecx, dest_pitch // Get the pitch size | |
| 845 | |
| 846 lea edi, [esi+ecx*2] // tow lines below | |
| 847 add edi, ecx // three lines below | |
| 848 | |
| 849 pxor mm7, mm7 // clear out mm7 | |
| 850 mov edx, dest_width // Loop counter | |
| 851 | |
| 852 | |
| 853 last_vs_3_5_loop: | |
| 854 | |
| 855 movq mm0, QWORD ptr [esi] // src[0]; | |
| 856 movq mm1, QWORD ptr [esi+ecx] // src[1]; | |
| 857 | |
| 858 movq mm2, mm0 // Make a copy | |
| 859 punpcklbw mm0, mm7 // unpack low to word | |
| 860 | |
| 861 movq mm5, two_fifths // mm5 = 2/5 | |
| 862 punpckhbw mm2, mm7 // unpack high to word | |
| 863 | |
| 864 pmullw mm0, mm5 // a * 2/5 | |
| 865 | |
| 866 movq mm3, mm1 // make a copy | |
| 867 punpcklbw mm1, mm7 // unpack low to word | |
| 868 | |
| 869 pmullw mm2, mm5 // a * 2/5 | |
| 870 movq mm6, three_fifths // mm6 = 3/5 | |
| 871 | |
| 872 movq mm4, mm1 // copy of low b | |
| 873 pmullw mm4, mm6 // b * 3/5 | |
| 874 | |
| 875 punpckhbw mm3, mm7 // unpack high to word | |
| 876 movq mm5, mm3 // copy of high b | |
| 877 | |
| 878 pmullw mm5, mm6 // b * 3/5 | |
| 879 paddw mm0, mm4 // a * 2/5 + b * 3/5 | |
| 880 | |
| 881 paddw mm2, mm5 // a * 2/5 + b * 3/5 | |
| 882 paddw mm0, round_values // + 128 | |
| 883 | |
| 884 paddw mm2, round_values // + 128 | |
| 885 psrlw mm0, 8 | |
| 886 | |
| 887 psrlw mm2, 8 | |
| 888 packuswb mm0, mm2 // des [1] | |
| 889 | |
| 890 movq QWORD ptr [esi+ecx], mm0 // write des[1] | |
| 891 movq mm0, [esi+ecx*2] // mm0 = src[2] | |
| 892 | |
| 893 | |
| 894 | |
| 895 // mm1, mm3 --- Src[1] | |
| 896 // mm0 --- Src[2] | |
| 897 // mm7 for unpacking | |
| 898 | |
| 899 movq mm4, mm1 // b low | |
| 900 pmullw mm1, four_fifths // b * 4/5 low | |
| 901 | |
| 902 movq QWORD ptr [edi+ecx], mm0 // write des[4] | |
| 903 | |
| 904 movq mm5, mm3 // b high | |
| 905 pmullw mm3, four_fifths // b * 4/5 high | |
| 906 | |
| 907 movq mm2, mm0 // c | |
| 908 pmullw mm4, one_fifth // b * 1/5 | |
| 909 | |
| 910 punpcklbw mm0, mm7 // c low | |
| 911 pmullw mm5, one_fifth // b * 1/5 | |
| 912 | |
| 913 movq mm6, mm0 // make copy of c low | |
| 914 punpckhbw mm2, mm7 // c high | |
| 915 | |
| 916 pmullw mm6, one_fifth // c * 1/5 low | |
| 917 movq mm7, mm2 // make copy of c high | |
| 918 | |
| 919 pmullw mm7, one_fifth // c * 1/5 high | |
| 920 paddw mm1, mm6 // b * 4/5 + c * 1/5 low | |
| 921 | |
| 922 paddw mm3, mm7 // b * 4/5 + c * 1/5 high | |
| 923 movq mm6, mm0 // make copy of c low | |
| 924 | |
| 925 pmullw mm6, four_fifths // c * 4/5 low | |
| 926 movq mm7, mm2 // make copy of c high | |
| 927 | |
| 928 pmullw mm7, four_fifths // c * 4/5 high | |
| 929 | |
| 930 paddw mm4, mm6 // b * 1/5 + c * 4/5 low | |
| 931 paddw mm5, mm7 // b * 1/5 + c * 4/5 high | |
| 932 | |
| 933 paddw mm1, round_values // + 128 | |
| 934 paddw mm3, round_values // + 128 | |
| 935 | |
| 936 psrlw mm1, 8 | |
| 937 psrlw mm3, 8 | |
| 938 | |
| 939 packuswb mm1, mm3 // des[2] | |
| 940 movq QWORD ptr [esi+ecx*2], mm1 // write des[2] | |
| 941 | |
| 942 paddw mm4, round_values // + 128 | |
| 943 paddw mm5, round_values // + 128 | |
| 944 | |
| 945 psrlw mm4, 8 | |
| 946 psrlw mm5, 8 | |
| 947 | |
| 948 packuswb mm4, mm5 // des[3] | |
| 949 movq QWORD ptr [edi], mm4 // write des[3] | |
| 950 | |
| 951 // mm0, mm2 --- Src[3] | |
| 952 | |
| 953 add edi, 8 | |
| 954 add esi, 8 | |
| 955 | |
| 956 sub edx, 8 | |
| 957 jg last_vs_3_5_loop | |
| 958 } | |
| 959 } | |
| 960 | |
| 961 /**************************************************************************** | |
| 962 * | |
| 963 * ROUTINE : vertical_band_1_2_scale_mmx | |
| 964 * | |
| 965 * INPUTS : unsigned char *dest : | |
| 966 * unsigned int dest_pitch : | |
| 967 * unsigned int dest_width : | |
| 968 * | |
| 969 * OUTPUTS : None. | |
| 970 * | |
| 971 * RETURNS : void | |
| 972 * | |
| 973 * FUNCTION : 1 to 2 up-scaling of a band of pixels. | |
| 974 * | |
| 975 * SPECIAL NOTES : The routine uses the first line of the band below | |
| 976 * the current band. The function also has an "C" only | |
| 977 * version. | |
| 978 * | |
| 979 ****************************************************************************/ | |
| 980 static | |
| 981 void vertical_band_1_2_scale_mmx | |
| 982 ( | |
| 983 unsigned char *dest, | |
| 984 unsigned int dest_pitch, | |
| 985 unsigned int dest_width | |
| 986 ) { | |
| 987 __asm { | |
| 988 | |
| 989 mov esi, dest // Get the source and destinatio
n pointer | |
| 990 mov ecx, dest_pitch // Get the pitch size | |
| 991 | |
| 992 pxor mm7, mm7 // clear out mm7 | |
| 993 mov edx, dest_width // Loop counter | |
| 994 | |
| 995 vs_1_2_loop: | |
| 996 | |
| 997 movq mm0, [esi] // get Src[0] | |
| 998 movq mm1, [esi + ecx * 2] // get Src[1] | |
| 999 | |
| 1000 movq mm2, mm0 // make copy before unpack | |
| 1001 movq mm3, mm1 // make copy before unpack | |
| 1002 | |
| 1003 punpcklbw mm0, mm7 // low Src[0] | |
| 1004 movq mm6, four_ones // mm6= 1, 1, 1, 1 | |
| 1005 | |
| 1006 punpcklbw mm1, mm7 // low Src[1] | |
| 1007 paddw mm0, mm1 // low (a + b) | |
| 1008 | |
| 1009 punpckhbw mm2, mm7 // high Src[0] | |
| 1010 paddw mm0, mm6 // low (a + b + 1) | |
| 1011 | |
| 1012 punpckhbw mm3, mm7 | |
| 1013 paddw mm2, mm3 // high (a + b ) | |
| 1014 | |
| 1015 psraw mm0, 1 // low (a + b +1 )/2 | |
| 1016 paddw mm2, mm6 // high (a + b + 1) | |
| 1017 | |
| 1018 psraw mm2, 1 // high (a + b + 1)/2 | |
| 1019 packuswb mm0, mm2 // pack results | |
| 1020 | |
| 1021 movq [esi+ecx], mm0 // write out eight bytes | |
| 1022 add esi, 8 | |
| 1023 | |
| 1024 sub edx, 8 | |
| 1025 jg vs_1_2_loop | |
| 1026 } | |
| 1027 | |
| 1028 } | |
| 1029 | |
| 1030 /**************************************************************************** | |
| 1031 * | |
| 1032 * ROUTINE : last_vertical_band_1_2_scale_mmx | |
| 1033 * | |
| 1034 * INPUTS : unsigned char *dest : | |
| 1035 * unsigned int dest_pitch : | |
| 1036 * unsigned int dest_width : | |
| 1037 * | |
| 1038 * OUTPUTS : None. | |
| 1039 * | |
| 1040 * RETURNS : void | |
| 1041 * | |
| 1042 * FUNCTION : 1 to 2 up-scaling of band of pixels. | |
| 1043 * | |
| 1044 * SPECIAL NOTES : The routine uses the first line of the band below | |
| 1045 * the current band. The function also has an "C" only | |
| 1046 * version. | |
| 1047 * | |
| 1048 ****************************************************************************/ | |
| 1049 static | |
| 1050 void last_vertical_band_1_2_scale_mmx | |
| 1051 ( | |
| 1052 unsigned char *dest, | |
| 1053 unsigned int dest_pitch, | |
| 1054 unsigned int dest_width | |
| 1055 ) { | |
| 1056 __asm { | |
| 1057 mov esi, dest // Get the source and destinatio
n pointer | |
| 1058 mov ecx, dest_pitch // Get the pitch size | |
| 1059 | |
| 1060 mov edx, dest_width // Loop counter | |
| 1061 | |
| 1062 last_vs_1_2_loop: | |
| 1063 | |
| 1064 movq mm0, [esi] // get Src[0] | |
| 1065 movq [esi+ecx], mm0 // write out eight bytes | |
| 1066 | |
| 1067 add esi, 8 | |
| 1068 sub edx, 8 | |
| 1069 | |
| 1070 jg last_vs_1_2_loop | |
| 1071 } | |
| 1072 } | |
| 1073 | |
| 1074 /**************************************************************************** | |
| 1075 * | |
| 1076 * ROUTINE : horizontal_line_1_2_scale | |
| 1077 * | |
| 1078 * INPUTS : const unsigned char *source : | |
| 1079 * unsigned int source_width : | |
| 1080 * unsigned char *dest : | |
| 1081 * unsigned int dest_width : | |
| 1082 * | |
| 1083 * OUTPUTS : None. | |
| 1084 * | |
| 1085 * RETURNS : void | |
| 1086 * | |
| 1087 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. | |
| 1088 * | |
| 1089 * SPECIAL NOTES : None. | |
| 1090 * | |
| 1091 ****************************************************************************/ | |
| 1092 static | |
| 1093 void horizontal_line_1_2_scale_mmx | |
| 1094 ( | |
| 1095 const unsigned char *source, | |
| 1096 unsigned int source_width, | |
| 1097 unsigned char *dest, | |
| 1098 unsigned int dest_width | |
| 1099 ) { | |
| 1100 (void) dest_width; | |
| 1101 | |
| 1102 __asm { | |
| 1103 mov esi, source | |
| 1104 mov edi, dest | |
| 1105 | |
| 1106 pxor mm7, mm7 | |
| 1107 movq mm6, four_ones | |
| 1108 | |
| 1109 mov ecx, source_width | |
| 1110 | |
| 1111 hs_1_2_loop: | |
| 1112 | |
| 1113 movq mm0, [esi] | |
| 1114 movq mm1, [esi+1] | |
| 1115 | |
| 1116 movq mm2, mm0 | |
| 1117 movq mm3, mm1 | |
| 1118 | |
| 1119 movq mm4, mm0 | |
| 1120 punpcklbw mm0, mm7 | |
| 1121 | |
| 1122 punpcklbw mm1, mm7 | |
| 1123 paddw mm0, mm1 | |
| 1124 | |
| 1125 paddw mm0, mm6 | |
| 1126 punpckhbw mm2, mm7 | |
| 1127 | |
| 1128 punpckhbw mm3, mm7 | |
| 1129 paddw mm2, mm3 | |
| 1130 | |
| 1131 paddw mm2, mm6 | |
| 1132 psraw mm0, 1 | |
| 1133 | |
| 1134 psraw mm2, 1 | |
| 1135 packuswb mm0, mm2 | |
| 1136 | |
| 1137 movq mm2, mm4 | |
| 1138 punpcklbw mm2, mm0 | |
| 1139 | |
| 1140 movq [edi], mm2 | |
| 1141 punpckhbw mm4, mm0 | |
| 1142 | |
| 1143 movq [edi+8], mm4 | |
| 1144 add esi, 8 | |
| 1145 | |
| 1146 add edi, 16 | |
| 1147 sub ecx, 8 | |
| 1148 | |
| 1149 cmp ecx, 8 | |
| 1150 jg hs_1_2_loop | |
| 1151 | |
| 1152 // last eight pixel | |
| 1153 | |
| 1154 movq mm0, [esi] | |
| 1155 movq mm1, mm0 | |
| 1156 | |
| 1157 movq mm2, mm0 | |
| 1158 movq mm3, mm1 | |
| 1159 | |
| 1160 psrlq mm1, 8 | |
| 1161 psrlq mm3, 56 | |
| 1162 | |
| 1163 psllq mm3, 56 | |
| 1164 por mm1, mm3 | |
| 1165 | |
| 1166 movq mm3, mm1 | |
| 1167 movq mm4, mm0 | |
| 1168 | |
| 1169 punpcklbw mm0, mm7 | |
| 1170 punpcklbw mm1, mm7 | |
| 1171 | |
| 1172 paddw mm0, mm1 | |
| 1173 paddw mm0, mm6 | |
| 1174 | |
| 1175 punpckhbw mm2, mm7 | |
| 1176 punpckhbw mm3, mm7 | |
| 1177 | |
| 1178 paddw mm2, mm3 | |
| 1179 paddw mm2, mm6 | |
| 1180 | |
| 1181 psraw mm0, 1 | |
| 1182 psraw mm2, 1 | |
| 1183 | |
| 1184 packuswb mm0, mm2 | |
| 1185 movq mm2, mm4 | |
| 1186 | |
| 1187 punpcklbw mm2, mm0 | |
| 1188 movq [edi], mm2 | |
| 1189 | |
| 1190 punpckhbw mm4, mm0 | |
| 1191 movq [edi+8], mm4 | |
| 1192 } | |
| 1193 } | |
| 1194 | |
| 1195 | |
| 1196 | |
| 1197 | |
| 1198 | |
| 1199 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128,
192 }; | 29 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128,
192 }; |
| 1200 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,
64 }; | 30 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,
64 }; |
| 1201 | 31 |
| 1202 | 32 |
| 1203 /**************************************************************************** | 33 /**************************************************************************** |
| 1204 * | 34 * |
| 1205 * ROUTINE : horizontal_line_5_4_scale_mmx | 35 * ROUTINE : horizontal_line_5_4_scale_mmx |
| 1206 * | 36 * |
| 1207 * INPUTS : const unsigned char *source : Pointer to source data. | 37 * INPUTS : const unsigned char *source : Pointer to source data. |
| 1208 * unsigned int source_width : Stride of source. | 38 * unsigned int source_width : Stride of source. |
| (...skipping 469 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1678 cmp esi, ecx | 508 cmp esi, ecx |
| 1679 jl vs_2_1_i_loop | 509 jl vs_2_1_i_loop |
| 1680 | 510 |
| 1681 } | 511 } |
| 1682 } | 512 } |
| 1683 | 513 |
| 1684 | 514 |
| 1685 | 515 |
| 1686 void | 516 void |
| 1687 register_mmxscalers(void) { | 517 register_mmxscalers(void) { |
| 1688 vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx; | |
| 1689 vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx; | |
| 1690 vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx; | |
| 1691 vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx; | |
| 1692 vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx; | |
| 1693 vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx; | |
| 1694 vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx; | |
| 1695 vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx; | |
| 1696 vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx; | |
| 1697 | |
| 1698 vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c; | |
| 1699 vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c; | |
| 1700 vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; | |
| 1701 vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c; | |
| 1702 vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c; | |
| 1703 vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; | |
| 1704 | |
| 1705 | |
| 1706 | |
| 1707 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx; | 518 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx; |
| 1708 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx; | 519 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx; |
| 1709 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx; | 520 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx; |
| 1710 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx; | 521 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx; |
| 1711 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx; | 522 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx; |
| 1712 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx; | 523 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx; |
| 1713 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx; | 524 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx; |
| 1714 | |
| 1715 | |
| 1716 | |
| 1717 | |
| 1718 } | 525 } |
| OLD | NEW |