OLD | NEW |
1 ; | 1 ; |
2 ; jcclrss2.asm - colorspace conversion (SSE2) | 2 ; jcclrss2.asm - colorspace conversion (SSE2) |
3 ; | 3 ; |
4 ; x86 SIMD extension for IJG JPEG library | 4 ; x86 SIMD extension for IJG JPEG library |
5 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 5 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
6 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 6 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
7 ; | 7 ; |
8 ; This file should be assembled with NASM (Netwide Assembler), | 8 ; This file should be assembled with NASM (Netwide Assembler), |
9 ; can *not* be assembled with Microsoft's MASM or any compatible | 9 ; can *not* be assembled with Microsoft's MASM or any compatible |
10 ; assembler (including Borland's Turbo Assembler). | 10 ; assembler (including Borland's Turbo Assembler). |
(...skipping 275 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
286 punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) | 286 punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) |
287 punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) | 287 punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) |
288 | 288 |
289 punpcklbw xmmF,xmmH | 289 punpcklbw xmmF,xmmH |
290 punpckhbw xmmH,xmmH | 290 punpckhbw xmmH,xmmH |
291 psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) | 291 psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) |
292 psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) | 292 psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) |
293 | 293 |
294 %endif ; RGB_PIXELSIZE ; --------------- | 294 %endif ; RGB_PIXELSIZE ; --------------- |
295 | 295 |
| 296 %if PREMULTIPLY == 1 ; --------------- |
| 297 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE, xmm6=A(
02468ACE)=AE |
| 298 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO, xmm7=A(
13579BDF)=AO |
| 299 |
| 300 ; Unpremultiply even registers (i.e. xmm0, xmm2, xmm4, and xmm6). |
| 301 movdqa [wk(0)], xmm1 |
| 302 movdqa [wk(1)], xmm3 |
| 303 movdqa [wk(2)], xmm5 |
| 304 movdqa [wk(3)], xmm7 |
| 305 |
| 306 ; for (int i = 0; i < 8; ++i) |
| 307 ; xmm6.word[i] = xmm6.word[i] ? xmm6.word[i] : 255; // a[i] ? a[i] : 2
55; |
| 308 pxor xmm1, xmm1 |
| 309 pcmpeqw xmm1, xmm6 |
| 310 psrlw xmm1, 8 |
| 311 pmaxuw xmm6, xmm1 |
| 312 |
| 313 ; xmm0.dword[i] = r[i]; xmm2.dword[i] = g[i]; xmm4.dword[i] = b[i];
xmm6.dword[i] = a[i]; |
| 314 ; xmm1.dword[i] = r[4+i]; xmm3.dword[i] = g[4+i]; xmm5.dword[i] = b[4+i]
; xmm7.dword[i] = a[4+i]; |
| 315 movdqa xmm1, xmm0 |
| 316 movdqa xmm3, xmm2 |
| 317 movdqa xmm5, xmm4 |
| 318 movdqa xmm7, xmm6 |
| 319 |
| 320 movdqa [wk(4)], xmm1 |
| 321 pxor xmm1, xmm1 |
| 322 punpcklwd xmm0, xmm1 |
| 323 punpcklwd xmm2, xmm1 |
| 324 punpcklwd xmm4, xmm1 |
| 325 punpcklwd xmm6, xmm1 |
| 326 movdqa xmm1, [wk(4)] |
| 327 |
| 328 movdqa [wk(4)], xmm0 |
| 329 pxor xmm0, xmm0 |
| 330 punpckhwd xmm1, xmm0 |
| 331 punpckhwd xmm3, xmm0 |
| 332 punpckhwd xmm5, xmm0 |
| 333 punpckhwd xmm7, xmm0 |
| 334 movdqa xmm0, [wk(4)]; |
| 335 |
| 336 ; for (int i = 0; i < 4; ++i) { |
| 337 ; xmm0.float[i] = (float)xmm0.dword[i]; // (float)r[i]; |
| 338 ; xmm2.float[i] = (float)xmm2.dword[i]; // (float)g[i]; |
| 339 ; xmm4.float[i] = (float)xmm4.dword[i]; // (float)b[i]; |
| 340 ; xmm6.float[i] = (float)xmm6.dword[i]; // (float)a[i]; |
| 341 ; xmm1.float[i] = (float)xmm1.dword[i]; // (float)r[4+i]; |
| 342 ; xmm3.float[i] = (float)xmm3.dword[i]; // (float)g[4+i]; |
| 343 ; xmm5.float[i] = (float)xmm5.dword[i]; // (float)b[4+i]; |
| 344 ; xmm7.float[i] = (float)xmm7.dword[i]; // (float)a[4+i]; |
| 345 ; } |
| 346 cvtdq2ps xmm0, xmm0 |
| 347 cvtdq2ps xmm2, xmm2 |
| 348 cvtdq2ps xmm4, xmm4 |
| 349 cvtdq2ps xmm6, xmm6 |
| 350 cvtdq2ps xmm1, xmm1 |
| 351 cvtdq2ps xmm3, xmm3 |
| 352 cvtdq2ps xmm5, xmm5 |
| 353 cvtdq2ps xmm7, xmm7 |
| 354 |
| 355 ; for (int i = 0; i < 4; ++i) { |
| 356 ; xmm6.float[i] = 255.0 / xmm6.float[i]; // 255.0 / (float)a[i]; |
| 357 ; xmm7.float[i] = 255.0 / xmm7.float[i]; // 255.0 / (float)a[4+i]; |
| 358 ; } |
| 359 rcpps xmm6, xmm6 |
| 360 rcpps xmm7, xmm7 |
| 361 mulps xmm6, [GOTOFF(eax,PF_255)] |
| 362 mulps xmm7, [GOTOFF(eax,PF_255)] |
| 363 |
| 364 ; for (int i = 0; i < 4; ++i) { |
| 365 ; xmm0.word[i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16
)((float)r[i] * 255.0 / (float)a[i]); |
| 366 ; xmm2.word[i] = (uint16)(xmm2.float[i] * xmm6.float[i]); // (uint16
)((float)g[i] * 255.0 / (float)a[i]); |
| 367 ; xmm4.word[i] = (uint16)(xmm4.float[i] * xmm6.float[i]); // (uint16
)((float)b[i] * 255.0 / (float)a[i]); |
| 368 ; xmm0.word[4+i] = (uint16)(xmm1.float[i] * xmm7.float[i]); // (uint16
)((float)r[4+i] * 255.0 / (float)a[4+i]); |
| 369 ; xmm2.word[4+i] = (uint16)(xmm3.float[i] * xmm7.float[i]); // (uint16
)((float)g[4+i] * 255.0 / (float)a[4+i]); |
| 370 ; xmm4.word[4+i] = (uint16)(xmm5.float[i] * xmm7.float[i]); // (uint16
)((float)b[4+i] * 255.0 / (float)a[4+i]); |
| 371 ; } |
| 372 mulps xmm0, xmm6 |
| 373 cvtps2dq xmm0, xmm0 |
| 374 mulps xmm1, xmm7 |
| 375 cvtps2dq xmm1, xmm1 |
| 376 packusdw xmm0, xmm1 |
| 377 |
| 378 mulps xmm2, xmm6 |
| 379 cvtps2dq xmm2, xmm2 |
| 380 mulps xmm3, xmm7 |
| 381 cvtps2dq xmm3, xmm3 |
| 382 packusdw xmm2, xmm3 |
| 383 |
| 384 mulps xmm4, xmm6 |
| 385 cvtps2dq xmm4, xmm4 |
| 386 mulps xmm5, xmm7 |
| 387 cvtps2dq xmm5, xmm5 |
| 388 packusdw xmm4, xmm5 |
| 389 |
| 390 movdqa xmm1, [wk(0)] |
| 391 movdqa xmm3, [wk(1)] |
| 392 movdqa xmm5, [wk(2)] |
| 393 movdqa xmm7, [wk(3)] |
| 394 |
| 395 ; Unpremultiply odd registers (i.e. xmm1, xmm3, xmm5, and xmm7). |
| 396 movdqa [wk(0)], xmm0 |
| 397 movdqa [wk(1)], xmm2 |
| 398 movdqa [wk(2)], xmm4 |
| 399 movdqa [wk(3)], xmm6 |
| 400 |
| 401 ; for (int i = 0; i < 8; ++i) |
| 402 ; xmm7.word[i] = xmm7.word[i] ? xmm7.word[i] : 255; // a[i] ? a[i] : 2
55; |
| 403 pxor xmm0, xmm0 |
| 404 pcmpeqw xmm0, xmm7 |
| 405 psrlw xmm0, 8 |
| 406 pmaxuw xmm7, xmm0 |
| 407 |
| 408 ; xmm4.dword[i] = r[i]; xmm5.dword[i] = g[i]; xmm6.dword[i] = b[i];
xmm7.dword[i] = a[i]; |
| 409 ; xmm0.dword[i] = r[4+i]; xmm1.dword[i] = g[4+i]; xmm2.dword[i] = b[4+i]
; xmm3.dword[i] = a[4+i]; |
| 410 movdqa xmm0, xmm1 |
| 411 movdqa xmm2, xmm3 |
| 412 movdqa xmm4, xmm5 |
| 413 movdqa xmm6, xmm7 |
| 414 |
| 415 movdqa [wk(4)], xmm0 |
| 416 pxor xmm0, xmm0 |
| 417 punpcklwd xmm1, xmm0 |
| 418 punpcklwd xmm3, xmm0 |
| 419 punpcklwd xmm5, xmm0 |
| 420 punpcklwd xmm7, xmm0 |
| 421 movdqa xmm0, [wk(4)] |
| 422 |
| 423 movdqa [wk(4)], xmm1 |
| 424 pxor xmm1, xmm1 |
| 425 punpckhwd xmm0, xmm1 |
| 426 punpckhwd xmm2, xmm1 |
| 427 punpckhwd xmm4, xmm1 |
| 428 punpckhwd xmm6, xmm1 |
| 429 movdqa xmm1, [wk(4)]; |
| 430 |
| 431 ; for (int i = 0; i < 4; ++i) { |
| 432 ; xmm1.float[i] = (float)xmm1.dword[i]; // (float)r[i]; |
| 433 ; xmm3.float[i] = (float)xmm3.dword[i]; // (float)g[i]; |
| 434 ; xmm5.float[i] = (float)xmm5.dword[i]; // (float)b[i]; |
| 435 ; xmm7.float[i] = (float)xmm7.dword[i]; // (float)a[i]; |
| 436 ; xmm0.float[i] = (float)xmm0.dword[i]; // (float)r[4+i]; |
| 437 ; xmm2.float[i] = (float)xmm2.dword[i]; // (float)g[4+i]; |
| 438 ; xmm4.float[i] = (float)xmm4.dword[i]; // (float)b[4+i]; |
| 439 ; xmm6.float[i] = (float)xmm6.dword[i]; // (float)a[4+i]; |
| 440 ; } |
| 441 cvtdq2ps xmm1, xmm1 |
| 442 cvtdq2ps xmm3, xmm3 |
| 443 cvtdq2ps xmm5, xmm5 |
| 444 cvtdq2ps xmm7, xmm7 |
| 445 cvtdq2ps xmm0, xmm0 |
| 446 cvtdq2ps xmm2, xmm2 |
| 447 cvtdq2ps xmm4, xmm4 |
| 448 cvtdq2ps xmm6, xmm6 |
| 449 |
| 450 ; for (int i = 0; i < 4; ++i) { |
| 451 ; xmm7.float[i] = 255.0 / xmm7.float[i]; // 255.0 / (float)a[i]; |
| 452 ; xmm6.float[i] = 255.0 / xmm6.float[i]; // 255.0 / (float)a[4+i]; |
| 453 ; } |
| 454 rcpps xmm7, xmm7 |
| 455 rcpps xmm6, xmm6 |
| 456 mulps xmm7, [GOTOFF(eax,PF_255)] |
| 457 mulps xmm6, [GOTOFF(eax,PF_255)] |
| 458 |
| 459 ; for (int i = 0; i < 4; ++i) { |
| 460 ; xmm1.word[i] = (uint16)(xmm1.float[i] * xmm7.float[i]); // (uint16
)((float)r[i] * 255.0 / (float)a[i]); |
| 461 ; xmm3.word[i] = (uint16)(xmm3.float[i] * xmm7.float[i]); // (uint16
)((float)g[i] * 255.0 / (float)a[i]); |
| 462 ; xmm5.word[i] = (uint16)(xmm5.float[i] * xmm7.float[i]); // (uint16
)((float)b[i] * 255.0 / (float)a[i]); |
| 463 ; xmm1.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16
)((float)r[4+i] * 255.0 / (float)a[4+i]); |
| 464 ; xmm3.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16
)((float)g[4+i] * 255.0 / (float)a[4+i]); |
| 465 ; xmm5.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16
)((float)b[4+i] * 255.0 / (float)a[4+i]); |
| 466 ; } |
| 467 mulps xmm1, xmm7 |
| 468 cvtps2dq xmm1, xmm1 |
| 469 mulps xmm0, xmm6 |
| 470 cvtps2dq xmm0, xmm0 |
| 471 packusdw xmm1, xmm0 |
| 472 |
| 473 mulps xmm3, xmm7 |
| 474 cvtps2dq xmm3, xmm3 |
| 475 mulps xmm2, xmm6 |
| 476 cvtps2dq xmm2, xmm2 |
| 477 packusdw xmm3, xmm2 |
| 478 |
| 479 mulps xmm5, xmm7 |
| 480 cvtps2dq xmm5, xmm5 |
| 481 mulps xmm4, xmm6 |
| 482 cvtps2dq xmm4, xmm4 |
| 483 packusdw xmm5, xmm4 |
| 484 |
| 485 movdqa xmm0, [wk(0)] |
| 486 movdqa xmm2, [wk(1)] |
| 487 movdqa xmm4, [wk(2)] |
| 488 movdqa xmm6, [wk(3)] |
| 489 %endif ; PREMULTIPLY == 1 |
| 490 |
296 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE | 491 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE |
297 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO | 492 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO |
298 | 493 |
299 ; (Original) | 494 ; (Original) |
300 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B | 495 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B |
301 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE | 496 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE |
302 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE | 497 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE |
303 ; | 498 ; |
304 ; (This implementation) | 499 ; (This implementation) |
305 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G | 500 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G |
(...skipping 190 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
496 ; pop ecx ; need not be preserved | 691 ; pop ecx ; need not be preserved |
497 pop ebx | 692 pop ebx |
498 mov esp,ebp ; esp <- aligned ebp | 693 mov esp,ebp ; esp <- aligned ebp |
499 pop esp ; esp <- original ebp | 694 pop esp ; esp <- original ebp |
500 pop ebp | 695 pop ebp |
501 ret | 696 ret |
502 | 697 |
503 ; For some reason, the OS X linker does not honor the request to align the | 698 ; For some reason, the OS X linker does not honor the request to align the |
504 ; segment unless we do this. | 699 ; segment unless we do this. |
505 align 16 | 700 align 16 |
OLD | NEW |