OLD | NEW |
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "media/base/yuv_row.h" | 5 #include "media/base/yuv_row.h" |
6 | 6 |
7 // Enable bilinear filtering by turning on the following macro. | 7 // Enable bilinear filtering by turning on the following macro. |
8 // #define MEDIA_BILINEAR_FILTER 1 | 8 // #define MEDIA_BILINEAR_FILTER 1 |
9 | 9 |
10 namespace media { | 10 namespace media { |
(...skipping 228 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
239 #undef RGBY | 239 #undef RGBY |
240 #undef RGBU | 240 #undef RGBU |
241 #undef RGBV | 241 #undef RGBV |
242 #undef MMX_ALIGNED | 242 #undef MMX_ALIGNED |
243 | 243 |
244 // Warning C4799: function has no EMMS instruction. | 244 // Warning C4799: function has no EMMS instruction. |
245 // EMMS() is slow and should be called by the calling function once per image. | 245 // EMMS() is slow and should be called by the calling function once per image. |
246 #pragma warning(disable: 4799) | 246 #pragma warning(disable: 4799) |
247 | 247 |
248 __declspec(naked) | 248 __declspec(naked) |
249 void ConvertYV12ToRGB32Row(const uint8* y_buf, | 249 void FastConvertYUVToRGB32Row(const uint8* y_buf, |
250 const uint8* u_buf, | 250 const uint8* u_buf, |
251 const uint8* v_buf, | 251 const uint8* v_buf, |
252 uint8* rgb_buf, | 252 uint8* rgb_buf, |
253 int width) { | 253 int width) { |
254 __asm { | 254 __asm { |
255 pushad | 255 pushad |
256 mov edx, [esp + 32 + 4] // Y | 256 mov edx, [esp + 32 + 4] // Y |
257 mov edi, [esp + 32 + 8] // U | 257 mov edi, [esp + 32 + 8] // U |
258 mov esi, [esp + 32 + 12] // V | 258 mov esi, [esp + 32 + 12] // V |
259 mov ebp, [esp + 32 + 16] // rgb | 259 mov ebp, [esp + 32 + 16] // rgb |
260 mov ecx, [esp + 32 + 20] // width | 260 mov ecx, [esp + 32 + 20] // width |
261 shr ecx, 1 | 261 jmp wend |
262 | 262 |
263 wloop : | 263 wloop : |
264 movzx eax, byte ptr [edi] // NOLINT | 264 movzx eax, byte ptr [edi] |
265 add edi, 1 | 265 add edi, 1 |
266 movzx ebx, byte ptr [esi] // NOLINT | 266 movzx ebx, byte ptr [esi] |
267 add esi, 1 | 267 add esi, 1 |
268 movq mm0, [coefficients_RGB_U + 8 * eax] | 268 movq mm0, [coefficients_RGB_U + 8 * eax] |
269 movzx eax, byte ptr [edx] // NOLINT | 269 movzx eax, byte ptr [edx] |
270 paddsw mm0, [coefficients_RGB_V + 8 * ebx] | 270 paddsw mm0, [coefficients_RGB_V + 8 * ebx] |
271 movzx ebx, byte ptr [edx + 1] // NOLINT | 271 movzx ebx, byte ptr [edx + 1] |
272 movq mm1, [coefficients_RGB_Y + 8 * eax] | 272 movq mm1, [coefficients_RGB_Y + 8 * eax] |
273 add edx, 2 | 273 add edx, 2 |
274 movq mm2, [coefficients_RGB_Y + 8 * ebx] | 274 movq mm2, [coefficients_RGB_Y + 8 * ebx] |
275 paddsw mm1, mm0 | 275 paddsw mm1, mm0 |
276 paddsw mm2, mm0 | 276 paddsw mm2, mm0 |
277 psraw mm1, 6 | 277 psraw mm1, 6 |
278 psraw mm2, 6 | 278 psraw mm2, 6 |
279 packuswb mm1, mm2 | 279 packuswb mm1, mm2 |
280 movntq [ebp], mm1 // NOLINT | 280 movntq [ebp], mm1 |
281 add ebp, 8 | 281 add ebp, 8 |
282 sub ecx, 1 | 282 wend : |
283 jnz wloop | 283 sub ecx, 2 |
| 284 jns wloop |
| 285 |
| 286 and ecx, 1 // odd number of pixels? |
| 287 jz wdone |
| 288 |
| 289 movzx eax, byte ptr [edi] |
| 290 movq mm0, [coefficients_RGB_U + 8 * eax] |
| 291 movzx eax, byte ptr [esi] |
| 292 paddsw mm0, [coefficients_RGB_V + 8 * eax] |
| 293 movzx eax, byte ptr [edx] |
| 294 movq mm1, [coefficients_RGB_Y + 8 * eax] |
| 295 paddsw mm1, mm0 |
| 296 psraw mm1, 6 |
| 297 packuswb mm1, mm1 |
| 298 movd [ebp], mm1 |
| 299 wdone : |
284 | 300 |
285 popad | 301 popad |
286 ret | 302 ret |
287 } | 303 } |
288 } | 304 } |
289 | 305 |
290 __declspec(naked) | 306 __declspec(naked) |
291 void HalfYV12ToRGB32Row(const uint8* y_buf, | 307 void ConvertYUVToRGB32Row(const uint8* y_buf, |
292 const uint8* u_buf, | 308 const uint8* u_buf, |
293 const uint8* v_buf, | 309 const uint8* v_buf, |
294 uint8* rgb_buf, | 310 uint8* rgb_buf, |
295 int width) { | 311 int width, |
| 312 int step) { |
296 __asm { | 313 __asm { |
297 pushad | 314 pushad |
298 mov edx, [esp + 32 + 4] // Y | 315 mov edx, [esp + 32 + 4] // Y |
299 mov edi, [esp + 32 + 8] // U | 316 mov edi, [esp + 32 + 8] // U |
300 mov esi, [esp + 32 + 12] // V | 317 mov esi, [esp + 32 + 12] // V |
301 mov ebp, [esp + 32 + 16] // rgb | 318 mov ebp, [esp + 32 + 16] // rgb |
302 mov ecx, [esp + 32 + 20] // width | 319 mov ecx, [esp + 32 + 20] // width |
| 320 mov ebx, [esp + 32 + 24] // step |
| 321 jmp wend |
303 | 322 |
304 wloop : | 323 wloop : |
305 movzx eax, byte ptr [edi] | 324 movzx eax, byte ptr [edi] |
306 add edi, 1 | 325 add edi, ebx |
307 movzx ebx, byte ptr [esi] | |
308 add esi, 1 | |
309 movq mm0, [coefficients_RGB_U + 8 * eax] | 326 movq mm0, [coefficients_RGB_U + 8 * eax] |
| 327 movzx eax, byte ptr [esi] |
| 328 add esi, ebx |
| 329 paddsw mm0, [coefficients_RGB_V + 8 * eax] |
310 movzx eax, byte ptr [edx] | 330 movzx eax, byte ptr [edx] |
311 paddsw mm0, [coefficients_RGB_V + 8 * ebx] | 331 add edx, ebx |
312 #if MEDIA_BILINEAR_FILTER | 332 movq mm1, [coefficients_RGB_Y + 8 * eax] |
313 movzx ebx, byte ptr [edx + 1] | 333 movzx eax, byte ptr [edx] |
314 add ebx, eax | 334 add edx, ebx |
315 shr ebx, 1 | 335 movq mm2, [coefficients_RGB_Y + 8 * eax] |
316 #endif | 336 paddsw mm1, mm0 |
317 paddsw mm0, [coefficients_RGB_Y + 8 * eax] | 337 paddsw mm2, mm0 |
318 add edx, 2 | 338 psraw mm1, 6 |
319 psraw mm0, 6 | 339 psraw mm2, 6 |
320 packuswb mm0, mm0 | 340 packuswb mm1, mm2 |
321 movd [ebp], mm0 | 341 movntq [ebp], mm1 |
322 add ebp, 4 | 342 add ebp, 8 |
323 sub ecx, 1 | 343 wend : |
324 jnz wloop | 344 sub ecx, 2 |
| 345 jns wloop |
| 346 |
| 347 and ecx, 1 // odd number of pixels? |
| 348 jz wdone |
| 349 |
| 350 movzx eax, byte ptr [edi] |
| 351 movq mm0, [coefficients_RGB_U + 8 * eax] |
| 352 movzx eax, byte ptr [esi] |
| 353 paddsw mm0, [coefficients_RGB_V + 8 * eax] |
| 354 movzx eax, byte ptr [edx] |
| 355 movq mm1, [coefficients_RGB_Y + 8 * eax] |
| 356 paddsw mm1, mm0 |
| 357 psraw mm1, 6 |
| 358 packuswb mm1, mm1 |
| 359 movd [ebp], mm1 |
| 360 wdone : |
325 | 361 |
326 popad | 362 popad |
327 ret | 363 ret |
328 } | 364 } |
329 } | 365 } |
330 | 366 |
331 __declspec(naked) | 367 __declspec(naked) |
332 void ScaleYV12ToRGB32Row(const uint8* y_buf, | 368 void RotateConvertYUVToRGB32Row(const uint8* y_buf, |
| 369 const uint8* u_buf, |
| 370 const uint8* v_buf, |
| 371 uint8* rgb_buf, |
| 372 int width, |
| 373 int ystep, |
| 374 int uvstep) { |
| 375 __asm { |
| 376 pushad |
| 377 mov edx, [esp + 32 + 4] // Y |
| 378 mov edi, [esp + 32 + 8] // U |
| 379 mov esi, [esp + 32 + 12] // V |
| 380 mov ebp, [esp + 32 + 16] // rgb |
| 381 mov ecx, [esp + 32 + 20] // width |
| 382 jmp wend |
| 383 |
| 384 wloop : |
| 385 movzx eax, byte ptr [edi] |
| 386 mov ebx, [esp + 32 + 28] // uvstep |
| 387 add edi, ebx |
| 388 movq mm0, [coefficients_RGB_U + 8 * eax] |
| 389 movzx eax, byte ptr [esi] |
| 390 add esi, ebx |
| 391 paddsw mm0, [coefficients_RGB_V + 8 * eax] |
| 392 movzx eax, byte ptr [edx] |
| 393 mov ebx, [esp + 32 + 24] // ystep |
| 394 add edx, ebx |
| 395 movq mm1, [coefficients_RGB_Y + 8 * eax] |
| 396 movzx eax, byte ptr [edx] |
| 397 add edx, ebx |
| 398 movq mm2, [coefficients_RGB_Y + 8 * eax] |
| 399 paddsw mm1, mm0 |
| 400 paddsw mm2, mm0 |
| 401 psraw mm1, 6 |
| 402 psraw mm2, 6 |
| 403 packuswb mm1, mm2 |
| 404 movntq [ebp], mm1 |
| 405 add ebp, 8 |
| 406 wend : |
| 407 sub ecx, 2 |
| 408 jns wloop |
| 409 |
| 410 and ecx, 1 // odd number of pixels? |
| 411 jz wdone |
| 412 |
| 413 movzx eax, byte ptr [edi] |
| 414 movq mm0, [coefficients_RGB_U + 8 * eax] |
| 415 movzx eax, byte ptr [esi] |
| 416 paddsw mm0, [coefficients_RGB_V + 8 * eax] |
| 417 movzx eax, byte ptr [edx] |
| 418 movq mm1, [coefficients_RGB_Y + 8 * eax] |
| 419 paddsw mm1, mm0 |
| 420 psraw mm1, 6 |
| 421 packuswb mm1, mm1 |
| 422 movd [ebp], mm1 |
| 423 wdone : |
| 424 |
| 425 popad |
| 426 ret |
| 427 } |
| 428 } |
| 429 |
| 430 __declspec(naked) |
| 431 void DoubleYUVToRGB32Row(const uint8* y_buf, |
333 const uint8* u_buf, | 432 const uint8* u_buf, |
334 const uint8* v_buf, | 433 const uint8* v_buf, |
335 uint8* rgb_buf, | 434 uint8* rgb_buf, |
336 int width, | 435 int width) { |
337 int dx) { | |
338 __asm { | 436 __asm { |
339 pushad | 437 pushad |
340 mov edx, [esp + 32 + 4] // Y | 438 mov edx, [esp + 32 + 4] // Y |
341 mov edi, [esp + 32 + 8] // U | 439 mov edi, [esp + 32 + 8] // U |
342 mov esi, [esp + 32 + 12] // V | 440 mov esi, [esp + 32 + 12] // V |
343 mov ebp, [esp + 32 + 16] // rgb | 441 mov ebp, [esp + 32 + 16] // rgb |
344 mov ecx, [esp + 32 + 20] // width | 442 mov ecx, [esp + 32 + 20] // width |
345 xor eax, eax // x | 443 jmp wend |
346 | 444 |
347 wloop : | 445 wloop : |
348 mov ebx, eax | 446 movzx eax, byte ptr [edi] |
349 sar ebx, 5 | 447 add edi, 1 |
350 movzx ebx, byte ptr [edi + ebx] | 448 movzx ebx, byte ptr [esi] |
351 movq mm0, [coefficients_RGB_U + 8 * ebx] | 449 add esi, 1 |
352 mov ebx, eax | 450 movq mm0, [coefficients_RGB_U + 8 * eax] |
353 sar ebx, 5 | 451 movzx eax, byte ptr [edx] |
354 movzx ebx, byte ptr [esi + ebx] | |
355 paddsw mm0, [coefficients_RGB_V + 8 * ebx] | 452 paddsw mm0, [coefficients_RGB_V + 8 * ebx] |
356 mov ebx, eax | 453 movq mm1, [coefficients_RGB_Y + 8 * eax] |
357 sar ebx, 4 | 454 paddsw mm1, mm0 |
358 movzx ebx, byte ptr [edx + ebx] | 455 psraw mm1, 6 |
| 456 packuswb mm1, mm1 |
| 457 punpckldq mm1, mm1 |
| 458 movntq [ebp], mm1 |
| 459 |
| 460 movzx ebx, byte ptr [edx + 1] |
| 461 add edx, 2 |
359 paddsw mm0, [coefficients_RGB_Y + 8 * ebx] | 462 paddsw mm0, [coefficients_RGB_Y + 8 * ebx] |
360 psraw mm0, 6 | 463 psraw mm0, 6 |
361 packuswb mm0, mm0 | 464 packuswb mm0, mm0 |
362 movd [ebp], mm0 | 465 punpckldq mm0, mm0 |
| 466 movntq [ebp+8], mm0 |
| 467 add ebp, 16 |
| 468 wend : |
| 469 sub ecx, 4 |
| 470 jns wloop |
| 471 |
| 472 add ecx, 4 |
| 473 jz wdone |
| 474 |
| 475 movzx eax, byte ptr [edi] |
| 476 movq mm0, [coefficients_RGB_U + 8 * eax] |
| 477 movzx eax, byte ptr [esi] |
| 478 paddsw mm0, [coefficients_RGB_V + 8 * eax] |
| 479 movzx eax, byte ptr [edx] |
| 480 movq mm1, [coefficients_RGB_Y + 8 * eax] |
| 481 paddsw mm1, mm0 |
| 482 psraw mm1, 6 |
| 483 packuswb mm1, mm1 |
| 484 jmp wend1 |
| 485 |
| 486 wloop1 : |
| 487 movd [ebp], mm1 |
363 add ebp, 4 | 488 add ebp, 4 |
364 add eax, [esp + 32 + 24] // x += dx | 489 wend1 : |
365 sub ecx, 1 | 490 sub ecx, 1 |
366 jnz wloop | 491 jns wloop1 |
367 | 492 wdone : |
368 popad | 493 popad |
369 ret | 494 ret |
370 } | 495 } |
371 } | 496 } |
372 | 497 |
373 | 498 // This version does general purpose scaling by any amount, up or down. |
| 499 // The only thing it can not do it rotation by 90 or 270. |
| 500 // For performance the chroma is under sampled, reducing cost of a 3x |
| 501 // 1080p scale from 8.4 ms to 5.4 ms. |
374 __declspec(naked) | 502 __declspec(naked) |
375 void Half2Row(const uint8* in_row0, | 503 void ScaleYUVToRGB32Row(const uint8* y_buf, |
376 const uint8* in_row1, | 504 const uint8* u_buf, |
377 uint8* out_row, | 505 const uint8* v_buf, |
378 int out_width) { | 506 uint8* rgb_buf, |
| 507 int width, |
| 508 int dx) { |
379 __asm { | 509 __asm { |
380 pushad | 510 pushad |
381 mov esi, [esp + 32 + 4] // row0 | 511 mov edx, [esp + 32 + 4] // Y |
382 mov ebx, [esp + 32 + 8] // row1 | 512 mov edi, [esp + 32 + 8] // U |
383 mov edi, [esp + 32 + 12] // out | 513 mov esi, [esp + 32 + 12] // V |
384 mov ecx, [esp + 32 + 16] // width | 514 mov ebp, [esp + 32 + 16] // rgb |
| 515 mov ecx, [esp + 32 + 20] // width |
| 516 xor ebx, ebx // x |
| 517 jmp wend |
385 | 518 |
386 wloop : | 519 wloop : |
387 movzx eax, byte ptr [esi] | 520 mov eax, ebx |
388 movzx edx, byte ptr [esi+1] | 521 sar eax, 5 |
389 add esi, 2 | 522 movzx eax, byte ptr [edi + eax] |
390 add eax, edx | 523 movq mm0, [coefficients_RGB_U + 8 * eax] |
391 movzx edx, byte ptr [ebx] | 524 mov eax, ebx |
392 add eax, edx | 525 sar eax, 5 |
393 movzx edx, byte ptr [ebx+1] | 526 movzx eax, byte ptr [esi + eax] |
394 add eax, edx | 527 paddsw mm0, [coefficients_RGB_V + 8 * eax] |
395 add ebx, 2 | 528 mov eax, ebx |
396 shr eax, 2 | 529 add ebx, [esp + 32 + 24] // x += dx |
397 mov [edi], al | 530 sar eax, 4 |
398 add edi, 1 | 531 movzx eax, byte ptr [edx + eax] |
399 sub ecx, 1 | 532 movq mm1, [coefficients_RGB_Y + 8 * eax] |
400 jnz wloop | 533 mov eax, ebx |
| 534 add ebx, [esp + 32 + 24] // x += dx |
| 535 sar eax, 4 |
| 536 movzx eax, byte ptr [edx + eax] |
| 537 movq mm2, [coefficients_RGB_Y + 8 * eax] |
| 538 paddsw mm1, mm0 |
| 539 paddsw mm2, mm0 |
| 540 psraw mm1, 6 |
| 541 psraw mm2, 6 |
| 542 packuswb mm1, mm2 |
| 543 movntq [ebp], mm1 |
| 544 add ebp, 8 |
| 545 wend : |
| 546 sub ecx, 2 |
| 547 jns wloop |
| 548 |
| 549 and ecx, 1 // odd number of pixels? |
| 550 jz wdone |
| 551 |
| 552 mov eax, ebx |
| 553 sar eax, 5 |
| 554 movzx eax, byte ptr [edi + eax] |
| 555 movq mm0, [coefficients_RGB_U + 8 * eax] |
| 556 mov eax, ebx |
| 557 sar eax, 5 |
| 558 movzx eax, byte ptr [esi + eax] |
| 559 paddsw mm0, [coefficients_RGB_V + 8 * eax] |
| 560 mov eax, ebx |
| 561 sar eax, 4 |
| 562 movzx eax, byte ptr [edx + eax] |
| 563 movq mm1, [coefficients_RGB_Y + 8 * eax] |
| 564 mov eax, ebx |
| 565 sar eax, 4 |
| 566 movzx eax, byte ptr [edx + eax] |
| 567 movq mm2, [coefficients_RGB_Y + 8 * eax] |
| 568 paddsw mm1, mm0 |
| 569 paddsw mm2, mm0 |
| 570 psraw mm1, 6 |
| 571 psraw mm2, 6 |
| 572 packuswb mm1, mm2 |
| 573 movd [ebp], mm1 |
| 574 |
| 575 wdone : |
401 | 576 |
402 popad | 577 popad |
403 ret | 578 ret |
404 } | 579 } |
405 } | 580 } |
406 | 581 |
407 } // namespace media | 582 } // namespace media |
408 | 583 |
OLD | NEW |