Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(446)

Side by Side Diff: media/base/yuv_row_win.cc

Issue 113407: ScaleYV12 optimization.... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 11 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « media/base/yuv_row_mac.cc ('k') | media/base/yuv_scale.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "media/base/yuv_row.h" 5 #include "media/base/yuv_row.h"
6 6
7 // Enable bilinear filtering by turning on the following macro. 7 // Enable bilinear filtering by turning on the following macro.
8 // #define MEDIA_BILINEAR_FILTER 1 8 // #define MEDIA_BILINEAR_FILTER 1
9 9
10 namespace media { 10 namespace media {
(...skipping 228 matching lines...) Expand 10 before | Expand all | Expand 10 after
239 #undef RGBY 239 #undef RGBY
240 #undef RGBU 240 #undef RGBU
241 #undef RGBV 241 #undef RGBV
242 #undef MMX_ALIGNED 242 #undef MMX_ALIGNED
243 243
244 // Warning C4799: function has no EMMS instruction. 244 // Warning C4799: function has no EMMS instruction.
245 // EMMS() is slow and should be called by the calling function once per image. 245 // EMMS() is slow and should be called by the calling function once per image.
246 #pragma warning(disable: 4799) 246 #pragma warning(disable: 4799)
247 247
248 __declspec(naked) 248 __declspec(naked)
249 void ConvertYV12ToRGB32Row(const uint8* y_buf, 249 void FastConvertYUVToRGB32Row(const uint8* y_buf,
250 const uint8* u_buf, 250 const uint8* u_buf,
251 const uint8* v_buf, 251 const uint8* v_buf,
252 uint8* rgb_buf, 252 uint8* rgb_buf,
253 int width) { 253 int width) {
254 __asm { 254 __asm {
255 pushad 255 pushad
256 mov edx, [esp + 32 + 4] // Y 256 mov edx, [esp + 32 + 4] // Y
257 mov edi, [esp + 32 + 8] // U 257 mov edi, [esp + 32 + 8] // U
258 mov esi, [esp + 32 + 12] // V 258 mov esi, [esp + 32 + 12] // V
259 mov ebp, [esp + 32 + 16] // rgb 259 mov ebp, [esp + 32 + 16] // rgb
260 mov ecx, [esp + 32 + 20] // width 260 mov ecx, [esp + 32 + 20] // width
261 shr ecx, 1 261 jmp wend
262 262
263 wloop : 263 wloop :
264 movzx eax, byte ptr [edi] // NOLINT 264 movzx eax, byte ptr [edi]
265 add edi, 1 265 add edi, 1
266 movzx ebx, byte ptr [esi] // NOLINT 266 movzx ebx, byte ptr [esi]
267 add esi, 1 267 add esi, 1
268 movq mm0, [coefficients_RGB_U + 8 * eax] 268 movq mm0, [coefficients_RGB_U + 8 * eax]
269 movzx eax, byte ptr [edx] // NOLINT 269 movzx eax, byte ptr [edx]
270 paddsw mm0, [coefficients_RGB_V + 8 * ebx] 270 paddsw mm0, [coefficients_RGB_V + 8 * ebx]
271 movzx ebx, byte ptr [edx + 1] // NOLINT 271 movzx ebx, byte ptr [edx + 1]
272 movq mm1, [coefficients_RGB_Y + 8 * eax] 272 movq mm1, [coefficients_RGB_Y + 8 * eax]
273 add edx, 2 273 add edx, 2
274 movq mm2, [coefficients_RGB_Y + 8 * ebx] 274 movq mm2, [coefficients_RGB_Y + 8 * ebx]
275 paddsw mm1, mm0 275 paddsw mm1, mm0
276 paddsw mm2, mm0 276 paddsw mm2, mm0
277 psraw mm1, 6 277 psraw mm1, 6
278 psraw mm2, 6 278 psraw mm2, 6
279 packuswb mm1, mm2 279 packuswb mm1, mm2
280 movntq [ebp], mm1 // NOLINT 280 movntq [ebp], mm1
281 add ebp, 8 281 add ebp, 8
282 sub ecx, 1 282 wend :
283 jnz wloop 283 sub ecx, 2
284 jns wloop
285
286 and ecx, 1 // odd number of pixels?
287 jz wdone
288
289 movzx eax, byte ptr [edi]
290 movq mm0, [coefficients_RGB_U + 8 * eax]
291 movzx eax, byte ptr [esi]
292 paddsw mm0, [coefficients_RGB_V + 8 * eax]
293 movzx eax, byte ptr [edx]
294 movq mm1, [coefficients_RGB_Y + 8 * eax]
295 paddsw mm1, mm0
296 psraw mm1, 6
297 packuswb mm1, mm1
298 movd [ebp], mm1
299 wdone :
284 300
285 popad 301 popad
286 ret 302 ret
287 } 303 }
288 } 304 }
289 305
290 __declspec(naked) 306 __declspec(naked)
291 void HalfYV12ToRGB32Row(const uint8* y_buf, 307 void ConvertYUVToRGB32Row(const uint8* y_buf,
292 const uint8* u_buf, 308 const uint8* u_buf,
293 const uint8* v_buf, 309 const uint8* v_buf,
294 uint8* rgb_buf, 310 uint8* rgb_buf,
295 int width) { 311 int width,
312 int step) {
296 __asm { 313 __asm {
297 pushad 314 pushad
298 mov edx, [esp + 32 + 4] // Y 315 mov edx, [esp + 32 + 4] // Y
299 mov edi, [esp + 32 + 8] // U 316 mov edi, [esp + 32 + 8] // U
300 mov esi, [esp + 32 + 12] // V 317 mov esi, [esp + 32 + 12] // V
301 mov ebp, [esp + 32 + 16] // rgb 318 mov ebp, [esp + 32 + 16] // rgb
302 mov ecx, [esp + 32 + 20] // width 319 mov ecx, [esp + 32 + 20] // width
320 mov ebx, [esp + 32 + 24] // step
321 jmp wend
303 322
304 wloop : 323 wloop :
305 movzx eax, byte ptr [edi] 324 movzx eax, byte ptr [edi]
306 add edi, 1 325 add edi, ebx
307 movzx ebx, byte ptr [esi]
308 add esi, 1
309 movq mm0, [coefficients_RGB_U + 8 * eax] 326 movq mm0, [coefficients_RGB_U + 8 * eax]
327 movzx eax, byte ptr [esi]
328 add esi, ebx
329 paddsw mm0, [coefficients_RGB_V + 8 * eax]
310 movzx eax, byte ptr [edx] 330 movzx eax, byte ptr [edx]
311 paddsw mm0, [coefficients_RGB_V + 8 * ebx] 331 add edx, ebx
312 #if MEDIA_BILINEAR_FILTER 332 movq mm1, [coefficients_RGB_Y + 8 * eax]
313 movzx ebx, byte ptr [edx + 1] 333 movzx eax, byte ptr [edx]
314 add ebx, eax 334 add edx, ebx
315 shr ebx, 1 335 movq mm2, [coefficients_RGB_Y + 8 * eax]
316 #endif 336 paddsw mm1, mm0
317 paddsw mm0, [coefficients_RGB_Y + 8 * eax] 337 paddsw mm2, mm0
318 add edx, 2 338 psraw mm1, 6
319 psraw mm0, 6 339 psraw mm2, 6
320 packuswb mm0, mm0 340 packuswb mm1, mm2
321 movd [ebp], mm0 341 movntq [ebp], mm1
322 add ebp, 4 342 add ebp, 8
323 sub ecx, 1 343 wend :
324 jnz wloop 344 sub ecx, 2
345 jns wloop
346
347 and ecx, 1 // odd number of pixels?
348 jz wdone
349
350 movzx eax, byte ptr [edi]
351 movq mm0, [coefficients_RGB_U + 8 * eax]
352 movzx eax, byte ptr [esi]
353 paddsw mm0, [coefficients_RGB_V + 8 * eax]
354 movzx eax, byte ptr [edx]
355 movq mm1, [coefficients_RGB_Y + 8 * eax]
356 paddsw mm1, mm0
357 psraw mm1, 6
358 packuswb mm1, mm1
359 movd [ebp], mm1
360 wdone :
325 361
326 popad 362 popad
327 ret 363 ret
328 } 364 }
329 } 365 }
330 366
331 __declspec(naked) 367 __declspec(naked)
332 void ScaleYV12ToRGB32Row(const uint8* y_buf, 368 void RotateConvertYUVToRGB32Row(const uint8* y_buf,
369 const uint8* u_buf,
370 const uint8* v_buf,
371 uint8* rgb_buf,
372 int width,
373 int ystep,
374 int uvstep) {
375 __asm {
376 pushad
377 mov edx, [esp + 32 + 4] // Y
378 mov edi, [esp + 32 + 8] // U
379 mov esi, [esp + 32 + 12] // V
380 mov ebp, [esp + 32 + 16] // rgb
381 mov ecx, [esp + 32 + 20] // width
382 jmp wend
383
384 wloop :
385 movzx eax, byte ptr [edi]
386 mov ebx, [esp + 32 + 28] // uvstep
387 add edi, ebx
388 movq mm0, [coefficients_RGB_U + 8 * eax]
389 movzx eax, byte ptr [esi]
390 add esi, ebx
391 paddsw mm0, [coefficients_RGB_V + 8 * eax]
392 movzx eax, byte ptr [edx]
393 mov ebx, [esp + 32 + 24] // ystep
394 add edx, ebx
395 movq mm1, [coefficients_RGB_Y + 8 * eax]
396 movzx eax, byte ptr [edx]
397 add edx, ebx
398 movq mm2, [coefficients_RGB_Y + 8 * eax]
399 paddsw mm1, mm0
400 paddsw mm2, mm0
401 psraw mm1, 6
402 psraw mm2, 6
403 packuswb mm1, mm2
404 movntq [ebp], mm1
405 add ebp, 8
406 wend :
407 sub ecx, 2
408 jns wloop
409
410 and ecx, 1 // odd number of pixels?
411 jz wdone
412
413 movzx eax, byte ptr [edi]
414 movq mm0, [coefficients_RGB_U + 8 * eax]
415 movzx eax, byte ptr [esi]
416 paddsw mm0, [coefficients_RGB_V + 8 * eax]
417 movzx eax, byte ptr [edx]
418 movq mm1, [coefficients_RGB_Y + 8 * eax]
419 paddsw mm1, mm0
420 psraw mm1, 6
421 packuswb mm1, mm1
422 movd [ebp], mm1
423 wdone :
424
425 popad
426 ret
427 }
428 }
429
430 __declspec(naked)
431 void DoubleYUVToRGB32Row(const uint8* y_buf,
333 const uint8* u_buf, 432 const uint8* u_buf,
334 const uint8* v_buf, 433 const uint8* v_buf,
335 uint8* rgb_buf, 434 uint8* rgb_buf,
336 int width, 435 int width) {
337 int dx) {
338 __asm { 436 __asm {
339 pushad 437 pushad
340 mov edx, [esp + 32 + 4] // Y 438 mov edx, [esp + 32 + 4] // Y
341 mov edi, [esp + 32 + 8] // U 439 mov edi, [esp + 32 + 8] // U
342 mov esi, [esp + 32 + 12] // V 440 mov esi, [esp + 32 + 12] // V
343 mov ebp, [esp + 32 + 16] // rgb 441 mov ebp, [esp + 32 + 16] // rgb
344 mov ecx, [esp + 32 + 20] // width 442 mov ecx, [esp + 32 + 20] // width
345 xor eax, eax // x 443 jmp wend
346 444
347 wloop : 445 wloop :
348 mov ebx, eax 446 movzx eax, byte ptr [edi]
349 sar ebx, 5 447 add edi, 1
350 movzx ebx, byte ptr [edi + ebx] 448 movzx ebx, byte ptr [esi]
351 movq mm0, [coefficients_RGB_U + 8 * ebx] 449 add esi, 1
352 mov ebx, eax 450 movq mm0, [coefficients_RGB_U + 8 * eax]
353 sar ebx, 5 451 movzx eax, byte ptr [edx]
354 movzx ebx, byte ptr [esi + ebx]
355 paddsw mm0, [coefficients_RGB_V + 8 * ebx] 452 paddsw mm0, [coefficients_RGB_V + 8 * ebx]
356 mov ebx, eax 453 movq mm1, [coefficients_RGB_Y + 8 * eax]
357 sar ebx, 4 454 paddsw mm1, mm0
358 movzx ebx, byte ptr [edx + ebx] 455 psraw mm1, 6
456 packuswb mm1, mm1
457 punpckldq mm1, mm1
458 movntq [ebp], mm1
459
460 movzx ebx, byte ptr [edx + 1]
461 add edx, 2
359 paddsw mm0, [coefficients_RGB_Y + 8 * ebx] 462 paddsw mm0, [coefficients_RGB_Y + 8 * ebx]
360 psraw mm0, 6 463 psraw mm0, 6
361 packuswb mm0, mm0 464 packuswb mm0, mm0
362 movd [ebp], mm0 465 punpckldq mm0, mm0
466 movntq [ebp+8], mm0
467 add ebp, 16
468 wend :
469 sub ecx, 4
470 jns wloop
471
472 add ecx, 4
473 jz wdone
474
475 movzx eax, byte ptr [edi]
476 movq mm0, [coefficients_RGB_U + 8 * eax]
477 movzx eax, byte ptr [esi]
478 paddsw mm0, [coefficients_RGB_V + 8 * eax]
479 movzx eax, byte ptr [edx]
480 movq mm1, [coefficients_RGB_Y + 8 * eax]
481 paddsw mm1, mm0
482 psraw mm1, 6
483 packuswb mm1, mm1
484 jmp wend1
485
486 wloop1 :
487 movd [ebp], mm1
363 add ebp, 4 488 add ebp, 4
364 add eax, [esp + 32 + 24] // x += dx 489 wend1 :
365 sub ecx, 1 490 sub ecx, 1
366 jnz wloop 491 jns wloop1
367 492 wdone :
368 popad 493 popad
369 ret 494 ret
370 } 495 }
371 } 496 }
372 497
373 498 // This version does general purpose scaling by any amount, up or down.
499 // The only thing it can not do it rotation by 90 or 270.
500 // For performance the chroma is under sampled, reducing cost of a 3x
501 // 1080p scale from 8.4 ms to 5.4 ms.
374 __declspec(naked) 502 __declspec(naked)
375 void Half2Row(const uint8* in_row0, 503 void ScaleYUVToRGB32Row(const uint8* y_buf,
376 const uint8* in_row1, 504 const uint8* u_buf,
377 uint8* out_row, 505 const uint8* v_buf,
378 int out_width) { 506 uint8* rgb_buf,
507 int width,
508 int dx) {
379 __asm { 509 __asm {
380 pushad 510 pushad
381 mov esi, [esp + 32 + 4] // row0 511 mov edx, [esp + 32 + 4] // Y
382 mov ebx, [esp + 32 + 8] // row1 512 mov edi, [esp + 32 + 8] // U
383 mov edi, [esp + 32 + 12] // out 513 mov esi, [esp + 32 + 12] // V
384 mov ecx, [esp + 32 + 16] // width 514 mov ebp, [esp + 32 + 16] // rgb
515 mov ecx, [esp + 32 + 20] // width
516 xor ebx, ebx // x
517 jmp wend
385 518
386 wloop : 519 wloop :
387 movzx eax, byte ptr [esi] 520 mov eax, ebx
388 movzx edx, byte ptr [esi+1] 521 sar eax, 5
389 add esi, 2 522 movzx eax, byte ptr [edi + eax]
390 add eax, edx 523 movq mm0, [coefficients_RGB_U + 8 * eax]
391 movzx edx, byte ptr [ebx] 524 mov eax, ebx
392 add eax, edx 525 sar eax, 5
393 movzx edx, byte ptr [ebx+1] 526 movzx eax, byte ptr [esi + eax]
394 add eax, edx 527 paddsw mm0, [coefficients_RGB_V + 8 * eax]
395 add ebx, 2 528 mov eax, ebx
396 shr eax, 2 529 add ebx, [esp + 32 + 24] // x += dx
397 mov [edi], al 530 sar eax, 4
398 add edi, 1 531 movzx eax, byte ptr [edx + eax]
399 sub ecx, 1 532 movq mm1, [coefficients_RGB_Y + 8 * eax]
400 jnz wloop 533 mov eax, ebx
534 add ebx, [esp + 32 + 24] // x += dx
535 sar eax, 4
536 movzx eax, byte ptr [edx + eax]
537 movq mm2, [coefficients_RGB_Y + 8 * eax]
538 paddsw mm1, mm0
539 paddsw mm2, mm0
540 psraw mm1, 6
541 psraw mm2, 6
542 packuswb mm1, mm2
543 movntq [ebp], mm1
544 add ebp, 8
545 wend :
546 sub ecx, 2
547 jns wloop
548
549 and ecx, 1 // odd number of pixels?
550 jz wdone
551
552 mov eax, ebx
553 sar eax, 5
554 movzx eax, byte ptr [edi + eax]
555 movq mm0, [coefficients_RGB_U + 8 * eax]
556 mov eax, ebx
557 sar eax, 5
558 movzx eax, byte ptr [esi + eax]
559 paddsw mm0, [coefficients_RGB_V + 8 * eax]
560 mov eax, ebx
561 sar eax, 4
562 movzx eax, byte ptr [edx + eax]
563 movq mm1, [coefficients_RGB_Y + 8 * eax]
564 mov eax, ebx
565 sar eax, 4
566 movzx eax, byte ptr [edx + eax]
567 movq mm2, [coefficients_RGB_Y + 8 * eax]
568 paddsw mm1, mm0
569 paddsw mm2, mm0
570 psraw mm1, 6
571 psraw mm2, 6
572 packuswb mm1, mm2
573 movd [ebp], mm1
574
575 wdone :
401 576
402 popad 577 popad
403 ret 578 ret
404 } 579 }
405 } 580 }
406 581
407 } // namespace media 582 } // namespace media
408 583
OLDNEW
« no previous file with comments | « media/base/yuv_row_mac.cc ('k') | media/base/yuv_scale.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698