| Index: media/base/simd/convert_yuva_to_argb_mmx.inc | 
| diff --git a/media/base/simd/convert_yuva_to_argb_mmx.inc b/media/base/simd/convert_yuva_to_argb_mmx.inc | 
| new file mode 100644 | 
| index 0000000000000000000000000000000000000000..4200bf7d6ec9f8bc21b34d2ba08ac10fbece58e7 | 
| --- /dev/null | 
| +++ b/media/base/simd/convert_yuva_to_argb_mmx.inc | 
| @@ -0,0 +1,175 @@ | 
| +; Copyright (c) 2011 The Chromium Authors. All rights reserved. | 
| +; Use of this source code is governed by a BSD-style license that can be | 
| +; found in the LICENSE file. | 
| + | 
| +  global    mangle(SYMBOL) PRIVATE | 
| +  align     function_align | 
| + | 
| +; Non-PIC code is the fastest so use this if possible. | 
| +%ifndef PIC | 
| +mangle(SYMBOL): | 
| +  %assign   stack_offset 0 | 
| +  PROLOGUE  6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP | 
| +  extern    mangle(kCoefficientsRgbY) | 
| +  extern    mangle(kWordDup) | 
| +  jmp       .convertend | 
| + | 
| +.convertloop: | 
| +  movzx     TEMPd, BYTE [Uq] | 
| +  movq      mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] | 
| +  add       Uq, 1 | 
| +  movzx     TEMPd, BYTE [Vq] | 
| +  paddsw    mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] | 
| +  add       Vq, 1 | 
| +  movzx     TEMPd, BYTE [Yq] | 
| +  movq      mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] | 
| +  movzx     TEMPd, BYTE [Yq + 1] | 
| +  movq      mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPq] | 
| +  add       Yq, 2 | 
| +  paddsw    mm1, mm0 | 
| +  paddsw    mm2, mm0 | 
| +  psraw     mm1, 6 | 
| +  psraw     mm2, 6 | 
| +  packuswb  mm1, mm2 | 
| + | 
| +  ; Multiply ARGB by alpha value. | 
| +  movq      mm0, mm1 | 
| +  pxor      mm2, mm2 | 
| +  punpcklbw mm0, mm2 | 
| +  punpckhbw mm1, mm2 | 
| +  movzx     TEMPd, BYTE [Aq] | 
| +  movq      mm2, [mangle(kWordDup) + 8 * TEMPq] | 
| +  pmullw    mm0, mm2 | 
| +  psrlw     mm0, 8 | 
| +  movzx     TEMPd, BYTE [Aq + 1] | 
| +  movq      mm2, [mangle(kWordDup) + 8 * TEMPq] | 
| +  add       Aq, 2 | 
| +  pmullw    mm1, mm2 | 
| +  psrlw     mm1, 8 | 
| +  packuswb  mm0, mm1 | 
| + | 
| +  MOVQ      [ARGBq], mm0 | 
| +  add       ARGBq, 8 | 
| + | 
| +.convertend: | 
| +  sub       WIDTHq, 2 | 
| +  jns       .convertloop | 
| + | 
| +  ; If number of pixels is odd then compute it. | 
| +  and       WIDTHq, 1 | 
| +  jz        .convertdone | 
| + | 
| +  movzx     TEMPd, BYTE [Uq] | 
| +  movq      mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] | 
| +  movzx     TEMPd, BYTE [Vq] | 
| +  paddsw    mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] | 
| +  movzx     TEMPd, BYTE [Yq] | 
| +  movq      mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] | 
| +  paddsw    mm1, mm0 | 
| +  psraw     mm1, 6 | 
| +  packuswb  mm1, mm1 | 
| + | 
| +  ; Multiply ARGB by alpha value. | 
| +  pxor      mm0, mm0 | 
| +  punpcklbw mm1, mm0 | 
| +  movzx     TEMPd, BYTE [Aq] | 
| +  movq      mm0, [mangle(kWordDup) + 8 * TEMPq] | 
| +  pmullw    mm1, mm0 | 
| +  psrlw     mm1, 8 | 
| +  packuswb  mm1, mm1 | 
| + | 
| +  movd      [ARGBq], mm1 | 
| + | 
| +.convertdone: | 
| +  RET | 
| +%endif | 
| + | 
| +; With PIC code we need to load the address of mangle(kCoefficientsRgbY). | 
| +; This code is slower than the above version. | 
| +%ifdef PIC | 
| +mangle(SYMBOL): | 
| +  %assign   stack_offset 0 | 
| +  PROLOGUE  6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP | 
| +  extern    mangle(kCoefficientsRgbY) | 
| +  PUSH      WIDTHq | 
| +  DEFINE_ARGS Y, U, V, A, ARGB, TABLE, TEMP | 
| +  LOAD_SYM  TABLEq, mangle(kCoefficientsRgbY) | 
| +  jmp       .convertend | 
| + | 
| +.convertloop: | 
| +  movzx     TEMPd, BYTE [Uq] | 
| +  movq      mm0, [TABLEq + 2048 + 8 * TEMPq] | 
| +  add       Uq, 1 | 
| + | 
| +  movzx     TEMPd, BYTE [Vq] | 
| +  paddsw    mm0, [TABLEq + 4096 + 8 * TEMPq] | 
| +  add       Vq, 1 | 
| + | 
| +  movzx     TEMPd, BYTE [Yq] | 
| +  movq      mm1, [TABLEq + 8 * TEMPq] | 
| + | 
| +  movzx     TEMPd, BYTE [Yq + 1] | 
| +  movq      mm2, [TABLEq + 8 * TEMPq] | 
| +  add       Yq, 2 | 
| + | 
| +  ; Add UV components to Y component. | 
| +  paddsw    mm1, mm0 | 
| +  paddsw    mm2, mm0 | 
| + | 
| +  ; Down shift and then pack. | 
| +  psraw     mm1, 6 | 
| +  psraw     mm2, 6 | 
| +  packuswb  mm1, mm2 | 
| + | 
| +  ; Unpack and multiply by alpha value, then repack high bytes of words. | 
| +  movq      mm0, mm1 | 
| +  pxor      mm2, mm2 | 
| +  punpcklbw mm0, mm2 | 
| +  punpckhbw mm1, mm2 | 
| +  movzx     TEMPd, BYTE [Aq] | 
| +  movq      mm2, [TABLEq + 6144 + 8 * TEMPq] | 
| +  pmullw    mm0, mm2 | 
| +  psrlw     mm0, 8 | 
| +  movzx     TEMPd, BYTE [Aq + 1] | 
| +  movq      mm2, [TABLEq + 6144 + 8 * TEMPq] | 
| +  add       Aq, 2 | 
| +  pmullw    mm1, mm2 | 
| +  psrlw     mm1, 8 | 
| +  packuswb  mm0, mm1 | 
| + | 
| +  MOVQ      [ARGBq], mm0 | 
| +  add       ARGBq, 8 | 
| + | 
| +.convertend: | 
| +  sub       dword [rsp], 2 | 
| +  jns       .convertloop | 
| + | 
| +  ; If number of pixels is odd then compute it. | 
| +  and       dword [rsp], 1 | 
| +  jz        .convertdone | 
| + | 
| +  movzx     TEMPd, BYTE [Uq] | 
| +  movq      mm0, [TABLEq + 2048 + 8 * TEMPq] | 
| +  movzx     TEMPd, BYTE [Vq] | 
| +  paddsw    mm0, [TABLEq + 4096 + 8 * TEMPq] | 
| +  movzx     TEMPd, BYTE [Yq] | 
| +  movq      mm1, [TABLEq + 8 * TEMPq] | 
| +  paddsw    mm1, mm0 | 
| +  psraw     mm1, 6 | 
| +  packuswb  mm1, mm1 | 
| + | 
| +  ; Multiply ARGB by alpha value. | 
| +  pxor      mm0, mm0 | 
| +  punpcklbw mm1, mm0 | 
| +  movzx     TEMPd, BYTE [Aq] | 
| +  movq      mm0, [TABLEq + 6144 + 8 * TEMPq] | 
| +  pmullw    mm1, mm0 | 
| +  psrlw     mm1, 8 | 
| +  packuswb  mm1, mm1 | 
| + | 
| +  movd      [ARGBq], mm1 | 
| + | 
| +.convertdone: | 
| +  POP       TABLEq | 
| +  RET | 
| +%endif | 
|  |