Chromium Code Reviews| Index: media/base/simd/convert_yuva_to_argb_mmx.inc |
| diff --git a/media/base/simd/convert_yuva_to_argb_mmx.inc b/media/base/simd/convert_yuva_to_argb_mmx.inc |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..4200bf7d6ec9f8bc21b34d2ba08ac10fbece58e7 |
| --- /dev/null |
| +++ b/media/base/simd/convert_yuva_to_argb_mmx.inc |
| @@ -0,0 +1,175 @@ |
| +; Copyright (c) 2011 The Chromium Authors. All rights reserved. |
|
scherkus (not reviewing)
2013/04/04 00:36:17
FYI I have no idea if this code is correct
it loo
vignesh
2013/04/04 18:17:52
yes, that is correct. Although, I have to admit th
|
| +; Use of this source code is governed by a BSD-style license that can be |
| +; found in the LICENSE file. |
| + |
| + global mangle(SYMBOL) PRIVATE |
| + align function_align |
| + |
| +; Non-PIC code is the fastest so use this if possible. |
| +%ifndef PIC |
| +mangle(SYMBOL): |
| + %assign stack_offset 0 |
| + PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP |
| + extern mangle(kCoefficientsRgbY) |
| + extern mangle(kWordDup) |
| + jmp .convertend |
| + |
| +.convertloop: |
| + movzx TEMPd, BYTE [Uq] |
| + movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] |
| + add Uq, 1 |
| + movzx TEMPd, BYTE [Vq] |
| + paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] |
| + add Vq, 1 |
| + movzx TEMPd, BYTE [Yq] |
| + movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] |
| + movzx TEMPd, BYTE [Yq + 1] |
| + movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPq] |
| + add Yq, 2 |
| + paddsw mm1, mm0 |
| + paddsw mm2, mm0 |
| + psraw mm1, 6 |
| + psraw mm2, 6 |
| + packuswb mm1, mm2 |
| + |
| + ; Multiply ARGB by alpha value. |
| + movq mm0, mm1 |
| + pxor mm2, mm2 |
| + punpcklbw mm0, mm2 |
| + punpckhbw mm1, mm2 |
| + movzx TEMPd, BYTE [Aq] |
| + movq mm2, [mangle(kWordDup) + 8 * TEMPq] |
| + pmullw mm0, mm2 |
| + psrlw mm0, 8 |
| + movzx TEMPd, BYTE [Aq + 1] |
| + movq mm2, [mangle(kWordDup) + 8 * TEMPq] |
| + add Aq, 2 |
| + pmullw mm1, mm2 |
| + psrlw mm1, 8 |
| + packuswb mm0, mm1 |
| + |
| + MOVQ [ARGBq], mm0 |
| + add ARGBq, 8 |
| + |
| +.convertend: |
| + sub WIDTHq, 2 |
| + jns .convertloop |
| + |
| + ; If number of pixels is odd then compute it. |
| + and WIDTHq, 1 |
| + jz .convertdone |
| + |
| + movzx TEMPd, BYTE [Uq] |
| + movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] |
| + movzx TEMPd, BYTE [Vq] |
| + paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] |
| + movzx TEMPd, BYTE [Yq] |
| + movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] |
| + paddsw mm1, mm0 |
| + psraw mm1, 6 |
| + packuswb mm1, mm1 |
| + |
| + ; Multiply ARGB by alpha value. |
| + pxor mm0, mm0 |
| + punpcklbw mm1, mm0 |
| + movzx TEMPd, BYTE [Aq] |
| + movq mm0, [mangle(kWordDup) + 8 * TEMPq] |
| + pmullw mm1, mm0 |
| + psrlw mm1, 8 |
| + packuswb mm1, mm1 |
| + |
| + movd [ARGBq], mm1 |
| + |
| +.convertdone: |
| + RET |
| +%endif |
| + |
| +; With PIC code we need to load the address of mangle(kCoefficientsRgbY). |
| +; This code is slower than the above version. |
| +%ifdef PIC |
| +mangle(SYMBOL): |
| + %assign stack_offset 0 |
| + PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP |
| + extern mangle(kCoefficientsRgbY) |
| + PUSH WIDTHq |
| + DEFINE_ARGS Y, U, V, A, ARGB, TABLE, TEMP |
| + LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) |
| + jmp .convertend |
| + |
| +.convertloop: |
| + movzx TEMPd, BYTE [Uq] |
| + movq mm0, [TABLEq + 2048 + 8 * TEMPq] |
| + add Uq, 1 |
| + |
| + movzx TEMPd, BYTE [Vq] |
| + paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] |
| + add Vq, 1 |
| + |
| + movzx TEMPd, BYTE [Yq] |
| + movq mm1, [TABLEq + 8 * TEMPq] |
| + |
| + movzx TEMPd, BYTE [Yq + 1] |
| + movq mm2, [TABLEq + 8 * TEMPq] |
| + add Yq, 2 |
| + |
| + ; Add UV components to Y component. |
| + paddsw mm1, mm0 |
| + paddsw mm2, mm0 |
| + |
| + ; Down shift and then pack. |
| + psraw mm1, 6 |
| + psraw mm2, 6 |
| + packuswb mm1, mm2 |
| + |
| + ; Unpack and multiply by alpha value, then repack high bytes of words. |
| + movq mm0, mm1 |
| + pxor mm2, mm2 |
| + punpcklbw mm0, mm2 |
| + punpckhbw mm1, mm2 |
| + movzx TEMPd, BYTE [Aq] |
| + movq mm2, [TABLEq + 6144 + 8 * TEMPq] |
| + pmullw mm0, mm2 |
| + psrlw mm0, 8 |
| + movzx TEMPd, BYTE [Aq + 1] |
| + movq mm2, [TABLEq + 6144 + 8 * TEMPq] |
| + add Aq, 2 |
| + pmullw mm1, mm2 |
| + psrlw mm1, 8 |
| + packuswb mm0, mm1 |
| + |
| + MOVQ [ARGBq], mm0 |
| + add ARGBq, 8 |
| + |
| +.convertend: |
| + sub dword [rsp], 2 |
| + jns .convertloop |
| + |
| + ; If number of pixels is odd then compute it. |
| + and dword [rsp], 1 |
| + jz .convertdone |
| + |
| + movzx TEMPd, BYTE [Uq] |
| + movq mm0, [TABLEq + 2048 + 8 * TEMPq] |
| + movzx TEMPd, BYTE [Vq] |
| + paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] |
| + movzx TEMPd, BYTE [Yq] |
| + movq mm1, [TABLEq + 8 * TEMPq] |
| + paddsw mm1, mm0 |
| + psraw mm1, 6 |
| + packuswb mm1, mm1 |
| + |
| + ; Multiply ARGB by alpha value. |
| + pxor mm0, mm0 |
| + punpcklbw mm1, mm0 |
| + movzx TEMPd, BYTE [Aq] |
| + movq mm0, [TABLEq + 6144 + 8 * TEMPq] |
| + pmullw mm1, mm0 |
| + psrlw mm1, 8 |
| + packuswb mm1, mm1 |
| + |
| + movd [ARGBq], mm1 |
| + |
| +.convertdone: |
| + POP TABLEq |
| + RET |
| +%endif |