| Index: media/base/simd/convert_yuva_to_argb_mmx.inc
|
| diff --git a/media/base/simd/convert_yuva_to_argb_mmx.inc b/media/base/simd/convert_yuva_to_argb_mmx.inc
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..4200bf7d6ec9f8bc21b34d2ba08ac10fbece58e7
|
| --- /dev/null
|
| +++ b/media/base/simd/convert_yuva_to_argb_mmx.inc
|
| @@ -0,0 +1,175 @@
|
| +; Copyright (c) 2011 The Chromium Authors. All rights reserved.
|
| +; Use of this source code is governed by a BSD-style license that can be
|
| +; found in the LICENSE file.
|
| +
|
| + global mangle(SYMBOL) PRIVATE
|
| + align function_align
|
| +
|
| +; Non-PIC code is the fastest so use this if possible.
|
| +%ifndef PIC
|
| +mangle(SYMBOL):
|
| + %assign stack_offset 0
|
| + PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP
|
| + extern mangle(kCoefficientsRgbY)
|
| + extern mangle(kWordDup)
|
| + jmp .convertend
|
| +
|
| +.convertloop:
|
| + movzx TEMPd, BYTE [Uq]
|
| + movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq]
|
| + add Uq, 1
|
| + movzx TEMPd, BYTE [Vq]
|
| + paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq]
|
| + add Vq, 1
|
| + movzx TEMPd, BYTE [Yq]
|
| + movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq]
|
| + movzx TEMPd, BYTE [Yq + 1]
|
| + movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPq]
|
| + add Yq, 2
|
| + paddsw mm1, mm0
|
| + paddsw mm2, mm0
|
| + psraw mm1, 6
|
| + psraw mm2, 6
|
| + packuswb mm1, mm2
|
| +
|
| + ; Multiply ARGB by alpha value.
|
| + movq mm0, mm1
|
| + pxor mm2, mm2
|
| + punpcklbw mm0, mm2
|
| + punpckhbw mm1, mm2
|
| + movzx TEMPd, BYTE [Aq]
|
| + movq mm2, [mangle(kWordDup) + 8 * TEMPq]
|
| + pmullw mm0, mm2
|
| + psrlw mm0, 8
|
| + movzx TEMPd, BYTE [Aq + 1]
|
| + movq mm2, [mangle(kWordDup) + 8 * TEMPq]
|
| + add Aq, 2
|
| + pmullw mm1, mm2
|
| + psrlw mm1, 8
|
| + packuswb mm0, mm1
|
| +
|
| + MOVQ [ARGBq], mm0
|
| + add ARGBq, 8
|
| +
|
| +.convertend:
|
| + sub WIDTHq, 2
|
| + jns .convertloop
|
| +
|
| + ; If number of pixels is odd then compute it.
|
| + and WIDTHq, 1
|
| + jz .convertdone
|
| +
|
| + movzx TEMPd, BYTE [Uq]
|
| + movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq]
|
| + movzx TEMPd, BYTE [Vq]
|
| + paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq]
|
| + movzx TEMPd, BYTE [Yq]
|
| + movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq]
|
| + paddsw mm1, mm0
|
| + psraw mm1, 6
|
| + packuswb mm1, mm1
|
| +
|
| + ; Multiply ARGB by alpha value.
|
| + pxor mm0, mm0
|
| + punpcklbw mm1, mm0
|
| + movzx TEMPd, BYTE [Aq]
|
| + movq mm0, [mangle(kWordDup) + 8 * TEMPq]
|
| + pmullw mm1, mm0
|
| + psrlw mm1, 8
|
| + packuswb mm1, mm1
|
| +
|
| + movd [ARGBq], mm1
|
| +
|
| +.convertdone:
|
| + RET
|
| +%endif
|
| +
|
| +; With PIC code we need to load the address of mangle(kCoefficientsRgbY).
|
| +; This code is slower than the above version.
|
| +%ifdef PIC
|
| +mangle(SYMBOL):
|
| + %assign stack_offset 0
|
| + PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP
|
| + extern mangle(kCoefficientsRgbY)
|
| + PUSH WIDTHq
|
| + DEFINE_ARGS Y, U, V, A, ARGB, TABLE, TEMP
|
| + LOAD_SYM TABLEq, mangle(kCoefficientsRgbY)
|
| + jmp .convertend
|
| +
|
| +.convertloop:
|
| + movzx TEMPd, BYTE [Uq]
|
| + movq mm0, [TABLEq + 2048 + 8 * TEMPq]
|
| + add Uq, 1
|
| +
|
| + movzx TEMPd, BYTE [Vq]
|
| + paddsw mm0, [TABLEq + 4096 + 8 * TEMPq]
|
| + add Vq, 1
|
| +
|
| + movzx TEMPd, BYTE [Yq]
|
| + movq mm1, [TABLEq + 8 * TEMPq]
|
| +
|
| + movzx TEMPd, BYTE [Yq + 1]
|
| + movq mm2, [TABLEq + 8 * TEMPq]
|
| + add Yq, 2
|
| +
|
| + ; Add UV components to Y component.
|
| + paddsw mm1, mm0
|
| + paddsw mm2, mm0
|
| +
|
| + ; Down shift and then pack.
|
| + psraw mm1, 6
|
| + psraw mm2, 6
|
| + packuswb mm1, mm2
|
| +
|
| + ; Unpack and multiply by alpha value, then repack high bytes of words.
|
| + movq mm0, mm1
|
| + pxor mm2, mm2
|
| + punpcklbw mm0, mm2
|
| + punpckhbw mm1, mm2
|
| + movzx TEMPd, BYTE [Aq]
|
| + movq mm2, [TABLEq + 6144 + 8 * TEMPq]
|
| + pmullw mm0, mm2
|
| + psrlw mm0, 8
|
| + movzx TEMPd, BYTE [Aq + 1]
|
| + movq mm2, [TABLEq + 6144 + 8 * TEMPq]
|
| + add Aq, 2
|
| + pmullw mm1, mm2
|
| + psrlw mm1, 8
|
| + packuswb mm0, mm1
|
| +
|
| + MOVQ [ARGBq], mm0
|
| + add ARGBq, 8
|
| +
|
| +.convertend:
|
| + sub dword [rsp], 2
|
| + jns .convertloop
|
| +
|
| + ; If number of pixels is odd then compute it.
|
| + and dword [rsp], 1
|
| + jz .convertdone
|
| +
|
| + movzx TEMPd, BYTE [Uq]
|
| + movq mm0, [TABLEq + 2048 + 8 * TEMPq]
|
| + movzx TEMPd, BYTE [Vq]
|
| + paddsw mm0, [TABLEq + 4096 + 8 * TEMPq]
|
| + movzx TEMPd, BYTE [Yq]
|
| + movq mm1, [TABLEq + 8 * TEMPq]
|
| + paddsw mm1, mm0
|
| + psraw mm1, 6
|
| + packuswb mm1, mm1
|
| +
|
| + ; Multiply ARGB by alpha value.
|
| + pxor mm0, mm0
|
| + punpcklbw mm1, mm0
|
| + movzx TEMPd, BYTE [Aq]
|
| + movq mm0, [TABLEq + 6144 + 8 * TEMPq]
|
| + pmullw mm1, mm0
|
| + psrlw mm1, 8
|
| + packuswb mm1, mm1
|
| +
|
| + movd [ARGBq], mm1
|
| +
|
| +.convertdone:
|
| + POP TABLEq
|
| + RET
|
| +%endif
|
|
|