OLD | NEW |
(Empty) | |
| 1 ; Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 ; Use of this source code is governed by a BSD-style license that can be |
| 3 ; found in the LICENSE file. |
| 4 |
| 5 global mangle(SYMBOL) PRIVATE |
| 6 align function_align |
| 7 |
| 8 ; Non-PIC code is the fastest so use this if possible. |
| 9 %ifndef PIC |
| 10 mangle(SYMBOL): |
| 11 %assign stack_offset 0 |
| 12 PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP |
| 13 extern mangle(kCoefficientsRgbY) |
| 14 jmp .convertend |
| 15 |
| 16 .convertloop: |
| 17 movzx TEMPd, BYTE [Uq] |
| 18 movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] |
| 19 add Uq, 1 |
| 20 movzx TEMPd, BYTE [Vq] |
| 21 paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] |
| 22 add Vq, 1 |
| 23 movzx TEMPd, BYTE [Yq] |
| 24 movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] |
| 25 movzx TEMPd, BYTE [Yq + 1] |
| 26 movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPq] |
| 27 add Yq, 2 |
| 28 paddsw mm1, mm0 |
| 29 paddsw mm2, mm0 |
| 30 psraw mm1, 6 |
| 31 psraw mm2, 6 |
| 32 packuswb mm1, mm2 |
| 33 |
| 34 ; Multiply ARGB by alpha value. |
| 35 movq mm0, mm1 |
| 36 pxor mm2, mm2 |
| 37 punpcklbw mm0, mm2 |
| 38 punpckhbw mm1, mm2 |
| 39 movzx TEMPd, BYTE [Aq] |
| 40 movq mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq] |
| 41 pmullw mm0, mm2 |
| 42 psrlw mm0, 8 |
| 43 movzx TEMPd, BYTE [Aq + 1] |
| 44 movq mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq] |
| 45 add Aq, 2 |
| 46 pmullw mm1, mm2 |
| 47 psrlw mm1, 8 |
| 48 packuswb mm0, mm1 |
| 49 |
| 50 MOVQ [ARGBq], mm0 |
| 51 add ARGBq, 8 |
| 52 |
| 53 .convertend: |
| 54 sub WIDTHq, 2 |
| 55 jns .convertloop |
| 56 |
| 57 ; If number of pixels is odd then compute it. |
| 58 and WIDTHq, 1 |
| 59 jz .convertdone |
| 60 |
| 61 movzx TEMPd, BYTE [Uq] |
| 62 movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] |
| 63 movzx TEMPd, BYTE [Vq] |
| 64 paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] |
| 65 movzx TEMPd, BYTE [Yq] |
| 66 movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] |
| 67 paddsw mm1, mm0 |
| 68 psraw mm1, 6 |
| 69 packuswb mm1, mm1 |
| 70 |
| 71 ; Multiply ARGB by alpha value. |
| 72 pxor mm0, mm0 |
| 73 punpcklbw mm1, mm0 |
| 74 movzx TEMPd, BYTE [Aq] |
| 75 movq mm0, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq] |
| 76 pmullw mm1, mm0 |
| 77 psrlw mm1, 8 |
| 78 packuswb mm1, mm1 |
| 79 |
| 80 movd [ARGBq], mm1 |
| 81 |
| 82 .convertdone: |
| 83 RET |
| 84 %endif |
| 85 |
| 86 ; With PIC code we need to load the address of mangle(kCoefficientsRgbY). |
| 87 ; This code is slower than the above version. |
| 88 %ifdef PIC |
| 89 mangle(SYMBOL): |
| 90 %assign stack_offset 0 |
| 91 PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP |
| 92 extern mangle(kCoefficientsRgbY) |
| 93 PUSH WIDTHq |
| 94 DEFINE_ARGS Y, U, V, A, ARGB, TABLE, TEMP |
| 95 LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) |
| 96 jmp .convertend |
| 97 |
| 98 .convertloop: |
| 99 movzx TEMPd, BYTE [Uq] |
| 100 movq mm0, [TABLEq + 2048 + 8 * TEMPq] |
| 101 add Uq, 1 |
| 102 |
| 103 movzx TEMPd, BYTE [Vq] |
| 104 paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] |
| 105 add Vq, 1 |
| 106 |
| 107 movzx TEMPd, BYTE [Yq] |
| 108 movq mm1, [TABLEq + 8 * TEMPq] |
| 109 |
| 110 movzx TEMPd, BYTE [Yq + 1] |
| 111 movq mm2, [TABLEq + 8 * TEMPq] |
| 112 add Yq, 2 |
| 113 |
| 114 ; Add UV components to Y component. |
| 115 paddsw mm1, mm0 |
| 116 paddsw mm2, mm0 |
| 117 |
| 118 ; Down shift and then pack. |
| 119 psraw mm1, 6 |
| 120 psraw mm2, 6 |
| 121 packuswb mm1, mm2 |
| 122 |
| 123 ; Unpack and multiply by alpha value, then repack high bytes of words. |
| 124 movq mm0, mm1 |
| 125 pxor mm2, mm2 |
| 126 punpcklbw mm0, mm2 |
| 127 punpckhbw mm1, mm2 |
| 128 movzx TEMPd, BYTE [Aq] |
| 129 movq mm2, [TABLEq + 6144 + 8 * TEMPq] |
| 130 pmullw mm0, mm2 |
| 131 psrlw mm0, 8 |
| 132 movzx TEMPd, BYTE [Aq + 1] |
| 133 movq mm2, [TABLEq + 6144 + 8 * TEMPq] |
| 134 add Aq, 2 |
| 135 pmullw mm1, mm2 |
| 136 psrlw mm1, 8 |
| 137 packuswb mm0, mm1 |
| 138 |
| 139 MOVQ [ARGBq], mm0 |
| 140 add ARGBq, 8 |
| 141 |
| 142 .convertend: |
| 143 sub dword [rsp], 2 |
| 144 jns .convertloop |
| 145 |
| 146 ; If number of pixels is odd then compute it. |
| 147 and dword [rsp], 1 |
| 148 jz .convertdone |
| 149 |
| 150 movzx TEMPd, BYTE [Uq] |
| 151 movq mm0, [TABLEq + 2048 + 8 * TEMPq] |
| 152 movzx TEMPd, BYTE [Vq] |
| 153 paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] |
| 154 movzx TEMPd, BYTE [Yq] |
| 155 movq mm1, [TABLEq + 8 * TEMPq] |
| 156 paddsw mm1, mm0 |
| 157 psraw mm1, 6 |
| 158 packuswb mm1, mm1 |
| 159 |
| 160 ; Multiply ARGB by alpha value. |
| 161 pxor mm0, mm0 |
| 162 punpcklbw mm1, mm0 |
| 163 movzx TEMPd, BYTE [Aq] |
| 164 movq mm0, [TABLEq + 6144 + 8 * TEMPq] |
| 165 pmullw mm1, mm0 |
| 166 psrlw mm1, 8 |
| 167 packuswb mm1, mm1 |
| 168 |
| 169 movd [ARGBq], mm1 |
| 170 |
| 171 .convertdone: |
| 172 POP TABLEq |
| 173 RET |
| 174 %endif |
OLD | NEW |