OLD | NEW |
(Empty) | |
| 1 ; Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 ; Use of this source code is governed by a BSD-style license that can be |
| 3 ; found in the LICENSE file. |
| 4 |
| 5 global mangle(SYMBOL) PRIVATE |
| 6 align function_align |
| 7 |
| 8 ; Non-PIC code is the fastest so use this if possible. |
| 9 %ifndef PIC |
| 10 mangle(SYMBOL): |
| 11 %assign stack_offset 0 |
| 12 PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP |
| 13 extern mangle(kCoefficientsRgbY) |
| 14 extern mangle(kWordDup) |
| 15 jmp .convertend |
| 16 |
| 17 .convertloop: |
| 18 movzx TEMPd, BYTE [Uq] |
| 19 movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] |
| 20 add Uq, 1 |
| 21 movzx TEMPd, BYTE [Vq] |
| 22 paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] |
| 23 add Vq, 1 |
| 24 movzx TEMPd, BYTE [Yq] |
| 25 movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] |
| 26 movzx TEMPd, BYTE [Yq + 1] |
| 27 movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPq] |
| 28 add Yq, 2 |
| 29 paddsw mm1, mm0 |
| 30 paddsw mm2, mm0 |
| 31 psraw mm1, 6 |
| 32 psraw mm2, 6 |
| 33 packuswb mm1, mm2 |
| 34 |
| 35 ; Multiply ARGB by alpha value. |
| 36 movq mm0, mm1 |
| 37 pxor mm2, mm2 |
| 38 punpcklbw mm0, mm2 |
| 39 punpckhbw mm1, mm2 |
| 40 movzx TEMPd, BYTE [Aq] |
| 41 movq mm2, [mangle(kWordDup) + 8 * TEMPq] |
| 42 pmullw mm0, mm2 |
| 43 psrlw mm0, 8 |
| 44 movzx TEMPd, BYTE [Aq + 1] |
| 45 movq mm2, [mangle(kWordDup) + 8 * TEMPq] |
| 46 add Aq, 2 |
| 47 pmullw mm1, mm2 |
| 48 psrlw mm1, 8 |
| 49 packuswb mm0, mm1 |
| 50 |
| 51 MOVQ [ARGBq], mm0 |
| 52 add ARGBq, 8 |
| 53 |
| 54 .convertend: |
| 55 sub WIDTHq, 2 |
| 56 jns .convertloop |
| 57 |
| 58 ; If number of pixels is odd then compute it. |
| 59 and WIDTHq, 1 |
| 60 jz .convertdone |
| 61 |
| 62 movzx TEMPd, BYTE [Uq] |
| 63 movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] |
| 64 movzx TEMPd, BYTE [Vq] |
| 65 paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] |
| 66 movzx TEMPd, BYTE [Yq] |
| 67 movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] |
| 68 paddsw mm1, mm0 |
| 69 psraw mm1, 6 |
| 70 packuswb mm1, mm1 |
| 71 |
| 72 ; Multiply ARGB by alpha value. |
| 73 pxor mm0, mm0 |
| 74 punpcklbw mm1, mm0 |
| 75 movzx TEMPd, BYTE [Aq] |
| 76 movq mm0, [mangle(kWordDup) + 8 * TEMPq] |
| 77 pmullw mm1, mm0 |
| 78 psrlw mm1, 8 |
| 79 packuswb mm1, mm1 |
| 80 |
| 81 movd [ARGBq], mm1 |
| 82 |
| 83 .convertdone: |
| 84 RET |
| 85 %endif |
| 86 |
| 87 ; With PIC code we need to load the address of mangle(kCoefficientsRgbY). |
| 88 ; This code is slower than the above version. |
| 89 %ifdef PIC |
| 90 mangle(SYMBOL): |
| 91 %assign stack_offset 0 |
| 92 PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP |
| 93 extern mangle(kCoefficientsRgbY) |
| 94 PUSH WIDTHq |
| 95 DEFINE_ARGS Y, U, V, A, ARGB, TABLE, TEMP |
| 96 LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) |
| 97 jmp .convertend |
| 98 |
| 99 .convertloop: |
| 100 movzx TEMPd, BYTE [Uq] |
| 101 movq mm0, [TABLEq + 2048 + 8 * TEMPq] |
| 102 add Uq, 1 |
| 103 |
| 104 movzx TEMPd, BYTE [Vq] |
| 105 paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] |
| 106 add Vq, 1 |
| 107 |
| 108 movzx TEMPd, BYTE [Yq] |
| 109 movq mm1, [TABLEq + 8 * TEMPq] |
| 110 |
| 111 movzx TEMPd, BYTE [Yq + 1] |
| 112 movq mm2, [TABLEq + 8 * TEMPq] |
| 113 add Yq, 2 |
| 114 |
| 115 ; Add UV components to Y component. |
| 116 paddsw mm1, mm0 |
| 117 paddsw mm2, mm0 |
| 118 |
| 119 ; Down shift and then pack. |
| 120 psraw mm1, 6 |
| 121 psraw mm2, 6 |
| 122 packuswb mm1, mm2 |
| 123 |
| 124 ; Unpack and multiply by alpha value, then repack high bytes of words. |
| 125 movq mm0, mm1 |
| 126 pxor mm2, mm2 |
| 127 punpcklbw mm0, mm2 |
| 128 punpckhbw mm1, mm2 |
| 129 movzx TEMPd, BYTE [Aq] |
| 130 movq mm2, [TABLEq + 6144 + 8 * TEMPq] |
| 131 pmullw mm0, mm2 |
| 132 psrlw mm0, 8 |
| 133 movzx TEMPd, BYTE [Aq + 1] |
| 134 movq mm2, [TABLEq + 6144 + 8 * TEMPq] |
| 135 add Aq, 2 |
| 136 pmullw mm1, mm2 |
| 137 psrlw mm1, 8 |
| 138 packuswb mm0, mm1 |
| 139 |
| 140 MOVQ [ARGBq], mm0 |
| 141 add ARGBq, 8 |
| 142 |
| 143 .convertend: |
| 144 sub dword [rsp], 2 |
| 145 jns .convertloop |
| 146 |
| 147 ; If number of pixels is odd then compute it. |
| 148 and dword [rsp], 1 |
| 149 jz .convertdone |
| 150 |
| 151 movzx TEMPd, BYTE [Uq] |
| 152 movq mm0, [TABLEq + 2048 + 8 * TEMPq] |
| 153 movzx TEMPd, BYTE [Vq] |
| 154 paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] |
| 155 movzx TEMPd, BYTE [Yq] |
| 156 movq mm1, [TABLEq + 8 * TEMPq] |
| 157 paddsw mm1, mm0 |
| 158 psraw mm1, 6 |
| 159 packuswb mm1, mm1 |
| 160 |
| 161 ; Multiply ARGB by alpha value. |
| 162 pxor mm0, mm0 |
| 163 punpcklbw mm1, mm0 |
| 164 movzx TEMPd, BYTE [Aq] |
| 165 movq mm0, [TABLEq + 6144 + 8 * TEMPq] |
| 166 pmullw mm1, mm0 |
| 167 psrlw mm1, 8 |
| 168 packuswb mm1, mm1 |
| 169 |
| 170 movd [ARGBq], mm1 |
| 171 |
| 172 .convertdone: |
| 173 POP TABLEq |
| 174 RET |
| 175 %endif |
OLD | NEW |