OLD | NEW |
| (Empty) |
1 ; Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
2 ; Use of this source code is governed by a BSD-style license that can be | |
3 ; found in the LICENSE file. | |
4 | |
5 global mangle(SYMBOL) PRIVATE | |
6 align function_align | |
7 | |
8 ; Non-PIC code is the fastest so use this if possible. | |
9 %ifndef PIC | |
10 mangle(SYMBOL): | |
11 %assign stack_offset 0 | |
12 PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP | |
13 extern mangle(kCoefficientsRgbY) | |
14 jmp .convertend | |
15 | |
16 .convertloop: | |
17 movzx TEMPd, BYTE [Uq] | |
18 movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] | |
19 add Uq, 1 | |
20 movzx TEMPd, BYTE [Vq] | |
21 paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] | |
22 add Vq, 1 | |
23 movzx TEMPd, BYTE [Yq] | |
24 movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] | |
25 movzx TEMPd, BYTE [Yq + 1] | |
26 movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPq] | |
27 add Yq, 2 | |
28 paddsw mm1, mm0 | |
29 paddsw mm2, mm0 | |
30 psraw mm1, 6 | |
31 psraw mm2, 6 | |
32 packuswb mm1, mm2 | |
33 | |
34 ; Multiply ARGB by alpha value. | |
35 movq mm0, mm1 | |
36 pxor mm2, mm2 | |
37 punpcklbw mm0, mm2 | |
38 punpckhbw mm1, mm2 | |
39 movzx TEMPd, BYTE [Aq] | |
40 movq mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq] | |
41 pmullw mm0, mm2 | |
42 psrlw mm0, 8 | |
43 movzx TEMPd, BYTE [Aq + 1] | |
44 movq mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq] | |
45 add Aq, 2 | |
46 pmullw mm1, mm2 | |
47 psrlw mm1, 8 | |
48 packuswb mm0, mm1 | |
49 | |
50 MOVQ [ARGBq], mm0 | |
51 add ARGBq, 8 | |
52 | |
53 .convertend: | |
54 sub WIDTHq, 2 | |
55 jns .convertloop | |
56 | |
57 ; If number of pixels is odd then compute it. | |
58 and WIDTHq, 1 | |
59 jz .convertdone | |
60 | |
61 movzx TEMPd, BYTE [Uq] | |
62 movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] | |
63 movzx TEMPd, BYTE [Vq] | |
64 paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] | |
65 movzx TEMPd, BYTE [Yq] | |
66 movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] | |
67 paddsw mm1, mm0 | |
68 psraw mm1, 6 | |
69 packuswb mm1, mm1 | |
70 | |
71 ; Multiply ARGB by alpha value. | |
72 pxor mm0, mm0 | |
73 punpcklbw mm1, mm0 | |
74 movzx TEMPd, BYTE [Aq] | |
75 movq mm0, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq] | |
76 pmullw mm1, mm0 | |
77 psrlw mm1, 8 | |
78 packuswb mm1, mm1 | |
79 | |
80 movd [ARGBq], mm1 | |
81 | |
82 .convertdone: | |
83 RET | |
84 %endif | |
85 | |
86 ; With PIC code we need to load the address of mangle(kCoefficientsRgbY). | |
87 ; This code is slower than the above version. | |
88 %ifdef PIC | |
89 mangle(SYMBOL): | |
90 %assign stack_offset 0 | |
91 PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP | |
92 extern mangle(kCoefficientsRgbY) | |
93 PUSH WIDTHq | |
94 DEFINE_ARGS Y, U, V, A, ARGB, TABLE, TEMP | |
95 LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) | |
96 jmp .convertend | |
97 | |
98 .convertloop: | |
99 movzx TEMPd, BYTE [Uq] | |
100 movq mm0, [TABLEq + 2048 + 8 * TEMPq] | |
101 add Uq, 1 | |
102 | |
103 movzx TEMPd, BYTE [Vq] | |
104 paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] | |
105 add Vq, 1 | |
106 | |
107 movzx TEMPd, BYTE [Yq] | |
108 movq mm1, [TABLEq + 8 * TEMPq] | |
109 | |
110 movzx TEMPd, BYTE [Yq + 1] | |
111 movq mm2, [TABLEq + 8 * TEMPq] | |
112 add Yq, 2 | |
113 | |
114 ; Add UV components to Y component. | |
115 paddsw mm1, mm0 | |
116 paddsw mm2, mm0 | |
117 | |
118 ; Down shift and then pack. | |
119 psraw mm1, 6 | |
120 psraw mm2, 6 | |
121 packuswb mm1, mm2 | |
122 | |
123 ; Unpack and multiply by alpha value, then repack high bytes of words. | |
124 movq mm0, mm1 | |
125 pxor mm2, mm2 | |
126 punpcklbw mm0, mm2 | |
127 punpckhbw mm1, mm2 | |
128 movzx TEMPd, BYTE [Aq] | |
129 movq mm2, [TABLEq + 6144 + 8 * TEMPq] | |
130 pmullw mm0, mm2 | |
131 psrlw mm0, 8 | |
132 movzx TEMPd, BYTE [Aq + 1] | |
133 movq mm2, [TABLEq + 6144 + 8 * TEMPq] | |
134 add Aq, 2 | |
135 pmullw mm1, mm2 | |
136 psrlw mm1, 8 | |
137 packuswb mm0, mm1 | |
138 | |
139 MOVQ [ARGBq], mm0 | |
140 add ARGBq, 8 | |
141 | |
142 .convertend: | |
143 sub dword [rsp], 2 | |
144 jns .convertloop | |
145 | |
146 ; If number of pixels is odd then compute it. | |
147 and dword [rsp], 1 | |
148 jz .convertdone | |
149 | |
150 movzx TEMPd, BYTE [Uq] | |
151 movq mm0, [TABLEq + 2048 + 8 * TEMPq] | |
152 movzx TEMPd, BYTE [Vq] | |
153 paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] | |
154 movzx TEMPd, BYTE [Yq] | |
155 movq mm1, [TABLEq + 8 * TEMPq] | |
156 paddsw mm1, mm0 | |
157 psraw mm1, 6 | |
158 packuswb mm1, mm1 | |
159 | |
160 ; Multiply ARGB by alpha value. | |
161 pxor mm0, mm0 | |
162 punpcklbw mm1, mm0 | |
163 movzx TEMPd, BYTE [Aq] | |
164 movq mm0, [TABLEq + 6144 + 8 * TEMPq] | |
165 pmullw mm1, mm0 | |
166 psrlw mm1, 8 | |
167 packuswb mm1, mm1 | |
168 | |
169 movd [ARGBq], mm1 | |
170 | |
171 .convertdone: | |
172 POP TABLEq | |
173 RET | |
174 %endif | |
OLD | NEW |