OLD | NEW |
---|---|
(Empty) | |
1 ; Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
scherkus (not reviewing)
2013/04/04 00:36:17
FYI I have no idea if this code is correct
it loo
vignesh
2013/04/04 18:17:52
yes, that is correct. Although, I have to admit th
| |
2 ; Use of this source code is governed by a BSD-style license that can be | |
3 ; found in the LICENSE file. | |
4 | |
5 global mangle(SYMBOL) PRIVATE | |
6 align function_align | |
7 | |
8 ; Non-PIC code is the fastest so use this if possible. | |
9 %ifndef PIC | |
10 mangle(SYMBOL): | |
11 %assign stack_offset 0 | |
12 PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP | |
13 extern mangle(kCoefficientsRgbY) | |
14 extern mangle(kWordDup) | |
15 jmp .convertend | |
16 | |
17 .convertloop: | |
18 movzx TEMPd, BYTE [Uq] | |
19 movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] | |
20 add Uq, 1 | |
21 movzx TEMPd, BYTE [Vq] | |
22 paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] | |
23 add Vq, 1 | |
24 movzx TEMPd, BYTE [Yq] | |
25 movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] | |
26 movzx TEMPd, BYTE [Yq + 1] | |
27 movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPq] | |
28 add Yq, 2 | |
29 paddsw mm1, mm0 | |
30 paddsw mm2, mm0 | |
31 psraw mm1, 6 | |
32 psraw mm2, 6 | |
33 packuswb mm1, mm2 | |
34 | |
35 ; Multiply ARGB by alpha value. | |
36 movq mm0, mm1 | |
37 pxor mm2, mm2 | |
38 punpcklbw mm0, mm2 | |
39 punpckhbw mm1, mm2 | |
40 movzx TEMPd, BYTE [Aq] | |
41 movq mm2, [mangle(kWordDup) + 8 * TEMPq] | |
42 pmullw mm0, mm2 | |
43 psrlw mm0, 8 | |
44 movzx TEMPd, BYTE [Aq + 1] | |
45 movq mm2, [mangle(kWordDup) + 8 * TEMPq] | |
46 add Aq, 2 | |
47 pmullw mm1, mm2 | |
48 psrlw mm1, 8 | |
49 packuswb mm0, mm1 | |
50 | |
51 MOVQ [ARGBq], mm0 | |
52 add ARGBq, 8 | |
53 | |
54 .convertend: | |
55 sub WIDTHq, 2 | |
56 jns .convertloop | |
57 | |
58 ; If number of pixels is odd then compute it. | |
59 and WIDTHq, 1 | |
60 jz .convertdone | |
61 | |
62 movzx TEMPd, BYTE [Uq] | |
63 movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] | |
64 movzx TEMPd, BYTE [Vq] | |
65 paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] | |
66 movzx TEMPd, BYTE [Yq] | |
67 movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] | |
68 paddsw mm1, mm0 | |
69 psraw mm1, 6 | |
70 packuswb mm1, mm1 | |
71 | |
72 ; Multiply ARGB by alpha value. | |
73 pxor mm0, mm0 | |
74 punpcklbw mm1, mm0 | |
75 movzx TEMPd, BYTE [Aq] | |
76 movq mm0, [mangle(kWordDup) + 8 * TEMPq] | |
77 pmullw mm1, mm0 | |
78 psrlw mm1, 8 | |
79 packuswb mm1, mm1 | |
80 | |
81 movd [ARGBq], mm1 | |
82 | |
83 .convertdone: | |
84 RET | |
85 %endif | |
86 | |
87 ; With PIC code we need to load the address of mangle(kCoefficientsRgbY). | |
88 ; This code is slower than the above version. | |
89 %ifdef PIC | |
90 mangle(SYMBOL): | |
91 %assign stack_offset 0 | |
92 PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP | |
93 extern mangle(kCoefficientsRgbY) | |
94 PUSH WIDTHq | |
95 DEFINE_ARGS Y, U, V, A, ARGB, TABLE, TEMP | |
96 LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) | |
97 jmp .convertend | |
98 | |
99 .convertloop: | |
100 movzx TEMPd, BYTE [Uq] | |
101 movq mm0, [TABLEq + 2048 + 8 * TEMPq] | |
102 add Uq, 1 | |
103 | |
104 movzx TEMPd, BYTE [Vq] | |
105 paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] | |
106 add Vq, 1 | |
107 | |
108 movzx TEMPd, BYTE [Yq] | |
109 movq mm1, [TABLEq + 8 * TEMPq] | |
110 | |
111 movzx TEMPd, BYTE [Yq + 1] | |
112 movq mm2, [TABLEq + 8 * TEMPq] | |
113 add Yq, 2 | |
114 | |
115 ; Add UV components to Y component. | |
116 paddsw mm1, mm0 | |
117 paddsw mm2, mm0 | |
118 | |
119 ; Down shift and then pack. | |
120 psraw mm1, 6 | |
121 psraw mm2, 6 | |
122 packuswb mm1, mm2 | |
123 | |
124 ; Unpack and multiply by alpha value, then repack high bytes of words. | |
125 movq mm0, mm1 | |
126 pxor mm2, mm2 | |
127 punpcklbw mm0, mm2 | |
128 punpckhbw mm1, mm2 | |
129 movzx TEMPd, BYTE [Aq] | |
130 movq mm2, [TABLEq + 6144 + 8 * TEMPq] | |
131 pmullw mm0, mm2 | |
132 psrlw mm0, 8 | |
133 movzx TEMPd, BYTE [Aq + 1] | |
134 movq mm2, [TABLEq + 6144 + 8 * TEMPq] | |
135 add Aq, 2 | |
136 pmullw mm1, mm2 | |
137 psrlw mm1, 8 | |
138 packuswb mm0, mm1 | |
139 | |
140 MOVQ [ARGBq], mm0 | |
141 add ARGBq, 8 | |
142 | |
143 .convertend: | |
144 sub dword [rsp], 2 | |
145 jns .convertloop | |
146 | |
147 ; If number of pixels is odd then compute it. | |
148 and dword [rsp], 1 | |
149 jz .convertdone | |
150 | |
151 movzx TEMPd, BYTE [Uq] | |
152 movq mm0, [TABLEq + 2048 + 8 * TEMPq] | |
153 movzx TEMPd, BYTE [Vq] | |
154 paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] | |
155 movzx TEMPd, BYTE [Yq] | |
156 movq mm1, [TABLEq + 8 * TEMPq] | |
157 paddsw mm1, mm0 | |
158 psraw mm1, 6 | |
159 packuswb mm1, mm1 | |
160 | |
161 ; Multiply ARGB by alpha value. | |
162 pxor mm0, mm0 | |
163 punpcklbw mm1, mm0 | |
164 movzx TEMPd, BYTE [Aq] | |
165 movq mm0, [TABLEq + 6144 + 8 * TEMPq] | |
166 pmullw mm1, mm0 | |
167 psrlw mm1, 8 | |
168 packuswb mm1, mm1 | |
169 | |
170 movd [ARGBq], mm1 | |
171 | |
172 .convertdone: | |
173 POP TABLEq | |
174 RET | |
175 %endif | |
OLD | NEW |