OLD | NEW |
| (Empty) |
1 #include <xmmintrin.h> | |
2 | |
3 #include "qcmsint.h" | |
4 | |
5 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequ
ence */ | |
6 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE) | |
7 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZ
E ) | |
8 static const ALIGN float floatScaleX4[4] = | |
9 { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE}; | |
10 static const ALIGN float clampMaxValueX4[4] = | |
11 { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL}; | |
12 | |
13 void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform, | |
14 unsigned char *src, | |
15 unsigned char *dest, | |
16 size_t length) | |
17 { | |
18 unsigned int i; | |
19 float (*mat)[4] = transform->matrix; | |
20 char input_back[32]; | |
21 /* Ensure we have a buffer that's 16 byte aligned regardless of the original | |
22 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(
align(32)) | |
23 * because they don't work on stack variables. gcc 4.4 does do the right thi
ng | |
24 * on x86 but that's too new for us right now. For more info: gcc bug #16660
*/ | |
25 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); | |
26 /* share input and output locations to save having to keep the | |
27 * locations in separate registers */ | |
28 uint32_t const * output = (uint32_t*)input; | |
29 | |
30 /* deref *transform now to avoid it in loop */ | |
31 const float *igtbl_r = transform->input_gamma_table_r; | |
32 const float *igtbl_g = transform->input_gamma_table_g; | |
33 const float *igtbl_b = transform->input_gamma_table_b; | |
34 | |
35 /* deref *transform now to avoid it in loop */ | |
36 const uint8_t *otdata_r = &transform->output_table_r->data[0]; | |
37 const uint8_t *otdata_g = &transform->output_table_g->data[0]; | |
38 const uint8_t *otdata_b = &transform->output_table_b->data[0]; | |
39 | |
40 /* input matrix values never change */ | |
41 const __m128 mat0 = _mm_load_ps(mat[0]); | |
42 const __m128 mat1 = _mm_load_ps(mat[1]); | |
43 const __m128 mat2 = _mm_load_ps(mat[2]); | |
44 | |
45 /* these values don't change, either */ | |
46 const __m128 max = _mm_load_ps(clampMaxValueX4); | |
47 const __m128 min = _mm_setzero_ps(); | |
48 const __m128 scale = _mm_load_ps(floatScaleX4); | |
49 | |
50 /* working variables */ | |
51 __m128 vec_r, vec_g, vec_b, result; | |
52 | |
53 /* CYA */ | |
54 if (!length) | |
55 return; | |
56 | |
57 /* one pixel is handled outside of the loop */ | |
58 length--; | |
59 | |
60 /* setup for transforming 1st pixel */ | |
61 vec_r = _mm_load_ss(&igtbl_r[src[0]]); | |
62 vec_g = _mm_load_ss(&igtbl_g[src[1]]); | |
63 vec_b = _mm_load_ss(&igtbl_b[src[2]]); | |
64 src += 3; | |
65 | |
66 /* transform all but final pixel */ | |
67 | |
68 for (i=0; i<length; i++) | |
69 { | |
70 /* position values from gamma tables */ | |
71 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); | |
72 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); | |
73 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); | |
74 | |
75 /* gamma * matrix */ | |
76 vec_r = _mm_mul_ps(vec_r, mat0); | |
77 vec_g = _mm_mul_ps(vec_g, mat1); | |
78 vec_b = _mm_mul_ps(vec_b, mat2); | |
79 | |
80 /* crunch, crunch, crunch */ | |
81 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); | |
82 vec_r = _mm_max_ps(min, vec_r); | |
83 vec_r = _mm_min_ps(max, vec_r); | |
84 result = _mm_mul_ps(vec_r, scale); | |
85 | |
86 /* store calc'd output tables indices */ | |
87 *((__m64 *)&output[0]) = _mm_cvtps_pi32(result); | |
88 result = _mm_movehl_ps(result, result); | |
89 *((__m64 *)&output[2]) = _mm_cvtps_pi32(result) ; | |
90 | |
91 /* load for next loop while store completes */ | |
92 vec_r = _mm_load_ss(&igtbl_r[src[0]]); | |
93 vec_g = _mm_load_ss(&igtbl_g[src[1]]); | |
94 vec_b = _mm_load_ss(&igtbl_b[src[2]]); | |
95 src += 3; | |
96 | |
97 /* use calc'd indices to output RGB values */ | |
98 dest[0] = otdata_r[output[0]]; | |
99 dest[1] = otdata_g[output[1]]; | |
100 dest[2] = otdata_b[output[2]]; | |
101 dest += 3; | |
102 } | |
103 | |
104 /* handle final (maybe only) pixel */ | |
105 | |
106 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); | |
107 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); | |
108 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); | |
109 | |
110 vec_r = _mm_mul_ps(vec_r, mat0); | |
111 vec_g = _mm_mul_ps(vec_g, mat1); | |
112 vec_b = _mm_mul_ps(vec_b, mat2); | |
113 | |
114 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); | |
115 vec_r = _mm_max_ps(min, vec_r); | |
116 vec_r = _mm_min_ps(max, vec_r); | |
117 result = _mm_mul_ps(vec_r, scale); | |
118 | |
119 *((__m64 *)&output[0]) = _mm_cvtps_pi32(result); | |
120 result = _mm_movehl_ps(result, result); | |
121 *((__m64 *)&output[2]) = _mm_cvtps_pi32(result); | |
122 | |
123 dest[0] = otdata_r[output[0]]; | |
124 dest[1] = otdata_g[output[1]]; | |
125 dest[2] = otdata_b[output[2]]; | |
126 | |
127 _mm_empty(); | |
128 } | |
129 | |
130 void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform, | |
131 unsigned char *src, | |
132 unsigned char *dest, | |
133 size_t length) | |
134 { | |
135 unsigned int i; | |
136 float (*mat)[4] = transform->matrix; | |
137 char input_back[32]; | |
138 /* Ensure we have a buffer that's 16 byte aligned regardless of the original | |
139 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(
align(32)) | |
140 * because they don't work on stack variables. gcc 4.4 does do the right thi
ng | |
141 * on x86 but that's too new for us right now. For more info: gcc bug #16660
*/ | |
142 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); | |
143 /* share input and output locations to save having to keep the | |
144 * locations in separate registers */ | |
145 uint32_t const * output = (uint32_t*)input; | |
146 | |
147 /* deref *transform now to avoid it in loop */ | |
148 const float *igtbl_r = transform->input_gamma_table_r; | |
149 const float *igtbl_g = transform->input_gamma_table_g; | |
150 const float *igtbl_b = transform->input_gamma_table_b; | |
151 | |
152 /* deref *transform now to avoid it in loop */ | |
153 const uint8_t *otdata_r = &transform->output_table_r->data[0]; | |
154 const uint8_t *otdata_g = &transform->output_table_g->data[0]; | |
155 const uint8_t *otdata_b = &transform->output_table_b->data[0]; | |
156 | |
157 /* input matrix values never change */ | |
158 const __m128 mat0 = _mm_load_ps(mat[0]); | |
159 const __m128 mat1 = _mm_load_ps(mat[1]); | |
160 const __m128 mat2 = _mm_load_ps(mat[2]); | |
161 | |
162 /* these values don't change, either */ | |
163 const __m128 max = _mm_load_ps(clampMaxValueX4); | |
164 const __m128 min = _mm_setzero_ps(); | |
165 const __m128 scale = _mm_load_ps(floatScaleX4); | |
166 | |
167 /* working variables */ | |
168 __m128 vec_r, vec_g, vec_b, result; | |
169 unsigned char alpha; | |
170 | |
171 /* CYA */ | |
172 if (!length) | |
173 return; | |
174 | |
175 /* one pixel is handled outside of the loop */ | |
176 length--; | |
177 | |
178 /* setup for transforming 1st pixel */ | |
179 vec_r = _mm_load_ss(&igtbl_r[src[0]]); | |
180 vec_g = _mm_load_ss(&igtbl_g[src[1]]); | |
181 vec_b = _mm_load_ss(&igtbl_b[src[2]]); | |
182 alpha = src[3]; | |
183 src += 4; | |
184 | |
185 /* transform all but final pixel */ | |
186 | |
187 for (i=0; i<length; i++) | |
188 { | |
189 /* position values from gamma tables */ | |
190 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); | |
191 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); | |
192 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); | |
193 | |
194 /* gamma * matrix */ | |
195 vec_r = _mm_mul_ps(vec_r, mat0); | |
196 vec_g = _mm_mul_ps(vec_g, mat1); | |
197 vec_b = _mm_mul_ps(vec_b, mat2); | |
198 | |
199 /* store alpha for this pixel; load alpha for next */ | |
200 dest[3] = alpha; | |
201 alpha = src[3]; | |
202 | |
203 /* crunch, crunch, crunch */ | |
204 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); | |
205 vec_r = _mm_max_ps(min, vec_r); | |
206 vec_r = _mm_min_ps(max, vec_r); | |
207 result = _mm_mul_ps(vec_r, scale); | |
208 | |
209 /* store calc'd output tables indices */ | |
210 *((__m64 *)&output[0]) = _mm_cvtps_pi32(result); | |
211 result = _mm_movehl_ps(result, result); | |
212 *((__m64 *)&output[2]) = _mm_cvtps_pi32(result); | |
213 | |
214 /* load gamma values for next loop while store completes */ | |
215 vec_r = _mm_load_ss(&igtbl_r[src[0]]); | |
216 vec_g = _mm_load_ss(&igtbl_g[src[1]]); | |
217 vec_b = _mm_load_ss(&igtbl_b[src[2]]); | |
218 src += 4; | |
219 | |
220 /* use calc'd indices to output RGB values */ | |
221 dest[0] = otdata_r[output[0]]; | |
222 dest[1] = otdata_g[output[1]]; | |
223 dest[2] = otdata_b[output[2]]; | |
224 dest += 4; | |
225 } | |
226 | |
227 /* handle final (maybe only) pixel */ | |
228 | |
229 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); | |
230 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); | |
231 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); | |
232 | |
233 vec_r = _mm_mul_ps(vec_r, mat0); | |
234 vec_g = _mm_mul_ps(vec_g, mat1); | |
235 vec_b = _mm_mul_ps(vec_b, mat2); | |
236 | |
237 dest[3] = alpha; | |
238 | |
239 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); | |
240 vec_r = _mm_max_ps(min, vec_r); | |
241 vec_r = _mm_min_ps(max, vec_r); | |
242 result = _mm_mul_ps(vec_r, scale); | |
243 | |
244 *((__m64 *)&output[0]) = _mm_cvtps_pi32(result); | |
245 result = _mm_movehl_ps(result, result); | |
246 *((__m64 *)&output[2]) = _mm_cvtps_pi32(result); | |
247 | |
248 dest[0] = otdata_r[output[0]]; | |
249 dest[1] = otdata_g[output[1]]; | |
250 dest[2] = otdata_b[output[2]]; | |
251 | |
252 _mm_empty(); | |
253 } | |
OLD | NEW |