OLD | NEW |
| (Empty) |
1 #include <emmintrin.h> | |
2 | |
3 #include "qcmsint.h" | |
4 | |
5 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequ
ence */ | |
6 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE) | |
7 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZ
E ) | |
8 static const ALIGN float floatScaleX4[4] = | |
9 { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE}; | |
10 static const ALIGN float clampMaxValueX4[4] = | |
11 { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL}; | |
12 | |
13 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform, | |
14 unsigned char *src, | |
15 unsigned char *dest, | |
16 size_t length) | |
17 { | |
18 unsigned int i; | |
19 float (*mat)[4] = transform->matrix; | |
20 char input_back[32]; | |
21 /* Ensure we have a buffer that's 16 byte aligned regardless of the original | |
22 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(
align(32)) | |
23 * because they don't work on stack variables. gcc 4.4 does do the right thi
ng | |
24 * on x86 but that's too new for us right now. For more info: gcc bug #16660
*/ | |
25 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); | |
26 /* share input and output locations to save having to keep the | |
27 * locations in separate registers */ | |
28 uint32_t const * output = (uint32_t*)input; | |
29 | |
30 /* deref *transform now to avoid it in loop */ | |
31 const float *igtbl_r = transform->input_gamma_table_r; | |
32 const float *igtbl_g = transform->input_gamma_table_g; | |
33 const float *igtbl_b = transform->input_gamma_table_b; | |
34 | |
35 /* deref *transform now to avoid it in loop */ | |
36 const uint8_t *otdata_r = &transform->output_table_r->data[0]; | |
37 const uint8_t *otdata_g = &transform->output_table_g->data[0]; | |
38 const uint8_t *otdata_b = &transform->output_table_b->data[0]; | |
39 | |
40 /* input matrix values never change */ | |
41 const __m128 mat0 = _mm_load_ps(mat[0]); | |
42 const __m128 mat1 = _mm_load_ps(mat[1]); | |
43 const __m128 mat2 = _mm_load_ps(mat[2]); | |
44 | |
45 /* these values don't change, either */ | |
46 const __m128 max = _mm_load_ps(clampMaxValueX4); | |
47 const __m128 min = _mm_setzero_ps(); | |
48 const __m128 scale = _mm_load_ps(floatScaleX4); | |
49 | |
50 /* working variables */ | |
51 __m128 vec_r, vec_g, vec_b, result; | |
52 | |
53 /* CYA */ | |
54 if (!length) | |
55 return; | |
56 | |
57 /* one pixel is handled outside of the loop */ | |
58 length--; | |
59 | |
60 /* setup for transforming 1st pixel */ | |
61 vec_r = _mm_load_ss(&igtbl_r[src[0]]); | |
62 vec_g = _mm_load_ss(&igtbl_g[src[1]]); | |
63 vec_b = _mm_load_ss(&igtbl_b[src[2]]); | |
64 src += 3; | |
65 | |
66 /* transform all but final pixel */ | |
67 | |
68 for (i=0; i<length; i++) | |
69 { | |
70 /* position values from gamma tables */ | |
71 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); | |
72 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); | |
73 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); | |
74 | |
75 /* gamma * matrix */ | |
76 vec_r = _mm_mul_ps(vec_r, mat0); | |
77 vec_g = _mm_mul_ps(vec_g, mat1); | |
78 vec_b = _mm_mul_ps(vec_b, mat2); | |
79 | |
80 /* crunch, crunch, crunch */ | |
81 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); | |
82 vec_r = _mm_max_ps(min, vec_r); | |
83 vec_r = _mm_min_ps(max, vec_r); | |
84 result = _mm_mul_ps(vec_r, scale); | |
85 | |
86 /* store calc'd output tables indices */ | |
87 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); | |
88 | |
89 /* load for next loop while store completes */ | |
90 vec_r = _mm_load_ss(&igtbl_r[src[0]]); | |
91 vec_g = _mm_load_ss(&igtbl_g[src[1]]); | |
92 vec_b = _mm_load_ss(&igtbl_b[src[2]]); | |
93 src += 3; | |
94 | |
95 /* use calc'd indices to output RGB values */ | |
96 dest[0] = otdata_r[output[0]]; | |
97 dest[1] = otdata_g[output[1]]; | |
98 dest[2] = otdata_b[output[2]]; | |
99 dest += 3; | |
100 } | |
101 | |
102 /* handle final (maybe only) pixel */ | |
103 | |
104 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); | |
105 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); | |
106 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); | |
107 | |
108 vec_r = _mm_mul_ps(vec_r, mat0); | |
109 vec_g = _mm_mul_ps(vec_g, mat1); | |
110 vec_b = _mm_mul_ps(vec_b, mat2); | |
111 | |
112 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); | |
113 vec_r = _mm_max_ps(min, vec_r); | |
114 vec_r = _mm_min_ps(max, vec_r); | |
115 result = _mm_mul_ps(vec_r, scale); | |
116 | |
117 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); | |
118 | |
119 dest[0] = otdata_r[output[0]]; | |
120 dest[1] = otdata_g[output[1]]; | |
121 dest[2] = otdata_b[output[2]]; | |
122 } | |
123 | |
124 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform, | |
125 unsigned char *src, | |
126 unsigned char *dest, | |
127 size_t length) | |
128 { | |
129 unsigned int i; | |
130 float (*mat)[4] = transform->matrix; | |
131 char input_back[32]; | |
132 /* Ensure we have a buffer that's 16 byte aligned regardless of the original | |
133 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(
align(32)) | |
134 * because they don't work on stack variables. gcc 4.4 does do the right thi
ng | |
135 * on x86 but that's too new for us right now. For more info: gcc bug #16660
*/ | |
136 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); | |
137 /* share input and output locations to save having to keep the | |
138 * locations in separate registers */ | |
139 uint32_t const * output = (uint32_t*)input; | |
140 | |
141 /* deref *transform now to avoid it in loop */ | |
142 const float *igtbl_r = transform->input_gamma_table_r; | |
143 const float *igtbl_g = transform->input_gamma_table_g; | |
144 const float *igtbl_b = transform->input_gamma_table_b; | |
145 | |
146 /* deref *transform now to avoid it in loop */ | |
147 const uint8_t *otdata_r = &transform->output_table_r->data[0]; | |
148 const uint8_t *otdata_g = &transform->output_table_g->data[0]; | |
149 const uint8_t *otdata_b = &transform->output_table_b->data[0]; | |
150 | |
151 /* input matrix values never change */ | |
152 const __m128 mat0 = _mm_load_ps(mat[0]); | |
153 const __m128 mat1 = _mm_load_ps(mat[1]); | |
154 const __m128 mat2 = _mm_load_ps(mat[2]); | |
155 | |
156 /* these values don't change, either */ | |
157 const __m128 max = _mm_load_ps(clampMaxValueX4); | |
158 const __m128 min = _mm_setzero_ps(); | |
159 const __m128 scale = _mm_load_ps(floatScaleX4); | |
160 | |
161 /* working variables */ | |
162 __m128 vec_r, vec_g, vec_b, result; | |
163 unsigned char alpha; | |
164 | |
165 /* CYA */ | |
166 if (!length) | |
167 return; | |
168 | |
169 /* one pixel is handled outside of the loop */ | |
170 length--; | |
171 | |
172 /* setup for transforming 1st pixel */ | |
173 vec_r = _mm_load_ss(&igtbl_r[src[0]]); | |
174 vec_g = _mm_load_ss(&igtbl_g[src[1]]); | |
175 vec_b = _mm_load_ss(&igtbl_b[src[2]]); | |
176 alpha = src[3]; | |
177 src += 4; | |
178 | |
179 /* transform all but final pixel */ | |
180 | |
181 for (i=0; i<length; i++) | |
182 { | |
183 /* position values from gamma tables */ | |
184 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); | |
185 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); | |
186 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); | |
187 | |
188 /* gamma * matrix */ | |
189 vec_r = _mm_mul_ps(vec_r, mat0); | |
190 vec_g = _mm_mul_ps(vec_g, mat1); | |
191 vec_b = _mm_mul_ps(vec_b, mat2); | |
192 | |
193 /* store alpha for this pixel; load alpha for next */ | |
194 dest[3] = alpha; | |
195 alpha = src[3]; | |
196 | |
197 /* crunch, crunch, crunch */ | |
198 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); | |
199 vec_r = _mm_max_ps(min, vec_r); | |
200 vec_r = _mm_min_ps(max, vec_r); | |
201 result = _mm_mul_ps(vec_r, scale); | |
202 | |
203 /* store calc'd output tables indices */ | |
204 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); | |
205 | |
206 /* load gamma values for next loop while store completes */ | |
207 vec_r = _mm_load_ss(&igtbl_r[src[0]]); | |
208 vec_g = _mm_load_ss(&igtbl_g[src[1]]); | |
209 vec_b = _mm_load_ss(&igtbl_b[src[2]]); | |
210 src += 4; | |
211 | |
212 /* use calc'd indices to output RGB values */ | |
213 dest[0] = otdata_r[output[0]]; | |
214 dest[1] = otdata_g[output[1]]; | |
215 dest[2] = otdata_b[output[2]]; | |
216 dest += 4; | |
217 } | |
218 | |
219 /* handle final (maybe only) pixel */ | |
220 | |
221 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); | |
222 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); | |
223 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); | |
224 | |
225 vec_r = _mm_mul_ps(vec_r, mat0); | |
226 vec_g = _mm_mul_ps(vec_g, mat1); | |
227 vec_b = _mm_mul_ps(vec_b, mat2); | |
228 | |
229 dest[3] = alpha; | |
230 | |
231 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); | |
232 vec_r = _mm_max_ps(min, vec_r); | |
233 vec_r = _mm_min_ps(max, vec_r); | |
234 result = _mm_mul_ps(vec_r, scale); | |
235 | |
236 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); | |
237 | |
238 dest[0] = otdata_r[output[0]]; | |
239 dest[1] = otdata_g[output[1]]; | |
240 dest[2] = otdata_b[output[2]]; | |
241 } | |
242 | |
243 | |
OLD | NEW |